feat(matrix-bridge-dagi): add egress, audit integration, fix router endpoint (PR-M1.4)

Closes the full Matrix ↔ DAGI loop: Egress: - invoke Router POST /v1/agents/{agent_id}/infer (field: prompt, response: response) - send_text() reply to Matrix room with idempotent txn_id = make_txn_id(room_id, event_id) - empty reply → skip send (no spam) - reply truncated to 4000 chars if needed Audit (via sofiia-console POST /api/audit/internal): - matrix.message.received (on ingress) - matrix.agent.replied (on successful reply) - matrix.error (on router/send failure, with error_code) - fire-and-forget: audit failures never crash the loop Router URL fix: - DAGI_GATEWAY_URL now points to dagi-router-node1:8000 (not gateway:9300) - Session ID: stable per room — matrix:{room_localpart} (memory context) 9 tests: invoke endpoint, fallback fields, audit write, full cycle, dedupe, empty reply skip, metric callbacks Made-with: Cursor
2026-03-03 08:06:49 -08:00
parent 8d564fbbe5
commit cad3663508
4 changed files with 540 additions and 307 deletions
--- a/services/matrix-bridge-dagi/app/ingress.py
+++ b/services/matrix-bridge-dagi/app/ingress.py
@@ -1,21 +1,25 @@
 """
-Matrix Ingress Loop — Phase M1.3
+Matrix Ingress + Egress Loop — Phase M1.4

-Polls Matrix /sync for new messages, invokes DAGI Gateway for mapped rooms.
-Does NOT send replies back (that is PR-M1.4 egress).
+Polls Matrix /sync for new messages, invokes DAGI Router for mapped rooms,
+sends agent replies back to Matrix, writes audit events to sofiia-console.

-Design:
-  - asyncio task, driven by run_ingress_loop()
-  - sync_poll() → extract_room_messages() per mapped room
-  - for each message: dedupe → invoke gateway → audit (fire-and-forget)
-  - next_batch token persisted in memory (restart resets to None — acceptable for M1)
-  - graceful shutdown via asyncio.Event
+Pipeline:
+  sync_poll() → extract_room_messages()
+  → for each message:
+    1. dedupe (mark_seen)
+    2. audit: matrix.message.received
+    3. invoke DAGI Router (/v1/agents/{agent_id}/infer)
+    4. send_text() reply to Matrix room
+    5. audit: matrix.agent.replied | matrix.error
+
+Graceful shutdown via asyncio.Event.
 """

 import asyncio
 import logging
 import time
-from typing import Any, Dict, Optional
+from typing import Any, Callable, Dict, List, Optional

 import httpx

@@ -26,64 +30,113 @@ logger = logging.getLogger(__name__)

 # ── Constants ──────────────────────────────────────────────────────────────────

-# Max wait between sync retries on error (seconds)
 _MAX_RETRY_BACKOFF = 60.0
 _INIT_RETRY_BACKOFF = 2.0
-
-# Gateway invoke timeout
-_GATEWAY_TIMEOUT_S = 30.0
+_ROUTER_TIMEOUT_S = 45.0       # Router may call DeepSeek/Mistral
+_AUDIT_TIMEOUT_S = 5.0
+_REPLY_TEXT_MAX = 4000          # Matrix message cap (chars)
+_ERROR_REPLY_TEXT = "⚠️ Тимчасова помилка. Спробуйте ще раз."


-# ── Gateway invoke ─────────────────────────────────────────────────────────────
+# ── Router invoke ──────────────────────────────────────────────────────────────

-async def _invoke_gateway(
+async def _invoke_router(
    http_client: httpx.AsyncClient,
-    gateway_url: str,
+    router_url: str,
    agent_id: str,
    node_id: str,
-    message_text: str,
-    matrix_room_id: str,
-    matrix_event_id: str,
-    matrix_sender: str,
-) -> Dict[str, Any]:
+    prompt: str,
+    session_id: str,
+) -> str:
    """
-    POST to DAGI Gateway /v1/invoke (or /debug/agent_ping equivalent).
-    Returns parsed JSON response or raises httpx.HTTPError.
-
-    Payload format matches existing Gateway invoke schema.
+    POST /v1/agents/{agent_id}/infer — returns response text string.
+    Field: response['response'] (confirmed from NODA1 test).
+    Raises httpx.HTTPError on failure.
    """
-    url = f"{gateway_url.rstrip('/')}/v1/invoke"
+    url = f"{router_url.rstrip('/')}/v1/agents/{agent_id}/infer"
    payload = {
-        "agent_id": agent_id,
-        "node_id": node_id,
-        "message": message_text,
+        "prompt": prompt,
+        "session_id": session_id,
+        "user_id": "matrix_bridge",
        "metadata": {
            "transport": "matrix",
-            "matrix_room_id": matrix_room_id,
-            "matrix_event_id": matrix_event_id,
-            "matrix_sender": matrix_sender,
            "node_id": node_id,
        },
    }
-    resp = await http_client.post(url, json=payload, timeout=_GATEWAY_TIMEOUT_S)
+    resp = await http_client.post(url, json=payload, timeout=_ROUTER_TIMEOUT_S)
    resp.raise_for_status()
-    return resp.json()
+    data = resp.json()
+    # Extract text — field confirmed as 'response'
+    text = (
+        data.get("response")
+        or data.get("text")
+        or data.get("content")
+        or data.get("message")
+        or ""
+    )
+    if not isinstance(text, str):
+        text = str(text)
+    return text.strip()
+
+
+# ── Audit write ────────────────────────────────────────────────────────────────
+
+async def _write_audit(
+    http_client: httpx.AsyncClient,
+    console_url: str,
+    internal_token: str,
+    event: str,
+    agent_id: str,
+    node_id: str,
+    room_id: str,
+    event_id: str,
+    status: str = "ok",
+    error_code: Optional[str] = None,
+    duration_ms: Optional[int] = None,
+    data: Optional[Dict[str, Any]] = None,
+) -> None:
+    """
+    Fire-and-forget audit write to sofiia-console internal endpoint.
+    Never raises — logs warning on failure.
+    """
+    if not console_url or not internal_token:
+        return
+    try:
+        url = f"{console_url.rstrip('/')}/api/audit/internal"
+        await http_client.post(
+            url,
+            json={
+                "event": event,
+                "operator_id": "matrix_bridge",
+                "node_id": node_id,
+                "agent_id": agent_id,
+                "chat_id": room_id,
+                "status": status,
+                "error_code": error_code,
+                "duration_ms": duration_ms,
+                "data": {
+                    "matrix_event_id": event_id,
+                    "matrix_room_id": room_id,
+                    **(data or {}),
+                },
+            },
+            headers={"X-Internal-Service-Token": internal_token},
+            timeout=_AUDIT_TIMEOUT_S,
+        )
+    except Exception as exc:
+        logger.warning("Audit write failed (non-blocking): %s", exc)


 # ── Ingress loop ───────────────────────────────────────────────────────────────

 class MatrixIngressLoop:
    """
-    Drives the Matrix sync-poll → gateway-invoke pipeline.
+    Drives Matrix sync-poll → router-invoke → Matrix send_text pipeline.

    Usage:
-        loop = MatrixIngressLoop(cfg, room_map)
+        loop = MatrixIngressLoop(...)
        stop_event = asyncio.Event()
        await loop.run(stop_event)
-
-    Metrics callbacks (optional, injected to avoid hard dependency):
-        on_message_received(room_id, agent_id) — called after successful dedupe
-        on_gateway_error(error_type)           — called on gateway invoke error
    """

    def __init__(
@@ -91,115 +144,92 @@ class MatrixIngressLoop:
        matrix_homeserver_url: str,
        matrix_access_token: str,
        matrix_user_id: str,
-        gateway_url: str,
+        router_url: str,
        node_id: str,
        room_map: RoomMappingConfig,
-        on_message_received=None,
-        on_gateway_error=None,
+        sofiia_console_url: str = "",
+        sofiia_internal_token: str = "",
+        on_message_received: Optional[Callable[[str, str], None]] = None,
+        on_message_replied: Optional[Callable[[str, str, str], None]] = None,
+        on_gateway_error: Optional[Callable[[str], None]] = None,
    ) -> None:
        self._hs_url = matrix_homeserver_url
        self._token = matrix_access_token
        self._user_id = matrix_user_id
-        self._gateway_url = gateway_url
+        self._router_url = router_url
        self._node_id = node_id
        self._room_map = room_map
+        self._console_url = sofiia_console_url
+        self._internal_token = sofiia_internal_token
        self._on_message_received = on_message_received
+        self._on_message_replied = on_message_replied
        self._on_gateway_error = on_gateway_error
-
        self._next_batch: Optional[str] = None
-        self._running = False

    @property
    def next_batch(self) -> Optional[str]:
        return self._next_batch

    async def run(self, stop_event: asyncio.Event) -> None:
-        """
-        Main loop. Runs until stop_event is set.
-        Handles errors with exponential backoff.
-        """
-        self._running = True
+        """Main loop until stop_event is set."""
        backoff = _INIT_RETRY_BACKOFF
        logger.info(
-            "Matrix ingress loop started | hs=%s node=%s mappings=%d",
+            "Matrix ingress/egress loop started | hs=%s node=%s mappings=%d",
            self._hs_url, self._node_id, self._room_map.total_mappings,
        )

        if self._room_map.total_mappings == 0:
-            logger.warning("No room mappings configured — ingress loop is idle")
+            logger.warning("No room mappings — ingress loop is idle")

-        async with MatrixClient(
-            self._hs_url, self._token, self._user_id
-        ) as client:
-            # Join all mapped rooms at startup
+        async with MatrixClient(self._hs_url, self._token, self._user_id) as client:
            for mapping in self._room_map.mappings:
                if mapping.agent_id in self._room_map.allowed_agents:
                    try:
                        await client.join_room(mapping.room_id)
-                        logger.info("Joined room %s → agent %s", mapping.room_id, mapping.agent_id)
                    except Exception as exc:
                        logger.warning("Could not join room %s: %s", mapping.room_id, exc)

-            async with httpx.AsyncClient(timeout=_GATEWAY_TIMEOUT_S) as gw_client:
+            async with httpx.AsyncClient() as http_client:
                while not stop_event.is_set():
                    try:
                        sync_resp = await client.sync_poll(since=self._next_batch)
                        self._next_batch = sync_resp.get("next_batch")
-                        backoff = _INIT_RETRY_BACKOFF  # reset on success
-
-                        await self._process_sync(client, gw_client, sync_resp)
-
+                        backoff = _INIT_RETRY_BACKOFF
+                        await self._process_sync(client, http_client, sync_resp)
                    except asyncio.CancelledError:
-                        logger.info("Ingress loop cancelled")
                        break
                    except Exception as exc:
-                        logger.error(
-                            "Ingress loop error (retry in %.1fs): %s",
-                            backoff, exc,
-                        )
+                        logger.error("Ingress loop error (retry in %.1fs): %s", backoff, exc)
                        if self._on_gateway_error:
                            self._on_gateway_error("sync_error")
                        try:
-                            await asyncio.wait_for(
-                                stop_event.wait(), timeout=backoff
-                            )
+                            await asyncio.wait_for(stop_event.wait(), timeout=backoff)
                        except asyncio.TimeoutError:
                            pass
                        backoff = min(backoff * 2, _MAX_RETRY_BACKOFF)

-        self._running = False
-        logger.info("Matrix ingress loop stopped")
+        logger.info("Matrix ingress/egress loop stopped")

    async def _process_sync(
        self,
        client: MatrixClient,
-        gw_client: httpx.AsyncClient,
+        http_client: httpx.AsyncClient,
        sync_resp: Dict[str, Any],
    ) -> None:
-        """Process all mapped rooms in a sync response."""
        for mapping in self._room_map.mappings:
            if mapping.agent_id not in self._room_map.allowed_agents:
                continue
-
            messages = client.extract_room_messages(sync_resp, mapping.room_id)
            for event in messages:
-                await self._handle_message(client, gw_client, event, mapping)
+                await self._handle_message(client, http_client, event, mapping)

    async def _handle_message(
        self,
        client: MatrixClient,
-        gw_client: httpx.AsyncClient,
+        http_client: httpx.AsyncClient,
        event: Dict[str, Any],
        mapping,
    ) -> None:
-        """
-        Process a single Matrix message event:
-          1. Mark as seen (dedupe)
-          2. Invoke DAGI gateway
-          3. Fire metrics callback
-
-        Note: Reply sending (egress) is PR-M1.4 — not done here.
-        """
        event_id = event.get("event_id", "")
        sender = event.get("sender", "")
        text = event.get("content", {}).get("body", "").strip()
@@ -207,61 +237,157 @@ class MatrixIngressLoop:
        agent_id = mapping.agent_id

        if not text:
-            logger.debug("Skipping empty message from %s in %s", sender, room_id)
            return

-        # Mark event as seen before invoke (prevents duplicate on retry)
+        # Dedupe — mark seen before any IO (prevents double-process on retry)
        client.mark_seen(event_id)

        logger.info(
-            "Matrix message: room=%s sender=%s agent=%s event=%s text_len=%d",
+            "Matrix message: room=%s sender=%s agent=%s event=%s len=%d",
            room_id, sender, agent_id, event_id, len(text),
        )

        if self._on_message_received:
            self._on_message_received(room_id, agent_id)

+        # Audit: received
+        await _write_audit(
+            http_client, self._console_url, self._internal_token,
+            event="matrix.message.received",
+            agent_id=agent_id, node_id=self._node_id,
+            room_id=room_id, event_id=event_id,
+            status="ok",
+            data={"sender": sender, "text_len": len(text)},
+        )
+
+        # Session ID: stable per room (allows memory context across messages)
+        session_id = f"matrix:{room_id.replace('!', '').replace(':', '_')}"
+
        t0 = time.monotonic()
+        reply_text: Optional[str] = None
+        invoke_ok = False
+
        try:
-            await _invoke_gateway(
-                gw_client,
-                self._gateway_url,
+            reply_text = await _invoke_router(
+                http_client,
+                self._router_url,
                agent_id=agent_id,
                node_id=self._node_id,
-                message_text=text,
-                matrix_room_id=room_id,
-                matrix_event_id=event_id,
-                matrix_sender=sender,
+                prompt=text,
+                session_id=session_id,
            )
-            duration = time.monotonic() - t0
+            invoke_ok = True
+            duration_ms = int((time.monotonic() - t0) * 1000)
            logger.info(
-                "Gateway invoke ok: agent=%s event=%s duration=%.2fs",
-                agent_id, event_id, duration,
+                "Router invoke ok: agent=%s event=%s reply_len=%d duration=%dms",
+                agent_id, event_id, len(reply_text or ""), duration_ms,
            )

        except httpx.HTTPStatusError as exc:
-            duration = time.monotonic() - t0
+            duration_ms = int((time.monotonic() - t0) * 1000)
            logger.error(
-                "Gateway HTTP error %d for agent=%s event=%s duration=%.2fs",
-                exc.response.status_code, agent_id, event_id, duration,
+                "Router HTTP %d for agent=%s event=%s duration=%dms",
+                exc.response.status_code, agent_id, event_id, duration_ms,
            )
            if self._on_gateway_error:
                self._on_gateway_error(f"http_{exc.response.status_code}")
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.error",
+                agent_id=agent_id, node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="error", error_code=f"router_http_{exc.response.status_code}",
+                duration_ms=duration_ms,
+            )

        except (httpx.ConnectError, httpx.TimeoutException) as exc:
-            duration = time.monotonic() - t0
+            duration_ms = int((time.monotonic() - t0) * 1000)
            logger.error(
-                "Gateway network error for agent=%s event=%s: %s duration=%.2fs",
-                agent_id, event_id, exc, duration,
+                "Router network error agent=%s event=%s: %s duration=%dms",
+                agent_id, event_id, exc, duration_ms,
            )
            if self._on_gateway_error:
                self._on_gateway_error("network_error")
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.error",
+                agent_id=agent_id, node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="error", error_code="router_network_error",
+                duration_ms=duration_ms,
+            )

        except Exception as exc:
-            duration = time.monotonic() - t0
+            duration_ms = int((time.monotonic() - t0) * 1000)
            logger.error(
-                "Unexpected error invoking gateway for agent=%s event=%s: %s",
+                "Unexpected router error agent=%s event=%s: %s",
                agent_id, event_id, exc,
            )
            if self._on_gateway_error:
                self._on_gateway_error("unexpected")
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.error",
+                agent_id=agent_id, node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="error", error_code="router_unexpected",
+                duration_ms=duration_ms,
+            )
+
+        # ── Egress: send reply back to Matrix ──────────────────────────────────
+        if not invoke_ok:
+            # No reply on error in M1 — just audit (avoids spam in room)
+            return
+
+        if not reply_text:
+            logger.warning("Empty reply from router for agent=%s event=%s", agent_id, event_id)
+            return
+
+        # Truncate if needed
+        send_text = reply_text[:_REPLY_TEXT_MAX]
+        txn_id = MatrixClient.make_txn_id(room_id, event_id)
+
+        send_t0 = time.monotonic()
+        try:
+            await client.send_text(room_id, send_text, txn_id)
+            send_duration_ms = int((time.monotonic() - send_t0) * 1000)
+
+            if self._on_message_replied:
+                self._on_message_replied(room_id, agent_id, "ok")
+
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.agent.replied",
+                agent_id=agent_id, node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="ok",
+                duration_ms=send_duration_ms,
+                data={
+                    "reply_len": len(send_text),
+                    "truncated": len(reply_text) > _REPLY_TEXT_MAX,
+                    "router_duration_ms": duration_ms,
+                },
+            )
+            logger.info(
+                "Reply sent: agent=%s event=%s reply_len=%d send_ms=%d",
+                agent_id, event_id, len(send_text), send_duration_ms,
+            )
+
+        except Exception as exc:
+            send_duration_ms = int((time.monotonic() - send_t0) * 1000)
+            logger.error(
+                "Failed to send Matrix reply agent=%s event=%s: %s",
+                agent_id, event_id, exc,
+            )
+            if self._on_message_replied:
+                self._on_message_replied(room_id, agent_id, "error")
+            if self._on_gateway_error:
+                self._on_gateway_error("matrix_send_error")
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.error",
+                agent_id=agent_id, node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="error", error_code="matrix_send_failed",
+                duration_ms=send_duration_ms,
+            )
--- a/services/matrix-bridge-dagi/app/main.py
+++ b/services/matrix-bridge-dagi/app/main.py
@@ -141,14 +141,23 @@ async def lifespan(app_: Any):
                if _PROM_OK:
                    _gateway_errors.labels(error_type=error_type).inc()

+            def _on_replied(room_id: str, agent_id: str, status: str) -> None:
+                if _PROM_OK:
+                    _messages_replied.labels(
+                        room_id=room_id, agent_id=agent_id, status=status
+                    ).inc()
+
            ingress = MatrixIngressLoop(
                matrix_homeserver_url=_cfg.matrix_homeserver_url,
                matrix_access_token=_cfg.matrix_access_token,
                matrix_user_id=_cfg.matrix_user_id,
-                gateway_url=_cfg.dagi_gateway_url,
+                router_url=_cfg.dagi_gateway_url,
                node_id=_cfg.node_id,
                room_map=_room_map,
+                sofiia_console_url=_cfg.sofiia_console_url,
+                sofiia_internal_token=_cfg.sofiia_internal_token,
                on_message_received=_on_msg,
+                on_message_replied=_on_replied,
                on_gateway_error=_on_gw_error,
            )
            _ingress_task = asyncio.create_task(