""" Matrix Ingress + Egress Loop — Phase M1.4 + H1/H3 Polls Matrix /sync for new messages, invokes DAGI Router for mapped rooms, sends agent replies back to Matrix, writes audit events to sofiia-console. Pipeline: sync_poll() → extract_room_messages() → for each message: 1. rate_limit check (room + sender) ← H1 2. dedupe (mark_seen) 3. audit: matrix.message.received 4. invoke DAGI Router (timed → on_invoke_latency) ← H3 5. send_text() reply (timed → on_send_latency) ← H3 6. audit: matrix.agent.replied | matrix.error Graceful shutdown via asyncio.Event. """ import asyncio import logging import time from typing import Any, Callable, Dict, List, Optional import httpx from .matrix_client import MatrixClient from .rate_limit import InMemoryRateLimiter from .room_mapping import RoomMappingConfig logger = logging.getLogger(__name__) # ── Constants ────────────────────────────────────────────────────────────────── _MAX_RETRY_BACKOFF = 60.0 _INIT_RETRY_BACKOFF = 2.0 _ROUTER_TIMEOUT_S = 45.0 _AUDIT_TIMEOUT_S = 5.0 _REPLY_TEXT_MAX = 4000 # ── Router invoke ────────────────────────────────────────────────────────────── async def _invoke_router( http_client: httpx.AsyncClient, router_url: str, agent_id: str, node_id: str, prompt: str, session_id: str, ) -> str: """ POST /v1/agents/{agent_id}/infer — returns response text string. Field confirmed as 'response' on NODA1. Raises httpx.HTTPError on failure. """ url = f"{router_url.rstrip('/')}/v1/agents/{agent_id}/infer" payload = { "prompt": prompt, "session_id": session_id, "user_id": "matrix_bridge", "metadata": { "transport": "matrix", "node_id": node_id, }, } resp = await http_client.post(url, json=payload, timeout=_ROUTER_TIMEOUT_S) resp.raise_for_status() data = resp.json() text = ( data.get("response") or data.get("text") or data.get("content") or data.get("message") or "" ) if not isinstance(text, str): text = str(text) return text.strip() # ── Audit write ──────────────────────────────────────────────────────────────── async def _write_audit( http_client: httpx.AsyncClient, console_url: str, internal_token: str, event: str, agent_id: str, node_id: str, room_id: str, event_id: str, status: str = "ok", error_code: Optional[str] = None, duration_ms: Optional[int] = None, data: Optional[Dict[str, Any]] = None, ) -> None: """Fire-and-forget audit write. Never raises.""" if not console_url or not internal_token: return try: url = f"{console_url.rstrip('/')}/api/audit/internal" await http_client.post( url, json={ "event": event, "operator_id": "matrix_bridge", "node_id": node_id, "agent_id": agent_id, "chat_id": room_id, "status": status, "error_code": error_code, "duration_ms": duration_ms, "data": { "matrix_event_id": event_id, "matrix_room_id": room_id, **(data or {}), }, }, headers={"X-Internal-Service-Token": internal_token}, timeout=_AUDIT_TIMEOUT_S, ) except Exception as exc: logger.warning("Audit write failed (non-blocking): %s", exc) # ── Ingress loop ─────────────────────────────────────────────────────────────── class MatrixIngressLoop: """ Drives Matrix sync-poll → rate-check → router-invoke → Matrix send_text. Metric callbacks (all optional, called synchronously): on_message_received(room_id, agent_id) on_message_replied(room_id, agent_id, status) on_gateway_error(error_type) on_rate_limited(room_id, agent_id, limit_type) ← H1 on_invoke_latency(agent_id, duration_seconds) ← H3 on_send_latency(agent_id, duration_seconds) ← H3 """ def __init__( self, matrix_homeserver_url: str, matrix_access_token: str, matrix_user_id: str, router_url: str, node_id: str, room_map: RoomMappingConfig, sofiia_console_url: str = "", sofiia_internal_token: str = "", rate_limiter: Optional[InMemoryRateLimiter] = None, on_message_received: Optional[Callable[[str, str], None]] = None, on_message_replied: Optional[Callable[[str, str, str], None]] = None, on_gateway_error: Optional[Callable[[str], None]] = None, on_rate_limited: Optional[Callable[[str, str, str], None]] = None, on_invoke_latency: Optional[Callable[[str, float], None]] = None, on_send_latency: Optional[Callable[[str, float], None]] = None, ) -> None: self._hs_url = matrix_homeserver_url self._token = matrix_access_token self._user_id = matrix_user_id self._router_url = router_url self._node_id = node_id self._room_map = room_map self._console_url = sofiia_console_url self._internal_token = sofiia_internal_token self._rate_limiter = rate_limiter self._on_message_received = on_message_received self._on_message_replied = on_message_replied self._on_gateway_error = on_gateway_error self._on_rate_limited = on_rate_limited self._on_invoke_latency = on_invoke_latency self._on_send_latency = on_send_latency self._next_batch: Optional[str] = None @property def next_batch(self) -> Optional[str]: return self._next_batch async def run(self, stop_event: asyncio.Event) -> None: backoff = _INIT_RETRY_BACKOFF logger.info( "Matrix ingress/egress loop started | hs=%s node=%s mappings=%d", self._hs_url, self._node_id, self._room_map.total_mappings, ) if self._room_map.total_mappings == 0: logger.warning("No room mappings — ingress loop is idle") async with MatrixClient(self._hs_url, self._token, self._user_id) as client: for mapping in self._room_map.mappings: if mapping.agent_id in self._room_map.allowed_agents: try: await client.join_room(mapping.room_id) except Exception as exc: logger.warning("Could not join room %s: %s", mapping.room_id, exc) async with httpx.AsyncClient() as http_client: while not stop_event.is_set(): try: sync_resp = await client.sync_poll(since=self._next_batch) self._next_batch = sync_resp.get("next_batch") backoff = _INIT_RETRY_BACKOFF await self._process_sync(client, http_client, sync_resp) except asyncio.CancelledError: break except Exception as exc: logger.error("Ingress loop error (retry in %.1fs): %s", backoff, exc) if self._on_gateway_error: self._on_gateway_error("sync_error") try: await asyncio.wait_for(stop_event.wait(), timeout=backoff) except asyncio.TimeoutError: pass backoff = min(backoff * 2, _MAX_RETRY_BACKOFF) logger.info("Matrix ingress/egress loop stopped") async def _process_sync( self, client: MatrixClient, http_client: httpx.AsyncClient, sync_resp: Dict[str, Any], ) -> None: for mapping in self._room_map.mappings: if mapping.agent_id not in self._room_map.allowed_agents: continue messages = client.extract_room_messages(sync_resp, mapping.room_id) for event in messages: await self._handle_message(client, http_client, event, mapping) async def _handle_message( self, client: MatrixClient, http_client: httpx.AsyncClient, event: Dict[str, Any], mapping, ) -> None: event_id = event.get("event_id", "") sender = event.get("sender", "") text = event.get("content", {}).get("body", "").strip() room_id = mapping.room_id agent_id = mapping.agent_id if not text: return # ── H1: Rate limit check ─────────────────────────────────────────────── if self._rate_limiter is not None: allowed, limit_type = self._rate_limiter.check(room_id=room_id, sender=sender) if not allowed: logger.warning( "Rate limited: room=%s sender=%s limit_type=%s event=%s", room_id, sender, limit_type, event_id, ) if self._on_rate_limited: self._on_rate_limited(room_id, agent_id, limit_type or "unknown") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.rate_limited", agent_id=agent_id, node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code=f"rate_limit_{limit_type}", data={"sender": sender, "limit_type": limit_type}, ) return # Dedupe — mark seen before any IO client.mark_seen(event_id) logger.info( "Matrix message: room=%s sender=%s agent=%s event=%s len=%d", room_id, sender, agent_id, event_id, len(text), ) if self._on_message_received: self._on_message_received(room_id, agent_id) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.message.received", agent_id=agent_id, node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={"sender": sender, "text_len": len(text)}, ) session_id = f"matrix:{room_id.replace('!', '').replace(':', '_')}" # ── H3: Invoke with latency measurement ─────────────────────────────── t0 = time.monotonic() reply_text: Optional[str] = None invoke_ok = False invoke_duration_s: float = 0.0 try: reply_text = await _invoke_router( http_client, self._router_url, agent_id=agent_id, node_id=self._node_id, prompt=text, session_id=session_id, ) invoke_ok = True invoke_duration_s = time.monotonic() - t0 duration_ms = int(invoke_duration_s * 1000) if self._on_invoke_latency: self._on_invoke_latency(agent_id, invoke_duration_s) logger.info( "Router invoke ok: agent=%s event=%s reply_len=%d duration=%dms", agent_id, event_id, len(reply_text or ""), duration_ms, ) except httpx.HTTPStatusError as exc: invoke_duration_s = time.monotonic() - t0 duration_ms = int(invoke_duration_s * 1000) logger.error( "Router HTTP %d for agent=%s event=%s duration=%dms", exc.response.status_code, agent_id, event_id, duration_ms, ) if self._on_gateway_error: self._on_gateway_error(f"http_{exc.response.status_code}") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.error", agent_id=agent_id, node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code=f"router_http_{exc.response.status_code}", duration_ms=duration_ms, ) except (httpx.ConnectError, httpx.TimeoutException) as exc: invoke_duration_s = time.monotonic() - t0 duration_ms = int(invoke_duration_s * 1000) logger.error( "Router network error agent=%s event=%s: %s duration=%dms", agent_id, event_id, exc, duration_ms, ) if self._on_gateway_error: self._on_gateway_error("network_error") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.error", agent_id=agent_id, node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="router_network_error", duration_ms=duration_ms, ) except Exception as exc: invoke_duration_s = time.monotonic() - t0 duration_ms = int(invoke_duration_s * 1000) logger.error( "Unexpected router error agent=%s event=%s: %s", agent_id, event_id, exc, ) if self._on_gateway_error: self._on_gateway_error("unexpected") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.error", agent_id=agent_id, node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="router_unexpected", duration_ms=duration_ms, ) if not invoke_ok: return if not reply_text: logger.warning("Empty reply from router for agent=%s event=%s", agent_id, event_id) return # ── H3: Send with latency measurement ───────────────────────────────── send_text = reply_text[:_REPLY_TEXT_MAX] txn_id = MatrixClient.make_txn_id(room_id, event_id) send_t0 = time.monotonic() try: await client.send_text(room_id, send_text, txn_id) send_duration_s = time.monotonic() - send_t0 send_duration_ms = int(send_duration_s * 1000) if self._on_send_latency: self._on_send_latency(agent_id, send_duration_s) if self._on_message_replied: self._on_message_replied(room_id, agent_id, "ok") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.agent.replied", agent_id=agent_id, node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", duration_ms=send_duration_ms, data={ "reply_len": len(send_text), "truncated": len(reply_text) > _REPLY_TEXT_MAX, "router_duration_ms": int(invoke_duration_s * 1000), }, ) logger.info( "Reply sent: agent=%s event=%s reply_len=%d send_ms=%d", agent_id, event_id, len(send_text), send_duration_ms, ) except Exception as exc: send_duration_s = time.monotonic() - send_t0 send_duration_ms = int(send_duration_s * 1000) logger.error( "Failed to send Matrix reply agent=%s event=%s: %s", agent_id, event_id, exc, ) if self._on_message_replied: self._on_message_replied(room_id, agent_id, "error") if self._on_gateway_error: self._on_gateway_error("matrix_send_error") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.error", agent_id=agent_id, node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="matrix_send_failed", duration_ms=send_duration_ms, )