feat(matrix-bridge-dagi): add egress, audit integration, fix router endpoint (PR-M1.4)
Closes the full Matrix ↔ DAGI loop:
Egress:
- invoke Router POST /v1/agents/{agent_id}/infer (field: prompt, response: response)
- send_text() reply to Matrix room with idempotent txn_id = make_txn_id(room_id, event_id)
- empty reply → skip send (no spam)
- reply truncated to 4000 chars if needed
Audit (via sofiia-console POST /api/audit/internal):
- matrix.message.received (on ingress)
- matrix.agent.replied (on successful reply)
- matrix.error (on router/send failure, with error_code)
- fire-and-forget: audit failures never crash the loop
Router URL fix:
- DAGI_GATEWAY_URL now points to dagi-router-node1:8000 (not gateway:9300)
- Session ID: stable per room — matrix:{room_localpart} (memory context)
9 tests: invoke endpoint, fallback fields, audit write, full cycle,
dedupe, empty reply skip, metric callbacks
Made-with: Cursor
This commit is contained in:
@@ -1,21 +1,25 @@
|
||||
"""
|
||||
Matrix Ingress Loop — Phase M1.3
|
||||
Matrix Ingress + Egress Loop — Phase M1.4
|
||||
|
||||
Polls Matrix /sync for new messages, invokes DAGI Gateway for mapped rooms.
|
||||
Does NOT send replies back (that is PR-M1.4 egress).
|
||||
Polls Matrix /sync for new messages, invokes DAGI Router for mapped rooms,
|
||||
sends agent replies back to Matrix, writes audit events to sofiia-console.
|
||||
|
||||
Design:
|
||||
- asyncio task, driven by run_ingress_loop()
|
||||
- sync_poll() → extract_room_messages() per mapped room
|
||||
- for each message: dedupe → invoke gateway → audit (fire-and-forget)
|
||||
- next_batch token persisted in memory (restart resets to None — acceptable for M1)
|
||||
- graceful shutdown via asyncio.Event
|
||||
Pipeline:
|
||||
sync_poll() → extract_room_messages()
|
||||
→ for each message:
|
||||
1. dedupe (mark_seen)
|
||||
2. audit: matrix.message.received
|
||||
3. invoke DAGI Router (/v1/agents/{agent_id}/infer)
|
||||
4. send_text() reply to Matrix room
|
||||
5. audit: matrix.agent.replied | matrix.error
|
||||
|
||||
Graceful shutdown via asyncio.Event.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import Any, Dict, Optional
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
@@ -26,64 +30,113 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
# ── Constants ──────────────────────────────────────────────────────────────────
|
||||
|
||||
# Max wait between sync retries on error (seconds)
|
||||
_MAX_RETRY_BACKOFF = 60.0
|
||||
_INIT_RETRY_BACKOFF = 2.0
|
||||
|
||||
# Gateway invoke timeout
|
||||
_GATEWAY_TIMEOUT_S = 30.0
|
||||
_ROUTER_TIMEOUT_S = 45.0 # Router may call DeepSeek/Mistral
|
||||
_AUDIT_TIMEOUT_S = 5.0
|
||||
_REPLY_TEXT_MAX = 4000 # Matrix message cap (chars)
|
||||
_ERROR_REPLY_TEXT = "⚠️ Тимчасова помилка. Спробуйте ще раз."
|
||||
|
||||
|
||||
# ── Gateway invoke ─────────────────────────────────────────────────────────────
|
||||
# ── Router invoke ──────────────────────────────────────────────────────────────
|
||||
|
||||
async def _invoke_gateway(
|
||||
async def _invoke_router(
|
||||
http_client: httpx.AsyncClient,
|
||||
gateway_url: str,
|
||||
router_url: str,
|
||||
agent_id: str,
|
||||
node_id: str,
|
||||
message_text: str,
|
||||
matrix_room_id: str,
|
||||
matrix_event_id: str,
|
||||
matrix_sender: str,
|
||||
) -> Dict[str, Any]:
|
||||
prompt: str,
|
||||
session_id: str,
|
||||
) -> str:
|
||||
"""
|
||||
POST to DAGI Gateway /v1/invoke (or /debug/agent_ping equivalent).
|
||||
Returns parsed JSON response or raises httpx.HTTPError.
|
||||
|
||||
Payload format matches existing Gateway invoke schema.
|
||||
POST /v1/agents/{agent_id}/infer — returns response text string.
|
||||
Field: response['response'] (confirmed from NODA1 test).
|
||||
Raises httpx.HTTPError on failure.
|
||||
"""
|
||||
url = f"{gateway_url.rstrip('/')}/v1/invoke"
|
||||
url = f"{router_url.rstrip('/')}/v1/agents/{agent_id}/infer"
|
||||
payload = {
|
||||
"agent_id": agent_id,
|
||||
"node_id": node_id,
|
||||
"message": message_text,
|
||||
"prompt": prompt,
|
||||
"session_id": session_id,
|
||||
"user_id": "matrix_bridge",
|
||||
"metadata": {
|
||||
"transport": "matrix",
|
||||
"matrix_room_id": matrix_room_id,
|
||||
"matrix_event_id": matrix_event_id,
|
||||
"matrix_sender": matrix_sender,
|
||||
"node_id": node_id,
|
||||
},
|
||||
}
|
||||
resp = await http_client.post(url, json=payload, timeout=_GATEWAY_TIMEOUT_S)
|
||||
resp = await http_client.post(url, json=payload, timeout=_ROUTER_TIMEOUT_S)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
data = resp.json()
|
||||
# Extract text — field confirmed as 'response'
|
||||
text = (
|
||||
data.get("response")
|
||||
or data.get("text")
|
||||
or data.get("content")
|
||||
or data.get("message")
|
||||
or ""
|
||||
)
|
||||
if not isinstance(text, str):
|
||||
text = str(text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
# ── Audit write ────────────────────────────────────────────────────────────────
|
||||
|
||||
async def _write_audit(
|
||||
http_client: httpx.AsyncClient,
|
||||
console_url: str,
|
||||
internal_token: str,
|
||||
event: str,
|
||||
agent_id: str,
|
||||
node_id: str,
|
||||
room_id: str,
|
||||
event_id: str,
|
||||
status: str = "ok",
|
||||
error_code: Optional[str] = None,
|
||||
duration_ms: Optional[int] = None,
|
||||
data: Optional[Dict[str, Any]] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Fire-and-forget audit write to sofiia-console internal endpoint.
|
||||
Never raises — logs warning on failure.
|
||||
"""
|
||||
if not console_url or not internal_token:
|
||||
return
|
||||
try:
|
||||
url = f"{console_url.rstrip('/')}/api/audit/internal"
|
||||
await http_client.post(
|
||||
url,
|
||||
json={
|
||||
"event": event,
|
||||
"operator_id": "matrix_bridge",
|
||||
"node_id": node_id,
|
||||
"agent_id": agent_id,
|
||||
"chat_id": room_id,
|
||||
"status": status,
|
||||
"error_code": error_code,
|
||||
"duration_ms": duration_ms,
|
||||
"data": {
|
||||
"matrix_event_id": event_id,
|
||||
"matrix_room_id": room_id,
|
||||
**(data or {}),
|
||||
},
|
||||
},
|
||||
headers={"X-Internal-Service-Token": internal_token},
|
||||
timeout=_AUDIT_TIMEOUT_S,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("Audit write failed (non-blocking): %s", exc)
|
||||
|
||||
|
||||
# ── Ingress loop ───────────────────────────────────────────────────────────────
|
||||
|
||||
class MatrixIngressLoop:
|
||||
"""
|
||||
Drives the Matrix sync-poll → gateway-invoke pipeline.
|
||||
Drives Matrix sync-poll → router-invoke → Matrix send_text pipeline.
|
||||
|
||||
Usage:
|
||||
loop = MatrixIngressLoop(cfg, room_map)
|
||||
loop = MatrixIngressLoop(...)
|
||||
stop_event = asyncio.Event()
|
||||
await loop.run(stop_event)
|
||||
|
||||
Metrics callbacks (optional, injected to avoid hard dependency):
|
||||
on_message_received(room_id, agent_id) — called after successful dedupe
|
||||
on_gateway_error(error_type) — called on gateway invoke error
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -91,115 +144,92 @@ class MatrixIngressLoop:
|
||||
matrix_homeserver_url: str,
|
||||
matrix_access_token: str,
|
||||
matrix_user_id: str,
|
||||
gateway_url: str,
|
||||
router_url: str,
|
||||
node_id: str,
|
||||
room_map: RoomMappingConfig,
|
||||
on_message_received=None,
|
||||
on_gateway_error=None,
|
||||
sofiia_console_url: str = "",
|
||||
sofiia_internal_token: str = "",
|
||||
on_message_received: Optional[Callable[[str, str], None]] = None,
|
||||
on_message_replied: Optional[Callable[[str, str, str], None]] = None,
|
||||
on_gateway_error: Optional[Callable[[str], None]] = None,
|
||||
) -> None:
|
||||
self._hs_url = matrix_homeserver_url
|
||||
self._token = matrix_access_token
|
||||
self._user_id = matrix_user_id
|
||||
self._gateway_url = gateway_url
|
||||
self._router_url = router_url
|
||||
self._node_id = node_id
|
||||
self._room_map = room_map
|
||||
self._console_url = sofiia_console_url
|
||||
self._internal_token = sofiia_internal_token
|
||||
self._on_message_received = on_message_received
|
||||
self._on_message_replied = on_message_replied
|
||||
self._on_gateway_error = on_gateway_error
|
||||
|
||||
self._next_batch: Optional[str] = None
|
||||
self._running = False
|
||||
|
||||
@property
|
||||
def next_batch(self) -> Optional[str]:
|
||||
return self._next_batch
|
||||
|
||||
async def run(self, stop_event: asyncio.Event) -> None:
|
||||
"""
|
||||
Main loop. Runs until stop_event is set.
|
||||
Handles errors with exponential backoff.
|
||||
"""
|
||||
self._running = True
|
||||
"""Main loop until stop_event is set."""
|
||||
backoff = _INIT_RETRY_BACKOFF
|
||||
logger.info(
|
||||
"Matrix ingress loop started | hs=%s node=%s mappings=%d",
|
||||
"Matrix ingress/egress loop started | hs=%s node=%s mappings=%d",
|
||||
self._hs_url, self._node_id, self._room_map.total_mappings,
|
||||
)
|
||||
|
||||
if self._room_map.total_mappings == 0:
|
||||
logger.warning("No room mappings configured — ingress loop is idle")
|
||||
logger.warning("No room mappings — ingress loop is idle")
|
||||
|
||||
async with MatrixClient(
|
||||
self._hs_url, self._token, self._user_id
|
||||
) as client:
|
||||
# Join all mapped rooms at startup
|
||||
async with MatrixClient(self._hs_url, self._token, self._user_id) as client:
|
||||
for mapping in self._room_map.mappings:
|
||||
if mapping.agent_id in self._room_map.allowed_agents:
|
||||
try:
|
||||
await client.join_room(mapping.room_id)
|
||||
logger.info("Joined room %s → agent %s", mapping.room_id, mapping.agent_id)
|
||||
except Exception as exc:
|
||||
logger.warning("Could not join room %s: %s", mapping.room_id, exc)
|
||||
|
||||
async with httpx.AsyncClient(timeout=_GATEWAY_TIMEOUT_S) as gw_client:
|
||||
async with httpx.AsyncClient() as http_client:
|
||||
while not stop_event.is_set():
|
||||
try:
|
||||
sync_resp = await client.sync_poll(since=self._next_batch)
|
||||
self._next_batch = sync_resp.get("next_batch")
|
||||
backoff = _INIT_RETRY_BACKOFF # reset on success
|
||||
|
||||
await self._process_sync(client, gw_client, sync_resp)
|
||||
|
||||
backoff = _INIT_RETRY_BACKOFF
|
||||
await self._process_sync(client, http_client, sync_resp)
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Ingress loop cancelled")
|
||||
break
|
||||
except Exception as exc:
|
||||
logger.error(
|
||||
"Ingress loop error (retry in %.1fs): %s",
|
||||
backoff, exc,
|
||||
)
|
||||
logger.error("Ingress loop error (retry in %.1fs): %s", backoff, exc)
|
||||
if self._on_gateway_error:
|
||||
self._on_gateway_error("sync_error")
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
stop_event.wait(), timeout=backoff
|
||||
)
|
||||
await asyncio.wait_for(stop_event.wait(), timeout=backoff)
|
||||
except asyncio.TimeoutError:
|
||||
pass
|
||||
backoff = min(backoff * 2, _MAX_RETRY_BACKOFF)
|
||||
|
||||
self._running = False
|
||||
logger.info("Matrix ingress loop stopped")
|
||||
logger.info("Matrix ingress/egress loop stopped")
|
||||
|
||||
async def _process_sync(
|
||||
self,
|
||||
client: MatrixClient,
|
||||
gw_client: httpx.AsyncClient,
|
||||
http_client: httpx.AsyncClient,
|
||||
sync_resp: Dict[str, Any],
|
||||
) -> None:
|
||||
"""Process all mapped rooms in a sync response."""
|
||||
for mapping in self._room_map.mappings:
|
||||
if mapping.agent_id not in self._room_map.allowed_agents:
|
||||
continue
|
||||
|
||||
messages = client.extract_room_messages(sync_resp, mapping.room_id)
|
||||
for event in messages:
|
||||
await self._handle_message(client, gw_client, event, mapping)
|
||||
await self._handle_message(client, http_client, event, mapping)
|
||||
|
||||
async def _handle_message(
|
||||
self,
|
||||
client: MatrixClient,
|
||||
gw_client: httpx.AsyncClient,
|
||||
http_client: httpx.AsyncClient,
|
||||
event: Dict[str, Any],
|
||||
mapping,
|
||||
) -> None:
|
||||
"""
|
||||
Process a single Matrix message event:
|
||||
1. Mark as seen (dedupe)
|
||||
2. Invoke DAGI gateway
|
||||
3. Fire metrics callback
|
||||
|
||||
Note: Reply sending (egress) is PR-M1.4 — not done here.
|
||||
"""
|
||||
event_id = event.get("event_id", "")
|
||||
sender = event.get("sender", "")
|
||||
text = event.get("content", {}).get("body", "").strip()
|
||||
@@ -207,61 +237,157 @@ class MatrixIngressLoop:
|
||||
agent_id = mapping.agent_id
|
||||
|
||||
if not text:
|
||||
logger.debug("Skipping empty message from %s in %s", sender, room_id)
|
||||
return
|
||||
|
||||
# Mark event as seen before invoke (prevents duplicate on retry)
|
||||
# Dedupe — mark seen before any IO (prevents double-process on retry)
|
||||
client.mark_seen(event_id)
|
||||
|
||||
logger.info(
|
||||
"Matrix message: room=%s sender=%s agent=%s event=%s text_len=%d",
|
||||
"Matrix message: room=%s sender=%s agent=%s event=%s len=%d",
|
||||
room_id, sender, agent_id, event_id, len(text),
|
||||
)
|
||||
|
||||
if self._on_message_received:
|
||||
self._on_message_received(room_id, agent_id)
|
||||
|
||||
# Audit: received
|
||||
await _write_audit(
|
||||
http_client, self._console_url, self._internal_token,
|
||||
event="matrix.message.received",
|
||||
agent_id=agent_id, node_id=self._node_id,
|
||||
room_id=room_id, event_id=event_id,
|
||||
status="ok",
|
||||
data={"sender": sender, "text_len": len(text)},
|
||||
)
|
||||
|
||||
# Session ID: stable per room (allows memory context across messages)
|
||||
session_id = f"matrix:{room_id.replace('!', '').replace(':', '_')}"
|
||||
|
||||
t0 = time.monotonic()
|
||||
reply_text: Optional[str] = None
|
||||
invoke_ok = False
|
||||
|
||||
try:
|
||||
await _invoke_gateway(
|
||||
gw_client,
|
||||
self._gateway_url,
|
||||
reply_text = await _invoke_router(
|
||||
http_client,
|
||||
self._router_url,
|
||||
agent_id=agent_id,
|
||||
node_id=self._node_id,
|
||||
message_text=text,
|
||||
matrix_room_id=room_id,
|
||||
matrix_event_id=event_id,
|
||||
matrix_sender=sender,
|
||||
prompt=text,
|
||||
session_id=session_id,
|
||||
)
|
||||
duration = time.monotonic() - t0
|
||||
invoke_ok = True
|
||||
duration_ms = int((time.monotonic() - t0) * 1000)
|
||||
logger.info(
|
||||
"Gateway invoke ok: agent=%s event=%s duration=%.2fs",
|
||||
agent_id, event_id, duration,
|
||||
"Router invoke ok: agent=%s event=%s reply_len=%d duration=%dms",
|
||||
agent_id, event_id, len(reply_text or ""), duration_ms,
|
||||
)
|
||||
|
||||
except httpx.HTTPStatusError as exc:
|
||||
duration = time.monotonic() - t0
|
||||
duration_ms = int((time.monotonic() - t0) * 1000)
|
||||
logger.error(
|
||||
"Gateway HTTP error %d for agent=%s event=%s duration=%.2fs",
|
||||
exc.response.status_code, agent_id, event_id, duration,
|
||||
"Router HTTP %d for agent=%s event=%s duration=%dms",
|
||||
exc.response.status_code, agent_id, event_id, duration_ms,
|
||||
)
|
||||
if self._on_gateway_error:
|
||||
self._on_gateway_error(f"http_{exc.response.status_code}")
|
||||
await _write_audit(
|
||||
http_client, self._console_url, self._internal_token,
|
||||
event="matrix.error",
|
||||
agent_id=agent_id, node_id=self._node_id,
|
||||
room_id=room_id, event_id=event_id,
|
||||
status="error", error_code=f"router_http_{exc.response.status_code}",
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
except (httpx.ConnectError, httpx.TimeoutException) as exc:
|
||||
duration = time.monotonic() - t0
|
||||
duration_ms = int((time.monotonic() - t0) * 1000)
|
||||
logger.error(
|
||||
"Gateway network error for agent=%s event=%s: %s duration=%.2fs",
|
||||
agent_id, event_id, exc, duration,
|
||||
"Router network error agent=%s event=%s: %s duration=%dms",
|
||||
agent_id, event_id, exc, duration_ms,
|
||||
)
|
||||
if self._on_gateway_error:
|
||||
self._on_gateway_error("network_error")
|
||||
await _write_audit(
|
||||
http_client, self._console_url, self._internal_token,
|
||||
event="matrix.error",
|
||||
agent_id=agent_id, node_id=self._node_id,
|
||||
room_id=room_id, event_id=event_id,
|
||||
status="error", error_code="router_network_error",
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
duration = time.monotonic() - t0
|
||||
duration_ms = int((time.monotonic() - t0) * 1000)
|
||||
logger.error(
|
||||
"Unexpected error invoking gateway for agent=%s event=%s: %s",
|
||||
"Unexpected router error agent=%s event=%s: %s",
|
||||
agent_id, event_id, exc,
|
||||
)
|
||||
if self._on_gateway_error:
|
||||
self._on_gateway_error("unexpected")
|
||||
await _write_audit(
|
||||
http_client, self._console_url, self._internal_token,
|
||||
event="matrix.error",
|
||||
agent_id=agent_id, node_id=self._node_id,
|
||||
room_id=room_id, event_id=event_id,
|
||||
status="error", error_code="router_unexpected",
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
# ── Egress: send reply back to Matrix ──────────────────────────────────
|
||||
if not invoke_ok:
|
||||
# No reply on error in M1 — just audit (avoids spam in room)
|
||||
return
|
||||
|
||||
if not reply_text:
|
||||
logger.warning("Empty reply from router for agent=%s event=%s", agent_id, event_id)
|
||||
return
|
||||
|
||||
# Truncate if needed
|
||||
send_text = reply_text[:_REPLY_TEXT_MAX]
|
||||
txn_id = MatrixClient.make_txn_id(room_id, event_id)
|
||||
|
||||
send_t0 = time.monotonic()
|
||||
try:
|
||||
await client.send_text(room_id, send_text, txn_id)
|
||||
send_duration_ms = int((time.monotonic() - send_t0) * 1000)
|
||||
|
||||
if self._on_message_replied:
|
||||
self._on_message_replied(room_id, agent_id, "ok")
|
||||
|
||||
await _write_audit(
|
||||
http_client, self._console_url, self._internal_token,
|
||||
event="matrix.agent.replied",
|
||||
agent_id=agent_id, node_id=self._node_id,
|
||||
room_id=room_id, event_id=event_id,
|
||||
status="ok",
|
||||
duration_ms=send_duration_ms,
|
||||
data={
|
||||
"reply_len": len(send_text),
|
||||
"truncated": len(reply_text) > _REPLY_TEXT_MAX,
|
||||
"router_duration_ms": duration_ms,
|
||||
},
|
||||
)
|
||||
logger.info(
|
||||
"Reply sent: agent=%s event=%s reply_len=%d send_ms=%d",
|
||||
agent_id, event_id, len(send_text), send_duration_ms,
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
send_duration_ms = int((time.monotonic() - send_t0) * 1000)
|
||||
logger.error(
|
||||
"Failed to send Matrix reply agent=%s event=%s: %s",
|
||||
agent_id, event_id, exc,
|
||||
)
|
||||
if self._on_message_replied:
|
||||
self._on_message_replied(room_id, agent_id, "error")
|
||||
if self._on_gateway_error:
|
||||
self._on_gateway_error("matrix_send_error")
|
||||
await _write_audit(
|
||||
http_client, self._console_url, self._internal_token,
|
||||
event="matrix.error",
|
||||
agent_id=agent_id, node_id=self._node_id,
|
||||
room_id=room_id, event_id=event_id,
|
||||
status="error", error_code="matrix_send_failed",
|
||||
duration_ms=send_duration_ms,
|
||||
)
|
||||
|
||||
@@ -141,14 +141,23 @@ async def lifespan(app_: Any):
|
||||
if _PROM_OK:
|
||||
_gateway_errors.labels(error_type=error_type).inc()
|
||||
|
||||
def _on_replied(room_id: str, agent_id: str, status: str) -> None:
|
||||
if _PROM_OK:
|
||||
_messages_replied.labels(
|
||||
room_id=room_id, agent_id=agent_id, status=status
|
||||
).inc()
|
||||
|
||||
ingress = MatrixIngressLoop(
|
||||
matrix_homeserver_url=_cfg.matrix_homeserver_url,
|
||||
matrix_access_token=_cfg.matrix_access_token,
|
||||
matrix_user_id=_cfg.matrix_user_id,
|
||||
gateway_url=_cfg.dagi_gateway_url,
|
||||
router_url=_cfg.dagi_gateway_url,
|
||||
node_id=_cfg.node_id,
|
||||
room_map=_room_map,
|
||||
sofiia_console_url=_cfg.sofiia_console_url,
|
||||
sofiia_internal_token=_cfg.sofiia_internal_token,
|
||||
on_message_received=_on_msg,
|
||||
on_message_replied=_on_replied,
|
||||
on_gateway_error=_on_gw_error,
|
||||
)
|
||||
_ingress_task = asyncio.create_task(
|
||||
|
||||
Reference in New Issue
Block a user