feat(matrix-bridge-dagi): add matrix client wrapper and synapse setup (PR-M1.1)

- adds MatrixClient with send_text/sync_poll/join_room/whoami (idempotent via txn_id)
- LRU dedupe for incoming event_ids (2048 capacity)
- exponential backoff retry (max 3 attempts) for 429/5xx/network errors
- extract_room_messages: filters own messages, non-text, duplicates
- health endpoint now probes matrix_reachable + gateway_reachable at startup
- adds docker-compose.synapse-node1.yml (Synapse + Postgres for NODA1)
- adds ops/runbook-matrix-setup.md (10-step setup: DNS, config, bot, room, .env)
- 19 tests passing, no real Synapse required

Made-with: Cursor
This commit is contained in:
Apple
2026-03-03 07:38:54 -08:00
parent 1d8482f4c1
commit d8506da179
5 changed files with 1013 additions and 5 deletions

View File

@@ -8,11 +8,18 @@ import logging
import os
import time
from contextlib import asynccontextmanager
from typing import Any, Dict
from typing import Any, Dict, Optional
from fastapi import FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware
try:
import httpx as _httpx
_HTTPX_OK = True
except ImportError: # pragma: no cover
_httpx = None # type: ignore
_HTTPX_OK = False
try:
from prometheus_client import (
Counter, Histogram, Gauge,
@@ -60,13 +67,27 @@ if _PROM_OK:
# ── Startup state ─────────────────────────────────────────────────────────────
_START_TIME = time.monotonic()
_cfg: BridgeConfig | None = None
_config_error: str | None = None
_cfg: Optional[BridgeConfig] = None
_config_error: Optional[str] = None
_matrix_reachable: Optional[bool] = None # probed at startup
_gateway_reachable: Optional[bool] = None # probed at startup
async def _probe_url(url: str, timeout: float = 5.0) -> bool:
"""Quick GET probe — returns True if HTTP 2xx."""
if not _HTTPX_OK or not url:
return False
try:
async with _httpx.AsyncClient(timeout=timeout) as client:
r = await client.get(url)
return r.status_code < 400
except Exception:
return False
# ── Lifespan ──────────────────────────────────────────────────────────────────
@asynccontextmanager
async def lifespan(app_: Any):
global _cfg, _config_error
global _cfg, _config_error, _matrix_reachable, _gateway_reachable
try:
_cfg = load_config()
logger.info(
@@ -74,6 +95,21 @@ async def lifespan(app_: Any):
_cfg.node_id, _cfg.build_sha, _cfg.matrix_homeserver_url,
_cfg.sofiia_room_id, list(_cfg.bridge_allowed_agents),
)
# Connectivity smoke probes (non-blocking failures)
_matrix_reachable = await _probe_url(
f"{_cfg.matrix_homeserver_url}/_matrix/client/versions"
)
_gateway_reachable = await _probe_url(
f"{_cfg.dagi_gateway_url}/health"
)
if _matrix_reachable:
logger.info("✅ Matrix homeserver reachable: %s", _cfg.matrix_homeserver_url)
else:
logger.warning("⚠️ Matrix homeserver NOT reachable: %s", _cfg.matrix_homeserver_url)
if _gateway_reachable:
logger.info("✅ DAGI Gateway reachable: %s", _cfg.dagi_gateway_url)
else:
logger.warning("⚠️ DAGI Gateway NOT reachable: %s", _cfg.dagi_gateway_url)
if _PROM_OK:
_bridge_up.set(1)
except RuntimeError as exc:
@@ -111,8 +147,11 @@ async def health() -> Dict[str, Any]:
"uptime_s": uptime,
"error": _config_error or "service not initialised",
}
matrix_ok = _matrix_reachable is True
gateway_ok = _gateway_reachable is True
overall_ok = matrix_ok and gateway_ok
return {
"ok": True,
"ok": overall_ok,
"service": "matrix-bridge-dagi",
"version": "0.1.0",
"build": _cfg.build_sha,
@@ -121,10 +160,12 @@ async def health() -> Dict[str, Any]:
"uptime_s": uptime,
"node_id": _cfg.node_id,
"homeserver": _cfg.matrix_homeserver_url,
"matrix_reachable": _matrix_reachable,
"bridge_user": _cfg.matrix_user_id,
"sofiia_room_id": _cfg.sofiia_room_id,
"allowed_agents": list(_cfg.bridge_allowed_agents),
"gateway": _cfg.dagi_gateway_url,
"gateway_reachable": _gateway_reachable,
"config_ok": True,
}