microdao-daarion/services/matrix-bridge-dagi/app/main.py

"""
matrix-bridge-dagi — Phase M1 scaffold
Bridges Matrix/Element rooms to DAGI agents via Gateway.

M1 scope: 1 room ↔ 1 agent (Sofiia), audit via sofiia-console internal endpoint.
"""
import asyncio
import logging
import os
import time
from contextlib import asynccontextmanager
from typing import Any, Dict, Optional

from fastapi import FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware

try:
    import httpx as _httpx
    _HTTPX_OK = True
except ImportError:  # pragma: no cover
    _httpx = None  # type: ignore
    _HTTPX_OK = False

try:
    from prometheus_client import (
        Counter, Histogram, Gauge,
        generate_latest, CONTENT_TYPE_LATEST,
        CollectorRegistry, REGISTRY,
    )
    _PROM_OK = True
except ImportError:  # pragma: no cover
    _PROM_OK = False

from .config import BridgeConfig, load_config
from .control import ControlConfig, parse_control_config
from .ingress import MatrixIngressLoop
from .mixed_routing import MixedRoomConfig, parse_mixed_room_map
from .rate_limit import InMemoryRateLimiter
from .room_mapping import RoomMappingConfig, parse_room_map

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
logger = logging.getLogger("matrix-bridge-dagi")

# ── Prometheus metrics (H3) ───────────────────────────────────────────────────
if _PROM_OK:
    _messages_received = Counter(
        "matrix_bridge_messages_received_total",
        "Total Matrix messages received",
        ["room_id", "agent_id"],
    )
    _messages_replied = Counter(
        "matrix_bridge_messages_replied_total",
        "Total agent replies sent to Matrix",
        ["room_id", "agent_id", "status"],
    )
    _messages_rate_limited = Counter(
        "matrix_bridge_rate_limited_total",
        "Messages dropped by rate limiter",
        ["room_id", "agent_id", "limit_type"],
    )
    _gateway_errors = Counter(
        "matrix_bridge_gateway_errors_total",
        "Errors by stage (sync, invoke, send, audit)",
        ["error_type"],
    )
    _invoke_latency = Histogram(
        "matrix_bridge_invoke_duration_seconds",
        "Latency of DAGI Router infer call",
        ["agent_id"],
        buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 45.0],
    )
    _send_latency = Histogram(
        "matrix_bridge_send_duration_seconds",
        "Latency of Matrix send_text call",
        ["agent_id"],
        buckets=[0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0],
    )
    _bridge_up = Gauge(
        "matrix_bridge_up",
        "1 if bridge started successfully",
    )
    _rate_limiter_active_rooms = Gauge(
        "matrix_bridge_rate_limiter_active_rooms",
        "Rooms with activity in the current rate-limit window",
    )
    _rate_limiter_active_senders = Gauge(
        "matrix_bridge_rate_limiter_active_senders",
        "Senders with activity in the current rate-limit window",
    )
    # H2: Queue metrics
    _queue_size = Gauge(
        "matrix_bridge_queue_size",
        "Current number of pending items in the work queue",
    )
    _queue_dropped = Counter(
        "matrix_bridge_queue_dropped_total",
        "Messages dropped because queue was full",
        ["room_id", "agent_id"],
    )
    _queue_wait = Histogram(
        "matrix_bridge_queue_wait_seconds",
        "Time between enqueue and worker start processing",
        ["agent_id"],
        buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 30.0],
    )
    # M2.2: Mixed room routing metrics
    _routed_total = Counter(
        "matrix_bridge_routed_total",
        "Successful message routing by reason",
        ["agent_id", "reason"],
    )
    _route_rejected_total = Counter(
        "matrix_bridge_route_rejected_total",
        "Messages rejected during routing (unknown agent, bad slash, etc.)",
        ["room_id", "reason"],
    )
    _active_room_agent_locks = Gauge(
        "matrix_bridge_active_room_agent_locks",
        "Number of room-agent pairs currently holding a concurrency lock",
    )
    # M3.0: Control channel
    _control_commands_total = Counter(
        "matrix_bridge_control_commands_total",
        "Total control commands received from authorized operators",
        ["sender", "verb", "subcommand"],
    )

# ── Startup state ─────────────────────────────────────────────────────────────
_START_TIME = time.monotonic()
_cfg: Optional[BridgeConfig] = None
_config_error: Optional[str] = None
_matrix_reachable: Optional[bool] = None
_gateway_reachable: Optional[bool] = None
_room_map: Optional[RoomMappingConfig] = None
_mixed_room_config: Optional[MixedRoomConfig] = None
_control_config: Optional[ControlConfig] = None
_rate_limiter: Optional[InMemoryRateLimiter] = None
_ingress_loop: Optional["MatrixIngressLoop"] = None   # for /health queue_size
_ingress_task: Optional[asyncio.Task] = None
_ingress_stop: Optional[asyncio.Event] = None


async def _probe_url(url: str, timeout: float = 5.0) -> bool:
    """Quick GET probe — returns True if HTTP 2xx."""
    if not _HTTPX_OK or not url:
        return False
    try:
        async with _httpx.AsyncClient(timeout=timeout) as client:
            r = await client.get(url)
            return r.status_code < 400
    except Exception:
        return False

# ── Lifespan ──────────────────────────────────────────────────────────────────
@asynccontextmanager
async def lifespan(app_: Any):
    global _cfg, _config_error, _matrix_reachable, _gateway_reachable
    global _room_map, _mixed_room_config, _control_config, _rate_limiter, _ingress_loop
    try:
        _cfg = load_config()

        # Parse regular room mapping (M1/M2.0: 1 room → 1 agent)
        _room_map = parse_room_map(
            os.getenv("BRIDGE_ROOM_MAP", ""),
            _cfg.bridge_allowed_agents,
        )

        # Parse mixed room mapping (M2.1: 1 room → N agents)
        if _cfg.bridge_mixed_room_map:
            _mixed_room_config = parse_mixed_room_map(
                _cfg.bridge_mixed_room_map,
                _cfg.bridge_mixed_defaults,
                _cfg.bridge_allowed_agents,
                max_agents_per_room=_cfg.max_agents_per_mixed_room,
            )
            logger.info(
                "✅ Mixed room config: %d rooms, agents=%s",
                _mixed_room_config.total_rooms,
                [a for r in _mixed_room_config.rooms.values() for a in r.agents],
            )
        else:
            _mixed_room_config = None

        # H1: Rate limiter (inmemory, per config)
        _rate_limiter = InMemoryRateLimiter(
            room_rpm=_cfg.rate_limit_room_rpm,
            sender_rpm=_cfg.rate_limit_sender_rpm,
        )
        logger.info(
            "✅ Rate limiter: room_rpm=%d sender_rpm=%d",
            _cfg.rate_limit_room_rpm, _cfg.rate_limit_sender_rpm,
        )

        # M3.0: Operator control channel
        if _cfg.bridge_operator_allowlist or _cfg.bridge_control_rooms:
            _control_config = parse_control_config(
                _cfg.bridge_operator_allowlist,
                _cfg.bridge_control_rooms,
            )
        else:
            _control_config = None

        mixed_count = _mixed_room_config.total_rooms if _mixed_room_config else 0
        ctrl_rooms = len(_control_config.control_rooms) if _control_config else 0
        ctrl_ops = len(_control_config.operator_allowlist) if _control_config else 0
        logger.info(
            "✅ matrix-bridge-dagi started | node=%s build=%s homeserver=%s "
            "agents=%s mappings=%d mixed_rooms=%d ctrl_rooms=%d ctrl_operators=%d",
            _cfg.node_id, _cfg.build_sha, _cfg.matrix_homeserver_url,
            list(_cfg.bridge_allowed_agents),
            _room_map.total_mappings, mixed_count, ctrl_rooms, ctrl_ops,
        )

        # Connectivity smoke probes (non-blocking failures)
        _matrix_reachable = await _probe_url(
            f"{_cfg.matrix_homeserver_url}/_matrix/client/versions"
        )
        _gateway_reachable = await _probe_url(
            f"{_cfg.dagi_gateway_url}/health"
        )
        if _matrix_reachable:
            logger.info("✅ Matrix homeserver reachable: %s", _cfg.matrix_homeserver_url)
        else:
            logger.warning("⚠️  Matrix homeserver NOT reachable: %s", _cfg.matrix_homeserver_url)
        if _gateway_reachable:
            logger.info("✅ DAGI Gateway reachable: %s", _cfg.dagi_gateway_url)
        else:
            logger.warning("⚠️  DAGI Gateway NOT reachable: %s", _cfg.dagi_gateway_url)
        if _PROM_OK:
            _bridge_up.set(1)

        # Start ingress loop (fire-and-forget asyncio task)
        _has_rooms = (_room_map and _room_map.total_mappings > 0) or (
            _mixed_room_config and _mixed_room_config.total_rooms > 0
        )
        if _has_rooms:
            _ingress_stop = asyncio.Event()

            def _on_msg(room_id: str, agent_id: str) -> None:
                if _PROM_OK:
                    _messages_received.labels(room_id=room_id, agent_id=agent_id).inc()

            def _on_gw_error(error_type: str) -> None:
                if _PROM_OK:
                    _gateway_errors.labels(error_type=error_type).inc()

            def _on_replied(room_id: str, agent_id: str, status: str) -> None:
                if _PROM_OK:
                    _messages_replied.labels(
                        room_id=room_id, agent_id=agent_id, status=status
                    ).inc()

            def _on_rate_limited(room_id: str, agent_id: str, limit_type: str) -> None:
                if _PROM_OK:
                    _messages_rate_limited.labels(
                        room_id=room_id, agent_id=agent_id, limit_type=limit_type
                    ).inc()
                    if _rate_limiter is not None:
                        stats = _rate_limiter.stats()
                        _rate_limiter_active_rooms.set(stats["active_rooms"])
                        _rate_limiter_active_senders.set(stats["active_senders"])

            def _on_invoke_latency(agent_id: str, duration_s: float) -> None:
                if _PROM_OK:
                    _invoke_latency.labels(agent_id=agent_id).observe(duration_s)

            def _on_send_latency(agent_id: str, duration_s: float) -> None:
                if _PROM_OK:
                    _send_latency.labels(agent_id=agent_id).observe(duration_s)

            # H2 callbacks
            def _on_queue_dropped(room_id: str, agent_id: str) -> None:
                if _PROM_OK:
                    _queue_dropped.labels(room_id=room_id, agent_id=agent_id).inc()

            def _on_queue_size(size: int) -> None:
                if _PROM_OK:
                    _queue_size.set(size)

            def _on_queue_wait(agent_id: str, wait_s: float) -> None:
                if _PROM_OK:
                    _queue_wait.labels(agent_id=agent_id).observe(wait_s)

            # M2.2 callbacks
            def _on_routed(agent_id: str, reason: str) -> None:
                if _PROM_OK:
                    _routed_total.labels(agent_id=agent_id, reason=reason).inc()

            def _on_route_rejected(room_id: str, reason: str) -> None:
                if _PROM_OK:
                    _route_rejected_total.labels(room_id=room_id, reason=reason).inc()

            # M3.0 callbacks
            def _on_control_command(sender: str, verb: str, subcommand: str) -> None:
                if _PROM_OK:
                    _control_commands_total.labels(
                        sender=sender, verb=verb, subcommand=subcommand
                    ).inc()

            ingress = MatrixIngressLoop(
                matrix_homeserver_url=_cfg.matrix_homeserver_url,
                matrix_access_token=_cfg.matrix_access_token,
                matrix_user_id=_cfg.matrix_user_id,
                router_url=_cfg.dagi_gateway_url,
                node_id=_cfg.node_id,
                room_map=_room_map,
                sofiia_console_url=_cfg.sofiia_console_url,
                sofiia_internal_token=_cfg.sofiia_internal_token,
                rate_limiter=_rate_limiter,
                queue_max_events=_cfg.queue_max_events,
                worker_concurrency=_cfg.worker_concurrency,
                queue_drain_timeout_s=_cfg.queue_drain_timeout_s,
                mixed_room_config=_mixed_room_config,
                unknown_agent_behavior=_cfg.unknown_agent_behavior,
                max_slash_len=_cfg.max_slash_len,
                mixed_concurrency_cap=_cfg.mixed_concurrency_cap,
                on_message_received=_on_msg,
                on_message_replied=_on_replied,
                on_gateway_error=_on_gw_error,
                on_rate_limited=_on_rate_limited,
                on_queue_dropped=_on_queue_dropped,
                on_queue_size=_on_queue_size,
                on_invoke_latency=_on_invoke_latency,
                on_send_latency=_on_send_latency,
                on_queue_wait=_on_queue_wait,
                on_routed=_on_routed,
                on_route_rejected=_on_route_rejected,
                control_config=_control_config,
                control_unauthorized_behavior=_cfg.control_unauthorized_behavior,
                on_control_command=_on_control_command,
            )
            logger.info(
                "✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs",
                _cfg.queue_max_events, _cfg.worker_concurrency, _cfg.queue_drain_timeout_s,
            )
            _ingress_loop = ingress
            _ingress_task = asyncio.create_task(
                ingress.run(_ingress_stop),
                name="matrix_ingress_loop",
            )
            logger.info("✅ Ingress loop task started")
        else:
            logger.warning("⚠️  No room mappings — ingress loop NOT started")

    except (RuntimeError, ValueError) as exc:
        _config_error = str(exc)
        logger.error("❌ Config error: %s", _config_error)
        if _PROM_OK:
            _bridge_up.set(0)
    yield
    # Shutdown: cancel ingress loop
    if _ingress_stop:
        _ingress_stop.set()
    if _ingress_task and not _ingress_task.done():
        _ingress_task.cancel()
        try:
            await asyncio.wait_for(_ingress_task, timeout=5.0)
        except (asyncio.CancelledError, asyncio.TimeoutError):
            pass
    logger.info("matrix-bridge-dagi shutting down")

# ── App ───────────────────────────────────────────────────────────────────────
app = FastAPI(
    title="matrix-bridge-dagi",
    version="0.1.0",
    lifespan=lifespan,
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["GET", "POST"],
    allow_headers=["*"],
)

# ── Health ────────────────────────────────────────────────────────────────────
@app.get("/health")
async def health() -> Dict[str, Any]:
    uptime = int(time.monotonic() - _START_TIME)
    if _PROM_OK and _ingress_loop is not None:
        _active_room_agent_locks.set(_ingress_loop.active_lock_count)
    if _config_error or _cfg is None:
        return {
            "ok": False,
            "service": "matrix-bridge-dagi",
            "version": "0.1.0",
            "build": os.getenv("BUILD_SHA", "dev"),
            "uptime_s": uptime,
            "error": _config_error or "service not initialised",
        }
    matrix_ok = _matrix_reachable is True
    gateway_ok = _gateway_reachable is True
    overall_ok = matrix_ok and gateway_ok
    return {
        "ok": overall_ok,
        "service": "matrix-bridge-dagi",
        "version": "0.1.0",
        "build": _cfg.build_sha,
        "build_time": _cfg.build_time,
        "env": os.getenv("ENV", "dev"),
        "uptime_s": uptime,
        "node_id": _cfg.node_id,
        "homeserver": _cfg.matrix_homeserver_url,
        "matrix_reachable": _matrix_reachable,
        "bridge_user": _cfg.matrix_user_id,
        "sofiia_room_id": _cfg.sofiia_room_id,
        "allowed_agents": list(_cfg.bridge_allowed_agents),
        "gateway": _cfg.dagi_gateway_url,
        "gateway_reachable": _gateway_reachable,
        "mappings_count": _room_map.total_mappings if _room_map else 0,
        "mixed_rooms_count": _mixed_room_config.total_rooms if _mixed_room_config else 0,
        "total_agents_in_mixed_rooms": sum(
            len(r.agents) for r in _mixed_room_config.rooms.values()
        ) if _mixed_room_config else 0,
        "config_ok": True,
        "rate_limiter": _rate_limiter.stats() if _rate_limiter else None,
        "queue": {
            "size": _ingress_loop.queue_size if _ingress_loop else 0,
            "max": _cfg.queue_max_events,
            "workers": _cfg.worker_concurrency,
        },
        "mixed_guard_rails": {
            "max_agents_per_room": _cfg.max_agents_per_mixed_room,
            "max_slash_len": _cfg.max_slash_len,
            "unknown_agent_behavior": _cfg.unknown_agent_behavior,
            "concurrency_cap": _cfg.mixed_concurrency_cap,
            "active_room_agent_locks": _ingress_loop.active_lock_count if _ingress_loop else 0,
        },
        "control_channel": {
            "enabled": _control_config.is_enabled if _control_config else False,
            "control_rooms_count": len(_control_config.control_rooms) if _control_config else 0,
            "operators_count": len(_control_config.operator_allowlist) if _control_config else 0,
            "unauthorized_behavior": _cfg.control_unauthorized_behavior,
        },
    }


# ── Bridge Mappings (read-only ops endpoint) ───────────────────────────────────
@app.get("/bridge/mappings")
async def bridge_mappings() -> Dict[str, Any]:
    """
    Returns room-to-agent mapping summary.
    Safe for ops visibility — no secrets included.
    """
    if _cfg is None or _room_map is None:
        return {
            "ok": False,
            "error": _config_error or "service not initialised",
            "mappings": [],
            "mixed_rooms": [],
        }
    return {
        "ok": True,
        "total": _room_map.total_mappings,
        "allowed_agents": list(_cfg.bridge_allowed_agents),
        "mappings": _room_map.as_summary(),
        "mixed_rooms_total": _mixed_room_config.total_rooms if _mixed_room_config else 0,
        "mixed_rooms": _mixed_room_config.as_summary() if _mixed_room_config else [],
        "control_rooms": sorted(_control_config.control_rooms) if _control_config else [],
        "control_operators_count": len(_control_config.operator_allowlist) if _control_config else 0,
    }


# ── Metrics ───────────────────────────────────────────────────────────────────
@app.get("/metrics")
async def metrics():
    if not _PROM_OK:
        return Response("# prometheus_client not available\n", media_type="text/plain")
    return Response(generate_latest(REGISTRY), media_type=CONTENT_TYPE_LATEST)