microdao-daarion/services/matrix-bridge-dagi/app/main.py

"""
matrix-bridge-dagi — Phase M1 scaffold
Bridges Matrix/Element rooms to DAGI agents via Gateway.

M1 scope: 1 room ↔ 1 agent (Sofiia), audit via sofiia-console internal endpoint.
"""
import asyncio
import logging
import os
import time
from contextlib import asynccontextmanager
from typing import Any, Dict, Optional

from fastapi import FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware

try:
    import httpx as _httpx
    _HTTPX_OK = True
except ImportError:  # pragma: no cover
    _httpx = None  # type: ignore
    _HTTPX_OK = False

try:
    from prometheus_client import (
        Counter, Histogram, Gauge,
        generate_latest, CONTENT_TYPE_LATEST,
        CollectorRegistry, REGISTRY,
    )
    _PROM_OK = True
except ImportError:  # pragma: no cover
    _PROM_OK = False

from .config import BridgeConfig, load_config
from .control import ControlConfig, parse_control_config
from .control_limiter import ControlRateLimiter
from .event_store import EventStore
from .node_policy import parse_node_policy
from .ingress import MatrixIngressLoop
from .mixed_routing import MixedRoomConfig, parse_mixed_room_map
from .rate_limit import InMemoryRateLimiter
from .room_mapping import RoomMappingConfig, parse_room_map

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
logger = logging.getLogger("matrix-bridge-dagi")

# ── Prometheus metrics (H3) ───────────────────────────────────────────────────
if _PROM_OK:
    _messages_received = Counter(
        "matrix_bridge_messages_received_total",
        "Total Matrix messages received",
        ["room_id", "agent_id"],
    )
    _messages_replied = Counter(
        "matrix_bridge_messages_replied_total",
        "Total agent replies sent to Matrix",
        ["room_id", "agent_id", "status"],
    )
    _messages_rate_limited = Counter(
        "matrix_bridge_rate_limited_total",
        "Messages dropped by rate limiter",
        ["room_id", "agent_id", "limit_type"],
    )
    _gateway_errors = Counter(
        "matrix_bridge_gateway_errors_total",
        "Errors by stage (sync, invoke, send, audit)",
        ["error_type"],
    )
    _invoke_latency = Histogram(
        "matrix_bridge_invoke_duration_seconds",
        "Latency of DAGI Router infer call",
        ["agent_id", "node_id"],   # M5.1: per-node latency breakdown
        buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 45.0],
    )
    _send_latency = Histogram(
        "matrix_bridge_send_duration_seconds",
        "Latency of Matrix send_text call",
        ["agent_id"],
        buckets=[0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0],
    )
    _bridge_up = Gauge(
        "matrix_bridge_up",
        "1 if bridge started successfully; 0 on config error",
        ["node_id"],   # M7.1: per-node label for multi-node deployments
    )
    _rate_limiter_active_rooms = Gauge(
        "matrix_bridge_rate_limiter_active_rooms",
        "Rooms with activity in the current rate-limit window",
    )
    _rate_limiter_active_senders = Gauge(
        "matrix_bridge_rate_limiter_active_senders",
        "Senders with activity in the current rate-limit window",
    )
    # H2: Queue metrics
    _queue_size = Gauge(
        "matrix_bridge_queue_size",
        "Current number of pending items in the work queue",
    )
    _queue_dropped = Counter(
        "matrix_bridge_queue_dropped_total",
        "Messages dropped because queue was full",
        ["room_id", "agent_id"],
    )
    _queue_wait = Histogram(
        "matrix_bridge_queue_wait_seconds",
        "Time between enqueue and worker start processing",
        ["agent_id"],
        buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 30.0],
    )
    # M2.2: Mixed room routing — reason breakdown (slash/mention/name/default/direct)
    # M7.1: Renamed from matrix_bridge_routed_total to avoid collision with M5.0 counter
    _routing_reasons_total = Counter(
        "matrix_bridge_routing_reasons_total",
        "Message routing breakdown by agent and routing reason (slash/mention/name/default/direct)",
        ["agent_id", "reason"],
    )
    _route_rejected_total = Counter(
        "matrix_bridge_route_rejected_total",
        "Messages rejected during routing (unknown agent, bad slash, etc.)",
        ["room_id", "reason"],
    )
    _active_room_agent_locks = Gauge(
        "matrix_bridge_active_room_agent_locks",
        "Number of room-agent pairs currently holding a concurrency lock",
    )
    # M3.0: Control channel
    _control_commands_total = Counter(
        "matrix_bridge_control_commands_total",
        "Total control commands received from authorized operators",
        ["sender", "verb", "subcommand"],
    )
    _control_rate_limited_total = Counter(
        "matrix_bridge_control_rate_limited_total",
        "Total control commands rejected by rate limiter or cooldown",
        ["scope"],
    )
    _dedupe_persistent_hits_total = Counter(
        "matrix_bridge_dedupe_persistent_hits_total",
        "Total events dropped by persistent (SQLite) deduplication",
        ["room_id"],
    )
    _dedupe_persistent_inserts_total = Counter(
        "matrix_bridge_dedupe_persistent_inserts_total",
        "Total events marked as processed in persistent dedupe store",
    )
    # M5.0: node-aware routing — primary routed counter (unique name, no collision)
    _routed_total = Counter(
        "matrix_bridge_routed_total",
        "Total messages successfully routed, by agent, resolved node, and node source",
        ["agent_id", "node_id", "source"],
    )
    _node_rejected_total = Counter(
        "matrix_bridge_node_rejected_total",
        "Total messages with rejected (non-allowlisted) node kwarg",
        ["node_id"],
    )
    # M8.0: soft-failover metrics
    _failover_total = Counter(
        "matrix_bridge_failover_total",
        "Total successful soft-failovers by node transition and reason",
        ["from_node", "to_node", "reason"],
    )
    _node_health_state = Gauge(
        "matrix_bridge_node_health_state",
        "Node health state: 1=healthy 0.5=degraded 0=down",
        ["node_id"],
    )
    # M8.1: sticky routing metrics
    _sticky_set_total = Counter(
        "matrix_bridge_sticky_node_total",
        "Total sticky routing entries set after failover, by preferred node and scope",
        ["node_id", "scope"],
    )
    _sticky_active = Gauge(
        "matrix_bridge_sticky_node_active",
        "Current count of active sticky routing entries",
        [],
    )

# ── Startup state ─────────────────────────────────────────────────────────────
_START_TIME = time.monotonic()
_cfg: Optional[BridgeConfig] = None
# M5.1: in-memory per-node counters (lightweight, for !status reply)
from collections import defaultdict as _defaultdict
_node_stats: Dict[str, Dict[str, int]] = _defaultdict(lambda: {"routed": 0, "rejected": 0})
_config_error: Optional[str] = None
_matrix_reachable: Optional[bool] = None
_gateway_reachable: Optional[bool] = None
_room_map: Optional[RoomMappingConfig] = None
_mixed_room_config: Optional[MixedRoomConfig] = None
_control_config: Optional[ControlConfig] = None
_event_store: Optional[EventStore] = None
_rate_limiter: Optional[InMemoryRateLimiter] = None
_ingress_loop: Optional["MatrixIngressLoop"] = None   # for /health queue_size
_ingress_task: Optional[asyncio.Task] = None
_ingress_stop: Optional[asyncio.Event] = None
_sticky_cache: Optional[Any] = None   # M8.1: StickyNodeCache instance
_confirm_store: Optional[Any] = None  # M9.0: ConfirmStore instance
_dummy_http_client: Optional[Any] = None  # M11: soak inject endpoint (debug only)


async def _probe_url(url: str, timeout: float = 5.0) -> bool:
    """Quick GET probe — returns True if HTTP 2xx."""
    if not _HTTPX_OK or not url:
        return False
    try:
        async with _httpx.AsyncClient(timeout=timeout) as client:
            r = await client.get(url)
            return r.status_code < 400
    except Exception:
        return False

# ── Lifespan ──────────────────────────────────────────────────────────────────
@asynccontextmanager
async def lifespan(app_: Any):
    global _cfg, _config_error, _matrix_reachable, _gateway_reachable
    global _room_map, _mixed_room_config, _control_config, _rate_limiter, _ingress_loop
    try:
        _cfg = load_config()

        # Parse regular room mapping (M1/M2.0: 1 room → 1 agent)
        _room_map = parse_room_map(
            os.getenv("BRIDGE_ROOM_MAP", ""),
            _cfg.bridge_allowed_agents,
        )

        # Parse mixed room mapping (M2.1: 1 room → N agents)
        if _cfg.bridge_mixed_room_map:
            _mixed_room_config = parse_mixed_room_map(
                _cfg.bridge_mixed_room_map,
                _cfg.bridge_mixed_defaults,
                _cfg.bridge_allowed_agents,
                max_agents_per_room=_cfg.max_agents_per_mixed_room,
            )
            logger.info(
                "✅ Mixed room config: %d rooms, agents=%s",
                _mixed_room_config.total_rooms,
                [a for r in _mixed_room_config.rooms.values() for a in r.agents],
            )
        else:
            _mixed_room_config = None

        # H1: Rate limiter (inmemory, per config)
        _rate_limiter = InMemoryRateLimiter(
            room_rpm=_cfg.rate_limit_room_rpm,
            sender_rpm=_cfg.rate_limit_sender_rpm,
        )
        logger.info(
            "✅ Rate limiter: room_rpm=%d sender_rpm=%d",
            _cfg.rate_limit_room_rpm, _cfg.rate_limit_sender_rpm,
        )

        # M3.0: Operator control channel
        if _cfg.bridge_operator_allowlist or _cfg.bridge_control_rooms:
            _control_config = parse_control_config(
                _cfg.bridge_operator_allowlist,
                _cfg.bridge_control_rooms,
            )
        else:
            _control_config = None

        mixed_count = _mixed_room_config.total_rooms if _mixed_room_config else 0
        ctrl_rooms = len(_control_config.control_rooms) if _control_config else 0
        ctrl_ops = len(_control_config.operator_allowlist) if _control_config else 0
        logger.info(
            "✅ matrix-bridge-dagi started | node=%s build=%s homeserver=%s "
            "agents=%s mappings=%d mixed_rooms=%d ctrl_rooms=%d ctrl_operators=%d",
            _cfg.node_id, _cfg.build_sha, _cfg.matrix_homeserver_url,
            list(_cfg.bridge_allowed_agents),
            _room_map.total_mappings, mixed_count, ctrl_rooms, ctrl_ops,
        )

        # Connectivity smoke probes (non-blocking failures)
        _matrix_reachable = await _probe_url(
            f"{_cfg.matrix_homeserver_url}/_matrix/client/versions"
        )
        _gateway_reachable = await _probe_url(
            f"{_cfg.dagi_gateway_url}/health"
        )
        if _matrix_reachable:
            logger.info("✅ Matrix homeserver reachable: %s", _cfg.matrix_homeserver_url)
        else:
            logger.warning("⚠️  Matrix homeserver NOT reachable: %s", _cfg.matrix_homeserver_url)
        if _gateway_reachable:
            logger.info("✅ DAGI Gateway reachable: %s", _cfg.dagi_gateway_url)
        else:
            logger.warning("⚠️  DAGI Gateway NOT reachable: %s", _cfg.dagi_gateway_url)
        if _PROM_OK:
            _bridge_up.labels(node_id=_cfg.node_id or "").set(1)  # M7.1: labeled

        # Start ingress loop (fire-and-forget asyncio task)
        _has_rooms = (_room_map and _room_map.total_mappings > 0) or (
            _mixed_room_config and _mixed_room_config.total_rooms > 0
        )
        if _has_rooms:
            _ingress_stop = asyncio.Event()

            def _on_msg(room_id: str, agent_id: str) -> None:
                if _PROM_OK:
                    _messages_received.labels(room_id=room_id, agent_id=agent_id).inc()

            def _on_gw_error(error_type: str) -> None:
                if _PROM_OK:
                    _gateway_errors.labels(error_type=error_type).inc()

            def _on_replied(room_id: str, agent_id: str, status: str) -> None:
                if _PROM_OK:
                    _messages_replied.labels(
                        room_id=room_id, agent_id=agent_id, status=status
                    ).inc()

            def _on_rate_limited(room_id: str, agent_id: str, limit_type: str) -> None:
                if _PROM_OK:
                    _messages_rate_limited.labels(
                        room_id=room_id, agent_id=agent_id, limit_type=limit_type
                    ).inc()
                    if _rate_limiter is not None:
                        stats = _rate_limiter.stats()
                        _rate_limiter_active_rooms.set(stats["active_rooms"])
                        _rate_limiter_active_senders.set(stats["active_senders"])

            def _on_invoke_latency(agent_id: str, duration_s: float, node_id: str = "") -> None:
                if _PROM_OK:
                    _invoke_latency.labels(agent_id=agent_id, node_id=node_id or "unknown").observe(duration_s)

            def _on_send_latency(agent_id: str, duration_s: float) -> None:
                if _PROM_OK:
                    _send_latency.labels(agent_id=agent_id).observe(duration_s)

            # H2 callbacks
            def _on_queue_dropped(room_id: str, agent_id: str) -> None:
                if _PROM_OK:
                    _queue_dropped.labels(room_id=room_id, agent_id=agent_id).inc()

            def _on_queue_size(size: int) -> None:
                if _PROM_OK:
                    _queue_size.set(size)

            def _on_queue_wait(agent_id: str, wait_s: float) -> None:
                if _PROM_OK:
                    _queue_wait.labels(agent_id=agent_id).observe(wait_s)

            # M2.2 callbacks
            def _on_routed(agent_id: str, reason: str) -> None:
                if _PROM_OK:
                    _routing_reasons_total.labels(agent_id=agent_id, reason=reason).inc()  # M7.1: renamed

            def _on_route_rejected(room_id: str, reason: str) -> None:
                if _PROM_OK:
                    _route_rejected_total.labels(room_id=room_id, reason=reason).inc()

            # M3.0 callbacks
            def _on_control_command(sender: str, verb: str, subcommand: str) -> None:
                if _PROM_OK:
                    _control_commands_total.labels(
                        sender=sender, verb=verb, subcommand=subcommand
                    ).inc()

            # M3.4: control safety rate limiter
            _control_limiter = ControlRateLimiter(
                room_rpm=_cfg.control_room_rpm,
                operator_rpm=_cfg.control_operator_rpm,
                run_next_rpm=_cfg.control_run_next_rpm,
                cooldown_s=_cfg.control_cooldown_s,
            ) if _control_config and _control_config.is_enabled else None

            def _on_control_rate_limited(scope: str) -> None:
                if _PROM_OK:
                    _control_rate_limited_total.labels(scope=scope).inc()

            # M2.3: Persistent event deduplication
            _prune_task: Optional[asyncio.Task] = None
            if _cfg.persistent_dedupe:
                db_path = os.path.join(_cfg.bridge_data_dir, "matrix_bridge.db")
                _event_store = EventStore(
                    db_path=db_path,
                    ttl_h=_cfg.processed_events_ttl_h,
                    prune_batch=_cfg.processed_events_prune_batch,
                )
                store_ok = await _event_store.open()
                if store_ok:
                    logger.info(
                        "✅ Persistent dedupe: %s (ttl_h=%d)",
                        db_path, _cfg.processed_events_ttl_h,
                    )
                    # Best-effort prune on startup
                    pruned = await _event_store.prune()
                    if pruned:
                        logger.info("Startup prune removed %d stale events", pruned)
                    # Periodic prune task
                    if _cfg.processed_events_prune_interval_s > 0:
                        async def _prune_loop() -> None:
                            while True:
                                await asyncio.sleep(_cfg.processed_events_prune_interval_s)
                                if _event_store:
                                    await _event_store.prune()
                        _prune_task = asyncio.create_task(_prune_loop(), name="event_store_prune")
                else:
                    logger.warning("⚠️  EventStore init failed — persistent dedupe disabled (degraded)")
                    _event_store = None
            else:
                logger.info("Persistent dedupe disabled (PERSISTENT_DEDUPE=0)")

            def _on_dedupe_hit(room_id: str, agent_id: str) -> None:
                if _PROM_OK:
                    _dedupe_persistent_hits_total.labels(room_id=room_id).inc()

            def _on_dedupe_insert() -> None:
                if _PROM_OK:
                    _dedupe_persistent_inserts_total.inc()

            # M5.0: node-aware routing policy
            _node_policy = parse_node_policy(
                raw_allowed=_cfg.bridge_allowed_nodes,
                default_node=_cfg.bridge_default_node,
                raw_room_map=_cfg.bridge_room_node_map,
            )
            logger.info(
                "✅ Node policy: default=%s allowed=%s room_overrides=%d",
                _node_policy.default_node,
                sorted(_node_policy.allowed_nodes),
                len(_node_policy.room_node_map),
            )

            # M6.0: Persistent policy store for dynamic room-node overrides
            _policy_store: Optional[Any] = None
            try:
                from .policy_store import PolicyStore as _PolicyStore
                _ps_path = os.path.join(_cfg.bridge_data_dir, "policy_overrides.db")
                _policy_store = _PolicyStore(db_path=_ps_path)
                _policy_store.open()
                logger.info(
                    "✅ Policy store: %s (%d overrides)",
                    _ps_path, _policy_store.count_overrides(),
                )
            except Exception as _ps_exc:
                logger.warning("Policy store init failed (non-fatal): %s", _ps_exc)
                _policy_store = None

            def _on_node_selected(agent_id: str, node_id: str, source: str) -> None:
                if _PROM_OK:
                    _routed_total.labels(agent_id=agent_id, node_id=node_id, source=source).inc()
                _node_stats[node_id]["routed"] += 1

            def _on_node_rejected(rejected_node: str) -> None:
                if _PROM_OK:
                    _node_rejected_total.labels(node_id=rejected_node).inc()
                _node_stats[rejected_node]["rejected"] += 1

            # M8.0: Node health tracker + soft-failover
            from .node_health import NodeHealthTracker as _NodeHealthTracker, parse_node_health_config as _parse_nhc
            _health_cfg = _parse_nhc(
                fail_consecutive=_cfg.node_fail_consecutive,
                lat_ewma_s=_cfg.node_lat_ewma_s,
                ewma_alpha=_cfg.node_ewma_alpha,
            )
            _node_health_tracker = _NodeHealthTracker(_health_cfg)
            logger.info(
                "✅ Node health tracker: fail_consecutive=%d lat_ewma_s=%.1f ewma_alpha=%.2f",
                _cfg.node_fail_consecutive, _cfg.node_lat_ewma_s, _cfg.node_ewma_alpha,
            )

            def _on_failover(from_node: str, to_node: str, reason: str) -> None:
                if _PROM_OK:
                    _failover_total.labels(
                        from_node=from_node, to_node=to_node, reason=reason
                    ).inc()
                if _PROM_OK:
                    _update_health_gauges()
                logger.info("⚡ Failover: %s → %s reason=%s", from_node, to_node, reason)

            def _update_health_gauges() -> None:
                if not _PROM_OK or _node_health_tracker is None or _node_policy is None:
                    return
                _STATE_MAP = {"healthy": 1.0, "degraded": 0.5, "down": 0.0}
                for nid in _node_policy.allowed_nodes:
                    state = _node_health_tracker.state(nid)
                    _node_health_state.labels(node_id=nid).set(_STATE_MAP.get(state, 1.0))

            # M8.1: Sticky failover cache
            from .sticky_cache import StickyNodeCache as _StickyNodeCache
            global _sticky_cache
            if _cfg.failover_sticky_ttl_s > 0:
                _sticky_cache = _StickyNodeCache(ttl_s=_cfg.failover_sticky_ttl_s)
                logger.info("✅ Sticky failover cache: ttl=%.0fs", _cfg.failover_sticky_ttl_s)
            else:
                _sticky_cache = None
                logger.info("ℹ️  Sticky failover disabled (FAILOVER_STICKY_TTL_S=0)")

            # M9.0: Confirm store
            from .confirm_store import ConfirmStore as _ConfirmStore
            global _confirm_store
            if _cfg.confirm_ttl_s > 0:
                _confirm_store = _ConfirmStore(ttl_s=_cfg.confirm_ttl_s)
                logger.info("✅ Confirm store: ttl=%.0fs", _cfg.confirm_ttl_s)
            else:
                _confirm_store = None
                logger.info("ℹ️  Confirm store disabled (CONFIRM_TTL_S=0)")

            # M11: debug inject client (only created when inject is enabled)
            global _dummy_http_client
            if _cfg.debug_inject_enabled and _HTTPX_OK:
                _dummy_http_client = _httpx.AsyncClient(timeout=30.0)
                logger.warning(
                    "⚠️  DEBUG_INJECT_ENABLED=true — synthetic event injection active. "
                    "NEVER use in production!"
                )

            def _on_sticky_set(node_id: str, scope: str) -> None:
                if _PROM_OK:
                    _sticky_set_total.labels(node_id=node_id, scope=scope).inc()
                    if _sticky_cache is not None:
                        _sticky_active.labels().set(_sticky_cache.active_count())

            ingress = MatrixIngressLoop(
                matrix_homeserver_url=_cfg.matrix_homeserver_url,
                matrix_access_token=_cfg.matrix_access_token,
                matrix_user_id=_cfg.matrix_user_id,
                router_url=_cfg.dagi_gateway_url,
                node_id=_cfg.node_id,
                room_map=_room_map,
                sofiia_console_url=_cfg.sofiia_console_url,
                sofiia_internal_token=_cfg.sofiia_internal_token,
                rate_limiter=_rate_limiter,
                queue_max_events=_cfg.queue_max_events,
                worker_concurrency=_cfg.worker_concurrency,
                queue_drain_timeout_s=_cfg.queue_drain_timeout_s,
                mixed_room_config=_mixed_room_config,
                unknown_agent_behavior=_cfg.unknown_agent_behavior,
                max_slash_len=_cfg.max_slash_len,
                mixed_concurrency_cap=_cfg.mixed_concurrency_cap,
                on_message_received=_on_msg,
                on_message_replied=_on_replied,
                on_gateway_error=_on_gw_error,
                on_rate_limited=_on_rate_limited,
                on_queue_dropped=_on_queue_dropped,
                on_queue_size=_on_queue_size,
                on_invoke_latency=_on_invoke_latency,
                on_send_latency=_on_send_latency,
                on_queue_wait=_on_queue_wait,
                on_routed=_on_routed,
                on_route_rejected=_on_route_rejected,
                control_config=_control_config,
                control_unauthorized_behavior=_cfg.control_unauthorized_behavior,
                sofiia_control_token=_cfg.sofiia_control_token,
                control_limiter=_control_limiter,
                on_control_command=_on_control_command,
                on_control_rate_limited=_on_control_rate_limited,
                event_store=_event_store,
                on_dedupe_persistent_hit=_on_dedupe_hit,
                on_dedupe_persistent_insert=_on_dedupe_insert,
                # M4.0: agent discovery
                discovery_rpm=_cfg.discovery_rpm,
                # M5.0: node-aware routing
                node_policy=_node_policy,
                on_node_selected=_on_node_selected,
                on_node_rejected=_on_node_rejected,
                # M5.1: node stats getter for !status
                node_stats_getter=lambda: {k: dict(v) for k, v in _node_stats.items()},
                # M6.0: dynamic room-node policy store
                policy_store=_policy_store,
                # M6.2: data directory for policy exports/imports
                bridge_data_dir=_cfg.bridge_data_dir,
                # M8.0: node health tracker + failover callback
                node_health_tracker=_node_health_tracker,
                on_failover=_on_failover,
                # M8.1: sticky failover cache
                sticky_cache=_sticky_cache,
                on_sticky_set=_on_sticky_set,
                # M8.2: HA persistence config
                ha_health_snapshot_interval_s=_cfg.ha_health_snapshot_interval_s,
                ha_health_max_age_s=_cfg.ha_health_max_age_s,
                # M9.0: Two-step confirmation store
                confirm_store=_confirm_store,
                policy_export_retention_days=_cfg.policy_export_retention_days,
                policy_history_limit=_cfg.policy_history_limit,
            )
            logger.info(
                "✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs",
                _cfg.queue_max_events, _cfg.worker_concurrency, _cfg.queue_drain_timeout_s,
            )
            _ingress_loop = ingress
            _ingress_task = asyncio.create_task(
                ingress.run(_ingress_stop),
                name="matrix_ingress_loop",
            )
            logger.info("✅ Ingress loop task started")
        else:
            logger.warning("⚠️  No room mappings — ingress loop NOT started")

    except (RuntimeError, ValueError) as exc:
        _config_error = str(exc)
        logger.error("❌ Config error: %s", _config_error)
        if _PROM_OK:
            _cfg_node = _cfg.node_id if _cfg else ""
            _bridge_up.labels(node_id=_cfg_node or "").set(0)  # M7.1: labeled
    yield
    # Shutdown: cancel ingress loop
    if _ingress_stop:
        _ingress_stop.set()
    if _ingress_task and not _ingress_task.done():
        _ingress_task.cancel()
        try:
            await asyncio.wait_for(_ingress_task, timeout=5.0)
        except (asyncio.CancelledError, asyncio.TimeoutError):
            pass
    # Shutdown: cancel prune task + close EventStore
    if "_prune_task" in dir() and _prune_task and not _prune_task.done():  # type: ignore[name-defined]
        _prune_task.cancel()  # type: ignore[name-defined]
    if _event_store is not None:
        await _event_store.close()
    # M6.0: close policy store
    if "_policy_store" in dir() and _policy_store is not None:  # type: ignore[name-defined]
        try:
            _policy_store.close()  # type: ignore[name-defined]
        except Exception:  # noqa: BLE001
            pass
    # M11: close debug http client if open
    if _dummy_http_client is not None:
        try:
            await _dummy_http_client.aclose()
        except Exception:  # noqa: BLE001
            pass
    logger.info("matrix-bridge-dagi shutting down")

# ── App ───────────────────────────────────────────────────────────────────────
app = FastAPI(
    title="matrix-bridge-dagi",
    version="0.1.0",
    lifespan=lifespan,
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["GET", "POST"],
    allow_headers=["*"],
)

# ── Health ────────────────────────────────────────────────────────────────────
@app.get("/health")
async def health() -> Dict[str, Any]:
    uptime = int(time.monotonic() - _START_TIME)
    if _PROM_OK and _ingress_loop is not None:
        _active_room_agent_locks.set(_ingress_loop.active_lock_count)
    if _config_error or _cfg is None:
        return {
            "ok": False,
            "service": "matrix-bridge-dagi",
            "version": "0.1.0",
            "build": os.getenv("BUILD_SHA", "dev"),
            "uptime_s": uptime,
            "error": _config_error or "service not initialised",
        }
    matrix_ok = _matrix_reachable is True
    gateway_ok = _gateway_reachable is True
    overall_ok = matrix_ok and gateway_ok
    return {
        "ok": overall_ok,
        "service": "matrix-bridge-dagi",
        "version": "0.1.0",
        "build": _cfg.build_sha,
        "build_time": _cfg.build_time,
        "env": os.getenv("ENV", "dev"),
        "uptime_s": uptime,
        "node_id": _cfg.node_id,
        "homeserver": _cfg.matrix_homeserver_url,
        "matrix_reachable": _matrix_reachable,
        "bridge_user": _cfg.matrix_user_id,
        "sofiia_room_id": _cfg.sofiia_room_id,
        "allowed_agents": list(_cfg.bridge_allowed_agents),
        "gateway": _cfg.dagi_gateway_url,
        "gateway_reachable": _gateway_reachable,
        "mappings_count": _room_map.total_mappings if _room_map else 0,
        "mixed_rooms_count": _mixed_room_config.total_rooms if _mixed_room_config else 0,
        "total_agents_in_mixed_rooms": sum(
            len(r.agents) for r in _mixed_room_config.rooms.values()
        ) if _mixed_room_config else 0,
        "config_ok": True,
        "rate_limiter": _rate_limiter.stats() if _rate_limiter else None,
        "queue": {
            "size": _ingress_loop.queue_size if _ingress_loop else 0,
            "max": _cfg.queue_max_events,
            "workers": _cfg.worker_concurrency,
        },
        "mixed_guard_rails": {
            "max_agents_per_room": _cfg.max_agents_per_mixed_room,
            "max_slash_len": _cfg.max_slash_len,
            "unknown_agent_behavior": _cfg.unknown_agent_behavior,
            "concurrency_cap": _cfg.mixed_concurrency_cap,
            "active_room_agent_locks": _ingress_loop.active_lock_count if _ingress_loop else 0,
        },
        "control_channel": {
            "enabled": _control_config.is_enabled if _control_config else False,
            "control_rooms_count": len(_control_config.control_rooms) if _control_config else 0,
            "operators_count": len(_control_config.operator_allowlist) if _control_config else 0,
            "unauthorized_behavior": _cfg.control_unauthorized_behavior,
        },
        "control_safety": {
            "enabled": _cfg.control_room_rpm > 0 or _cfg.control_operator_rpm > 0,
            "room_rpm": _cfg.control_room_rpm,
            "operator_rpm": _cfg.control_operator_rpm,
            "run_next_rpm": _cfg.control_run_next_rpm,
            "cooldown_s": _cfg.control_cooldown_s,
        },
        "persistent_dedupe": _event_store.as_health_dict() if _event_store else {
            "enabled": False,
            "db_path": None,
            "ttl_h": _cfg.processed_events_ttl_h,
            "ok": False,
            "last_prune_at": None,
            "pruned_rows_last": 0,
        },
        # M6.0: policy store health
        "policy_store": _health_policy_store_dict(),
        # M8.1: sticky failover cache health
        "sticky_cache": _health_sticky_dict(),
        # M8.2: HA state persistence info
        "ha_state": _health_ha_dict(),
        # M9.0: confirm store
        "confirm_store": _health_confirm_dict(),
    }


def _health_confirm_dict() -> Dict[str, Any]:
    """Return confirm store info for /health endpoint (M9.0)."""
    if _confirm_store is None:
        return {"enabled": False}
    return {
        "enabled": True,
        "pending": _confirm_store.pending_count(),
        "ttl_s": _confirm_store.ttl_s,
    }


def _health_ha_dict() -> Dict[str, Any]:
    """Return HA persistence info for /health endpoint (M8.2)."""
    if _ingress_loop is None:
        return {"sticky_loaded": 0, "health_loaded": False, "snapshot_interval_s": 0}
    try:
        s = _ingress_loop.get_status()
        return {
            "sticky_loaded": s.get("ha_sticky_loaded", 0),
            "health_loaded": s.get("ha_health_loaded", False),
            "snapshot_interval_s": s.get("ha_health_snapshot_interval_s", 0),
        }
    except Exception:  # noqa: BLE001
        return {"sticky_loaded": 0, "health_loaded": False, "snapshot_interval_s": 0}


def _health_sticky_dict() -> Dict[str, Any]:
    """Return sticky failover cache health for /health endpoint (M8.1)."""
    if _sticky_cache is None:
        return {"enabled": False, "active_keys": 0, "ttl_s": 0}
    return {
        "enabled": True,
        "active_keys": _sticky_cache.active_count(),
        "ttl_s": _sticky_cache.ttl_s,
    }


def _health_policy_store_dict() -> Dict[str, Any]:
    """Return policy store health info for /health endpoint."""
    try:
        if _ingress_loop is not None:
            s = _ingress_loop.get_status()
            return {
                "ok": s.get("policy_store_ok", False),
                "path": s.get("policy_store_path"),
                "overrides_count": s.get("policy_overrides_count", 0),
                "agent_overrides_count": s.get("policy_agent_overrides_count", 0),      # M6.1
                "last_export_at": s.get("policy_last_export_at"),                        # M6.2
                "last_import_at": s.get("policy_last_import_at"),                        # M6.2
                "db_mtime": s.get("policy_db_mtime"),                                    # M6.2
            }
    except Exception:  # noqa: BLE001
        pass
    return {
        "ok": False, "path": None,
        "overrides_count": 0, "agent_overrides_count": 0,
        "last_export_at": None, "last_import_at": None, "db_mtime": None,
    }


# ── Bridge Mappings (read-only ops endpoint) ───────────────────────────────────
@app.get("/bridge/mappings")
async def bridge_mappings() -> Dict[str, Any]:
    """
    Returns room-to-agent mapping summary.
    Safe for ops visibility — no secrets included.
    """
    if _cfg is None or _room_map is None:
        return {
            "ok": False,
            "error": _config_error or "service not initialised",
            "mappings": [],
            "mixed_rooms": [],
        }
    return {
        "ok": True,
        "total": _room_map.total_mappings,
        "allowed_agents": list(_cfg.bridge_allowed_agents),
        "mappings": _room_map.as_summary(),
        "mixed_rooms_total": _mixed_room_config.total_rooms if _mixed_room_config else 0,
        "mixed_rooms": _mixed_room_config.as_summary() if _mixed_room_config else [],
        "control_rooms": sorted(_control_config.control_rooms) if _control_config else [],
        "control_operators_count": len(_control_config.operator_allowlist) if _control_config else 0,
    }


# ── Debug / Soak (M11) ────────────────────────────────────────────────────────
@app.post("/v1/debug/inject_event")
async def debug_inject_event(body: Dict[str, Any]) -> Dict[str, Any]:
    """
    Synthetic event injection for soak/load testing.

    Enabled ONLY when DEBUG_INJECT_ENABLED=true (never in production).

    Body: { "room_id": "!room:server", "event": { Matrix event dict } }
    The event is enqueued directly into the ingress loop, bypassing Matrix poll.

    Returns: { "ok": bool, "enqueued": bool, "room_id": str, "event_id": str }
    """
    if _cfg is None or not _cfg.debug_inject_enabled:
        return Response(  # type: ignore[return-value]
            '{"ok":false,"error":"debug inject disabled"}',
            status_code=403,
            media_type="application/json",
        )
    if _ingress_loop is None:
        return {"ok": False, "error": "ingress loop not running"}

    room_id = body.get("room_id", "")
    event   = body.get("event", {})
    if not room_id or not event:
        return {"ok": False, "error": "missing room_id or event"}

    # Ensure event has minimum required fields for ingress processing
    if not event.get("event_id"):
        import time as _time
        event["event_id"] = f"!inject-{int(_time.monotonic() * 1e6)}"
    if not event.get("type"):
        event["type"] = "m.room.message"
    if not event.get("content"):
        event["content"] = {"msgtype": "m.text", "body": event.get("body", "soak-ping")}

    # Build a minimal sync_resp that looks like a real Matrix /sync response
    # so _enqueue_from_sync can pick it up via extract_room_messages.
    # We bypass Matrix polling by directly calling _try_enqueue on the right mapping.
    enqueued = False
    try:
        # Find the matching room mapping (direct rooms only for soak)
        mapping = None
        if _ingress_loop._room_map is not None:
            for m in _ingress_loop._room_map.mappings:
                if m.room_id == room_id:
                    mapping = m
                    break

        if mapping is None:
            return {"ok": False, "error": f"no mapping for room_id={room_id!r}"}

        # Build a minimal stub Matrix client — replies are discarded for soak events
        from .matrix_client import MatrixClient

        class _SoakMatrixClient(MatrixClient):  # type: ignore[misc]
            """No-op Matrix client for synthetic soak events."""
            def __init__(self) -> None:  # noqa: D107
                pass  # skip real __init__

            async def mark_seen(self, room_id: str, event_id: str) -> None:  # type: ignore[override]
                pass

            async def send_text(self, room_id: str, text: str,  # type: ignore[override]
                                txn: Optional[str] = None) -> None:
                pass

        _stub_client = _SoakMatrixClient()

        if _dummy_http_client is None:
            return {"ok": False, "error": "debug http client not initialised"}

        await _ingress_loop._try_enqueue(
            _stub_client,              # type: ignore[arg-type]
            _ingress_loop._queue,
            _dummy_http_client,
            event,
            mapping,
        )
        enqueued = True
    except Exception as exc:  # noqa: BLE001
        return {"ok": False, "error": str(exc), "enqueued": False}

    return {
        "ok": True,
        "enqueued": enqueued,
        "room_id": room_id,
        "event_id": event.get("event_id"),
    }


async def _noop_send(room_id: str, text: str, txn: Optional[str] = None) -> None:
    """Discard replies from injected soak events."""


# ── Metrics ───────────────────────────────────────────────────────────────────
@app.get("/metrics")
async def metrics():
    if not _PROM_OK:
        return Response("# prometheus_client not available\n", media_type="text/plain")
    return Response(generate_latest(REGISTRY), media_type=CONTENT_TYPE_LATEST)