""" matrix-bridge-dagi — Phase M1 scaffold Bridges Matrix/Element rooms to DAGI agents via Gateway. M1 scope: 1 room ↔ 1 agent (Sofiia), audit via sofiia-console internal endpoint. """ import asyncio import logging import os import time from contextlib import asynccontextmanager from typing import Any, Dict, Optional from fastapi import FastAPI, Response from fastapi.middleware.cors import CORSMiddleware try: import httpx as _httpx _HTTPX_OK = True except ImportError: # pragma: no cover _httpx = None # type: ignore _HTTPX_OK = False try: from prometheus_client import ( Counter, Histogram, Gauge, generate_latest, CONTENT_TYPE_LATEST, CollectorRegistry, REGISTRY, ) _PROM_OK = True except ImportError: # pragma: no cover _PROM_OK = False from .config import BridgeConfig, load_config from .control import ControlConfig, parse_control_config from .control_limiter import ControlRateLimiter from .event_store import EventStore from .node_policy import parse_node_policy from .ingress import MatrixIngressLoop from .mixed_routing import MixedRoomConfig, parse_mixed_room_map from .rate_limit import InMemoryRateLimiter from .room_mapping import RoomMappingConfig, parse_room_map logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s", ) logger = logging.getLogger("matrix-bridge-dagi") # ── Prometheus metrics (H3) ─────────────────────────────────────────────────── if _PROM_OK: _messages_received = Counter( "matrix_bridge_messages_received_total", "Total Matrix messages received", ["room_id", "agent_id"], ) _messages_replied = Counter( "matrix_bridge_messages_replied_total", "Total agent replies sent to Matrix", ["room_id", "agent_id", "status"], ) _messages_rate_limited = Counter( "matrix_bridge_rate_limited_total", "Messages dropped by rate limiter", ["room_id", "agent_id", "limit_type"], ) _gateway_errors = Counter( "matrix_bridge_gateway_errors_total", "Errors by stage (sync, invoke, send, audit)", ["error_type"], ) _invoke_latency = Histogram( "matrix_bridge_invoke_duration_seconds", "Latency of DAGI Router infer call", ["agent_id", "node_id"], # M5.1: per-node latency breakdown buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 45.0], ) _send_latency = Histogram( "matrix_bridge_send_duration_seconds", "Latency of Matrix send_text call", ["agent_id"], buckets=[0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0], ) _bridge_up = Gauge( "matrix_bridge_up", "1 if bridge started successfully; 0 on config error", ["node_id"], # M7.1: per-node label for multi-node deployments ) _rate_limiter_active_rooms = Gauge( "matrix_bridge_rate_limiter_active_rooms", "Rooms with activity in the current rate-limit window", ) _rate_limiter_active_senders = Gauge( "matrix_bridge_rate_limiter_active_senders", "Senders with activity in the current rate-limit window", ) # H2: Queue metrics _queue_size = Gauge( "matrix_bridge_queue_size", "Current number of pending items in the work queue", ) _queue_dropped = Counter( "matrix_bridge_queue_dropped_total", "Messages dropped because queue was full", ["room_id", "agent_id"], ) _queue_wait = Histogram( "matrix_bridge_queue_wait_seconds", "Time between enqueue and worker start processing", ["agent_id"], buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 30.0], ) # M2.2: Mixed room routing — reason breakdown (slash/mention/name/default/direct) # M7.1: Renamed from matrix_bridge_routed_total to avoid collision with M5.0 counter _routing_reasons_total = Counter( "matrix_bridge_routing_reasons_total", "Message routing breakdown by agent and routing reason (slash/mention/name/default/direct)", ["agent_id", "reason"], ) _route_rejected_total = Counter( "matrix_bridge_route_rejected_total", "Messages rejected during routing (unknown agent, bad slash, etc.)", ["room_id", "reason"], ) _active_room_agent_locks = Gauge( "matrix_bridge_active_room_agent_locks", "Number of room-agent pairs currently holding a concurrency lock", ) # M3.0: Control channel _control_commands_total = Counter( "matrix_bridge_control_commands_total", "Total control commands received from authorized operators", ["sender", "verb", "subcommand"], ) _control_rate_limited_total = Counter( "matrix_bridge_control_rate_limited_total", "Total control commands rejected by rate limiter or cooldown", ["scope"], ) _dedupe_persistent_hits_total = Counter( "matrix_bridge_dedupe_persistent_hits_total", "Total events dropped by persistent (SQLite) deduplication", ["room_id"], ) _dedupe_persistent_inserts_total = Counter( "matrix_bridge_dedupe_persistent_inserts_total", "Total events marked as processed in persistent dedupe store", ) # M5.0: node-aware routing — primary routed counter (unique name, no collision) _routed_total = Counter( "matrix_bridge_routed_total", "Total messages successfully routed, by agent, resolved node, and node source", ["agent_id", "node_id", "source"], ) _node_rejected_total = Counter( "matrix_bridge_node_rejected_total", "Total messages with rejected (non-allowlisted) node kwarg", ["node_id"], ) # M8.0: soft-failover metrics _failover_total = Counter( "matrix_bridge_failover_total", "Total successful soft-failovers by node transition and reason", ["from_node", "to_node", "reason"], ) _node_health_state = Gauge( "matrix_bridge_node_health_state", "Node health state: 1=healthy 0.5=degraded 0=down", ["node_id"], ) # M8.1: sticky routing metrics _sticky_set_total = Counter( "matrix_bridge_sticky_node_total", "Total sticky routing entries set after failover, by preferred node and scope", ["node_id", "scope"], ) _sticky_active = Gauge( "matrix_bridge_sticky_node_active", "Current count of active sticky routing entries", [], ) # ── Startup state ───────────────────────────────────────────────────────────── _START_TIME = time.monotonic() _cfg: Optional[BridgeConfig] = None # M5.1: in-memory per-node counters (lightweight, for !status reply) from collections import defaultdict as _defaultdict _node_stats: Dict[str, Dict[str, int]] = _defaultdict(lambda: {"routed": 0, "rejected": 0}) _config_error: Optional[str] = None _matrix_reachable: Optional[bool] = None _gateway_reachable: Optional[bool] = None _room_map: Optional[RoomMappingConfig] = None _mixed_room_config: Optional[MixedRoomConfig] = None _control_config: Optional[ControlConfig] = None _event_store: Optional[EventStore] = None _rate_limiter: Optional[InMemoryRateLimiter] = None _ingress_loop: Optional["MatrixIngressLoop"] = None # for /health queue_size _ingress_task: Optional[asyncio.Task] = None _ingress_stop: Optional[asyncio.Event] = None _sticky_cache: Optional[Any] = None # M8.1: StickyNodeCache instance _confirm_store: Optional[Any] = None # M9.0: ConfirmStore instance _dummy_http_client: Optional[Any] = None # M11: soak inject endpoint (debug only) async def _probe_url(url: str, timeout: float = 5.0) -> bool: """Quick GET probe — returns True if HTTP 2xx.""" if not _HTTPX_OK or not url: return False try: async with _httpx.AsyncClient(timeout=timeout) as client: r = await client.get(url) return r.status_code < 400 except Exception: return False # ── Lifespan ────────────────────────────────────────────────────────────────── @asynccontextmanager async def lifespan(app_: Any): global _cfg, _config_error, _matrix_reachable, _gateway_reachable global _room_map, _mixed_room_config, _control_config, _rate_limiter, _ingress_loop try: _cfg = load_config() # Parse regular room mapping (M1/M2.0: 1 room → 1 agent) _room_map = parse_room_map( os.getenv("BRIDGE_ROOM_MAP", ""), _cfg.bridge_allowed_agents, ) # Parse mixed room mapping (M2.1: 1 room → N agents) if _cfg.bridge_mixed_room_map: _mixed_room_config = parse_mixed_room_map( _cfg.bridge_mixed_room_map, _cfg.bridge_mixed_defaults, _cfg.bridge_allowed_agents, max_agents_per_room=_cfg.max_agents_per_mixed_room, ) logger.info( "✅ Mixed room config: %d rooms, agents=%s", _mixed_room_config.total_rooms, [a for r in _mixed_room_config.rooms.values() for a in r.agents], ) else: _mixed_room_config = None # H1: Rate limiter (inmemory, per config) _rate_limiter = InMemoryRateLimiter( room_rpm=_cfg.rate_limit_room_rpm, sender_rpm=_cfg.rate_limit_sender_rpm, ) logger.info( "✅ Rate limiter: room_rpm=%d sender_rpm=%d", _cfg.rate_limit_room_rpm, _cfg.rate_limit_sender_rpm, ) # M3.0: Operator control channel if _cfg.bridge_operator_allowlist or _cfg.bridge_control_rooms: _control_config = parse_control_config( _cfg.bridge_operator_allowlist, _cfg.bridge_control_rooms, ) else: _control_config = None mixed_count = _mixed_room_config.total_rooms if _mixed_room_config else 0 ctrl_rooms = len(_control_config.control_rooms) if _control_config else 0 ctrl_ops = len(_control_config.operator_allowlist) if _control_config else 0 logger.info( "✅ matrix-bridge-dagi started | node=%s build=%s homeserver=%s " "agents=%s mappings=%d mixed_rooms=%d ctrl_rooms=%d ctrl_operators=%d", _cfg.node_id, _cfg.build_sha, _cfg.matrix_homeserver_url, list(_cfg.bridge_allowed_agents), _room_map.total_mappings, mixed_count, ctrl_rooms, ctrl_ops, ) # Connectivity smoke probes (non-blocking failures) _matrix_reachable = await _probe_url( f"{_cfg.matrix_homeserver_url}/_matrix/client/versions" ) _gateway_reachable = await _probe_url( f"{_cfg.dagi_gateway_url}/health" ) if _matrix_reachable: logger.info("✅ Matrix homeserver reachable: %s", _cfg.matrix_homeserver_url) else: logger.warning("⚠️ Matrix homeserver NOT reachable: %s", _cfg.matrix_homeserver_url) if _gateway_reachable: logger.info("✅ DAGI Gateway reachable: %s", _cfg.dagi_gateway_url) else: logger.warning("⚠️ DAGI Gateway NOT reachable: %s", _cfg.dagi_gateway_url) if _PROM_OK: _bridge_up.labels(node_id=_cfg.node_id or "").set(1) # M7.1: labeled # Start ingress loop (fire-and-forget asyncio task) _has_rooms = (_room_map and _room_map.total_mappings > 0) or ( _mixed_room_config and _mixed_room_config.total_rooms > 0 ) if _has_rooms: _ingress_stop = asyncio.Event() def _on_msg(room_id: str, agent_id: str) -> None: if _PROM_OK: _messages_received.labels(room_id=room_id, agent_id=agent_id).inc() def _on_gw_error(error_type: str) -> None: if _PROM_OK: _gateway_errors.labels(error_type=error_type).inc() def _on_replied(room_id: str, agent_id: str, status: str) -> None: if _PROM_OK: _messages_replied.labels( room_id=room_id, agent_id=agent_id, status=status ).inc() def _on_rate_limited(room_id: str, agent_id: str, limit_type: str) -> None: if _PROM_OK: _messages_rate_limited.labels( room_id=room_id, agent_id=agent_id, limit_type=limit_type ).inc() if _rate_limiter is not None: stats = _rate_limiter.stats() _rate_limiter_active_rooms.set(stats["active_rooms"]) _rate_limiter_active_senders.set(stats["active_senders"]) def _on_invoke_latency(agent_id: str, duration_s: float, node_id: str = "") -> None: if _PROM_OK: _invoke_latency.labels(agent_id=agent_id, node_id=node_id or "unknown").observe(duration_s) def _on_send_latency(agent_id: str, duration_s: float) -> None: if _PROM_OK: _send_latency.labels(agent_id=agent_id).observe(duration_s) # H2 callbacks def _on_queue_dropped(room_id: str, agent_id: str) -> None: if _PROM_OK: _queue_dropped.labels(room_id=room_id, agent_id=agent_id).inc() def _on_queue_size(size: int) -> None: if _PROM_OK: _queue_size.set(size) def _on_queue_wait(agent_id: str, wait_s: float) -> None: if _PROM_OK: _queue_wait.labels(agent_id=agent_id).observe(wait_s) # M2.2 callbacks def _on_routed(agent_id: str, reason: str) -> None: if _PROM_OK: _routing_reasons_total.labels(agent_id=agent_id, reason=reason).inc() # M7.1: renamed def _on_route_rejected(room_id: str, reason: str) -> None: if _PROM_OK: _route_rejected_total.labels(room_id=room_id, reason=reason).inc() # M3.0 callbacks def _on_control_command(sender: str, verb: str, subcommand: str) -> None: if _PROM_OK: _control_commands_total.labels( sender=sender, verb=verb, subcommand=subcommand ).inc() # M3.4: control safety rate limiter _control_limiter = ControlRateLimiter( room_rpm=_cfg.control_room_rpm, operator_rpm=_cfg.control_operator_rpm, run_next_rpm=_cfg.control_run_next_rpm, cooldown_s=_cfg.control_cooldown_s, ) if _control_config and _control_config.is_enabled else None def _on_control_rate_limited(scope: str) -> None: if _PROM_OK: _control_rate_limited_total.labels(scope=scope).inc() # M2.3: Persistent event deduplication _prune_task: Optional[asyncio.Task] = None if _cfg.persistent_dedupe: import os db_path = os.path.join(_cfg.bridge_data_dir, "matrix_bridge.db") _event_store = EventStore( db_path=db_path, ttl_h=_cfg.processed_events_ttl_h, prune_batch=_cfg.processed_events_prune_batch, ) store_ok = await _event_store.open() if store_ok: logger.info( "✅ Persistent dedupe: %s (ttl_h=%d)", db_path, _cfg.processed_events_ttl_h, ) # Best-effort prune on startup pruned = await _event_store.prune() if pruned: logger.info("Startup prune removed %d stale events", pruned) # Periodic prune task if _cfg.processed_events_prune_interval_s > 0: async def _prune_loop() -> None: while True: await asyncio.sleep(_cfg.processed_events_prune_interval_s) if _event_store: await _event_store.prune() _prune_task = asyncio.create_task(_prune_loop(), name="event_store_prune") else: logger.warning("⚠️ EventStore init failed — persistent dedupe disabled (degraded)") _event_store = None else: logger.info("Persistent dedupe disabled (PERSISTENT_DEDUPE=0)") def _on_dedupe_hit(room_id: str, agent_id: str) -> None: if _PROM_OK: _dedupe_persistent_hits_total.labels(room_id=room_id).inc() def _on_dedupe_insert() -> None: if _PROM_OK: _dedupe_persistent_inserts_total.inc() # M5.0: node-aware routing policy _node_policy = parse_node_policy( raw_allowed=_cfg.bridge_allowed_nodes, default_node=_cfg.bridge_default_node, raw_room_map=_cfg.bridge_room_node_map, ) logger.info( "✅ Node policy: default=%s allowed=%s room_overrides=%d", _node_policy.default_node, sorted(_node_policy.allowed_nodes), len(_node_policy.room_node_map), ) # M6.0: Persistent policy store for dynamic room-node overrides _policy_store: Optional[Any] = None try: from .policy_store import PolicyStore as _PolicyStore import os _ps_path = os.path.join(_cfg.bridge_data_dir, "policy_overrides.db") _policy_store = _PolicyStore(db_path=_ps_path) _policy_store.open() logger.info( "✅ Policy store: %s (%d overrides)", _ps_path, _policy_store.count_overrides(), ) except Exception as _ps_exc: logger.warning("Policy store init failed (non-fatal): %s", _ps_exc) _policy_store = None def _on_node_selected(agent_id: str, node_id: str, source: str) -> None: if _PROM_OK: _routed_total.labels(agent_id=agent_id, node_id=node_id, source=source).inc() _node_stats[node_id]["routed"] += 1 def _on_node_rejected(rejected_node: str) -> None: if _PROM_OK: _node_rejected_total.labels(node_id=rejected_node).inc() _node_stats[rejected_node]["rejected"] += 1 # M8.0: Node health tracker + soft-failover from .node_health import NodeHealthTracker as _NodeHealthTracker, parse_node_health_config as _parse_nhc _health_cfg = _parse_nhc( fail_consecutive=_cfg.node_fail_consecutive, lat_ewma_s=_cfg.node_lat_ewma_s, ewma_alpha=_cfg.node_ewma_alpha, ) _node_health_tracker = _NodeHealthTracker(_health_cfg) logger.info( "✅ Node health tracker: fail_consecutive=%d lat_ewma_s=%.1f ewma_alpha=%.2f", _cfg.node_fail_consecutive, _cfg.node_lat_ewma_s, _cfg.node_ewma_alpha, ) def _on_failover(from_node: str, to_node: str, reason: str) -> None: if _PROM_OK: _failover_total.labels( from_node=from_node, to_node=to_node, reason=reason ).inc() if _PROM_OK: _update_health_gauges() logger.info("⚡ Failover: %s → %s reason=%s", from_node, to_node, reason) def _update_health_gauges() -> None: if not _PROM_OK or _node_health_tracker is None or _node_policy is None: return _STATE_MAP = {"healthy": 1.0, "degraded": 0.5, "down": 0.0} for nid in _node_policy.allowed_nodes: state = _node_health_tracker.state(nid) _node_health_state.labels(node_id=nid).set(_STATE_MAP.get(state, 1.0)) # M8.1: Sticky failover cache from .sticky_cache import StickyNodeCache as _StickyNodeCache global _sticky_cache if _cfg.failover_sticky_ttl_s > 0: _sticky_cache = _StickyNodeCache(ttl_s=_cfg.failover_sticky_ttl_s) logger.info("✅ Sticky failover cache: ttl=%.0fs", _cfg.failover_sticky_ttl_s) else: _sticky_cache = None logger.info("ℹ️ Sticky failover disabled (FAILOVER_STICKY_TTL_S=0)") # M9.0: Confirm store from .confirm_store import ConfirmStore as _ConfirmStore global _confirm_store if _cfg.confirm_ttl_s > 0: _confirm_store = _ConfirmStore(ttl_s=_cfg.confirm_ttl_s) logger.info("✅ Confirm store: ttl=%.0fs", _cfg.confirm_ttl_s) else: _confirm_store = None logger.info("ℹ️ Confirm store disabled (CONFIRM_TTL_S=0)") # M11: debug inject client (only created when inject is enabled) global _dummy_http_client if _cfg.debug_inject_enabled and _HTTPX_OK: _dummy_http_client = _httpx.AsyncClient(timeout=30.0) logger.warning( "⚠️ DEBUG_INJECT_ENABLED=true — synthetic event injection active. " "NEVER use in production!" ) def _on_sticky_set(node_id: str, scope: str) -> None: if _PROM_OK: _sticky_set_total.labels(node_id=node_id, scope=scope).inc() if _sticky_cache is not None: _sticky_active.labels().set(_sticky_cache.active_count()) ingress = MatrixIngressLoop( matrix_homeserver_url=_cfg.matrix_homeserver_url, matrix_access_token=_cfg.matrix_access_token, matrix_user_id=_cfg.matrix_user_id, router_url=_cfg.dagi_gateway_url, node_id=_cfg.node_id, room_map=_room_map, sofiia_console_url=_cfg.sofiia_console_url, sofiia_internal_token=_cfg.sofiia_internal_token, rate_limiter=_rate_limiter, queue_max_events=_cfg.queue_max_events, worker_concurrency=_cfg.worker_concurrency, queue_drain_timeout_s=_cfg.queue_drain_timeout_s, mixed_room_config=_mixed_room_config, unknown_agent_behavior=_cfg.unknown_agent_behavior, max_slash_len=_cfg.max_slash_len, mixed_concurrency_cap=_cfg.mixed_concurrency_cap, on_message_received=_on_msg, on_message_replied=_on_replied, on_gateway_error=_on_gw_error, on_rate_limited=_on_rate_limited, on_queue_dropped=_on_queue_dropped, on_queue_size=_on_queue_size, on_invoke_latency=_on_invoke_latency, on_send_latency=_on_send_latency, on_queue_wait=_on_queue_wait, on_routed=_on_routed, on_route_rejected=_on_route_rejected, control_config=_control_config, control_unauthorized_behavior=_cfg.control_unauthorized_behavior, sofiia_control_token=_cfg.sofiia_control_token, control_limiter=_control_limiter, on_control_command=_on_control_command, on_control_rate_limited=_on_control_rate_limited, event_store=_event_store, on_dedupe_persistent_hit=_on_dedupe_hit, on_dedupe_persistent_insert=_on_dedupe_insert, # M4.0: agent discovery discovery_rpm=_cfg.discovery_rpm, # M5.0: node-aware routing node_policy=_node_policy, on_node_selected=_on_node_selected, on_node_rejected=_on_node_rejected, # M5.1: node stats getter for !status node_stats_getter=lambda: {k: dict(v) for k, v in _node_stats.items()}, # M6.0: dynamic room-node policy store policy_store=_policy_store, # M6.2: data directory for policy exports/imports bridge_data_dir=_cfg.bridge_data_dir, # M8.0: node health tracker + failover callback node_health_tracker=_node_health_tracker, on_failover=_on_failover, # M8.1: sticky failover cache sticky_cache=_sticky_cache, on_sticky_set=_on_sticky_set, # M8.2: HA persistence config ha_health_snapshot_interval_s=_cfg.ha_health_snapshot_interval_s, ha_health_max_age_s=_cfg.ha_health_max_age_s, # M9.0: Two-step confirmation store confirm_store=_confirm_store, policy_export_retention_days=_cfg.policy_export_retention_days, policy_history_limit=_cfg.policy_history_limit, ) logger.info( "✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs", _cfg.queue_max_events, _cfg.worker_concurrency, _cfg.queue_drain_timeout_s, ) _ingress_loop = ingress _ingress_task = asyncio.create_task( ingress.run(_ingress_stop), name="matrix_ingress_loop", ) logger.info("✅ Ingress loop task started") else: logger.warning("⚠️ No room mappings — ingress loop NOT started") except (RuntimeError, ValueError) as exc: _config_error = str(exc) logger.error("❌ Config error: %s", _config_error) if _PROM_OK: _cfg_node = _cfg.node_id if _cfg else "" _bridge_up.labels(node_id=_cfg_node or "").set(0) # M7.1: labeled yield # Shutdown: cancel ingress loop if _ingress_stop: _ingress_stop.set() if _ingress_task and not _ingress_task.done(): _ingress_task.cancel() try: await asyncio.wait_for(_ingress_task, timeout=5.0) except (asyncio.CancelledError, asyncio.TimeoutError): pass # Shutdown: cancel prune task + close EventStore if "_prune_task" in dir() and _prune_task and not _prune_task.done(): # type: ignore[name-defined] _prune_task.cancel() # type: ignore[name-defined] if _event_store is not None: await _event_store.close() # M6.0: close policy store if "_policy_store" in dir() and _policy_store is not None: # type: ignore[name-defined] try: _policy_store.close() # type: ignore[name-defined] except Exception: # noqa: BLE001 pass # M11: close debug http client if open if _dummy_http_client is not None: try: await _dummy_http_client.aclose() except Exception: # noqa: BLE001 pass logger.info("matrix-bridge-dagi shutting down") # ── App ─────────────────────────────────────────────────────────────────────── app = FastAPI( title="matrix-bridge-dagi", version="0.1.0", lifespan=lifespan, ) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["GET", "POST"], allow_headers=["*"], ) # ── Health ──────────────────────────────────────────────────────────────────── @app.get("/health") async def health() -> Dict[str, Any]: uptime = int(time.monotonic() - _START_TIME) if _PROM_OK and _ingress_loop is not None: _active_room_agent_locks.set(_ingress_loop.active_lock_count) if _config_error or _cfg is None: return { "ok": False, "service": "matrix-bridge-dagi", "version": "0.1.0", "build": os.getenv("BUILD_SHA", "dev"), "uptime_s": uptime, "error": _config_error or "service not initialised", } matrix_ok = _matrix_reachable is True gateway_ok = _gateway_reachable is True overall_ok = matrix_ok and gateway_ok return { "ok": overall_ok, "service": "matrix-bridge-dagi", "version": "0.1.0", "build": _cfg.build_sha, "build_time": _cfg.build_time, "env": os.getenv("ENV", "dev"), "uptime_s": uptime, "node_id": _cfg.node_id, "homeserver": _cfg.matrix_homeserver_url, "matrix_reachable": _matrix_reachable, "bridge_user": _cfg.matrix_user_id, "sofiia_room_id": _cfg.sofiia_room_id, "allowed_agents": list(_cfg.bridge_allowed_agents), "gateway": _cfg.dagi_gateway_url, "gateway_reachable": _gateway_reachable, "mappings_count": _room_map.total_mappings if _room_map else 0, "mixed_rooms_count": _mixed_room_config.total_rooms if _mixed_room_config else 0, "total_agents_in_mixed_rooms": sum( len(r.agents) for r in _mixed_room_config.rooms.values() ) if _mixed_room_config else 0, "config_ok": True, "rate_limiter": _rate_limiter.stats() if _rate_limiter else None, "queue": { "size": _ingress_loop.queue_size if _ingress_loop else 0, "max": _cfg.queue_max_events, "workers": _cfg.worker_concurrency, }, "mixed_guard_rails": { "max_agents_per_room": _cfg.max_agents_per_mixed_room, "max_slash_len": _cfg.max_slash_len, "unknown_agent_behavior": _cfg.unknown_agent_behavior, "concurrency_cap": _cfg.mixed_concurrency_cap, "active_room_agent_locks": _ingress_loop.active_lock_count if _ingress_loop else 0, }, "control_channel": { "enabled": _control_config.is_enabled if _control_config else False, "control_rooms_count": len(_control_config.control_rooms) if _control_config else 0, "operators_count": len(_control_config.operator_allowlist) if _control_config else 0, "unauthorized_behavior": _cfg.control_unauthorized_behavior, }, "control_safety": { "enabled": _cfg.control_room_rpm > 0 or _cfg.control_operator_rpm > 0, "room_rpm": _cfg.control_room_rpm, "operator_rpm": _cfg.control_operator_rpm, "run_next_rpm": _cfg.control_run_next_rpm, "cooldown_s": _cfg.control_cooldown_s, }, "persistent_dedupe": _event_store.as_health_dict() if _event_store else { "enabled": False, "db_path": None, "ttl_h": _cfg.processed_events_ttl_h, "ok": False, "last_prune_at": None, "pruned_rows_last": 0, }, # M6.0: policy store health "policy_store": _health_policy_store_dict(), # M8.1: sticky failover cache health "sticky_cache": _health_sticky_dict(), # M8.2: HA state persistence info "ha_state": _health_ha_dict(), # M9.0: confirm store "confirm_store": _health_confirm_dict(), } def _health_confirm_dict() -> Dict[str, Any]: """Return confirm store info for /health endpoint (M9.0).""" if _confirm_store is None: return {"enabled": False} return { "enabled": True, "pending": _confirm_store.pending_count(), "ttl_s": _confirm_store.ttl_s, } def _health_ha_dict() -> Dict[str, Any]: """Return HA persistence info for /health endpoint (M8.2).""" if _ingress_loop is None: return {"sticky_loaded": 0, "health_loaded": False, "snapshot_interval_s": 0} try: s = _ingress_loop.get_status() return { "sticky_loaded": s.get("ha_sticky_loaded", 0), "health_loaded": s.get("ha_health_loaded", False), "snapshot_interval_s": s.get("ha_health_snapshot_interval_s", 0), } except Exception: # noqa: BLE001 return {"sticky_loaded": 0, "health_loaded": False, "snapshot_interval_s": 0} def _health_sticky_dict() -> Dict[str, Any]: """Return sticky failover cache health for /health endpoint (M8.1).""" if _sticky_cache is None: return {"enabled": False, "active_keys": 0, "ttl_s": 0} return { "enabled": True, "active_keys": _sticky_cache.active_count(), "ttl_s": _sticky_cache.ttl_s, } def _health_policy_store_dict() -> Dict[str, Any]: """Return policy store health info for /health endpoint.""" try: if _ingress_loop is not None: s = _ingress_loop.get_status() return { "ok": s.get("policy_store_ok", False), "path": s.get("policy_store_path"), "overrides_count": s.get("policy_overrides_count", 0), "agent_overrides_count": s.get("policy_agent_overrides_count", 0), # M6.1 "last_export_at": s.get("policy_last_export_at"), # M6.2 "last_import_at": s.get("policy_last_import_at"), # M6.2 "db_mtime": s.get("policy_db_mtime"), # M6.2 } except Exception: # noqa: BLE001 pass return { "ok": False, "path": None, "overrides_count": 0, "agent_overrides_count": 0, "last_export_at": None, "last_import_at": None, "db_mtime": None, } # ── Bridge Mappings (read-only ops endpoint) ─────────────────────────────────── @app.get("/bridge/mappings") async def bridge_mappings() -> Dict[str, Any]: """ Returns room-to-agent mapping summary. Safe for ops visibility — no secrets included. """ if _cfg is None or _room_map is None: return { "ok": False, "error": _config_error or "service not initialised", "mappings": [], "mixed_rooms": [], } return { "ok": True, "total": _room_map.total_mappings, "allowed_agents": list(_cfg.bridge_allowed_agents), "mappings": _room_map.as_summary(), "mixed_rooms_total": _mixed_room_config.total_rooms if _mixed_room_config else 0, "mixed_rooms": _mixed_room_config.as_summary() if _mixed_room_config else [], "control_rooms": sorted(_control_config.control_rooms) if _control_config else [], "control_operators_count": len(_control_config.operator_allowlist) if _control_config else 0, } # ── Debug / Soak (M11) ──────────────────────────────────────────────────────── @app.post("/v1/debug/inject_event") async def debug_inject_event(body: Dict[str, Any]) -> Dict[str, Any]: """ Synthetic event injection for soak/load testing. Enabled ONLY when DEBUG_INJECT_ENABLED=true (never in production). Body: { "room_id": "!room:server", "event": { Matrix event dict } } The event is enqueued directly into the ingress loop, bypassing Matrix poll. Returns: { "ok": bool, "enqueued": bool, "room_id": str, "event_id": str } """ if _cfg is None or not _cfg.debug_inject_enabled: return Response( # type: ignore[return-value] '{"ok":false,"error":"debug inject disabled"}', status_code=403, media_type="application/json", ) if _ingress_loop is None: return {"ok": False, "error": "ingress loop not running"} room_id = body.get("room_id", "") event = body.get("event", {}) if not room_id or not event: return {"ok": False, "error": "missing room_id or event"} # Ensure event has minimum required fields for ingress processing if not event.get("event_id"): import time as _time event["event_id"] = f"!inject-{int(_time.monotonic() * 1e6)}" if not event.get("type"): event["type"] = "m.room.message" if not event.get("content"): event["content"] = {"msgtype": "m.text", "body": event.get("body", "soak-ping")} # Build a minimal sync_resp that looks like a real Matrix /sync response # so _enqueue_from_sync can pick it up via extract_room_messages. # We bypass Matrix polling by directly calling _try_enqueue on the right mapping. enqueued = False try: # Find the matching room mapping (direct rooms only for soak) mapping = None if _ingress_loop._room_map is not None: for m in _ingress_loop._room_map.mappings: if m.room_id == room_id: mapping = m break if mapping is None: return {"ok": False, "error": f"no mapping for room_id={room_id!r}"} # Build a minimal stub Matrix client — replies are discarded for soak events from .matrix_client import MatrixClient class _SoakMatrixClient(MatrixClient): # type: ignore[misc] """No-op Matrix client for synthetic soak events.""" def __init__(self) -> None: # noqa: D107 pass # skip real __init__ async def mark_seen(self, room_id: str, event_id: str) -> None: # type: ignore[override] pass async def send_text(self, room_id: str, text: str, # type: ignore[override] txn: Optional[str] = None) -> None: pass _stub_client = _SoakMatrixClient() if _dummy_http_client is None: return {"ok": False, "error": "debug http client not initialised"} await _ingress_loop._try_enqueue( _stub_client, # type: ignore[arg-type] _ingress_loop._queue, _dummy_http_client, event, mapping, ) enqueued = True except Exception as exc: # noqa: BLE001 return {"ok": False, "error": str(exc), "enqueued": False} return { "ok": True, "enqueued": enqueued, "room_id": room_id, "event_id": event.get("event_id"), } async def _noop_send(room_id: str, text: str, txn: Optional[str] = None) -> None: """Discard replies from injected soak events.""" # ── Metrics ─────────────────────────────────────────────────────────────────── @app.get("/metrics") async def metrics(): if not _PROM_OK: return Response("# prometheus_client not available\n", media_type="text/plain") return Response(generate_latest(REGISTRY), media_type=CONTENT_TYPE_LATEST)