915 lines
39 KiB
Python
915 lines
39 KiB
Python
"""
|
||
matrix-bridge-dagi — Phase M1 scaffold
|
||
Bridges Matrix/Element rooms to DAGI agents via Gateway.
|
||
|
||
M1 scope: 1 room ↔ 1 agent (Sofiia), audit via sofiia-console internal endpoint.
|
||
"""
|
||
import asyncio
|
||
import logging
|
||
import os
|
||
import time
|
||
from contextlib import asynccontextmanager
|
||
from typing import Any, Dict, Optional
|
||
|
||
from fastapi import FastAPI, Response
|
||
from fastapi.middleware.cors import CORSMiddleware
|
||
|
||
try:
|
||
import httpx as _httpx
|
||
_HTTPX_OK = True
|
||
except ImportError: # pragma: no cover
|
||
_httpx = None # type: ignore
|
||
_HTTPX_OK = False
|
||
|
||
try:
|
||
from prometheus_client import (
|
||
Counter, Histogram, Gauge,
|
||
generate_latest, CONTENT_TYPE_LATEST,
|
||
CollectorRegistry, REGISTRY,
|
||
)
|
||
_PROM_OK = True
|
||
except ImportError: # pragma: no cover
|
||
_PROM_OK = False
|
||
|
||
from .config import BridgeConfig, load_config
|
||
from .control import ControlConfig, parse_control_config
|
||
from .control_limiter import ControlRateLimiter
|
||
from .event_store import EventStore
|
||
from .node_policy import parse_node_policy
|
||
from .ingress import MatrixIngressLoop
|
||
from .mixed_routing import MixedRoomConfig, parse_mixed_room_map
|
||
from .rate_limit import InMemoryRateLimiter
|
||
from .room_mapping import RoomMappingConfig, parse_room_map
|
||
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||
)
|
||
logger = logging.getLogger("matrix-bridge-dagi")
|
||
|
||
# ── Prometheus metrics (H3) ───────────────────────────────────────────────────
|
||
if _PROM_OK:
|
||
_messages_received = Counter(
|
||
"matrix_bridge_messages_received_total",
|
||
"Total Matrix messages received",
|
||
["room_id", "agent_id"],
|
||
)
|
||
_messages_replied = Counter(
|
||
"matrix_bridge_messages_replied_total",
|
||
"Total agent replies sent to Matrix",
|
||
["room_id", "agent_id", "status"],
|
||
)
|
||
_messages_rate_limited = Counter(
|
||
"matrix_bridge_rate_limited_total",
|
||
"Messages dropped by rate limiter",
|
||
["room_id", "agent_id", "limit_type"],
|
||
)
|
||
_gateway_errors = Counter(
|
||
"matrix_bridge_gateway_errors_total",
|
||
"Errors by stage (sync, invoke, send, audit)",
|
||
["error_type"],
|
||
)
|
||
_invoke_latency = Histogram(
|
||
"matrix_bridge_invoke_duration_seconds",
|
||
"Latency of DAGI Router infer call",
|
||
["agent_id", "node_id"], # M5.1: per-node latency breakdown
|
||
buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 45.0],
|
||
)
|
||
_send_latency = Histogram(
|
||
"matrix_bridge_send_duration_seconds",
|
||
"Latency of Matrix send_text call",
|
||
["agent_id"],
|
||
buckets=[0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0],
|
||
)
|
||
_bridge_up = Gauge(
|
||
"matrix_bridge_up",
|
||
"1 if bridge started successfully; 0 on config error",
|
||
["node_id"], # M7.1: per-node label for multi-node deployments
|
||
)
|
||
_rate_limiter_active_rooms = Gauge(
|
||
"matrix_bridge_rate_limiter_active_rooms",
|
||
"Rooms with activity in the current rate-limit window",
|
||
)
|
||
_rate_limiter_active_senders = Gauge(
|
||
"matrix_bridge_rate_limiter_active_senders",
|
||
"Senders with activity in the current rate-limit window",
|
||
)
|
||
# H2: Queue metrics
|
||
_queue_size = Gauge(
|
||
"matrix_bridge_queue_size",
|
||
"Current number of pending items in the work queue",
|
||
)
|
||
_queue_dropped = Counter(
|
||
"matrix_bridge_queue_dropped_total",
|
||
"Messages dropped because queue was full",
|
||
["room_id", "agent_id"],
|
||
)
|
||
_queue_wait = Histogram(
|
||
"matrix_bridge_queue_wait_seconds",
|
||
"Time between enqueue and worker start processing",
|
||
["agent_id"],
|
||
buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 30.0],
|
||
)
|
||
# M2.2: Mixed room routing — reason breakdown (slash/mention/name/default/direct)
|
||
# M7.1: Renamed from matrix_bridge_routed_total to avoid collision with M5.0 counter
|
||
_routing_reasons_total = Counter(
|
||
"matrix_bridge_routing_reasons_total",
|
||
"Message routing breakdown by agent and routing reason (slash/mention/name/default/direct)",
|
||
["agent_id", "reason"],
|
||
)
|
||
_route_rejected_total = Counter(
|
||
"matrix_bridge_route_rejected_total",
|
||
"Messages rejected during routing (unknown agent, bad slash, etc.)",
|
||
["room_id", "reason"],
|
||
)
|
||
_active_room_agent_locks = Gauge(
|
||
"matrix_bridge_active_room_agent_locks",
|
||
"Number of room-agent pairs currently holding a concurrency lock",
|
||
)
|
||
# M3.0: Control channel
|
||
_control_commands_total = Counter(
|
||
"matrix_bridge_control_commands_total",
|
||
"Total control commands received from authorized operators",
|
||
["sender", "verb", "subcommand"],
|
||
)
|
||
_control_rate_limited_total = Counter(
|
||
"matrix_bridge_control_rate_limited_total",
|
||
"Total control commands rejected by rate limiter or cooldown",
|
||
["scope"],
|
||
)
|
||
_dedupe_persistent_hits_total = Counter(
|
||
"matrix_bridge_dedupe_persistent_hits_total",
|
||
"Total events dropped by persistent (SQLite) deduplication",
|
||
["room_id"],
|
||
)
|
||
_dedupe_persistent_inserts_total = Counter(
|
||
"matrix_bridge_dedupe_persistent_inserts_total",
|
||
"Total events marked as processed in persistent dedupe store",
|
||
)
|
||
# M5.0: node-aware routing — primary routed counter (unique name, no collision)
|
||
_routed_total = Counter(
|
||
"matrix_bridge_routed_total",
|
||
"Total messages successfully routed, by agent, resolved node, and node source",
|
||
["agent_id", "node_id", "source"],
|
||
)
|
||
_node_rejected_total = Counter(
|
||
"matrix_bridge_node_rejected_total",
|
||
"Total messages with rejected (non-allowlisted) node kwarg",
|
||
["node_id"],
|
||
)
|
||
# M8.0: soft-failover metrics
|
||
_failover_total = Counter(
|
||
"matrix_bridge_failover_total",
|
||
"Total successful soft-failovers by node transition and reason",
|
||
["from_node", "to_node", "reason"],
|
||
)
|
||
_node_health_state = Gauge(
|
||
"matrix_bridge_node_health_state",
|
||
"Node health state: 1=healthy 0.5=degraded 0=down",
|
||
["node_id"],
|
||
)
|
||
# M8.1: sticky routing metrics
|
||
_sticky_set_total = Counter(
|
||
"matrix_bridge_sticky_node_total",
|
||
"Total sticky routing entries set after failover, by preferred node and scope",
|
||
["node_id", "scope"],
|
||
)
|
||
_sticky_active = Gauge(
|
||
"matrix_bridge_sticky_node_active",
|
||
"Current count of active sticky routing entries",
|
||
[],
|
||
)
|
||
|
||
# ── Startup state ─────────────────────────────────────────────────────────────
|
||
_START_TIME = time.monotonic()
|
||
_cfg: Optional[BridgeConfig] = None
|
||
# M5.1: in-memory per-node counters (lightweight, for !status reply)
|
||
from collections import defaultdict as _defaultdict
|
||
_node_stats: Dict[str, Dict[str, int]] = _defaultdict(lambda: {"routed": 0, "rejected": 0})
|
||
_config_error: Optional[str] = None
|
||
_matrix_reachable: Optional[bool] = None
|
||
_gateway_reachable: Optional[bool] = None
|
||
_room_map: Optional[RoomMappingConfig] = None
|
||
_mixed_room_config: Optional[MixedRoomConfig] = None
|
||
_control_config: Optional[ControlConfig] = None
|
||
_event_store: Optional[EventStore] = None
|
||
_rate_limiter: Optional[InMemoryRateLimiter] = None
|
||
_ingress_loop: Optional["MatrixIngressLoop"] = None # for /health queue_size
|
||
_ingress_task: Optional[asyncio.Task] = None
|
||
_ingress_stop: Optional[asyncio.Event] = None
|
||
_sticky_cache: Optional[Any] = None # M8.1: StickyNodeCache instance
|
||
_confirm_store: Optional[Any] = None # M9.0: ConfirmStore instance
|
||
_dummy_http_client: Optional[Any] = None # M11: soak inject endpoint (debug only)
|
||
|
||
|
||
async def _probe_url(url: str, timeout: float = 5.0) -> bool:
|
||
"""Quick GET probe — returns True if HTTP 2xx."""
|
||
if not _HTTPX_OK or not url:
|
||
return False
|
||
try:
|
||
async with _httpx.AsyncClient(timeout=timeout) as client:
|
||
r = await client.get(url)
|
||
return r.status_code < 400
|
||
except Exception:
|
||
return False
|
||
|
||
# ── Lifespan ──────────────────────────────────────────────────────────────────
|
||
@asynccontextmanager
|
||
async def lifespan(app_: Any):
|
||
global _cfg, _config_error, _matrix_reachable, _gateway_reachable
|
||
global _room_map, _mixed_room_config, _control_config, _rate_limiter, _ingress_loop
|
||
try:
|
||
_cfg = load_config()
|
||
|
||
# Parse regular room mapping (M1/M2.0: 1 room → 1 agent)
|
||
_room_map = parse_room_map(
|
||
os.getenv("BRIDGE_ROOM_MAP", ""),
|
||
_cfg.bridge_allowed_agents,
|
||
)
|
||
|
||
# Parse mixed room mapping (M2.1: 1 room → N agents)
|
||
if _cfg.bridge_mixed_room_map:
|
||
_mixed_room_config = parse_mixed_room_map(
|
||
_cfg.bridge_mixed_room_map,
|
||
_cfg.bridge_mixed_defaults,
|
||
_cfg.bridge_allowed_agents,
|
||
max_agents_per_room=_cfg.max_agents_per_mixed_room,
|
||
)
|
||
logger.info(
|
||
"✅ Mixed room config: %d rooms, agents=%s",
|
||
_mixed_room_config.total_rooms,
|
||
[a for r in _mixed_room_config.rooms.values() for a in r.agents],
|
||
)
|
||
else:
|
||
_mixed_room_config = None
|
||
|
||
# H1: Rate limiter (inmemory, per config)
|
||
_rate_limiter = InMemoryRateLimiter(
|
||
room_rpm=_cfg.rate_limit_room_rpm,
|
||
sender_rpm=_cfg.rate_limit_sender_rpm,
|
||
)
|
||
logger.info(
|
||
"✅ Rate limiter: room_rpm=%d sender_rpm=%d",
|
||
_cfg.rate_limit_room_rpm, _cfg.rate_limit_sender_rpm,
|
||
)
|
||
|
||
# M3.0: Operator control channel
|
||
if _cfg.bridge_operator_allowlist or _cfg.bridge_control_rooms:
|
||
_control_config = parse_control_config(
|
||
_cfg.bridge_operator_allowlist,
|
||
_cfg.bridge_control_rooms,
|
||
)
|
||
else:
|
||
_control_config = None
|
||
|
||
mixed_count = _mixed_room_config.total_rooms if _mixed_room_config else 0
|
||
ctrl_rooms = len(_control_config.control_rooms) if _control_config else 0
|
||
ctrl_ops = len(_control_config.operator_allowlist) if _control_config else 0
|
||
logger.info(
|
||
"✅ matrix-bridge-dagi started | node=%s build=%s homeserver=%s "
|
||
"agents=%s mappings=%d mixed_rooms=%d ctrl_rooms=%d ctrl_operators=%d",
|
||
_cfg.node_id, _cfg.build_sha, _cfg.matrix_homeserver_url,
|
||
list(_cfg.bridge_allowed_agents),
|
||
_room_map.total_mappings, mixed_count, ctrl_rooms, ctrl_ops,
|
||
)
|
||
|
||
# Connectivity smoke probes (non-blocking failures)
|
||
_matrix_reachable = await _probe_url(
|
||
f"{_cfg.matrix_homeserver_url}/_matrix/client/versions"
|
||
)
|
||
_gateway_reachable = await _probe_url(
|
||
f"{_cfg.dagi_gateway_url}/health"
|
||
)
|
||
if _matrix_reachable:
|
||
logger.info("✅ Matrix homeserver reachable: %s", _cfg.matrix_homeserver_url)
|
||
else:
|
||
logger.warning("⚠️ Matrix homeserver NOT reachable: %s", _cfg.matrix_homeserver_url)
|
||
if _gateway_reachable:
|
||
logger.info("✅ DAGI Gateway reachable: %s", _cfg.dagi_gateway_url)
|
||
else:
|
||
logger.warning("⚠️ DAGI Gateway NOT reachable: %s", _cfg.dagi_gateway_url)
|
||
if _PROM_OK:
|
||
_bridge_up.labels(node_id=_cfg.node_id or "").set(1) # M7.1: labeled
|
||
|
||
# Start ingress loop (fire-and-forget asyncio task)
|
||
_has_rooms = (_room_map and _room_map.total_mappings > 0) or (
|
||
_mixed_room_config and _mixed_room_config.total_rooms > 0
|
||
)
|
||
if _has_rooms:
|
||
_ingress_stop = asyncio.Event()
|
||
|
||
def _on_msg(room_id: str, agent_id: str) -> None:
|
||
if _PROM_OK:
|
||
_messages_received.labels(room_id=room_id, agent_id=agent_id).inc()
|
||
|
||
def _on_gw_error(error_type: str) -> None:
|
||
if _PROM_OK:
|
||
_gateway_errors.labels(error_type=error_type).inc()
|
||
|
||
def _on_replied(room_id: str, agent_id: str, status: str) -> None:
|
||
if _PROM_OK:
|
||
_messages_replied.labels(
|
||
room_id=room_id, agent_id=agent_id, status=status
|
||
).inc()
|
||
|
||
def _on_rate_limited(room_id: str, agent_id: str, limit_type: str) -> None:
|
||
if _PROM_OK:
|
||
_messages_rate_limited.labels(
|
||
room_id=room_id, agent_id=agent_id, limit_type=limit_type
|
||
).inc()
|
||
if _rate_limiter is not None:
|
||
stats = _rate_limiter.stats()
|
||
_rate_limiter_active_rooms.set(stats["active_rooms"])
|
||
_rate_limiter_active_senders.set(stats["active_senders"])
|
||
|
||
def _on_invoke_latency(agent_id: str, duration_s: float, node_id: str = "") -> None:
|
||
if _PROM_OK:
|
||
_invoke_latency.labels(agent_id=agent_id, node_id=node_id or "unknown").observe(duration_s)
|
||
|
||
def _on_send_latency(agent_id: str, duration_s: float) -> None:
|
||
if _PROM_OK:
|
||
_send_latency.labels(agent_id=agent_id).observe(duration_s)
|
||
|
||
# H2 callbacks
|
||
def _on_queue_dropped(room_id: str, agent_id: str) -> None:
|
||
if _PROM_OK:
|
||
_queue_dropped.labels(room_id=room_id, agent_id=agent_id).inc()
|
||
|
||
def _on_queue_size(size: int) -> None:
|
||
if _PROM_OK:
|
||
_queue_size.set(size)
|
||
|
||
def _on_queue_wait(agent_id: str, wait_s: float) -> None:
|
||
if _PROM_OK:
|
||
_queue_wait.labels(agent_id=agent_id).observe(wait_s)
|
||
|
||
# M2.2 callbacks
|
||
def _on_routed(agent_id: str, reason: str) -> None:
|
||
if _PROM_OK:
|
||
_routing_reasons_total.labels(agent_id=agent_id, reason=reason).inc() # M7.1: renamed
|
||
|
||
def _on_route_rejected(room_id: str, reason: str) -> None:
|
||
if _PROM_OK:
|
||
_route_rejected_total.labels(room_id=room_id, reason=reason).inc()
|
||
|
||
# M3.0 callbacks
|
||
def _on_control_command(sender: str, verb: str, subcommand: str) -> None:
|
||
if _PROM_OK:
|
||
_control_commands_total.labels(
|
||
sender=sender, verb=verb, subcommand=subcommand
|
||
).inc()
|
||
|
||
# M3.4: control safety rate limiter
|
||
_control_limiter = ControlRateLimiter(
|
||
room_rpm=_cfg.control_room_rpm,
|
||
operator_rpm=_cfg.control_operator_rpm,
|
||
run_next_rpm=_cfg.control_run_next_rpm,
|
||
cooldown_s=_cfg.control_cooldown_s,
|
||
) if _control_config and _control_config.is_enabled else None
|
||
|
||
def _on_control_rate_limited(scope: str) -> None:
|
||
if _PROM_OK:
|
||
_control_rate_limited_total.labels(scope=scope).inc()
|
||
|
||
# M2.3: Persistent event deduplication
|
||
_prune_task: Optional[asyncio.Task] = None
|
||
if _cfg.persistent_dedupe:
|
||
db_path = os.path.join(_cfg.bridge_data_dir, "matrix_bridge.db")
|
||
_event_store = EventStore(
|
||
db_path=db_path,
|
||
ttl_h=_cfg.processed_events_ttl_h,
|
||
prune_batch=_cfg.processed_events_prune_batch,
|
||
)
|
||
store_ok = await _event_store.open()
|
||
if store_ok:
|
||
logger.info(
|
||
"✅ Persistent dedupe: %s (ttl_h=%d)",
|
||
db_path, _cfg.processed_events_ttl_h,
|
||
)
|
||
# Best-effort prune on startup
|
||
pruned = await _event_store.prune()
|
||
if pruned:
|
||
logger.info("Startup prune removed %d stale events", pruned)
|
||
# Periodic prune task
|
||
if _cfg.processed_events_prune_interval_s > 0:
|
||
async def _prune_loop() -> None:
|
||
while True:
|
||
await asyncio.sleep(_cfg.processed_events_prune_interval_s)
|
||
if _event_store:
|
||
await _event_store.prune()
|
||
_prune_task = asyncio.create_task(_prune_loop(), name="event_store_prune")
|
||
else:
|
||
logger.warning("⚠️ EventStore init failed — persistent dedupe disabled (degraded)")
|
||
_event_store = None
|
||
else:
|
||
logger.info("Persistent dedupe disabled (PERSISTENT_DEDUPE=0)")
|
||
|
||
def _on_dedupe_hit(room_id: str, agent_id: str) -> None:
|
||
if _PROM_OK:
|
||
_dedupe_persistent_hits_total.labels(room_id=room_id).inc()
|
||
|
||
def _on_dedupe_insert() -> None:
|
||
if _PROM_OK:
|
||
_dedupe_persistent_inserts_total.inc()
|
||
|
||
# M5.0: node-aware routing policy
|
||
_node_policy = parse_node_policy(
|
||
raw_allowed=_cfg.bridge_allowed_nodes,
|
||
default_node=_cfg.bridge_default_node,
|
||
raw_room_map=_cfg.bridge_room_node_map,
|
||
)
|
||
logger.info(
|
||
"✅ Node policy: default=%s allowed=%s room_overrides=%d",
|
||
_node_policy.default_node,
|
||
sorted(_node_policy.allowed_nodes),
|
||
len(_node_policy.room_node_map),
|
||
)
|
||
|
||
# M6.0: Persistent policy store for dynamic room-node overrides
|
||
_policy_store: Optional[Any] = None
|
||
try:
|
||
from .policy_store import PolicyStore as _PolicyStore
|
||
_ps_path = os.path.join(_cfg.bridge_data_dir, "policy_overrides.db")
|
||
_policy_store = _PolicyStore(db_path=_ps_path)
|
||
_policy_store.open()
|
||
logger.info(
|
||
"✅ Policy store: %s (%d overrides)",
|
||
_ps_path, _policy_store.count_overrides(),
|
||
)
|
||
except Exception as _ps_exc:
|
||
logger.warning("Policy store init failed (non-fatal): %s", _ps_exc)
|
||
_policy_store = None
|
||
|
||
def _on_node_selected(agent_id: str, node_id: str, source: str) -> None:
|
||
if _PROM_OK:
|
||
_routed_total.labels(agent_id=agent_id, node_id=node_id, source=source).inc()
|
||
_node_stats[node_id]["routed"] += 1
|
||
|
||
def _on_node_rejected(rejected_node: str) -> None:
|
||
if _PROM_OK:
|
||
_node_rejected_total.labels(node_id=rejected_node).inc()
|
||
_node_stats[rejected_node]["rejected"] += 1
|
||
|
||
# M8.0: Node health tracker + soft-failover
|
||
from .node_health import NodeHealthTracker as _NodeHealthTracker, parse_node_health_config as _parse_nhc
|
||
_health_cfg = _parse_nhc(
|
||
fail_consecutive=_cfg.node_fail_consecutive,
|
||
lat_ewma_s=_cfg.node_lat_ewma_s,
|
||
ewma_alpha=_cfg.node_ewma_alpha,
|
||
)
|
||
_node_health_tracker = _NodeHealthTracker(_health_cfg)
|
||
logger.info(
|
||
"✅ Node health tracker: fail_consecutive=%d lat_ewma_s=%.1f ewma_alpha=%.2f",
|
||
_cfg.node_fail_consecutive, _cfg.node_lat_ewma_s, _cfg.node_ewma_alpha,
|
||
)
|
||
|
||
def _on_failover(from_node: str, to_node: str, reason: str) -> None:
|
||
if _PROM_OK:
|
||
_failover_total.labels(
|
||
from_node=from_node, to_node=to_node, reason=reason
|
||
).inc()
|
||
if _PROM_OK:
|
||
_update_health_gauges()
|
||
logger.info("⚡ Failover: %s → %s reason=%s", from_node, to_node, reason)
|
||
|
||
def _update_health_gauges() -> None:
|
||
if not _PROM_OK or _node_health_tracker is None or _node_policy is None:
|
||
return
|
||
_STATE_MAP = {"healthy": 1.0, "degraded": 0.5, "down": 0.0}
|
||
for nid in _node_policy.allowed_nodes:
|
||
state = _node_health_tracker.state(nid)
|
||
_node_health_state.labels(node_id=nid).set(_STATE_MAP.get(state, 1.0))
|
||
|
||
# M8.1: Sticky failover cache
|
||
from .sticky_cache import StickyNodeCache as _StickyNodeCache
|
||
global _sticky_cache
|
||
if _cfg.failover_sticky_ttl_s > 0:
|
||
_sticky_cache = _StickyNodeCache(ttl_s=_cfg.failover_sticky_ttl_s)
|
||
logger.info("✅ Sticky failover cache: ttl=%.0fs", _cfg.failover_sticky_ttl_s)
|
||
else:
|
||
_sticky_cache = None
|
||
logger.info("ℹ️ Sticky failover disabled (FAILOVER_STICKY_TTL_S=0)")
|
||
|
||
# M9.0: Confirm store
|
||
from .confirm_store import ConfirmStore as _ConfirmStore
|
||
global _confirm_store
|
||
if _cfg.confirm_ttl_s > 0:
|
||
_confirm_store = _ConfirmStore(ttl_s=_cfg.confirm_ttl_s)
|
||
logger.info("✅ Confirm store: ttl=%.0fs", _cfg.confirm_ttl_s)
|
||
else:
|
||
_confirm_store = None
|
||
logger.info("ℹ️ Confirm store disabled (CONFIRM_TTL_S=0)")
|
||
|
||
# M11: debug inject client (only created when inject is enabled)
|
||
global _dummy_http_client
|
||
if _cfg.debug_inject_enabled and _HTTPX_OK:
|
||
_dummy_http_client = _httpx.AsyncClient(timeout=30.0)
|
||
logger.warning(
|
||
"⚠️ DEBUG_INJECT_ENABLED=true — synthetic event injection active. "
|
||
"NEVER use in production!"
|
||
)
|
||
|
||
def _on_sticky_set(node_id: str, scope: str) -> None:
|
||
if _PROM_OK:
|
||
_sticky_set_total.labels(node_id=node_id, scope=scope).inc()
|
||
if _sticky_cache is not None:
|
||
_sticky_active.labels().set(_sticky_cache.active_count())
|
||
|
||
ingress = MatrixIngressLoop(
|
||
matrix_homeserver_url=_cfg.matrix_homeserver_url,
|
||
matrix_access_token=_cfg.matrix_access_token,
|
||
matrix_user_id=_cfg.matrix_user_id,
|
||
router_url=_cfg.dagi_gateway_url,
|
||
node_id=_cfg.node_id,
|
||
room_map=_room_map,
|
||
sofiia_console_url=_cfg.sofiia_console_url,
|
||
sofiia_internal_token=_cfg.sofiia_internal_token,
|
||
rate_limiter=_rate_limiter,
|
||
queue_max_events=_cfg.queue_max_events,
|
||
worker_concurrency=_cfg.worker_concurrency,
|
||
queue_drain_timeout_s=_cfg.queue_drain_timeout_s,
|
||
mixed_room_config=_mixed_room_config,
|
||
unknown_agent_behavior=_cfg.unknown_agent_behavior,
|
||
max_slash_len=_cfg.max_slash_len,
|
||
mixed_concurrency_cap=_cfg.mixed_concurrency_cap,
|
||
on_message_received=_on_msg,
|
||
on_message_replied=_on_replied,
|
||
on_gateway_error=_on_gw_error,
|
||
on_rate_limited=_on_rate_limited,
|
||
on_queue_dropped=_on_queue_dropped,
|
||
on_queue_size=_on_queue_size,
|
||
on_invoke_latency=_on_invoke_latency,
|
||
on_send_latency=_on_send_latency,
|
||
on_queue_wait=_on_queue_wait,
|
||
on_routed=_on_routed,
|
||
on_route_rejected=_on_route_rejected,
|
||
control_config=_control_config,
|
||
control_unauthorized_behavior=_cfg.control_unauthorized_behavior,
|
||
sofiia_control_token=_cfg.sofiia_control_token,
|
||
control_limiter=_control_limiter,
|
||
on_control_command=_on_control_command,
|
||
on_control_rate_limited=_on_control_rate_limited,
|
||
event_store=_event_store,
|
||
on_dedupe_persistent_hit=_on_dedupe_hit,
|
||
on_dedupe_persistent_insert=_on_dedupe_insert,
|
||
# M4.0: agent discovery
|
||
discovery_rpm=_cfg.discovery_rpm,
|
||
# M5.0: node-aware routing
|
||
node_policy=_node_policy,
|
||
on_node_selected=_on_node_selected,
|
||
on_node_rejected=_on_node_rejected,
|
||
# M5.1: node stats getter for !status
|
||
node_stats_getter=lambda: {k: dict(v) for k, v in _node_stats.items()},
|
||
# M6.0: dynamic room-node policy store
|
||
policy_store=_policy_store,
|
||
# M6.2: data directory for policy exports/imports
|
||
bridge_data_dir=_cfg.bridge_data_dir,
|
||
# M8.0: node health tracker + failover callback
|
||
node_health_tracker=_node_health_tracker,
|
||
on_failover=_on_failover,
|
||
# M8.1: sticky failover cache
|
||
sticky_cache=_sticky_cache,
|
||
on_sticky_set=_on_sticky_set,
|
||
# M8.2: HA persistence config
|
||
ha_health_snapshot_interval_s=_cfg.ha_health_snapshot_interval_s,
|
||
ha_health_max_age_s=_cfg.ha_health_max_age_s,
|
||
# M9.0: Two-step confirmation store
|
||
confirm_store=_confirm_store,
|
||
policy_export_retention_days=_cfg.policy_export_retention_days,
|
||
policy_history_limit=_cfg.policy_history_limit,
|
||
)
|
||
logger.info(
|
||
"✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs",
|
||
_cfg.queue_max_events, _cfg.worker_concurrency, _cfg.queue_drain_timeout_s,
|
||
)
|
||
_ingress_loop = ingress
|
||
_ingress_task = asyncio.create_task(
|
||
ingress.run(_ingress_stop),
|
||
name="matrix_ingress_loop",
|
||
)
|
||
logger.info("✅ Ingress loop task started")
|
||
else:
|
||
logger.warning("⚠️ No room mappings — ingress loop NOT started")
|
||
|
||
except (RuntimeError, ValueError) as exc:
|
||
_config_error = str(exc)
|
||
logger.error("❌ Config error: %s", _config_error)
|
||
if _PROM_OK:
|
||
_cfg_node = _cfg.node_id if _cfg else ""
|
||
_bridge_up.labels(node_id=_cfg_node or "").set(0) # M7.1: labeled
|
||
yield
|
||
# Shutdown: cancel ingress loop
|
||
if _ingress_stop:
|
||
_ingress_stop.set()
|
||
if _ingress_task and not _ingress_task.done():
|
||
_ingress_task.cancel()
|
||
try:
|
||
await asyncio.wait_for(_ingress_task, timeout=5.0)
|
||
except (asyncio.CancelledError, asyncio.TimeoutError):
|
||
pass
|
||
# Shutdown: cancel prune task + close EventStore
|
||
if "_prune_task" in dir() and _prune_task and not _prune_task.done(): # type: ignore[name-defined]
|
||
_prune_task.cancel() # type: ignore[name-defined]
|
||
if _event_store is not None:
|
||
await _event_store.close()
|
||
# M6.0: close policy store
|
||
if "_policy_store" in dir() and _policy_store is not None: # type: ignore[name-defined]
|
||
try:
|
||
_policy_store.close() # type: ignore[name-defined]
|
||
except Exception: # noqa: BLE001
|
||
pass
|
||
# M11: close debug http client if open
|
||
if _dummy_http_client is not None:
|
||
try:
|
||
await _dummy_http_client.aclose()
|
||
except Exception: # noqa: BLE001
|
||
pass
|
||
logger.info("matrix-bridge-dagi shutting down")
|
||
|
||
# ── App ───────────────────────────────────────────────────────────────────────
|
||
app = FastAPI(
|
||
title="matrix-bridge-dagi",
|
||
version="0.1.0",
|
||
lifespan=lifespan,
|
||
)
|
||
|
||
app.add_middleware(
|
||
CORSMiddleware,
|
||
allow_origins=["*"],
|
||
allow_methods=["GET", "POST"],
|
||
allow_headers=["*"],
|
||
)
|
||
|
||
# ── Health ────────────────────────────────────────────────────────────────────
|
||
@app.get("/health")
|
||
async def health() -> Dict[str, Any]:
|
||
uptime = int(time.monotonic() - _START_TIME)
|
||
if _PROM_OK and _ingress_loop is not None:
|
||
_active_room_agent_locks.set(_ingress_loop.active_lock_count)
|
||
if _config_error or _cfg is None:
|
||
return {
|
||
"ok": False,
|
||
"service": "matrix-bridge-dagi",
|
||
"version": "0.1.0",
|
||
"build": os.getenv("BUILD_SHA", "dev"),
|
||
"uptime_s": uptime,
|
||
"error": _config_error or "service not initialised",
|
||
}
|
||
matrix_ok = _matrix_reachable is True
|
||
gateway_ok = _gateway_reachable is True
|
||
overall_ok = matrix_ok and gateway_ok
|
||
return {
|
||
"ok": overall_ok,
|
||
"service": "matrix-bridge-dagi",
|
||
"version": "0.1.0",
|
||
"build": _cfg.build_sha,
|
||
"build_time": _cfg.build_time,
|
||
"env": os.getenv("ENV", "dev"),
|
||
"uptime_s": uptime,
|
||
"node_id": _cfg.node_id,
|
||
"homeserver": _cfg.matrix_homeserver_url,
|
||
"matrix_reachable": _matrix_reachable,
|
||
"bridge_user": _cfg.matrix_user_id,
|
||
"sofiia_room_id": _cfg.sofiia_room_id,
|
||
"allowed_agents": list(_cfg.bridge_allowed_agents),
|
||
"gateway": _cfg.dagi_gateway_url,
|
||
"gateway_reachable": _gateway_reachable,
|
||
"mappings_count": _room_map.total_mappings if _room_map else 0,
|
||
"mixed_rooms_count": _mixed_room_config.total_rooms if _mixed_room_config else 0,
|
||
"total_agents_in_mixed_rooms": sum(
|
||
len(r.agents) for r in _mixed_room_config.rooms.values()
|
||
) if _mixed_room_config else 0,
|
||
"config_ok": True,
|
||
"rate_limiter": _rate_limiter.stats() if _rate_limiter else None,
|
||
"queue": {
|
||
"size": _ingress_loop.queue_size if _ingress_loop else 0,
|
||
"max": _cfg.queue_max_events,
|
||
"workers": _cfg.worker_concurrency,
|
||
},
|
||
"mixed_guard_rails": {
|
||
"max_agents_per_room": _cfg.max_agents_per_mixed_room,
|
||
"max_slash_len": _cfg.max_slash_len,
|
||
"unknown_agent_behavior": _cfg.unknown_agent_behavior,
|
||
"concurrency_cap": _cfg.mixed_concurrency_cap,
|
||
"active_room_agent_locks": _ingress_loop.active_lock_count if _ingress_loop else 0,
|
||
},
|
||
"control_channel": {
|
||
"enabled": _control_config.is_enabled if _control_config else False,
|
||
"control_rooms_count": len(_control_config.control_rooms) if _control_config else 0,
|
||
"operators_count": len(_control_config.operator_allowlist) if _control_config else 0,
|
||
"unauthorized_behavior": _cfg.control_unauthorized_behavior,
|
||
},
|
||
"control_safety": {
|
||
"enabled": _cfg.control_room_rpm > 0 or _cfg.control_operator_rpm > 0,
|
||
"room_rpm": _cfg.control_room_rpm,
|
||
"operator_rpm": _cfg.control_operator_rpm,
|
||
"run_next_rpm": _cfg.control_run_next_rpm,
|
||
"cooldown_s": _cfg.control_cooldown_s,
|
||
},
|
||
"persistent_dedupe": _event_store.as_health_dict() if _event_store else {
|
||
"enabled": False,
|
||
"db_path": None,
|
||
"ttl_h": _cfg.processed_events_ttl_h,
|
||
"ok": False,
|
||
"last_prune_at": None,
|
||
"pruned_rows_last": 0,
|
||
},
|
||
# M6.0: policy store health
|
||
"policy_store": _health_policy_store_dict(),
|
||
# M8.1: sticky failover cache health
|
||
"sticky_cache": _health_sticky_dict(),
|
||
# M8.2: HA state persistence info
|
||
"ha_state": _health_ha_dict(),
|
||
# M9.0: confirm store
|
||
"confirm_store": _health_confirm_dict(),
|
||
}
|
||
|
||
|
||
def _health_confirm_dict() -> Dict[str, Any]:
|
||
"""Return confirm store info for /health endpoint (M9.0)."""
|
||
if _confirm_store is None:
|
||
return {"enabled": False}
|
||
return {
|
||
"enabled": True,
|
||
"pending": _confirm_store.pending_count(),
|
||
"ttl_s": _confirm_store.ttl_s,
|
||
}
|
||
|
||
|
||
def _health_ha_dict() -> Dict[str, Any]:
|
||
"""Return HA persistence info for /health endpoint (M8.2)."""
|
||
if _ingress_loop is None:
|
||
return {"sticky_loaded": 0, "health_loaded": False, "snapshot_interval_s": 0}
|
||
try:
|
||
s = _ingress_loop.get_status()
|
||
return {
|
||
"sticky_loaded": s.get("ha_sticky_loaded", 0),
|
||
"health_loaded": s.get("ha_health_loaded", False),
|
||
"snapshot_interval_s": s.get("ha_health_snapshot_interval_s", 0),
|
||
}
|
||
except Exception: # noqa: BLE001
|
||
return {"sticky_loaded": 0, "health_loaded": False, "snapshot_interval_s": 0}
|
||
|
||
|
||
def _health_sticky_dict() -> Dict[str, Any]:
|
||
"""Return sticky failover cache health for /health endpoint (M8.1)."""
|
||
if _sticky_cache is None:
|
||
return {"enabled": False, "active_keys": 0, "ttl_s": 0}
|
||
return {
|
||
"enabled": True,
|
||
"active_keys": _sticky_cache.active_count(),
|
||
"ttl_s": _sticky_cache.ttl_s,
|
||
}
|
||
|
||
|
||
def _health_policy_store_dict() -> Dict[str, Any]:
|
||
"""Return policy store health info for /health endpoint."""
|
||
try:
|
||
if _ingress_loop is not None:
|
||
s = _ingress_loop.get_status()
|
||
return {
|
||
"ok": s.get("policy_store_ok", False),
|
||
"path": s.get("policy_store_path"),
|
||
"overrides_count": s.get("policy_overrides_count", 0),
|
||
"agent_overrides_count": s.get("policy_agent_overrides_count", 0), # M6.1
|
||
"last_export_at": s.get("policy_last_export_at"), # M6.2
|
||
"last_import_at": s.get("policy_last_import_at"), # M6.2
|
||
"db_mtime": s.get("policy_db_mtime"), # M6.2
|
||
}
|
||
except Exception: # noqa: BLE001
|
||
pass
|
||
return {
|
||
"ok": False, "path": None,
|
||
"overrides_count": 0, "agent_overrides_count": 0,
|
||
"last_export_at": None, "last_import_at": None, "db_mtime": None,
|
||
}
|
||
|
||
|
||
# ── Bridge Mappings (read-only ops endpoint) ───────────────────────────────────
|
||
@app.get("/bridge/mappings")
|
||
async def bridge_mappings() -> Dict[str, Any]:
|
||
"""
|
||
Returns room-to-agent mapping summary.
|
||
Safe for ops visibility — no secrets included.
|
||
"""
|
||
if _cfg is None or _room_map is None:
|
||
return {
|
||
"ok": False,
|
||
"error": _config_error or "service not initialised",
|
||
"mappings": [],
|
||
"mixed_rooms": [],
|
||
}
|
||
return {
|
||
"ok": True,
|
||
"total": _room_map.total_mappings,
|
||
"allowed_agents": list(_cfg.bridge_allowed_agents),
|
||
"mappings": _room_map.as_summary(),
|
||
"mixed_rooms_total": _mixed_room_config.total_rooms if _mixed_room_config else 0,
|
||
"mixed_rooms": _mixed_room_config.as_summary() if _mixed_room_config else [],
|
||
"control_rooms": sorted(_control_config.control_rooms) if _control_config else [],
|
||
"control_operators_count": len(_control_config.operator_allowlist) if _control_config else 0,
|
||
}
|
||
|
||
|
||
# ── Debug / Soak (M11) ────────────────────────────────────────────────────────
|
||
@app.post("/v1/debug/inject_event")
|
||
async def debug_inject_event(body: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""
|
||
Synthetic event injection for soak/load testing.
|
||
|
||
Enabled ONLY when DEBUG_INJECT_ENABLED=true (never in production).
|
||
|
||
Body: { "room_id": "!room:server", "event": { Matrix event dict } }
|
||
The event is enqueued directly into the ingress loop, bypassing Matrix poll.
|
||
|
||
Returns: { "ok": bool, "enqueued": bool, "room_id": str, "event_id": str }
|
||
"""
|
||
if _cfg is None or not _cfg.debug_inject_enabled:
|
||
return Response( # type: ignore[return-value]
|
||
'{"ok":false,"error":"debug inject disabled"}',
|
||
status_code=403,
|
||
media_type="application/json",
|
||
)
|
||
if _ingress_loop is None:
|
||
return {"ok": False, "error": "ingress loop not running"}
|
||
|
||
room_id = body.get("room_id", "")
|
||
event = body.get("event", {})
|
||
if not room_id or not event:
|
||
return {"ok": False, "error": "missing room_id or event"}
|
||
|
||
# Ensure event has minimum required fields for ingress processing
|
||
if not event.get("event_id"):
|
||
import time as _time
|
||
event["event_id"] = f"!inject-{int(_time.monotonic() * 1e6)}"
|
||
if not event.get("type"):
|
||
event["type"] = "m.room.message"
|
||
if not event.get("content"):
|
||
event["content"] = {"msgtype": "m.text", "body": event.get("body", "soak-ping")}
|
||
|
||
# Build a minimal sync_resp that looks like a real Matrix /sync response
|
||
# so _enqueue_from_sync can pick it up via extract_room_messages.
|
||
# We bypass Matrix polling by directly calling _try_enqueue on the right mapping.
|
||
enqueued = False
|
||
try:
|
||
# Find the matching room mapping (direct rooms only for soak)
|
||
mapping = None
|
||
if _ingress_loop._room_map is not None:
|
||
for m in _ingress_loop._room_map.mappings:
|
||
if m.room_id == room_id:
|
||
mapping = m
|
||
break
|
||
|
||
if mapping is None:
|
||
return {"ok": False, "error": f"no mapping for room_id={room_id!r}"}
|
||
|
||
# Build a minimal stub Matrix client — replies are discarded for soak events
|
||
from .matrix_client import MatrixClient
|
||
|
||
class _SoakMatrixClient(MatrixClient): # type: ignore[misc]
|
||
"""No-op Matrix client for synthetic soak events."""
|
||
def __init__(self) -> None: # noqa: D107
|
||
pass # skip real __init__
|
||
|
||
async def mark_seen(self, room_id: str, event_id: str) -> None: # type: ignore[override]
|
||
pass
|
||
|
||
async def send_text(self, room_id: str, text: str, # type: ignore[override]
|
||
txn: Optional[str] = None) -> None:
|
||
pass
|
||
|
||
_stub_client = _SoakMatrixClient()
|
||
|
||
if _dummy_http_client is None:
|
||
return {"ok": False, "error": "debug http client not initialised"}
|
||
|
||
await _ingress_loop._try_enqueue(
|
||
_stub_client, # type: ignore[arg-type]
|
||
_ingress_loop._queue,
|
||
_dummy_http_client,
|
||
event,
|
||
mapping,
|
||
)
|
||
enqueued = True
|
||
except Exception as exc: # noqa: BLE001
|
||
return {"ok": False, "error": str(exc), "enqueued": False}
|
||
|
||
return {
|
||
"ok": True,
|
||
"enqueued": enqueued,
|
||
"room_id": room_id,
|
||
"event_id": event.get("event_id"),
|
||
}
|
||
|
||
|
||
async def _noop_send(room_id: str, text: str, txn: Optional[str] = None) -> None:
|
||
"""Discard replies from injected soak events."""
|
||
|
||
|
||
# ── Metrics ───────────────────────────────────────────────────────────────────
|
||
@app.get("/metrics")
|
||
async def metrics():
|
||
if not _PROM_OK:
|
||
return Response("# prometheus_client not available\n", media_type="text/plain")
|
||
return Response(generate_latest(REGISTRY), media_type=CONTENT_TYPE_LATEST)
|