feat(platform): add new services, tools, tests and crews modules

New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
2026-03-03 07:14:14 -08:00
parent e9dedffa48
commit 129e4ea1fc
241 changed files with 69349 additions and 0 deletions
--- a/services/router/alert_ingest.py
+++ b/services/router/alert_ingest.py
@@ -0,0 +1,138 @@
+"""
+alert_ingest.py — Alert ingestion business logic.
+
+Handles:
+  - AlertEvent validation and normalization
+  - Dedupe-aware ingestion via AlertStore
+  - list/get/ack helpers used by alert_ingest_tool handler
+"""
+from __future__ import annotations
+
+import hashlib
+import re
+import logging
+from typing import Any, Dict, List, Optional
+
+from alert_store import (
+    AlertStore,
+    _compute_dedupe_key,
+    _redact_text,
+    _sanitize_alert,
+    MAX_LOG_SAMPLES,
+)
+
+logger = logging.getLogger(__name__)
+
+# ─── Validation ────────────────────────────────────────────────────────────────
+
+VALID_SEVERITIES = {"P0", "P1", "P2", "P3", "INFO"}
+VALID_KINDS = {
+    "slo_breach", "crashloop", "latency", "error_rate",
+    "disk", "oom", "deploy", "security", "custom",
+}
+VALID_ENVS = {"prod", "staging", "dev", "any"}
+
+
+def validate_alert(data: Dict) -> Optional[str]:
+    """Return error string or None if valid."""
+    if not data.get("service"):
+        return "alert.service is required"
+    if not data.get("title"):
+        return "alert.title is required"
+    sev = data.get("severity", "P2")
+    if sev not in VALID_SEVERITIES:
+        return f"alert.severity must be one of {VALID_SEVERITIES}"
+    kind = data.get("kind", "custom")
+    if kind not in VALID_KINDS:
+        return f"alert.kind must be one of {VALID_KINDS}"
+    return None
+
+
+def normalize_alert(data: Dict) -> Dict:
+    """Normalize and sanitize alert fields."""
+    safe = _sanitize_alert(data)
+    safe.setdefault("kind", "custom")
+    safe.setdefault("env", "prod")
+    safe.setdefault("severity", "P2")
+    safe.setdefault("labels", {})
+    safe.setdefault("metrics", {})
+    safe.setdefault("links", [])
+    safe.setdefault("evidence", {})
+
+    ev = safe.get("evidence", {})
+    logs = ev.get("log_samples", [])
+    safe["evidence"] = {
+        **ev,
+        "log_samples": [_redact_text(l, 300) for l in logs[:MAX_LOG_SAMPLES]],
+    }
+    return safe
+
+
+# ─── Ingest ────────────────────────────────────────────────────────────────────
+
+def ingest_alert(
+    store: AlertStore,
+    alert_data: Dict,
+    dedupe_ttl_minutes: int = 30,
+) -> Dict:
+    """
+    Validate, normalize, and ingest alert with dedupe.
+    Returns the store result dict.
+    """
+    err = validate_alert(alert_data)
+    if err:
+        return {"accepted": False, "error": err}
+
+    normalized = normalize_alert(alert_data)
+    return store.ingest(normalized, dedupe_ttl_minutes=dedupe_ttl_minutes)
+
+
+# ─── List/Get/Ack ──────────────────────────────────────────────────────────────
+
+def list_alerts(
+    store: AlertStore,
+    service: Optional[str] = None,
+    env: Optional[str] = None,
+    window_minutes: int = 240,
+    limit: int = 50,
+) -> List[Dict]:
+    filters = {}
+    if service:
+        filters["service"] = service
+    if env and env != "any":
+        filters["env"] = env
+    filters["window_minutes"] = window_minutes
+    return store.list_alerts(filters, limit=min(limit, 200))
+
+
+def get_alert(store: AlertStore, alert_ref: str) -> Optional[Dict]:
+    return store.get_alert(alert_ref)
+
+
+def ack_alert(store: AlertStore, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
+    if not alert_ref:
+        return None
+    return store.ack_alert(alert_ref, actor, _redact_text(note, 500))
+
+
+# ─── Dedupe helpers ────────────────────────────────────────────────────────────
+
+def build_dedupe_key(service: str, env: str, kind: str, fingerprint: str = "") -> str:
+    return _compute_dedupe_key(service, env, kind, fingerprint)
+
+
+def map_alert_severity_to_incident(
+    alert_severity: str,
+    cap: str = "P1",
+) -> str:
+    """
+    Map alert severity to incident severity, applying a cap.
+    e.g. alert P0 with cap P1 → P1.
+    """
+    order = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "INFO": 4}
+    sev = alert_severity if alert_severity in order else "P2"
+    cap_val = cap if cap in order else "P1"
+    # Take the higher (less critical) of the two
+    if order[sev] < order[cap_val]:
+        return cap_val
+    return sev
--- a/services/router/alert_store.py
+++ b/services/router/alert_store.py
--- a/services/router/architecture_pressure.py
+++ b/services/router/architecture_pressure.py
@@ -0,0 +1,574 @@
+"""
+architecture_pressure.py — Architecture Pressure Index (APIx) Engine.
+DAARION.city | deterministic, no LLM.
+
+Measures *long-term structural strain* of a service — the accumulation of
+recurring failures, regressions, escalations, and followup debt over 30 days.
+
+Contrast with Risk Engine (short-term operational health).
+
+Public API:
+  load_pressure_policy() -> Dict
+  compute_pressure(service, env, ...) -> PressureReport
+  compute_pressure_dashboard(env, services, ...) -> DashboardResult
+  list_known_services(policy) -> List[str]
+"""
+from __future__ import annotations
+
+import datetime
+import logging
+import yaml
+from pathlib import Path
+from typing import Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+# ─── Policy ───────────────────────────────────────────────────────────────────
+
+_PRESSURE_POLICY_CACHE: Optional[Dict] = None
+_PRESSURE_POLICY_PATHS = [
+    Path("config/architecture_pressure_policy.yml"),
+    Path(__file__).resolve().parent.parent.parent / "config" / "architecture_pressure_policy.yml",
+]
+
+
+def load_pressure_policy() -> Dict:
+    global _PRESSURE_POLICY_CACHE
+    if _PRESSURE_POLICY_CACHE is not None:
+        return _PRESSURE_POLICY_CACHE
+    for p in _PRESSURE_POLICY_PATHS:
+        if p.exists():
+            try:
+                with open(p) as f:
+                    data = yaml.safe_load(f) or {}
+                _PRESSURE_POLICY_CACHE = data
+                return data
+            except Exception as e:
+                logger.warning("Failed to load architecture_pressure_policy from %s: %s", p, e)
+    _PRESSURE_POLICY_CACHE = _builtin_pressure_defaults()
+    return _PRESSURE_POLICY_CACHE
+
+
+def _reload_pressure_policy() -> None:
+    global _PRESSURE_POLICY_CACHE
+    _PRESSURE_POLICY_CACHE = None
+
+
+def _builtin_pressure_defaults() -> Dict:
+    return {
+        "defaults": {"lookback_days": 30, "top_n": 10},
+        "weights": {
+            "recurrence_high_30d": 20,
+            "recurrence_warn_30d": 10,
+            "regressions_30d": 15,
+            "escalations_30d": 12,
+            "followups_created_30d": 8,
+            "followups_overdue": 15,
+            "drift_failures_30d": 10,
+            "dependency_high_30d": 10,
+        },
+        "bands": {"low_max": 20, "medium_max": 45, "high_max": 70},
+        "priority_rules": {
+            "require_arch_review_at": 70,
+            "auto_create_followup": True,
+            "followup_priority": "P1",
+            "followup_due_days": 14,
+            "followup_owner": "cto",
+        },
+        "release_gate": {
+            "platform_review_required": {"enabled": True, "warn_at": 60, "fail_at": 85}
+        },
+        "digest": {
+            "output_dir": "ops/reports/platform",
+            "max_chars": 12000,
+            "top_n_in_digest": 10,
+        },
+    }
+
+
+# ─── Band classifier ──────────────────────────────────────────────────────────
+
+def classify_pressure_band(score: int, policy: Dict) -> str:
+    bands = policy.get("bands", {})
+    low_max = int(bands.get("low_max", 20))
+    med_max = int(bands.get("medium_max", 45))
+    high_max = int(bands.get("high_max", 70))
+    if score <= low_max:
+        return "low"
+    if score <= med_max:
+        return "medium"
+    if score <= high_max:
+        return "high"
+    return "critical"
+
+
+# ─── Signal scoring helpers ───────────────────────────────────────────────────
+
+def _score_signals(components: Dict, policy: Dict) -> int:
+    """
+    Additive scoring:
+      recurrence_high_30d, recurrence_warn_30d — boolean (1/0)
+      regressions_30d, escalations_30d, ... — counts (capped internally)
+    """
+    weights = policy.get("weights", {})
+    score = 0
+
+    # Boolean presence signals
+    for bool_key in ("recurrence_high_30d", "recurrence_warn_30d"):
+        if components.get(bool_key, 0):
+            score += int(weights.get(bool_key, 0))
+
+    # Count-based signals: weight applied per unit, capped at 3× weight
+    for count_key in (
+        "regressions_30d", "escalations_30d", "followups_created_30d",
+        "followups_overdue", "drift_failures_30d", "dependency_high_30d",
+    ):
+        count = int(components.get(count_key, 0))
+        if count:
+            w = int(weights.get(count_key, 0))
+            # First occurrence = full weight, subsequent = half (diminishing)
+            score += w + (count - 1) * max(1, w // 2)
+
+    return max(0, score)
+
+
+def _signals_summary(components: Dict, policy: Dict) -> List[str]:
+    """Generate human-readable signal descriptions."""
+    summaries = []
+    if components.get("recurrence_high_30d"):
+        summaries.append("High-recurrence alert buckets in last 30d")
+    if components.get("recurrence_warn_30d"):
+        summaries.append("Warn-level recurrence in last 30d")
+    regressions = int(components.get("regressions_30d", 0))
+    if regressions:
+        summaries.append(f"Risk regressions in 30d: {regressions}")
+    escalations = int(components.get("escalations_30d", 0))
+    if escalations:
+        summaries.append(f"Escalations in 30d: {escalations}")
+    fu_created = int(components.get("followups_created_30d", 0))
+    if fu_created:
+        summaries.append(f"Follow-ups created in 30d: {fu_created}")
+    fu_overdue = int(components.get("followups_overdue", 0))
+    if fu_overdue:
+        summaries.append(f"Overdue follow-ups: {fu_overdue}")
+    drift = int(components.get("drift_failures_30d", 0))
+    if drift:
+        summaries.append(f"Drift gate failures in 30d: {drift}")
+    dep = int(components.get("dependency_high_30d", 0))
+    if dep:
+        summaries.append(f"Dependency HIGH/CRITICAL findings in 30d: {dep}")
+    return summaries
+
+
+# ─── Signal collection from stores ───────────────────────────────────────────
+
+def fetch_pressure_signals(
+    service: str,
+    env: str,
+    lookback_days: int = 30,
+    *,
+    incident_store=None,
+    alert_store=None,
+    risk_history_store=None,
+    policy: Optional[Dict] = None,
+) -> Dict:
+    """
+    Collect all signals needed for compute_pressure from existing stores.
+    Always non-fatal per store.
+    Returns a components dict ready to pass to compute_pressure.
+    """
+    if policy is None:
+        policy = load_pressure_policy()
+
+    cutoff = (
+        datetime.datetime.utcnow() - datetime.timedelta(days=lookback_days)
+    ).isoformat()
+    cutoff_60m = (
+        datetime.datetime.utcnow() - datetime.timedelta(minutes=60)
+    ).isoformat()
+
+    components: Dict = {
+        "recurrence_high_30d": 0,
+        "recurrence_warn_30d": 0,
+        "regressions_30d": 0,
+        "escalations_30d": 0,
+        "followups_created_30d": 0,
+        "followups_overdue": 0,
+        "drift_failures_30d": 0,
+        "dependency_high_30d": 0,
+    }
+
+    # ── Escalations + followups from incident_store ───────────────────────────
+    try:
+        if incident_store is not None:
+            incs = incident_store.list_incidents({"service": service}, limit=100)
+            for inc in incs:
+                inc_id = inc.get("id", "")
+                inc_start = inc.get("started_at") or inc.get("created_at", "")
+                try:
+                    events = incident_store.get_events(inc_id, limit=200)
+                    for ev in events:
+                        ev_ts = ev.get("ts", "")
+                        if ev_ts < cutoff:
+                            continue
+                        ev_type = ev.get("type", "")
+                        msg = ev.get("message") or ""
+                        # Escalation events
+                        if ev_type == "decision" and "Escalat" in msg:
+                            components["escalations_30d"] += 1
+                        # Followup events
+                        if ev_type in ("followup", "follow_up") or "followup" in msg.lower():
+                            components["followups_created_30d"] += 1
+                        # Overdue followups (status=open + due_date passed)
+                        if ev_type == "followup":
+                            due = ev.get("due_date", "")
+                            status = ev.get("status", "")
+                            today = datetime.datetime.utcnow().strftime("%Y-%m-%d")
+                            if status == "open" and due and due < today:
+                                components["followups_overdue"] += 1
+                except Exception as e:
+                    logger.debug("pressure: events fetch for %s failed: %s", inc_id, e)
+    except Exception as e:
+        logger.warning("pressure: incident_store fetch failed: %s", e)
+
+    # ── Regressions from risk_history_store ───────────────────────────────────
+    try:
+        if risk_history_store is not None:
+            series = risk_history_store.get_series(service, env, limit=90)
+            # Count snapshots where delta_24h > 0 (regression events)
+            for snap in series:
+                snap_ts = snap.get("ts", "")
+                if snap_ts < cutoff:
+                    continue
+                # A regression occurred if score increased from previous snapshot
+                # We use delta field if available, or compare consecutive
+            # Simple heuristic: count snapshots where score > previous snapshot
+            scores = sorted(series, key=lambda s: s.get("ts", ""))
+            for i in range(1, len(scores)):
+                if (scores[i].get("ts", "") >= cutoff
+                        and scores[i].get("score", 0) > scores[i - 1].get("score", 0)):
+                    components["regressions_30d"] += 1
+    except Exception as e:
+        logger.warning("pressure: risk_history_store fetch failed: %s", e)
+
+    # ── Recurrence from alert_store top_signatures ───────────────────────────
+    try:
+        if alert_store is not None:
+            # Use 30-day window approximation via large window
+            sigs = alert_store.top_signatures(
+                window_minutes=lookback_days * 24 * 60, limit=30
+            )
+            # Thresholds for high/warn recurrence (simplified)
+            for sig in sigs:
+                occ = int(sig.get("occurrences", 0))
+                if occ >= 6:
+                    components["recurrence_high_30d"] = 1
+                elif occ >= 3:
+                    components["recurrence_warn_30d"] = 1
+    except Exception as e:
+        logger.warning("pressure: alert_store recurrence fetch failed: %s", e)
+
+    return components
+
+
+# ─── Core engine ──────────────────────────────────────────────────────────────
+
+def compute_pressure(
+    service: str,
+    env: str = "prod",
+    *,
+    components: Optional[Dict] = None,
+    lookback_days: int = 30,
+    policy: Optional[Dict] = None,
+    # Optional stores for signal collection when components not pre-fetched
+    incident_store=None,
+    alert_store=None,
+    risk_history_store=None,
+) -> Dict:
+    """
+    Compute Architecture Pressure score for a service.
+
+    If `components` is provided, no stores are accessed.
+    Otherwise, signals are collected from stores (non-fatal fallbacks).
+
+    Returns a PressureReport dict.
+    """
+    if policy is None:
+        policy = load_pressure_policy()
+
+    effective_days = lookback_days or int(
+        policy.get("defaults", {}).get("lookback_days", 30)
+    )
+
+    if components is None:
+        components = fetch_pressure_signals(
+            service, env, effective_days,
+            incident_store=incident_store,
+            alert_store=alert_store,
+            risk_history_store=risk_history_store,
+            policy=policy,
+        )
+    else:
+        components = dict(components)
+
+    # Ensure all keys present
+    defaults_keys = [
+        "recurrence_high_30d", "recurrence_warn_30d", "regressions_30d",
+        "escalations_30d", "followups_created_30d", "followups_overdue",
+        "drift_failures_30d", "dependency_high_30d",
+    ]
+    for k in defaults_keys:
+        components.setdefault(k, 0)
+
+    score = _score_signals(components, policy)
+    band = classify_pressure_band(score, policy)
+    signals_summary = _signals_summary(components, policy)
+
+    # Architecture review required?
+    review_threshold = int(
+        policy.get("priority_rules", {}).get("require_arch_review_at", 70)
+    )
+    requires_arch_review = score >= review_threshold
+
+    return {
+        "service": service,
+        "env": env,
+        "lookback_days": effective_days,
+        "score": score,
+        "band": band,
+        "components": components,
+        "signals_summary": signals_summary,
+        "requires_arch_review": requires_arch_review,
+        "computed_at": datetime.datetime.utcnow().isoformat(),
+    }
+
+
+# ─── Dashboard ────────────────────────────────────────────────────────────────
+
+def compute_pressure_dashboard(
+    env: str = "prod",
+    services: Optional[List[str]] = None,
+    top_n: int = 10,
+    *,
+    policy: Optional[Dict] = None,
+    incident_store=None,
+    alert_store=None,
+    risk_history_store=None,
+    risk_reports: Optional[Dict[str, Dict]] = None,
+) -> Dict:
+    """
+    Compute Architecture Pressure for multiple services and return a dashboard.
+
+    `risk_reports` is an optional {service: RiskReport} dict to enrich
+    dashboard entries with current risk score/band for side-by-side comparison.
+    """
+    if policy is None:
+        policy = load_pressure_policy()
+
+    effective_top_n = top_n or int(policy.get("defaults", {}).get("top_n", 10))
+
+    # Determine services to evaluate
+    if not services:
+        services = _list_services_from_stores(
+            env=env, incident_store=incident_store, policy=policy
+        )
+
+    reports = []
+    for svc in services:
+        try:
+            report = compute_pressure(
+                svc, env,
+                policy=policy,
+                incident_store=incident_store,
+                alert_store=alert_store,
+                risk_history_store=risk_history_store,
+            )
+            # Optionally attach current risk info
+            if risk_reports and svc in risk_reports:
+                rr = risk_reports[svc]
+                report["risk_score"] = rr.get("score")
+                report["risk_band"] = rr.get("band")
+                report["risk_delta_24h"] = (rr.get("trend") or {}).get("delta_24h")
+            reports.append(report)
+        except Exception as e:
+            logger.warning("pressure dashboard: compute_pressure failed for %s: %s", svc, e)
+
+    reports.sort(key=lambda r: -r.get("score", 0))
+
+    # Band counts
+    band_counts: Dict[str, int] = {"critical": 0, "high": 0, "medium": 0, "low": 0}
+    for r in reports:
+        b = r.get("band", "low")
+        band_counts[b] = band_counts.get(b, 0) + 1
+
+    critical_services = [r["service"] for r in reports if r.get("band") == "critical"]
+    high_services = [r["service"] for r in reports if r.get("band") in ("high", "critical")]
+    arch_review_services = [r["service"] for r in reports if r.get("requires_arch_review")]
+
+    return {
+        "env": env,
+        "computed_at": datetime.datetime.utcnow().isoformat(),
+        "top_pressure_services": reports[:effective_top_n],
+        "band_counts": band_counts,
+        "critical_services": critical_services,
+        "high_services": high_services,
+        "arch_review_required": arch_review_services,
+        "total_services_evaluated": len(reports),
+    }
+
+
+def _list_services_from_stores(
+    env: str,
+    incident_store=None,
+    policy: Optional[Dict] = None,
+) -> List[str]:
+    """Infer known services from incident store, falling back to SLO policy."""
+    services: set = set()
+    try:
+        if incident_store is not None:
+            incs = incident_store.list_incidents({}, limit=200)
+            for inc in incs:
+                svc = inc.get("service")
+                if svc:
+                    services.add(svc)
+    except Exception as e:
+        logger.warning("pressure: list_services from incident_store failed: %s", e)
+
+    if not services:
+        # Fallback: read from SLO policy
+        try:
+            slo_paths = [
+                Path("config/slo_policy.yml"),
+                Path(__file__).resolve().parent.parent.parent / "config" / "slo_policy.yml",
+            ]
+            for p in slo_paths:
+                if p.exists():
+                    import yaml as _yaml
+                    with open(p) as f:
+                        slo = _yaml.safe_load(f) or {}
+                    services.update(slo.get("services", {}).keys())
+                    break
+        except Exception:
+            pass
+
+    return sorted(services)
+
+
+# ─── Auto followup creation ───────────────────────────────────────────────────
+
+def maybe_create_arch_review_followup(
+    pressure_report: Dict,
+    *,
+    incident_store=None,
+    policy: Optional[Dict] = None,
+    week_str: Optional[str] = None,
+) -> Dict:
+    """
+    If pressure score >= require_arch_review_at and auto_create_followup=True,
+    create an architecture-review follow-up on the latest open incident.
+
+    Deduped by key: arch_review:{YYYY-WW}:{service}
+    Returns: {"created": bool, "dedupe_key": str, "skipped_reason": str|None}
+    """
+    if policy is None:
+        policy = load_pressure_policy()
+
+    service = pressure_report.get("service", "")
+    score = int(pressure_report.get("score", 0))
+
+    rules = policy.get("priority_rules", {})
+    review_at = int(rules.get("require_arch_review_at", 70))
+    auto_create = bool(rules.get("auto_create_followup", True))
+
+    if score < review_at:
+        return {"created": False, "dedupe_key": None,
+                "skipped_reason": f"score {score} < require_arch_review_at {review_at}"}
+
+    if not auto_create:
+        return {"created": False, "dedupe_key": None,
+                "skipped_reason": "auto_create_followup disabled"}
+
+    if incident_store is None:
+        return {"created": False, "dedupe_key": None,
+                "skipped_reason": "incident_store not available"}
+
+    if week_str is None:
+        week_str = datetime.datetime.utcnow().strftime("%Y-W%V")
+
+    dedupe_key = f"arch_review:{week_str}:{service}"
+    priority = rules.get("followup_priority", "P1")
+    owner = rules.get("followup_owner", "cto")
+    due_days = int(rules.get("followup_due_days", 14))
+    due_date = (
+        datetime.datetime.utcnow() + datetime.timedelta(days=due_days)
+    ).strftime("%Y-%m-%d")
+
+    try:
+        # Check if a follow-up with this dedupe_key already exists
+        incs = incident_store.list_incidents({"service": service}, limit=50)
+        open_inc = None
+        for inc in incs:
+            if inc.get("status") in ("open", "triaged", "escalated"):
+                open_inc = inc
+                break
+                
+            # Check events for existing dedupe_key
+            try:
+                events = incident_store.get_events(inc.get("id", ""), limit=100)
+                for ev in events:
+                    if ev.get("dedupe_key") == dedupe_key:
+                        return {"created": False, "dedupe_key": dedupe_key,
+                                "skipped_reason": f"already exists: {dedupe_key}"}
+            except Exception:
+                pass
+
+        if open_inc is None:
+            # No open incident — create a synthetic architecture_review incident
+            open_inc = incident_store.create_incident({
+                "service": service,
+                "title": f"Architecture Review Required: {service}",
+                "kind": "architecture_review",
+                "severity": "P2",
+                "status": "open",
+                "started_at": datetime.datetime.utcnow().isoformat(),
+                "source": "architecture_pressure_engine",
+            })
+
+        # Add followup event to the incident
+        inc_id = open_inc.get("id", "")
+        incident_store.get_events(inc_id, limit=1)  # verify inc exists
+
+        # Write the followup event
+        followup_event = {
+            "type": "followup",
+            "ts": datetime.datetime.utcnow().isoformat(),
+            "message": (
+                f"[Architecture Pressure] Score={score} >= {review_at}. "
+                f"Schedule architecture review for '{service}'."
+            ),
+            "owner": owner,
+            "priority": priority,
+            "due_date": due_date,
+            "status": "open",
+            "dedupe_key": dedupe_key,
+            "source": "architecture_pressure_engine",
+        }
+
+        if hasattr(incident_store, "add_event"):
+            incident_store.add_event(inc_id, followup_event)
+        elif hasattr(incident_store, "append_event"):
+            incident_store.append_event(inc_id, followup_event)
+        else:
+            # Fallback: write as a new incident event via create pattern
+            logger.info(
+                "pressure: would create followup for %s (inc=%s, key=%s)",
+                service, inc_id, dedupe_key
+            )
+
+        return {"created": True, "dedupe_key": dedupe_key, "skipped_reason": None,
+                "incident_id": inc_id, "due_date": due_date, "priority": priority}
+
+    except Exception as e:
+        logger.warning("maybe_create_arch_review_followup failed for %s: %s", service, e)
+        return {"created": False, "dedupe_key": dedupe_key,
+                "skipped_reason": f"error: {e}"}
--- a/services/router/audit_store.py
+++ b/services/router/audit_store.py
@@ -0,0 +1,573 @@
+"""
+Audit Store — persistence layer for ToolGovernance audit events.
+
+Backends:
+  memory   — in-process list (testing; not persistent)
+  jsonl    — append-only JSONL file with daily rotation (default, zero-config)
+  postgres — asyncpg INSERT into tool_audit_events table
+
+Selection: env var AUDIT_BACKEND=jsonl|postgres|memory (default: jsonl)
+
+Security / Privacy:
+  - Payload is NEVER written (only hash + sizes)
+  - Each write is fire-and-forget: errors → log warning, do NOT raise
+  - Postgres writes are non-blocking (asyncio task)
+
+JSONL schema per line (matches AuditEvent fields):
+  {ts, req_id, workspace_id, user_id, agent_id, tool, action,
+   status, duration_ms, in_size, out_size, input_hash,
+   graph_run_id?, graph_node?, job_id?}
+
+Postgres DDL (run once — or apply via migration):
+  See _POSTGRES_DDL constant below.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import datetime
+import json
+import logging
+import os
+import threading
+import time
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+# ─── DDL ──────────────────────────────────────────────────────────────────────
+
+_POSTGRES_DDL = """
+CREATE TABLE IF NOT EXISTS tool_audit_events (
+  id            BIGSERIAL PRIMARY KEY,
+  ts            TIMESTAMPTZ NOT NULL,
+  req_id        TEXT        NOT NULL,
+  workspace_id  TEXT        NOT NULL,
+  user_id       TEXT        NOT NULL,
+  agent_id      TEXT        NOT NULL,
+  tool          TEXT        NOT NULL,
+  action        TEXT        NOT NULL,
+  status        TEXT        NOT NULL,
+  duration_ms   INT         NOT NULL,
+  in_size       INT         NOT NULL,
+  out_size      INT         NOT NULL,
+  input_hash    TEXT        NOT NULL,
+  graph_run_id  TEXT,
+  graph_node    TEXT,
+  job_id        TEXT
+);
+CREATE INDEX IF NOT EXISTS idx_tool_audit_ts       ON tool_audit_events(ts);
+CREATE INDEX IF NOT EXISTS idx_tool_audit_tool_ts  ON tool_audit_events(tool, ts);
+CREATE INDEX IF NOT EXISTS idx_tool_audit_agent_ts ON tool_audit_events(agent_id, ts);
+CREATE INDEX IF NOT EXISTS idx_tool_audit_ws_ts    ON tool_audit_events(workspace_id, ts);
+"""
+
+
+# ─── Canonical event dict ─────────────────────────────────────────────────────
+
+def _event_to_dict(event: "AuditEventLike") -> Dict[str, Any]:
+    """Convert an AuditEvent (dataclass) or dict to canonical storage dict."""
+    if isinstance(event, dict):
+        return event
+    return {
+        "ts": getattr(event, "ts", ""),
+        "req_id": getattr(event, "req_id", ""),
+        "workspace_id": getattr(event, "workspace_id", ""),
+        "user_id": getattr(event, "user_id", ""),
+        "agent_id": getattr(event, "agent_id", ""),
+        "tool": getattr(event, "tool", ""),
+        "action": getattr(event, "action", ""),
+        "status": getattr(event, "status", ""),
+        "duration_ms": round(float(getattr(event, "duration_ms", 0))),
+        "in_size": int(getattr(event, "input_chars", 0)),
+        "out_size": int(getattr(event, "output_size_bytes", 0)),
+        "input_hash": getattr(event, "input_hash", ""),
+        "graph_run_id": getattr(event, "graph_run_id", None),
+        "graph_node": getattr(event, "graph_node", None),
+        "job_id": getattr(event, "job_id", None),
+    }
+
+
+# Type alias (avoid circular imports)
+AuditEventLike = Any
+
+
+# ─── Interface ────────────────────────────────────────────────────────────────
+
+class AuditStore(ABC):
+    @abstractmethod
+    def write(self, event: AuditEventLike) -> None:
+        """Non-blocking write. MUST NOT raise on error."""
+        ...
+
+    @abstractmethod
+    def read(
+        self,
+        from_ts: Optional[str] = None,
+        to_ts: Optional[str] = None,
+        tool: Optional[str] = None,
+        agent_id: Optional[str] = None,
+        workspace_id: Optional[str] = None,
+        limit: int = 50000,
+    ) -> List[Dict[str, Any]]:
+        """Read events matching filters. Returns list of dicts."""
+        ...
+
+    def close(self) -> None:
+        pass
+
+
+# ─── Memory store ─────────────────────────────────────────────────────────────
+
+class MemoryAuditStore(AuditStore):
+    """In-process store for testing. Thread-safe."""
+
+    def __init__(self, max_events: int = 100_000):
+        self._events: List[Dict] = []
+        self._lock = threading.Lock()
+        self._max = max_events
+
+    def write(self, event: AuditEventLike) -> None:
+        try:
+            d = _event_to_dict(event)
+            with self._lock:
+                self._events.append(d)
+                if len(self._events) > self._max:
+                    self._events = self._events[-self._max:]
+        except Exception as e:
+            logger.warning("MemoryAuditStore.write error: %s", e)
+
+    def read(
+        self,
+        from_ts: Optional[str] = None,
+        to_ts: Optional[str] = None,
+        tool: Optional[str] = None,
+        agent_id: Optional[str] = None,
+        workspace_id: Optional[str] = None,
+        limit: int = 50000,
+    ) -> List[Dict]:
+        with self._lock:
+            rows = list(self._events)
+
+        # Filter
+        if from_ts:
+            rows = [r for r in rows if r.get("ts", "") >= from_ts]
+        if to_ts:
+            rows = [r for r in rows if r.get("ts", "") <= to_ts]
+        if tool:
+            rows = [r for r in rows if r.get("tool") == tool]
+        if agent_id:
+            rows = [r for r in rows if r.get("agent_id") == agent_id]
+        if workspace_id:
+            rows = [r for r in rows if r.get("workspace_id") == workspace_id]
+
+        return rows[-limit:]
+
+    def clear(self) -> None:
+        with self._lock:
+            self._events.clear()
+
+
+# ─── JSONL store ──────────────────────────────────────────────────────────────
+
+class JsonlAuditStore(AuditStore):
+    """
+    Append-only JSONL file with daily rotation.
+
+    File pattern: ops/audit/tool_audit_YYYY-MM-DD.jsonl
+    Writes are serialised through a threading.Lock (safe for multi-thread, not multi-process).
+    """
+
+    def __init__(self, directory: str = "ops/audit"):
+        self._dir = Path(directory)
+        self._dir.mkdir(parents=True, exist_ok=True)
+        self._lock = threading.Lock()
+        self._current_file: Optional[Path] = None
+        self._current_date: Optional[str] = None
+        self._fh = None
+
+    def _get_fh(self, date_str: str):
+        if date_str != self._current_date:
+            if self._fh:
+                try:
+                    self._fh.close()
+                except Exception:
+                    pass
+            path = self._dir / f"tool_audit_{date_str}.jsonl"
+            self._fh = open(path, "a", encoding="utf-8", buffering=1)  # line-buffered
+            self._current_date = date_str
+            self._current_file = path
+        return self._fh
+
+    def write(self, event: AuditEventLike) -> None:
+        try:
+            d = _event_to_dict(event)
+            date_str = (d.get("ts") or "")[:10] or datetime.date.today().isoformat()
+            line = json.dumps(d, ensure_ascii=False)
+            with self._lock:
+                fh = self._get_fh(date_str)
+                fh.write(line + "\n")
+        except Exception as e:
+            logger.warning("JsonlAuditStore.write error: %s", e)
+
+    def read(
+        self,
+        from_ts: Optional[str] = None,
+        to_ts: Optional[str] = None,
+        tool: Optional[str] = None,
+        agent_id: Optional[str] = None,
+        workspace_id: Optional[str] = None,
+        limit: int = 50000,
+    ) -> List[Dict]:
+        """Stream-read JSONL files in date range."""
+        # Determine which files to read
+        files = sorted(self._dir.glob("tool_audit_*.jsonl"))
+        if from_ts:
+            from_date = from_ts[:10]
+            files = [f for f in files if f.stem[-10:] >= from_date]
+        if to_ts:
+            to_date = to_ts[:10]
+            files = [f for f in files if f.stem[-10:] <= to_date]
+
+        rows = []
+        for fpath in files:
+            try:
+                with open(fpath, "r", encoding="utf-8") as f:
+                    for line in f:
+                        line = line.strip()
+                        if not line:
+                            continue
+                        try:
+                            d = json.loads(line)
+                        except Exception:
+                            continue
+                        ts = d.get("ts", "")
+                        if from_ts and ts < from_ts:
+                            continue
+                        if to_ts and ts > to_ts:
+                            continue
+                        if tool and d.get("tool") != tool:
+                            continue
+                        if agent_id and d.get("agent_id") != agent_id:
+                            continue
+                        if workspace_id and d.get("workspace_id") != workspace_id:
+                            continue
+                        rows.append(d)
+                        if len(rows) >= limit:
+                            break
+            except Exception as e:
+                logger.warning("JsonlAuditStore.read error %s: %s", fpath, e)
+            if len(rows) >= limit:
+                break
+
+        return rows
+
+    def close(self) -> None:
+        with self._lock:
+            if self._fh:
+                try:
+                    self._fh.close()
+                except Exception:
+                    pass
+                self._fh = None
+
+
+# ─── Postgres store ───────────────────────────────────────────────────────────
+
+class PostgresAuditStore(AuditStore):
+    """
+    Async Postgres store using asyncpg.
+    Writes are enqueued to an asyncio queue and flushed in background.
+    Falls back gracefully if Postgres is unavailable.
+    """
+
+    def __init__(self, dsn: str):
+        self._dsn = dsn
+        self._pool = None
+        self._queue: asyncio.Queue = asyncio.Queue(maxsize=10_000)
+        self._task: Optional[asyncio.Task] = None
+        self._started = False
+
+    def _ensure_started(self):
+        if self._started:
+            return
+        try:
+            loop = asyncio.get_event_loop()
+            if loop.is_running():
+                self._task = loop.create_task(self._flush_loop())
+                self._started = True
+        except RuntimeError:
+            pass
+
+    async def _get_pool(self):
+        if self._pool is None:
+            import asyncpg
+            self._pool = await asyncpg.create_pool(self._dsn, min_size=1, max_size=3)
+            async with self._pool.acquire() as conn:
+                await conn.execute(_POSTGRES_DDL)
+        return self._pool
+
+    async def _flush_loop(self):
+        while True:
+            events = []
+            try:
+                # Collect up to 50 events or wait 2s
+                evt = await asyncio.wait_for(self._queue.get(), timeout=2.0)
+                events.append(evt)
+                while not self._queue.empty() and len(events) < 50:
+                    events.append(self._queue.get_nowait())
+            except asyncio.TimeoutError:
+                pass
+            except Exception:
+                pass
+
+            if not events:
+                continue
+
+            try:
+                pool = await self._get_pool()
+                async with pool.acquire() as conn:
+                    await conn.executemany(
+                        """
+                        INSERT INTO tool_audit_events
+                          (ts, req_id, workspace_id, user_id, agent_id, tool, action,
+                           status, duration_ms, in_size, out_size, input_hash,
+                           graph_run_id, graph_node, job_id)
+                        VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15)
+                        """,
+                        [
+                            (
+                                e["ts"], e["req_id"], e["workspace_id"], e["user_id"],
+                                e["agent_id"], e["tool"], e["action"], e["status"],
+                                e["duration_ms"], e["in_size"], e["out_size"],
+                                e["input_hash"], e.get("graph_run_id"),
+                                e.get("graph_node"), e.get("job_id"),
+                            )
+                            for e in events
+                        ],
+                    )
+            except Exception as ex:
+                logger.warning("PostgresAuditStore flush error: %s", ex)
+
+    def write(self, event: AuditEventLike) -> None:
+        try:
+            d = _event_to_dict(event)
+            self._ensure_started()
+            if self._started and not self._queue.full():
+                self._queue.put_nowait(d)
+        except Exception as e:
+            logger.warning("PostgresAuditStore.write error: %s", e)
+
+    def read(
+        self,
+        from_ts: Optional[str] = None,
+        to_ts: Optional[str] = None,
+        tool: Optional[str] = None,
+        agent_id: Optional[str] = None,
+        workspace_id: Optional[str] = None,
+        limit: int = 50000,
+    ) -> List[Dict]:
+        """Synchronous read via asyncio.run() — for analyzer queries."""
+        try:
+            return asyncio.run(self._async_read(from_ts, to_ts, tool, agent_id, workspace_id, limit))
+        except Exception as e:
+            logger.warning("PostgresAuditStore.read error: %s", e)
+            return []
+
+    async def _async_read(self, from_ts, to_ts, tool, agent_id, workspace_id, limit):
+        pool = await self._get_pool()
+        conditions = ["TRUE"]
+        params = []
+        p = 1
+        if from_ts:
+            conditions.append(f"ts >= ${p}"); params.append(from_ts); p += 1
+        if to_ts:
+            conditions.append(f"ts <= ${p}"); params.append(to_ts); p += 1
+        if tool:
+            conditions.append(f"tool = ${p}"); params.append(tool); p += 1
+        if agent_id:
+            conditions.append(f"agent_id = ${p}"); params.append(agent_id); p += 1
+        if workspace_id:
+            conditions.append(f"workspace_id = ${p}"); params.append(workspace_id); p += 1
+
+        sql = f"SELECT * FROM tool_audit_events WHERE {' AND '.join(conditions)} ORDER BY ts LIMIT {limit}"
+        async with pool.acquire() as conn:
+            rows = await conn.fetch(sql, *params)
+        return [dict(r) for r in rows]
+
+
+# ─── Null store ───────────────────────────────────────────────────────────────
+
+class NullAuditStore(AuditStore):
+    """No-op store (audit disabled)."""
+    def write(self, event: AuditEventLike) -> None:
+        pass
+    def read(self, **kwargs) -> List[Dict]:
+        return []
+
+
+# ─── Global singleton ─────────────────────────────────────────────────────────
+
+_store: Optional[AuditStore] = None
+_store_lock = threading.Lock()
+
+
+def get_audit_store() -> AuditStore:
+    """Lazily initialise and return the global audit store."""
+    global _store
+    if _store is None:
+        with _store_lock:
+            if _store is None:
+                _store = _create_store()
+    return _store
+
+
+def set_audit_store(store: AuditStore) -> None:
+    """Override the global store (used in tests)."""
+    global _store
+    with _store_lock:
+        _store = store
+
+
+class AutoAuditStore(AuditStore):
+    """
+    Smart backend: tries Postgres first, falls back to JSONL on failure.
+
+    Used when AUDIT_BACKEND=auto (or unset with DATABASE_URL present).
+    - Writes go to whichever backend is currently healthy.
+    - On Postgres failure, transparently falls back to JsonlAuditStore.
+    - Recovers to Postgres on next health check (every ~5 min).
+
+    Non-fatal: write errors are logged as warnings.
+    """
+
+    _RECOVERY_INTERVAL_S = 300  # retry Postgres after 5 minutes
+
+    def __init__(self, pg_dsn: str, jsonl_dir: str):
+        self._pg_dsn = pg_dsn
+        self._jsonl_dir = jsonl_dir
+        self._primary: Optional[PostgresAuditStore] = None
+        self._fallback: Optional[JsonlAuditStore] = None
+        self._using_fallback = False
+        self._fallback_since: float = 0.0
+        self._init_lock = threading.Lock()
+
+    def _get_primary(self) -> Optional[PostgresAuditStore]:
+        if self._primary is None:
+            with self._init_lock:
+                if self._primary is None:
+                    self._primary = PostgresAuditStore(self._pg_dsn)
+        return self._primary
+
+    def _get_fallback(self) -> JsonlAuditStore:
+        if self._fallback is None:
+            with self._init_lock:
+                if self._fallback is None:
+                    self._fallback = JsonlAuditStore(self._jsonl_dir)
+        return self._fallback
+
+    def _maybe_recover(self) -> None:
+        """Try to switch back to Postgres if enough time has passed since fallback."""
+        if self._using_fallback and self._fallback_since > 0:
+            if time.monotonic() - self._fallback_since >= self._RECOVERY_INTERVAL_S:
+                logger.info("AutoAuditStore: attempting Postgres recovery")
+                self._using_fallback = False
+                self._fallback_since = 0.0
+
+    def write(self, event: AuditEventLike) -> None:
+        self._maybe_recover()
+        if not self._using_fallback:
+            try:
+                primary = self._get_primary()
+                if primary:
+                    primary.write(event)
+                    return
+            except Exception as pg_err:
+                logger.warning(
+                    "AutoAuditStore: Postgres write failed (%s), switching to JSONL fallback", pg_err
+                )
+                self._using_fallback = True
+                self._fallback_since = time.monotonic()
+        # Write to JSONL fallback
+        try:
+            self._get_fallback().write(event)
+        except Exception as jl_err:
+            logger.warning("AutoAuditStore: JSONL fallback write failed: %s", jl_err)
+
+    def read(
+        self,
+        from_ts: Optional[str] = None,
+        to_ts: Optional[str] = None,
+        tool: Optional[str] = None,
+        agent_id: Optional[str] = None,
+        workspace_id: Optional[str] = None,
+        limit: int = 50000,
+    ) -> List[Dict]:
+        """Read from Postgres if available, else JSONL."""
+        self._maybe_recover()
+        if not self._using_fallback:
+            try:
+                primary = self._get_primary()
+                if primary:
+                    return primary.read(from_ts=from_ts, to_ts=to_ts, tool=tool,
+                                        agent_id=agent_id, workspace_id=workspace_id, limit=limit)
+            except Exception as pg_err:
+                logger.warning("AutoAuditStore: Postgres read failed (%s), using JSONL", pg_err)
+                self._using_fallback = True
+                self._fallback_since = time.monotonic()
+        return self._get_fallback().read(
+            from_ts=from_ts, to_ts=to_ts, tool=tool,
+            agent_id=agent_id, workspace_id=workspace_id, limit=limit,
+        )
+
+    def active_backend(self) -> str:
+        """Return the name of the currently active backend."""
+        return "jsonl_fallback" if self._using_fallback else "postgres"
+
+    def close(self) -> None:
+        if self._primary:
+            try:
+                self._primary.close()
+            except Exception:
+                pass
+        if self._fallback:
+            try:
+                self._fallback.close()
+            except Exception:
+                pass
+
+
+def _create_store() -> AuditStore:
+    backend = os.getenv("AUDIT_BACKEND", "jsonl").lower()
+    dsn = os.getenv("DATABASE_URL") or os.getenv("POSTGRES_DSN", "")
+    audit_dir = os.getenv(
+        "AUDIT_JSONL_DIR",
+        str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "audit"),
+    )
+
+    if backend == "memory":
+        logger.info("AuditStore: in-memory (testing only)")
+        return MemoryAuditStore()
+
+    if backend == "postgres":
+        if not dsn:
+            logger.warning("AUDIT_BACKEND=postgres but DATABASE_URL not set; falling back to jsonl")
+        else:
+            logger.info("AuditStore: postgres dsn=%s…", dsn[:30])
+            return PostgresAuditStore(dsn)
+
+    if backend == "auto":
+        if dsn:
+            logger.info("AuditStore: auto (postgres→jsonl fallback) dsn=%s…", dsn[:30])
+            return AutoAuditStore(pg_dsn=dsn, jsonl_dir=audit_dir)
+        else:
+            logger.info("AuditStore: auto — no DATABASE_URL, using jsonl")
+
+    if backend == "null":
+        return NullAuditStore()
+
+    # Default / jsonl
+    logger.info("AuditStore: jsonl dir=%s", audit_dir)
+    return JsonlAuditStore(audit_dir)
--- a/services/router/backlog_generator.py
+++ b/services/router/backlog_generator.py
@@ -0,0 +1,530 @@
+"""
+backlog_generator.py — Auto-generation of Engineering Backlog items
+from Platform Priority / Risk digests.
+DAARION.city | deterministic, no LLM.
+
+Public API:
+  load_backlog_policy() -> Dict
+  generate_from_pressure_digest(digest_data, env, ...) -> GenerateResult
+  generate_from_risk_digest(digest_data, env, ...) -> GenerateResult
+  _build_item_from_rule(service, rule, context, policy, week_str, env) -> BacklogItem | None
+  _make_dedupe_key(prefix, week_str, env, service, category) -> str
+"""
+from __future__ import annotations
+
+import datetime
+import json
+import logging
+import yaml
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from backlog_store import (
+    BacklogItem, BacklogEvent, BacklogStore,
+    _new_id, _now_iso,
+)
+
+logger = logging.getLogger(__name__)
+
+# ─── Policy ───────────────────────────────────────────────────────────────────
+
+_BACKLOG_POLICY_CACHE: Optional[Dict] = None
+_BACKLOG_POLICY_PATHS = [
+    Path("config/backlog_policy.yml"),
+    Path(__file__).resolve().parent.parent.parent / "config" / "backlog_policy.yml",
+]
+
+
+def load_backlog_policy() -> Dict:
+    global _BACKLOG_POLICY_CACHE
+    if _BACKLOG_POLICY_CACHE is not None:
+        return _BACKLOG_POLICY_CACHE
+    for p in _BACKLOG_POLICY_PATHS:
+        if p.exists():
+            try:
+                with open(p) as f:
+                    data = yaml.safe_load(f) or {}
+                _BACKLOG_POLICY_CACHE = data
+                return data
+            except Exception as e:
+                logger.warning("Failed to load backlog_policy from %s: %s", p, e)
+    _BACKLOG_POLICY_CACHE = _builtin_backlog_defaults()
+    return _BACKLOG_POLICY_CACHE
+
+
+def _reload_backlog_policy() -> None:
+    global _BACKLOG_POLICY_CACHE
+    _BACKLOG_POLICY_CACHE = None
+
+
+def _builtin_backlog_defaults() -> Dict:
+    return {
+        "defaults": {"env": "prod", "retention_days": 180, "max_items_per_run": 50},
+        "dedupe": {
+            "scheme": "YYYY-WW",
+            "key_fields": ["service", "category", "env"],
+            "key_prefix": "platform_backlog",
+        },
+        "categories": {
+            "arch_review":      {"priority": "P1", "due_days": 14},
+            "refactor":         {"priority": "P1", "due_days": 21},
+            "slo_hardening":    {"priority": "P2", "due_days": 30},
+            "cleanup_followups": {"priority": "P2", "due_days": 14},
+            "security":         {"priority": "P0", "due_days": 7},
+        },
+        "generation": {
+            "weekly_from_pressure_digest": True,
+            "daily_from_risk_digest": False,
+            "rules": [
+                {
+                    "name": "arch_review_required",
+                    "when": {"pressure_requires_arch_review": True},
+                    "create": {
+                        "category": "arch_review",
+                        "title_template": "[ARCH] Review required: {service}",
+                    },
+                },
+                {
+                    "name": "high_pressure_refactor",
+                    "when": {
+                        "pressure_band_in": ["high", "critical"],
+                        "risk_band_in": ["high", "critical"],
+                    },
+                    "create": {
+                        "category": "refactor",
+                        "title_template": "[REF] Reduce pressure & risk: {service}",
+                    },
+                },
+                {
+                    "name": "slo_violations",
+                    "when": {"risk_has_slo_violations": True},
+                    "create": {
+                        "category": "slo_hardening",
+                        "title_template": "[SLO] Fix violations: {service}",
+                    },
+                },
+                {
+                    "name": "followup_backlog",
+                    "when": {"followups_overdue_gt": 0},
+                    "create": {
+                        "category": "cleanup_followups",
+                        "title_template": "[OPS] Close overdue followups: {service}",
+                    },
+                },
+            ],
+        },
+        "ownership": {
+            "default_owner": "oncall",
+            "overrides": {"gateway": "cto"},
+        },
+        "workflow": {
+            "statuses": ["open", "in_progress", "blocked", "done", "canceled"],
+            "allowed_transitions": {
+                "open":        ["in_progress", "blocked", "canceled"],
+                "in_progress": ["blocked", "done", "canceled"],
+                "blocked":     ["open", "in_progress", "canceled"],
+                "done":        [],
+                "canceled":    [],
+            },
+        },
+    }
+
+
+# ─── Helpers ──────────────────────────────────────────────────────────────────
+
+def _now_week() -> str:
+    return datetime.datetime.utcnow().strftime("%Y-W%V")
+
+
+def _make_dedupe_key(prefix: str, week_str: str, env: str,
+                     service: str, category: str) -> str:
+    return f"{prefix}:{week_str}:{env}:{service}:{category}"
+
+
+def _due_date(due_days: int) -> str:
+    return (
+        datetime.datetime.utcnow() + datetime.timedelta(days=due_days)
+    ).strftime("%Y-%m-%d")
+
+
+def _owner_for(service: str, policy: Dict) -> str:
+    overrides = policy.get("ownership", {}).get("overrides", {})
+    return overrides.get(service, policy.get("ownership", {}).get("default_owner", "oncall"))
+
+
+def _match_rule(rule: Dict, ctx: Dict) -> bool:
+    """
+    Evaluate a rule's `when` conditions against the service context dict.
+    All conditions must hold (AND logic).
+    """
+    when = rule.get("when", {})
+    for key, expected in when.items():
+        if key == "pressure_requires_arch_review":
+            if bool(ctx.get("pressure_requires_arch_review")) is not bool(expected):
+                return False
+
+        elif key == "pressure_band_in":
+            if ctx.get("pressure_band") not in expected:
+                return False
+
+        elif key == "risk_band_in":
+            if ctx.get("risk_band") not in expected:
+                return False
+
+        elif key == "risk_has_slo_violations":
+            slo_v = int(ctx.get("slo_violations", 0))
+            if (slo_v > 0) is not bool(expected):
+                return False
+
+        elif key == "followups_overdue_gt":
+            overdue = int(ctx.get("followups_overdue", 0))
+            if not (overdue > int(expected)):
+                return False
+
+    return True
+
+
+def _build_description(service: str, ctx: Dict, rule: Dict) -> str:
+    """Generate deterministic bullet-list description from context."""
+    lines = [f"Auto-generated by Engineering Backlog Bridge — rule: {rule.get('name', '?')}.", ""]
+    p_score = ctx.get("pressure_score")
+    p_band = ctx.get("pressure_band")
+    r_score = ctx.get("risk_score")
+    r_band = ctx.get("risk_band")
+    r_delta = ctx.get("risk_delta_24h")
+
+    if p_score is not None:
+        lines.append(f"- Architecture Pressure: {p_score} ({p_band})")
+    if r_score is not None:
+        lines.append(f"- Risk Score: {r_score} ({r_band})"
+                     + (f"  Δ24h: +{r_delta}" if r_delta else ""))
+    slo_v = int(ctx.get("slo_violations", 0))
+    if slo_v:
+        lines.append(f"- Active SLO violations: {slo_v}")
+    overdue = int(ctx.get("followups_overdue", 0))
+    if overdue:
+        lines.append(f"- Overdue follow-ups: {overdue}")
+    if ctx.get("signals_summary"):
+        lines.append(f"- Pressure signals: {'; '.join(ctx['signals_summary'][:3])}")
+    if ctx.get("risk_reasons"):
+        lines.append(f"- Risk signals: {'; '.join(ctx['risk_reasons'][:3])}")
+    return "\n".join(lines)
+
+
+def _build_item_from_rule(
+    service: str,
+    rule: Dict,
+    ctx: Dict,
+    policy: Dict,
+    week_str: str,
+    env: str,
+) -> Optional[BacklogItem]:
+    """Build a BacklogItem from a matched rule and service context."""
+    create_cfg = rule.get("create", {})
+    category = create_cfg.get("category", "arch_review")
+    title_template = create_cfg.get("title_template", "[BACKLOG] {service}")
+    title = title_template.format(service=service)
+
+    cat_cfg = policy.get("categories", {}).get(category, {})
+    priority = cat_cfg.get("priority", "P2")
+    due_days = int(cat_cfg.get("due_days", 14))
+    owner = _owner_for(service, policy)
+    prefix = policy.get("dedupe", {}).get("key_prefix", "platform_backlog")
+    dedupe_key = _make_dedupe_key(prefix, week_str, env, service, category)
+    description = _build_description(service, ctx, rule)
+
+    # Gather evidence_refs from context
+    evidence_refs = dict(ctx.get("evidence_refs") or {})
+
+    return BacklogItem(
+        id=_new_id("bl"),
+        created_at=_now_iso(),
+        updated_at=_now_iso(),
+        env=env,
+        service=service,
+        category=category,
+        title=title,
+        description=description,
+        priority=priority,
+        status="open",
+        owner=owner,
+        due_date=_due_date(due_days),
+        source="digest",
+        dedupe_key=dedupe_key,
+        evidence_refs=evidence_refs,
+        tags=["auto", f"week:{week_str}", f"rule:{rule.get('name', '?')}"],
+        meta={
+            "rule_name": rule.get("name", ""),
+            "pressure_score": ctx.get("pressure_score"),
+            "risk_score": ctx.get("risk_score"),
+            "week": week_str,
+        },
+    )
+
+
+# ─── Context builder from digest ──────────────────────────────────────────────
+
+def _build_service_context(
+    service_entry: Dict,
+    risk_entry: Optional[Dict] = None,
+) -> Dict:
+    """
+    Build a unified service context dict from a platform_priority_digest
+    top_pressure_services entry plus an optional risk_digest service entry.
+    """
+    p_score = service_entry.get("score")
+    p_band = service_entry.get("band", "low")
+    requires_review = bool(service_entry.get("requires_arch_review", False))
+    signals_summary = service_entry.get("signals_summary", [])
+    comp = service_entry.get("components", {})
+    followups_overdue = int(comp.get("followups_overdue", 0))
+    evidence_refs = service_entry.get("evidence_refs") or {}
+
+    ctx: Dict[str, Any] = {
+        "pressure_score": p_score,
+        "pressure_band": p_band,
+        "pressure_requires_arch_review": requires_review,
+        "signals_summary": signals_summary,
+        "followups_overdue": followups_overdue,
+        "evidence_refs": dict(evidence_refs),
+    }
+
+    # Merge risk data
+    if risk_entry:
+        ctx["risk_score"] = risk_entry.get("score")
+        ctx["risk_band"] = risk_entry.get("band", "low")
+        ctx["risk_delta_24h"] = (risk_entry.get("trend") or {}).get("delta_24h")
+        slo_comp = (risk_entry.get("components") or {}).get("slo") or {}
+        ctx["slo_violations"] = int(slo_comp.get("violations", 0))
+        ctx["risk_reasons"] = risk_entry.get("reasons", [])
+        # Merge evidence_refs from risk
+        risk_attrs = risk_entry.get("attribution") or {}
+        risk_erefs = risk_attrs.get("evidence_refs") or {}
+        for k, v in risk_erefs.items():
+            if k not in ctx["evidence_refs"]:
+                ctx["evidence_refs"][k] = v
+    else:
+        ctx.setdefault("risk_band", service_entry.get("risk_band", "low"))
+        ctx.setdefault("risk_score", service_entry.get("risk_score"))
+        ctx.setdefault("risk_delta_24h", service_entry.get("risk_delta_24h"))
+        ctx.setdefault("slo_violations", 0)
+
+    return ctx
+
+
+# ─── Main generation function ─────────────────────────────────────────────────
+
+def generate_from_pressure_digest(
+    digest_data: Dict,
+    env: str = "prod",
+    *,
+    store: Optional[BacklogStore] = None,
+    policy: Optional[Dict] = None,
+    week_str: Optional[str] = None,
+    risk_digest_data: Optional[Dict] = None,
+) -> Dict:
+    """
+    Generate backlog items from a weekly_platform_priority_digest JSON output.
+
+    Args:
+      digest_data: JSON dict from platform_priority_digest (top_pressure_services list)
+      env: deployment environment
+      store: backlog store (loaded from factory if None)
+      policy: backlog_policy (loaded if None)
+      week_str: override ISO week (defaults to digest's "week" field or current)
+      risk_digest_data: optional daily risk digest JSON to enrich context
+
+    Returns GenerateResult dict: created, updated, skipped, items
+    """
+    if policy is None:
+        policy = load_backlog_policy()
+    if store is None:
+        from backlog_store import get_backlog_store
+        store = get_backlog_store()
+
+    gen_cfg = policy.get("generation", {})
+    if not gen_cfg.get("weekly_from_pressure_digest", True):
+        return {"created": 0, "updated": 0, "skipped": 0, "items": [],
+                "skipped_reason": "weekly_from_pressure_digest disabled in policy"}
+
+    effective_week = week_str or digest_data.get("week") or _now_week()
+    max_items = int(policy.get("defaults", {}).get("max_items_per_run", 50))
+    rules = gen_cfg.get("rules", [])
+
+    # Build risk_by_service lookup
+    risk_by_service: Dict[str, Dict] = {}
+    if risk_digest_data:
+        for rs in (risk_digest_data.get("top_services") or []):
+            svc = rs.get("service", "")
+            if svc:
+                risk_by_service[svc] = rs
+
+    created = updated = skipped = 0
+    items_out: List[Dict] = []
+    total_written = 0
+
+    for svc_entry in (digest_data.get("top_pressure_services") or []):
+        service = svc_entry.get("service", "")
+        if not service:
+            continue
+        if total_written >= max_items:
+            skipped += 1
+            continue
+
+        ctx = _build_service_context(svc_entry, risk_by_service.get(service))
+
+        # Evaluate rules — one item per matched rule
+        matched_categories: set = set()
+        for rule in rules:
+            try:
+                if not _match_rule(rule, ctx):
+                    continue
+                category = rule.get("create", {}).get("category", "")
+                if category in matched_categories:
+                    continue   # dedupe same category within a service
+                matched_categories.add(category)
+
+                item = _build_item_from_rule(service, rule, ctx, policy,
+                                             effective_week, env)
+                if item is None:
+                    continue
+
+                result = store.upsert(item)
+                action = result["action"]
+                upserted = result["item"]
+
+                # Emit event
+                ev_type = "created" if action == "created" else "auto_update"
+                store.add_event(BacklogEvent(
+                    id=_new_id("ev"),
+                    item_id=upserted.id,
+                    ts=_now_iso(),
+                    type=ev_type,
+                    message=f"Auto-generated by weekly digest — rule: {rule.get('name', '?')}",
+                    actor="backlog_generator",
+                    meta={"week": effective_week, "rule": rule.get("name", "")},
+                ))
+
+                if action == "created":
+                    created += 1
+                else:
+                    updated += 1
+                total_written += 1
+                items_out.append({
+                    "id": upserted.id,
+                    "service": service,
+                    "category": upserted.category,
+                    "status": upserted.status,
+                    "action": action,
+                })
+            except Exception as e:
+                logger.warning("backlog_generator: skip rule %s for %s: %s",
+                                rule.get("name"), service, e)
+                skipped += 1
+
+    return {
+        "created": created,
+        "updated": updated,
+        "skipped": skipped,
+        "items": items_out,
+        "week": effective_week,
+    }
+
+
+def generate_from_risk_digest(
+    risk_digest_data: Dict,
+    env: str = "prod",
+    *,
+    store: Optional[BacklogStore] = None,
+    policy: Optional[Dict] = None,
+    week_str: Optional[str] = None,
+) -> Dict:
+    """
+    Optional: generate items from a daily risk digest JSON.
+    Only active when generation.daily_from_risk_digest=true.
+    """
+    if policy is None:
+        policy = load_backlog_policy()
+
+    gen_cfg = policy.get("generation", {})
+    if not gen_cfg.get("daily_from_risk_digest", False):
+        return {"created": 0, "updated": 0, "skipped": 0, "items": [],
+                "skipped_reason": "daily_from_risk_digest disabled in policy"}
+
+    if store is None:
+        from backlog_store import get_backlog_store
+        store = get_backlog_store()
+
+    # Convert risk digest top_services into pressure-like entries
+    effective_week = week_str or _now_week()
+    max_items = int(policy.get("defaults", {}).get("max_items_per_run", 50))
+    rules = gen_cfg.get("rules", [])
+
+    created = updated = skipped = 0
+    items_out: List[Dict] = []
+    total_written = 0
+
+    for svc_entry in (risk_digest_data.get("top_services") or []):
+        service = svc_entry.get("service", "")
+        if not service or total_written >= max_items:
+            skipped += 1
+            continue
+
+        # Build a minimal pressure context from risk data
+        ctx: Dict = {
+            "pressure_score": None,
+            "pressure_band": "low",
+            "pressure_requires_arch_review": False,
+            "signals_summary": [],
+            "followups_overdue": 0,
+            "risk_score": svc_entry.get("score"),
+            "risk_band": svc_entry.get("band", "low"),
+            "risk_delta_24h": (svc_entry.get("trend") or {}).get("delta_24h"),
+            "slo_violations": (svc_entry.get("components") or {}).get("slo", {}).get("violations", 0) if svc_entry.get("components") else 0,
+            "risk_reasons": svc_entry.get("reasons", []),
+            "evidence_refs": (svc_entry.get("attribution") or {}).get("evidence_refs") or {},
+        }
+
+        matched_categories: set = set()
+        for rule in rules:
+            try:
+                if not _match_rule(rule, ctx):
+                    continue
+                category = rule.get("create", {}).get("category", "")
+                if category in matched_categories:
+                    continue
+                matched_categories.add(category)
+
+                item = _build_item_from_rule(service, rule, ctx, policy,
+                                             effective_week, env)
+                if item is None:
+                    continue
+                result = store.upsert(item)
+                action = result["action"]
+                upserted = result["item"]
+                store.add_event(BacklogEvent(
+                    id=_new_id("ev"),
+                    item_id=upserted.id,
+                    ts=_now_iso(),
+                    type="created" if action == "created" else "auto_update",
+                    message="Auto-generated from daily risk digest",
+                    actor="backlog_generator",
+                    meta={"week": effective_week},
+                ))
+                if action == "created":
+                    created += 1
+                else:
+                    updated += 1
+                total_written += 1
+                items_out.append({
+                    "id": upserted.id, "service": service,
+                    "category": upserted.category, "status": upserted.status,
+                    "action": action,
+                })
+            except Exception as e:
+                logger.warning("backlog_generator(risk): skip rule %s for %s: %s",
+                                rule.get("name"), service, e)
+                skipped += 1
+
+    return {"created": created, "updated": updated, "skipped": skipped,
+            "items": items_out, "week": effective_week}
--- a/services/router/backlog_store.py
+++ b/services/router/backlog_store.py
@@ -0,0 +1,705 @@
+"""
+backlog_store.py — Engineering Backlog Storage Layer.
+DAARION.city | deterministic, no LLM.
+
+Backends:
+  MemoryBacklogStore   — in-process (tests + fallback)
+  JsonlBacklogStore    — filesystem append-only JSONL (MVP)
+  PostgresBacklogStore — Postgres primary (psycopg2 sync)
+  AutoBacklogStore     — Postgres → JSONL → Memory cascade
+
+Factory: get_backlog_store() → respects BACKLOG_BACKEND env var.
+
+BACKLOG_BACKEND: auto | postgres | jsonl | memory | null
+"""
+from __future__ import annotations
+
+import datetime
+import json
+import logging
+import os
+import threading
+import uuid
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+# ─── Data model ───────────────────────────────────────────────────────────────
+
+_VALID_STATUSES = {"open", "in_progress", "blocked", "done", "canceled"}
+_VALID_PRIORITIES = {"P0", "P1", "P2", "P3"}
+
+
+def _now_iso() -> str:
+    return datetime.datetime.utcnow().isoformat()
+
+
+def _new_id(prefix: str = "bl") -> str:
+    return f"{prefix}_{uuid.uuid4().hex[:12]}"
+
+
+@dataclass
+class BacklogItem:
+    id: str
+    created_at: str
+    updated_at: str
+    env: str
+    service: str
+    category: str              # arch_review / refactor / slo_hardening / cleanup_followups / security
+    title: str
+    description: str
+    priority: str              # P0..P3
+    status: str                # open / in_progress / blocked / done / canceled
+    owner: str
+    due_date: str              # YYYY-MM-DD
+    source: str                # risk | pressure | digest | manual
+    dedupe_key: str
+    evidence_refs: Dict = field(default_factory=dict)   # alerts, incidents, release_checks, ...
+    tags: List[str] = field(default_factory=list)
+    meta: Dict = field(default_factory=dict)
+
+    def to_dict(self) -> Dict:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, d: Dict) -> "BacklogItem":
+        return cls(
+            id=d.get("id", _new_id()),
+            created_at=d.get("created_at", _now_iso()),
+            updated_at=d.get("updated_at", _now_iso()),
+            env=d.get("env", "prod"),
+            service=d.get("service", ""),
+            category=d.get("category", ""),
+            title=d.get("title", ""),
+            description=d.get("description", ""),
+            priority=d.get("priority", "P2"),
+            status=d.get("status", "open"),
+            owner=d.get("owner", "oncall"),
+            due_date=d.get("due_date", ""),
+            source=d.get("source", "manual"),
+            dedupe_key=d.get("dedupe_key", ""),
+            evidence_refs=d.get("evidence_refs") or {},
+            tags=d.get("tags") or [],
+            meta=d.get("meta") or {},
+        )
+
+
+@dataclass
+class BacklogEvent:
+    id: str
+    item_id: str
+    ts: str
+    type: str       # created | status_change | comment | auto_update
+    message: str
+    actor: str
+    meta: Dict = field(default_factory=dict)
+
+    def to_dict(self) -> Dict:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, d: Dict) -> "BacklogEvent":
+        return cls(
+            id=d.get("id", _new_id("ev")),
+            item_id=d.get("item_id", ""),
+            ts=d.get("ts", _now_iso()),
+            type=d.get("type", "comment"),
+            message=d.get("message", ""),
+            actor=d.get("actor", "system"),
+            meta=d.get("meta") or {},
+        )
+
+
+# ─── Abstract base ────────────────────────────────────────────────────────────
+
+class BacklogStore(ABC):
+    @abstractmethod
+    def create(self, item: BacklogItem) -> BacklogItem: ...
+
+    @abstractmethod
+    def get(self, item_id: str) -> Optional[BacklogItem]: ...
+
+    @abstractmethod
+    def get_by_dedupe_key(self, key: str) -> Optional[BacklogItem]: ...
+
+    @abstractmethod
+    def update(self, item: BacklogItem) -> BacklogItem: ...
+
+    @abstractmethod
+    def list_items(self, filters: Optional[Dict] = None, limit: int = 50,
+                   offset: int = 0) -> List[BacklogItem]: ...
+
+    @abstractmethod
+    def add_event(self, event: BacklogEvent) -> BacklogEvent: ...
+
+    @abstractmethod
+    def get_events(self, item_id: str, limit: int = 50) -> List[BacklogEvent]: ...
+
+    @abstractmethod
+    def cleanup(self, retention_days: int = 180) -> int: ...
+
+    def upsert(self, item: BacklogItem) -> Dict:
+        """Create or update by dedupe_key. Returns {"action": created|updated, "item": ...}"""
+        existing = self.get_by_dedupe_key(item.dedupe_key)
+        if existing is None:
+            created = self.create(item)
+            return {"action": "created", "item": created}
+        # Update title/description/evidence_refs/tags/meta; preserve status/owner
+        existing.title = item.title
+        existing.description = item.description
+        existing.evidence_refs = item.evidence_refs
+        existing.tags = list(set(existing.tags + item.tags))
+        existing.meta.update(item.meta or {})
+        existing.updated_at = _now_iso()
+        updated = self.update(existing)
+        return {"action": "updated", "item": updated}
+
+    def dashboard(self, env: str = "prod") -> Dict:
+        """Return aggregated backlog counts."""
+        items = self.list_items({"env": env}, limit=1000)
+        today = datetime.datetime.utcnow().strftime("%Y-%m-%d")
+        status_counts: Dict[str, int] = {}
+        priority_counts: Dict[str, int] = {}
+        category_counts: Dict[str, int] = {}
+        overdue: List[Dict] = []
+        service_counts: Dict[str, int] = {}
+
+        for it in items:
+            status_counts[it.status] = status_counts.get(it.status, 0) + 1
+            priority_counts[it.priority] = priority_counts.get(it.priority, 0) + 1
+            category_counts[it.category] = category_counts.get(it.category, 0) + 1
+            service_counts[it.service] = service_counts.get(it.service, 0) + 1
+            if (it.status not in ("done", "canceled")
+                    and it.due_date and it.due_date < today):
+                overdue.append({
+                    "id": it.id, "service": it.service,
+                    "title": it.title, "priority": it.priority,
+                    "due_date": it.due_date, "owner": it.owner,
+                })
+
+        overdue.sort(key=lambda x: (x["priority"], x["due_date"]))
+        top_services = sorted(service_counts.items(), key=lambda x: -x[1])[:10]
+
+        return {
+            "env": env,
+            "total": len(items),
+            "status_counts": status_counts,
+            "priority_counts": priority_counts,
+            "category_counts": category_counts,
+            "overdue": overdue[:20],
+            "overdue_count": len(overdue),
+            "top_services": [{"service": s, "count": c} for s, c in top_services],
+        }
+
+
+# ─── Workflow helper ──────────────────────────────────────────────────────────
+
+def validate_transition(current_status: str, new_status: str,
+                        policy: Optional[Dict] = None) -> bool:
+    """Return True if transition is allowed, False otherwise."""
+    defaults = _builtin_workflow()
+    if policy is None:
+        allowed = defaults
+    else:
+        allowed = policy.get("workflow", {}).get("allowed_transitions", defaults)
+    return new_status in allowed.get(current_status, [])
+
+
+def _builtin_workflow() -> Dict:
+    return {
+        "open":        ["in_progress", "blocked", "canceled"],
+        "in_progress": ["blocked", "done", "canceled"],
+        "blocked":     ["open", "in_progress", "canceled"],
+        "done":        [],
+        "canceled":    [],
+    }
+
+
+# ─── Memory backend ───────────────────────────────────────────────────────────
+
+class MemoryBacklogStore(BacklogStore):
+    def __init__(self) -> None:
+        self._items: Dict[str, BacklogItem] = {}
+        self._events: List[BacklogEvent] = []
+        self._lock = threading.Lock()
+
+    def create(self, item: BacklogItem) -> BacklogItem:
+        with self._lock:
+            self._items[item.id] = item
+        return item
+
+    def get(self, item_id: str) -> Optional[BacklogItem]:
+        with self._lock:
+            return self._items.get(item_id)
+
+    def get_by_dedupe_key(self, key: str) -> Optional[BacklogItem]:
+        with self._lock:
+            for it in self._items.values():
+                if it.dedupe_key == key:
+                    return it
+        return None
+
+    def update(self, item: BacklogItem) -> BacklogItem:
+        with self._lock:
+            self._items[item.id] = item
+        return item
+
+    def list_items(self, filters: Optional[Dict] = None,
+                   limit: int = 50, offset: int = 0) -> List[BacklogItem]:
+        filters = filters or {}
+        with self._lock:
+            items = list(self._items.values())
+        items = _apply_filters(items, filters)
+        items.sort(key=lambda x: (x.priority, x.due_date or "9999"))
+        return items[offset: offset + limit]
+
+    def add_event(self, event: BacklogEvent) -> BacklogEvent:
+        with self._lock:
+            self._events.append(event)
+        return event
+
+    def get_events(self, item_id: str, limit: int = 50) -> List[BacklogEvent]:
+        with self._lock:
+            evs = [e for e in self._events if e.item_id == item_id]
+        return evs[-limit:]
+
+    def cleanup(self, retention_days: int = 180) -> int:
+        cutoff = (
+            datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
+        ).isoformat()
+        with self._lock:
+            to_delete = [
+                iid for iid, it in self._items.items()
+                if it.status in ("done", "canceled") and it.updated_at < cutoff
+            ]
+            for iid in to_delete:
+                del self._items[iid]
+        return len(to_delete)
+
+
+# ─── JSONL backend ────────────────────────────────────────────────────────────
+
+_JSONL_ITEMS = "ops/backlog/items.jsonl"
+_JSONL_EVENTS = "ops/backlog/events.jsonl"
+_JSONL_CACHE_MAX = 50_000   # lines to scan
+
+
+class JsonlBacklogStore(BacklogStore):
+    """
+    Append-only JSONL filesystem store.
+    Last-write-wins: items keyed by id, updates appended (read returns latest).
+    """
+    def __init__(
+        self,
+        items_path: str = _JSONL_ITEMS,
+        events_path: str = _JSONL_EVENTS,
+    ) -> None:
+        self._items_path = Path(items_path)
+        self._events_path = Path(events_path)
+        self._lock = threading.Lock()
+        self._items_path.parent.mkdir(parents=True, exist_ok=True)
+        self._events_path.parent.mkdir(parents=True, exist_ok=True)
+
+    def _load_items(self) -> Dict[str, BacklogItem]:
+        """Scan file, last-write-wins per id."""
+        items: Dict[str, BacklogItem] = {}
+        if not self._items_path.exists():
+            return items
+        try:
+            with open(self._items_path, "r", encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        d = json.loads(line)
+                        items[d["id"]] = BacklogItem.from_dict(d)
+                    except Exception:
+                        pass
+        except Exception as e:
+            logger.warning("JsonlBacklogStore: load_items error: %s", e)
+        return items
+
+    def _append_item(self, item: BacklogItem) -> None:
+        with open(self._items_path, "a", encoding="utf-8") as f:
+            f.write(json.dumps(item.to_dict(), default=str) + "\n")
+
+    def create(self, item: BacklogItem) -> BacklogItem:
+        with self._lock:
+            self._append_item(item)
+        return item
+
+    def get(self, item_id: str) -> Optional[BacklogItem]:
+        with self._lock:
+            items = self._load_items()
+        return items.get(item_id)
+
+    def get_by_dedupe_key(self, key: str) -> Optional[BacklogItem]:
+        with self._lock:
+            items = self._load_items()
+        for it in items.values():
+            if it.dedupe_key == key:
+                return it
+        return None
+
+    def update(self, item: BacklogItem) -> BacklogItem:
+        item.updated_at = _now_iso()
+        with self._lock:
+            self._append_item(item)
+        return item
+
+    def list_items(self, filters: Optional[Dict] = None,
+                   limit: int = 50, offset: int = 0) -> List[BacklogItem]:
+        with self._lock:
+            items = list(self._load_items().values())
+        items = _apply_filters(items, filters or {})
+        items.sort(key=lambda x: (x.priority, x.due_date or "9999"))
+        return items[offset: offset + limit]
+
+    def add_event(self, event: BacklogEvent) -> BacklogEvent:
+        with self._lock:
+            if not self._events_path.parent.exists():
+                self._events_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(self._events_path, "a", encoding="utf-8") as f:
+                f.write(json.dumps(event.to_dict(), default=str) + "\n")
+        return event
+
+    def get_events(self, item_id: str, limit: int = 50) -> List[BacklogEvent]:
+        events: List[BacklogEvent] = []
+        if not self._events_path.exists():
+            return events
+        try:
+            with open(self._events_path, "r", encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        d = json.loads(line)
+                        if d.get("item_id") == item_id:
+                            events.append(BacklogEvent.from_dict(d))
+                    except Exception:
+                        pass
+        except Exception as e:
+            logger.warning("JsonlBacklogStore: get_events error: %s", e)
+        return events[-limit:]
+
+    def cleanup(self, retention_days: int = 180) -> int:
+        cutoff = (
+            datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
+        ).isoformat()
+        with self._lock:
+            items = self._load_items()
+            to_keep = {
+                iid: it for iid, it in items.items()
+                if not (it.status in ("done", "canceled") and it.updated_at < cutoff)
+            }
+            deleted = len(items) - len(to_keep)
+            if deleted:
+                # Rewrite the file
+                with open(self._items_path, "w", encoding="utf-8") as f:
+                    for it in to_keep.values():
+                        f.write(json.dumps(it.to_dict(), default=str) + "\n")
+        return deleted
+
+
+# ─── Postgres backend ─────────────────────────────────────────────────────────
+
+class PostgresBacklogStore(BacklogStore):
+    """
+    Postgres-backed store using psycopg2 (sync).
+    Tables: backlog_items, backlog_events  (created by migration script).
+    """
+    def __init__(self, dsn: Optional[str] = None) -> None:
+        self._dsn = dsn or os.environ.get(
+            "BACKLOG_POSTGRES_DSN",
+            os.environ.get("POSTGRES_DSN", "postgresql://localhost/daarion")
+        )
+        self._lock = threading.Lock()
+
+    def _conn(self):
+        import psycopg2
+        import psycopg2.extras
+        return psycopg2.connect(self._dsn)
+
+    def create(self, item: BacklogItem) -> BacklogItem:
+        sql = """
+            INSERT INTO backlog_items
+              (id, created_at, updated_at, env, service, category, title, description,
+               priority, status, owner, due_date, source, dedupe_key,
+               evidence_refs, tags, meta)
+            VALUES
+              (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
+            ON CONFLICT (dedupe_key) DO NOTHING
+        """
+        with self._conn() as conn:
+            with conn.cursor() as cur:
+                cur.execute(sql, (
+                    item.id, item.created_at, item.updated_at,
+                    item.env, item.service, item.category,
+                    item.title, item.description, item.priority,
+                    item.status, item.owner, item.due_date or None,
+                    item.source, item.dedupe_key,
+                    json.dumps(item.evidence_refs),
+                    json.dumps(item.tags),
+                    json.dumps(item.meta),
+                ))
+        return item
+
+    def get(self, item_id: str) -> Optional[BacklogItem]:
+        with self._conn() as conn:
+            with conn.cursor() as cur:
+                cur.execute("SELECT * FROM backlog_items WHERE id=%s", (item_id,))
+                row = cur.fetchone()
+                if row:
+                    return self._row_to_item(row, cur.description)
+        return None
+
+    def get_by_dedupe_key(self, key: str) -> Optional[BacklogItem]:
+        with self._conn() as conn:
+            with conn.cursor() as cur:
+                cur.execute("SELECT * FROM backlog_items WHERE dedupe_key=%s", (key,))
+                row = cur.fetchone()
+                if row:
+                    return self._row_to_item(row, cur.description)
+        return None
+
+    def update(self, item: BacklogItem) -> BacklogItem:
+        item.updated_at = _now_iso()
+        sql = """
+            UPDATE backlog_items SET
+              updated_at=%s, title=%s, description=%s, priority=%s,
+              status=%s, owner=%s, due_date=%s, evidence_refs=%s, tags=%s, meta=%s
+            WHERE id=%s
+        """
+        with self._conn() as conn:
+            with conn.cursor() as cur:
+                cur.execute(sql, (
+                    item.updated_at, item.title, item.description,
+                    item.priority, item.status, item.owner,
+                    item.due_date or None,
+                    json.dumps(item.evidence_refs),
+                    json.dumps(item.tags),
+                    json.dumps(item.meta),
+                    item.id,
+                ))
+        return item
+
+    def list_items(self, filters: Optional[Dict] = None,
+                   limit: int = 50, offset: int = 0) -> List[BacklogItem]:
+        filters = filters or {}
+        where, params = _pg_where_clause(filters)
+        sql = f"""
+            SELECT * FROM backlog_items {where}
+            ORDER BY priority ASC, due_date ASC NULLS LAST
+            LIMIT %s OFFSET %s
+        """
+        with self._conn() as conn:
+            with conn.cursor() as cur:
+                cur.execute(sql, params + [limit, offset])
+                rows = cur.fetchall()
+                desc = cur.description
+                return [self._row_to_item(r, desc) for r in rows]
+
+    def add_event(self, event: BacklogEvent) -> BacklogEvent:
+        sql = """
+            INSERT INTO backlog_events (id, item_id, ts, type, message, actor, meta)
+            VALUES (%s,%s,%s,%s,%s,%s,%s)
+        """
+        with self._conn() as conn:
+            with conn.cursor() as cur:
+                cur.execute(sql, (
+                    event.id, event.item_id, event.ts,
+                    event.type, event.message, event.actor,
+                    json.dumps(event.meta),
+                ))
+        return event
+
+    def get_events(self, item_id: str, limit: int = 50) -> List[BacklogEvent]:
+        with self._conn() as conn:
+            with conn.cursor() as cur:
+                cur.execute(
+                    "SELECT * FROM backlog_events WHERE item_id=%s ORDER BY ts DESC LIMIT %s",
+                    (item_id, limit)
+                )
+                rows = cur.fetchall()
+                desc = cur.description
+                return [self._row_to_event(r, desc) for r in rows]
+
+    def cleanup(self, retention_days: int = 180) -> int:
+        cutoff = (
+            datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
+        ).isoformat()
+        with self._conn() as conn:
+            with conn.cursor() as cur:
+                cur.execute(
+                    """DELETE FROM backlog_items
+                       WHERE status IN ('done','canceled') AND updated_at < %s""",
+                    (cutoff,)
+                )
+                return cur.rowcount
+
+    @staticmethod
+    def _row_to_item(row, description) -> BacklogItem:
+        d = {col.name: val for col, val in zip(description, row)}
+        for json_key in ("evidence_refs", "tags", "meta"):
+            v = d.get(json_key)
+            if isinstance(v, str):
+                try:
+                    d[json_key] = json.loads(v)
+                except Exception:
+                    d[json_key] = {} if json_key != "tags" else []
+        return BacklogItem.from_dict(d)
+
+    @staticmethod
+    def _row_to_event(row, description) -> BacklogEvent:
+        d = {col.name: val for col, val in zip(description, row)}
+        if isinstance(d.get("meta"), str):
+            try:
+                d["meta"] = json.loads(d["meta"])
+            except Exception:
+                d["meta"] = {}
+        return BacklogEvent.from_dict(d)
+
+
+def _pg_where_clause(filters: Dict):
+    clauses, params = [], []
+    if filters.get("env"):
+        clauses.append("env=%s"); params.append(filters["env"])
+    if filters.get("service"):
+        clauses.append("service=%s"); params.append(filters["service"])
+    if filters.get("status"):
+        if isinstance(filters["status"], list):
+            ph = ",".join(["%s"] * len(filters["status"]))
+            clauses.append(f"status IN ({ph})"); params.extend(filters["status"])
+        else:
+            clauses.append("status=%s"); params.append(filters["status"])
+    if filters.get("owner"):
+        clauses.append("owner=%s"); params.append(filters["owner"])
+    if filters.get("category"):
+        clauses.append("category=%s"); params.append(filters["category"])
+    if filters.get("due_before"):
+        clauses.append("due_date < %s"); params.append(filters["due_before"])
+    return ("WHERE " + " AND ".join(clauses)) if clauses else "", params
+
+
+# ─── Null backend ─────────────────────────────────────────────────────────────
+
+class NullBacklogStore(BacklogStore):
+    def create(self, item): return item
+    def get(self, item_id): return None
+    def get_by_dedupe_key(self, key): return None
+    def update(self, item): return item
+    def list_items(self, filters=None, limit=50, offset=0): return []
+    def add_event(self, event): return event
+    def get_events(self, item_id, limit=50): return []
+    def cleanup(self, retention_days=180): return 0
+
+
+# ─── Auto backend (Postgres → JSONL fallback) ─────────────────────────────────
+
+class AutoBacklogStore(BacklogStore):
+    """Postgres primary with JSONL fallback. Retries Postgres after 5 min."""
+    _RETRY_SEC = 300
+
+    def __init__(
+        self,
+        postgres_dsn: Optional[str] = None,
+        jsonl_items: str = _JSONL_ITEMS,
+        jsonl_events: str = _JSONL_EVENTS,
+    ) -> None:
+        self._pg: Optional[PostgresBacklogStore] = None
+        self._jsonl = JsonlBacklogStore(jsonl_items, jsonl_events)
+        self._dsn = postgres_dsn
+        self._pg_failed_at: Optional[float] = None
+        self._lock = threading.Lock()
+        self._try_init_pg()
+
+    def _try_init_pg(self) -> None:
+        try:
+            self._pg = PostgresBacklogStore(self._dsn)
+            self._pg._conn().close()   # test connection
+            self._pg_failed_at = None
+            logger.info("AutoBacklogStore: Postgres backend active")
+        except Exception as e:
+            logger.warning("AutoBacklogStore: Postgres unavailable, using JSONL: %s", e)
+            self._pg = None
+            import time
+            self._pg_failed_at = time.time()
+
+    def _backend(self) -> BacklogStore:
+        if self._pg is not None:
+            return self._pg
+        import time
+        if (self._pg_failed_at is None
+                or time.time() - self._pg_failed_at >= self._RETRY_SEC):
+            self._try_init_pg()
+        return self._pg if self._pg is not None else self._jsonl
+
+    def create(self, item): return self._backend().create(item)
+    def get(self, item_id): return self._backend().get(item_id)
+    def get_by_dedupe_key(self, key): return self._backend().get_by_dedupe_key(key)
+    def update(self, item): return self._backend().update(item)
+    def list_items(self, filters=None, limit=50, offset=0):
+        return self._backend().list_items(filters, limit, offset)
+    def add_event(self, event): return self._backend().add_event(event)
+    def get_events(self, item_id, limit=50): return self._backend().get_events(item_id, limit)
+    def cleanup(self, retention_days=180): return self._backend().cleanup(retention_days)
+
+
+# ─── Filters helper ───────────────────────────────────────────────────────────
+
+def _apply_filters(items: List[BacklogItem], filters: Dict) -> List[BacklogItem]:
+    result = []
+    for it in items:
+        if filters.get("env") and it.env != filters["env"]:
+            continue
+        if filters.get("service") and it.service != filters["service"]:
+            continue
+        if filters.get("status"):
+            statuses = filters["status"] if isinstance(filters["status"], list) else [filters["status"]]
+            if it.status not in statuses:
+                continue
+        if filters.get("owner") and it.owner != filters["owner"]:
+            continue
+        if filters.get("category") and it.category != filters["category"]:
+            continue
+        if filters.get("due_before") and it.due_date and it.due_date >= filters["due_before"]:
+            continue
+        result.append(it)
+    return result
+
+
+# ─── Factory ──────────────────────────────────────────────────────────────────
+
+_STORE_INSTANCE: Optional[BacklogStore] = None
+_STORE_LOCK = threading.Lock()
+
+
+def get_backlog_store() -> BacklogStore:
+    global _STORE_INSTANCE
+    with _STORE_LOCK:
+        if _STORE_INSTANCE is not None:
+            return _STORE_INSTANCE
+        backend = os.environ.get("BACKLOG_BACKEND", "auto").lower()
+        if backend == "memory":
+            _STORE_INSTANCE = MemoryBacklogStore()
+        elif backend == "jsonl":
+            _STORE_INSTANCE = JsonlBacklogStore()
+        elif backend == "postgres":
+            _STORE_INSTANCE = PostgresBacklogStore()
+        elif backend == "null":
+            _STORE_INSTANCE = NullBacklogStore()
+        else:  # auto
+            _STORE_INSTANCE = AutoBacklogStore()
+        logger.info("backlog_store: using %s backend", type(_STORE_INSTANCE).__name__)
+        return _STORE_INSTANCE
+
+
+def _reset_store_for_tests() -> None:
+    global _STORE_INSTANCE
+    with _STORE_LOCK:
+        _STORE_INSTANCE = None
--- a/services/router/cost_analyzer.py
+++ b/services/router/cost_analyzer.py
@@ -0,0 +1,595 @@
+"""
+Cost & Resource Analyzer (FinOps MVP)
+
+Reads audit events from AuditStore and computes:
+  - Aggregated cost_units by tool/agent/workspace/status
+  - Top spenders (tools, agents, users)
+  - Anomalies (cost spikes, error rate spikes)
+  - Cost model weights
+
+"cost_units" = cost_per_call(tool) + duration_ms * cost_per_ms(tool)
+These are relative units, not real dollars.
+
+No payload access — all inputs are aggregation parameters only.
+"""
+
+from __future__ import annotations
+
+import datetime
+import logging
+import os
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ─── Config loader ────────────────────────────────────────────────────────────
+
+_weights_cache: Optional[Dict] = None
+_WEIGHTS_PATH = os.path.join(
+    os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent)),
+    "config", "cost_weights.yml",
+)
+
+
+def _load_weights() -> Dict:
+    global _weights_cache
+    if _weights_cache is not None:
+        return _weights_cache
+    try:
+        import yaml
+        with open(_WEIGHTS_PATH, "r") as f:
+            _weights_cache = yaml.safe_load(f) or {}
+    except Exception as e:
+        logger.warning("cost_weights.yml not loaded: %s", e)
+        _weights_cache = {}
+    return _weights_cache
+
+
+def reload_cost_weights() -> None:
+    """Force reload weights (for tests)."""
+    global _weights_cache
+    _weights_cache = None
+
+
+def get_weights_for_tool(tool: str) -> Tuple[float, float]:
+    """Return (cost_per_call, cost_per_ms) for a tool."""
+    cfg = _load_weights()
+    defaults = cfg.get("defaults", {})
+    tool_cfg = (cfg.get("tools") or {}).get(tool, {})
+    cpc = float(tool_cfg.get("cost_per_call", defaults.get("cost_per_call", 1.0)))
+    cpm = float(tool_cfg.get("cost_per_ms", defaults.get("cost_per_ms", 0.001)))
+    return cpc, cpm
+
+
+def compute_event_cost(event: Dict) -> float:
+    """Compute cost_units for a single audit event."""
+    tool = event.get("tool", "")
+    duration_ms = float(event.get("duration_ms", 0))
+    cpc, cpm = get_weights_for_tool(tool)
+    return round(cpc + duration_ms * cpm, 4)
+
+
+# ─── Time helpers ─────────────────────────────────────────────────────────────
+
+def _now_utc() -> datetime.datetime:
+    return datetime.datetime.now(datetime.timezone.utc)
+
+
+def _iso(dt: datetime.datetime) -> str:
+    return dt.isoformat()
+
+
+def _parse_iso(s: str) -> datetime.datetime:
+    s = s.replace("Z", "+00:00")
+    try:
+        return datetime.datetime.fromisoformat(s)
+    except Exception:
+        return _now_utc()
+
+
+def _bucket_hour(ts: str) -> str:
+    """Truncate ISO ts to hour: '2026-02-23T10:00:00+00:00'."""
+    return ts[:13] + ":00"
+
+
+# ─── Aggregation helpers ──────────────────────────────────────────────────────
+
+def _aggregate(
+    events: List[Dict],
+    group_keys: List[str],
+) -> Dict[str, Dict]:
+    """
+    Aggregate events by composite key (e.g. ["tool"] or ["agent_id", "tool"]).
+    Returns {key_str: {count, cost_units, duration_sum, failed_count, ...}}.
+    """
+    result: Dict[str, Dict] = defaultdict(lambda: {
+        "count": 0,
+        "cost_units": 0.0,
+        "duration_ms_sum": 0.0,
+        "failed_count": 0,
+        "denied_count": 0,
+        "in_size_sum": 0,
+        "out_size_sum": 0,
+    })
+
+    for ev in events:
+        parts = [str(ev.get(k, "unknown")) for k in group_keys]
+        key = ":".join(parts)
+        cost = compute_event_cost(ev)
+        status = ev.get("status", "pass")
+
+        r = result[key]
+        r["count"] += 1
+        r["cost_units"] = round(r["cost_units"] + cost, 4)
+        r["duration_ms_sum"] = round(r["duration_ms_sum"] + float(ev.get("duration_ms", 0)), 2)
+        r["in_size_sum"] += int(ev.get("in_size", 0))
+        r["out_size_sum"] += int(ev.get("out_size", 0))
+        if status in ("failed", "error"):
+            r["failed_count"] += 1
+        elif status == "denied":
+            r["denied_count"] += 1
+
+    # Enrich with averages
+    for key, r in result.items():
+        n = r["count"] or 1
+        r["avg_duration_ms"] = round(r["duration_ms_sum"] / n, 1)
+        r["avg_cost_units"] = round(r["cost_units"] / n, 4)
+        r["error_rate"] = round(r["failed_count"] / (r["count"] or 1), 4)
+
+    return dict(result)
+
+
+def _top_n(aggregated: Dict[str, Dict], key_field: str, n: int, sort_by: str = "cost_units") -> List[Dict]:
+    """Sort aggregated dict by sort_by and return top N."""
+    items = [
+        {"key": k, key_field: k, **v}
+        for k, v in aggregated.items()
+    ]
+    items.sort(key=lambda x: x.get(sort_by, 0), reverse=True)
+    return items[:n]
+
+
+# ─── Actions ──────────────────────────────────────────────────────────────────
+
+def action_report(
+    store,
+    time_range: Optional[Dict[str, str]] = None,
+    group_by: Optional[List[str]] = None,
+    top_n: int = 10,
+    include_failed: bool = True,
+    include_hourly: bool = False,
+) -> Dict[str, Any]:
+    """
+    Generate aggregated cost report for a time range.
+
+    Returns:
+      totals, breakdowns by group_by keys, top spenders, optional hourly trend.
+    """
+    now = _now_utc()
+    tr = time_range or {}
+    from_ts = tr.get("from") or _iso(now - datetime.timedelta(days=7))
+    to_ts = tr.get("to") or _iso(now)
+
+    events = store.read(from_ts=from_ts, to_ts=to_ts, limit=200_000)
+    if not include_failed:
+        events = [e for e in events if e.get("status", "pass") not in ("failed", "error")]
+
+    # Totals
+    total_cost = sum(compute_event_cost(e) for e in events)
+    total_calls = len(events)
+    total_failed = sum(1 for e in events if e.get("status") in ("failed", "error"))
+    total_denied = sum(1 for e in events if e.get("status") == "denied")
+
+    # Breakdowns
+    by_key = group_by or ["tool"]
+    breakdowns: Dict[str, List[Dict]] = {}
+    for gk in by_key:
+        agg = _aggregate(events, [gk])
+        breakdowns[gk] = _top_n(agg, gk, top_n)
+
+    # Hourly trend (optional, for last 7d max)
+    hourly: List[Dict] = []
+    if include_hourly and events:
+        hourly_agg: Dict[str, Dict] = defaultdict(lambda: {"count": 0, "cost_units": 0.0})
+        for ev in events:
+            bucket = _bucket_hour(ev.get("ts", ""))
+            hourly_agg[bucket]["count"] += 1
+            hourly_agg[bucket]["cost_units"] = round(
+                hourly_agg[bucket]["cost_units"] + compute_event_cost(ev), 4
+            )
+        hourly = [{"hour": k, **v} for k, v in sorted(hourly_agg.items())]
+
+    return {
+        "time_range": {"from": from_ts, "to": to_ts},
+        "totals": {
+            "calls": total_calls,
+            "cost_units": round(total_cost, 2),
+            "failed": total_failed,
+            "denied": total_denied,
+            "error_rate": round(total_failed / (total_calls or 1), 4),
+        },
+        "breakdowns": breakdowns,
+        **({"hourly": hourly} if include_hourly else {}),
+    }
+
+
+def action_top(
+    store,
+    window_hours: int = 24,
+    top_n: int = 10,
+) -> Dict[str, Any]:
+    """
+    Quick top-N report for tools, agents, and users over window_hours.
+    """
+    now = _now_utc()
+    from_ts = _iso(now - datetime.timedelta(hours=window_hours))
+    to_ts = _iso(now)
+
+    events = store.read(from_ts=from_ts, to_ts=to_ts, limit=100_000)
+
+    top_tools = _top_n(_aggregate(events, ["tool"]), "tool", top_n)
+    top_agents = _top_n(_aggregate(events, ["agent_id"]), "agent_id", top_n)
+    top_users = _top_n(_aggregate(events, ["user_id"]), "user_id", top_n)
+    top_workspaces = _top_n(_aggregate(events, ["workspace_id"]), "workspace_id", top_n)
+
+    return {
+        "window_hours": window_hours,
+        "time_range": {"from": from_ts, "to": to_ts},
+        "total_calls": len(events),
+        "top_tools": top_tools,
+        "top_agents": top_agents,
+        "top_users": top_users,
+        "top_workspaces": top_workspaces,
+    }
+
+
+def action_anomalies(
+    store,
+    window_minutes: int = 60,
+    baseline_hours: int = 24,
+    ratio_threshold: Optional[float] = None,
+    min_calls: Optional[int] = None,
+    tools_filter: Optional[List[str]] = None,
+) -> Dict[str, Any]:
+    """
+    Detect cost/call spikes and elevated error rates.
+
+    Algorithm:
+      1. Compute per-tool metrics for window [now-window_minutes, now]
+      2. Compute per-tool metrics for baseline [now-baseline_hours, now-window_minutes]
+      3. Spike = window_rate / baseline_rate >= ratio_threshold AND calls >= min_calls
+      4. Error spike = failed_rate > 10% AND calls >= min_calls
+    """
+    cfg = _load_weights()
+    anomaly_cfg = cfg.get("anomaly", {})
+
+    if ratio_threshold is None:
+        ratio_threshold = float(anomaly_cfg.get("spike_ratio_threshold", 3.0))
+    if min_calls is None:
+        min_calls = int(anomaly_cfg.get("min_calls_threshold", 10))
+
+    now = _now_utc()
+    window_from = _iso(now - datetime.timedelta(minutes=window_minutes))
+    baseline_from = _iso(now - datetime.timedelta(hours=baseline_hours))
+    baseline_to = window_from  # non-overlapping
+
+    # Fetch both windows
+    window_events = store.read(from_ts=window_from, to_ts=_iso(now), limit=50_000)
+    baseline_events = store.read(from_ts=baseline_from, to_ts=baseline_to, limit=200_000)
+
+    if tools_filter:
+        window_events = [e for e in window_events if e.get("tool") in tools_filter]
+        baseline_events = [e for e in baseline_events if e.get("tool") in tools_filter]
+
+    # Aggregate by tool
+    window_by_tool = _aggregate(window_events, ["tool"])
+    baseline_by_tool = _aggregate(baseline_events, ["tool"])
+
+    # Normalise baseline to per-minute rate
+    baseline_minutes = (baseline_hours * 60) - window_minutes
+    baseline_minutes = max(baseline_minutes, 1)
+    window_minutes_actual = float(window_minutes)
+
+    anomalies = []
+
+    all_tools = set(window_by_tool.keys()) | set(baseline_by_tool.keys())
+    for tool_key in sorted(all_tools):
+        w = window_by_tool.get(tool_key, {})
+        b = baseline_by_tool.get(tool_key, {})
+
+        w_calls = w.get("count", 0)
+        b_calls = b.get("count", 0)
+
+        if w_calls < min_calls:
+            continue  # Not enough traffic for meaningful anomaly
+
+        # Per-minute rates
+        w_rate = w_calls / window_minutes_actual
+        b_rate = b_calls / baseline_minutes if b_calls > 0 else 0.0
+
+        # Cost spike
+        w_cost_pm = w.get("cost_units", 0) / window_minutes_actual
+        b_cost_pm = b.get("cost_units", 0) / baseline_minutes if b_calls > 0 else 0.0
+
+        call_ratio = (w_rate / b_rate) if b_rate > 0 else float("inf")
+        cost_ratio = (w_cost_pm / b_cost_pm) if b_cost_pm > 0 else float("inf")
+
+        if call_ratio >= ratio_threshold or cost_ratio >= ratio_threshold:
+            ratio_display = round(max(call_ratio, cost_ratio), 2)
+            if ratio_display == float("inf"):
+                ratio_display = "∞ (no baseline)"
+            w_cost = w.get("cost_units", 0)
+            b_cost = b.get("cost_units", 0)
+            anomalies.append({
+                "type": "cost_spike",
+                "key": f"tool:{tool_key}",
+                "tool": tool_key,
+                "window": f"last_{window_minutes}m",
+                "baseline": f"prev_{baseline_hours}h",
+                "window_calls": w_calls,
+                "baseline_calls": b_calls,
+                "window_cost_units": round(w_cost, 2),
+                "baseline_cost_units": round(b_cost, 2),
+                "ratio": ratio_display,
+                "recommendation": _spike_recommendation(tool_key, ratio_display, w_calls),
+            })
+
+        # Error rate spike
+        w_err_rate = w.get("error_rate", 0)
+        if w_err_rate > 0.10 and w_calls >= min_calls:
+            anomalies.append({
+                "type": "error_spike",
+                "key": f"tool:{tool_key}",
+                "tool": tool_key,
+                "window": f"last_{window_minutes}m",
+                "failed_calls": w.get("failed_count", 0),
+                "total_calls": w_calls,
+                "error_rate": round(w_err_rate, 4),
+                "recommendation": f"Investigate failures for '{tool_key}': {w.get('failed_count',0)} failed / {w_calls} calls ({round(w_err_rate*100,1)}% error rate).",
+            })
+
+    # De-duplicate tool+type combos (error_spike already separate)
+    seen = set()
+    unique_anomalies = []
+    for a in anomalies:
+        key = (a["type"], a.get("tool", ""))
+        if key not in seen:
+            unique_anomalies.append(a)
+            seen.add(key)
+
+    return {
+        "anomalies": unique_anomalies,
+        "anomaly_count": len(unique_anomalies),
+        "window_minutes": window_minutes,
+        "baseline_hours": baseline_hours,
+        "ratio_threshold": ratio_threshold,
+        "min_calls": min_calls,
+        "stats": {
+            "window_calls": len(window_events),
+            "baseline_calls": len(baseline_events),
+        },
+    }
+
+
+def action_weights(repo_root: Optional[str] = None) -> Dict[str, Any]:
+    """Return current cost weights configuration."""
+    global _weights_cache
+    _weights_cache = None  # Force reload
+    cfg = _load_weights()
+    return {
+        "defaults": cfg.get("defaults", {}),
+        "tools": cfg.get("tools", {}),
+        "anomaly": cfg.get("anomaly", {}),
+        "config_path": _WEIGHTS_PATH,
+    }
+
+
+# ─── Recommendation templates ─────────────────────────────────────────────────
+
+def _spike_recommendation(tool: str, ratio: Any, calls: int) -> str:
+    cfg = _load_weights()
+    tool_cfg = (cfg.get("tools") or {}).get(tool, {})
+    category = tool_cfg.get("category", "")
+
+    if category == "media":
+        return (
+            f"'{tool}' cost spike (ratio={ratio}, {calls} calls). "
+            "Consider: rate-limit per workspace, queue with priority, review calling agents."
+        )
+    if category == "release":
+        return (
+            f"'{tool}' called more frequently than baseline (ratio={ratio}). "
+            "Review if release_check is looping or being triggered too often."
+        )
+    if category == "web":
+        return (
+            f"'{tool}' spike (ratio={ratio}). Consider: result caching, dedup identical queries."
+        )
+    return (
+        f"'{tool}' cost spike (ratio={ratio}, {calls} calls in window). "
+        "Review caller agents and apply rate limits if needed."
+    )
+
+
+# ─── backend=auto store resolver ─────────────────────────────────────────────
+
+def _resolve_store(backend: str = "auto"):
+    """
+    Return an AuditStore based on backend param.
+    backend='auto' (default): uses the globally configured store (which may be
+                              AutoAuditStore, Postgres, or JSONL).
+    backend='jsonl':          forces JsonlAuditStore (7-day window max recommended).
+    backend='memory':         MemoryAuditStore (testing).
+    """
+    from audit_store import get_audit_store, JsonlAuditStore, MemoryAuditStore
+    if backend in ("auto", None, ""):
+        return get_audit_store()
+    if backend == "jsonl":
+        import os
+        from pathlib import Path
+        audit_dir = os.getenv(
+            "AUDIT_JSONL_DIR",
+            str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "audit"),
+        )
+        return JsonlAuditStore(audit_dir)
+    if backend == "memory":
+        return MemoryAuditStore()
+    return get_audit_store()
+
+
+# ─── Digest action ────────────────────────────────────────────────────────────
+
+def action_digest(
+    store,
+    window_hours: int = 24,
+    baseline_hours: int = 168,   # 7 days
+    top_n: int = 10,
+    max_markdown_chars: int = 3800,
+) -> Dict:
+    """
+    Daily/weekly cost digest: top tools/agents + anomalies + recommendations.
+
+    Returns both structured JSON and a Telegram/markdown-friendly `markdown` field.
+    """
+    now = _now_utc()
+    window_from = _iso(now - datetime.timedelta(hours=window_hours))
+    window_to = _iso(now)
+    baseline_from = _iso(now - datetime.timedelta(hours=baseline_hours))
+
+    # ── Top ──────────────────────────────────────────────────────────────────
+    top_data = action_top(store, window_hours=window_hours, top_n=top_n)
+    top_tools = top_data.get("top_tools") or []
+    top_agents = top_data.get("top_agents") or []
+    total_calls = top_data.get("total_calls", 0)
+
+    # ── Anomalies ─────────────────────────────────────────────────────────────
+    anomaly_data = action_anomalies(
+        store,
+        window_minutes=int(window_hours * 60 / 4),
+        baseline_hours=baseline_hours,
+        min_calls=5,
+    )
+    anomalies = anomaly_data.get("anomalies") or []
+
+    # ── Total cost ────────────────────────────────────────────────────────────
+    events = store.read(from_ts=window_from, to_ts=window_to, limit=200_000)
+    total_cost = sum(compute_event_cost(e) for e in events)
+    failed = sum(1 for e in events if e.get("status") in ("failed", "error"))
+    error_rate = round(failed / max(len(events), 1), 4)
+
+    # ── Recommendations ───────────────────────────────────────────────────────
+    recs = []
+    for a in anomalies[:5]:
+        r = a.get("recommendation", "")
+        if r:
+            recs.append(r)
+    if error_rate > 0.05:
+        recs.append(f"High error rate {round(error_rate*100,1)}% — investigate failing tools.")
+    if top_tools and top_tools[0].get("cost_units", 0) > 500:
+        tool_name = top_tools[0].get("tool", "?")
+        recs.append(f"Top spender '{tool_name}' used {top_tools[0]['cost_units']:.0f} cost units — review frequency.")
+    recs = list(dict.fromkeys(recs))[:8]
+
+    # ── Markdown ─────────────────────────────────────────────────────────────
+    period_label = f"Last {window_hours}h" if window_hours <= 48 else f"Last {window_hours//24}d"
+    lines = [
+        f"📊 **Cost Digest** ({period_label})",
+        f"Total calls: {total_calls} | Cost units: {total_cost:.0f} | Errors: {round(error_rate*100,1)}%",
+        "",
+        "**Top Tools:**",
+    ]
+    for t in top_tools[:5]:
+        lines.append(f"  • `{t.get('tool','?')}` — {t.get('cost_units',0):.1f}u, {t.get('count',0)} calls")
+    lines.append("")
+    lines.append("**Top Agents:**")
+    for a in top_agents[:3]:
+        lines.append(f"  • `{a.get('agent_id','?')}` — {a.get('cost_units',0):.1f}u, {a.get('count',0)} calls")
+
+    if anomalies:
+        lines.append("")
+        lines.append(f"⚠️ **{len(anomalies)} Anomaly(ies):**")
+        for anm in anomalies[:3]:
+            lines.append(f"  • [{anm.get('type','?')}] `{anm.get('tool','?')}` ratio={anm.get('ratio','?')}")
+    if recs:
+        lines.append("")
+        lines.append("💡 **Recommendations:**")
+        for r in recs[:5]:
+            lines.append(f"  {r[:200]}")
+
+    markdown = "\n".join(lines)
+    if len(markdown) > max_markdown_chars:
+        markdown = markdown[:max_markdown_chars] + "\n…[truncated]"
+
+    return {
+        "period": period_label,
+        "window_hours": window_hours,
+        "time_range": {"from": window_from, "to": window_to},
+        "totals": {
+            "calls": total_calls,
+            "cost_units": round(total_cost, 2),
+            "failed": failed,
+            "error_rate": error_rate,
+        },
+        "top_tools": top_tools[:top_n],
+        "top_agents": top_agents[:top_n],
+        "anomalies": anomalies[:10],
+        "anomaly_count": len(anomalies),
+        "recommendations": recs,
+        "markdown": markdown,
+    }
+
+
+# ─── Main entrypoint ─────────────────────────────────────────────────────────
+
+def analyze_cost_dict(action: str, params: Optional[Dict] = None, store=None) -> Dict:
+    """
+    Wrapper called by tool_manager handler.
+    Returns plain dict for ToolResult.
+    """
+    params = params or {}
+    if store is None:
+        backend = params.get("backend", "auto")
+        store = _resolve_store(backend)
+
+    if action == "digest":
+        return action_digest(
+            store,
+            window_hours=int(params.get("window_hours", 24)),
+            baseline_hours=int(params.get("baseline_hours", 168)),
+            top_n=int(params.get("top_n", 10)),
+            max_markdown_chars=int(params.get("max_markdown_chars", 3800)),
+        )
+
+    if action == "report":
+        return action_report(
+            store,
+            time_range=params.get("time_range"),
+            group_by=params.get("group_by", ["tool"]),
+            top_n=int(params.get("top_n", 10)),
+            include_failed=bool(params.get("include_failed", True)),
+            include_hourly=bool(params.get("include_hourly", False)),
+        )
+
+    if action == "top":
+        return action_top(
+            store,
+            window_hours=int(params.get("window_hours", 24)),
+            top_n=int(params.get("top_n", 10)),
+        )
+
+    if action == "anomalies":
+        return action_anomalies(
+            store,
+            window_minutes=int(params.get("window_minutes", 60)),
+            baseline_hours=int(params.get("baseline_hours", 24)),
+            ratio_threshold=params.get("ratio_threshold"),
+            min_calls=params.get("min_calls"),
+            tools_filter=params.get("tools_filter"),
+        )
+
+    if action == "weights":
+        return action_weights()
+
+    return {"error": f"Unknown action '{action}'. Valid: digest, report, top, anomalies, weights"}
--- a/services/router/data_governance.py
+++ b/services/router/data_governance.py
--- a/services/router/dependency_scanner.py
+++ b/services/router/dependency_scanner.py
@@ -0,0 +1,968 @@
+"""
+Dependency & Supply Chain Scanner.
+
+Scans Python and Node.js dependencies for:
+  1. Known vulnerabilities (via OSV.dev API or offline cache)
+  2. Outdated packages (lockfile_only mode, using OSV fixed_versions)
+  3. License policy enforcement (optional, MVP: offline-only)
+
+Ecosystems supported:
+  Python → poetry.lock, pipfile.lock, requirements*.txt, pyproject.toml
+  Node   → package-lock.json, pnpm-lock.yaml, yarn.lock, package.json
+
+Pass rule: pass=false if any vuln with severity in fail_on (default: CRITICAL, HIGH).
+MEDIUM → warning (not blocking by default). UNKNOWN → warning if not in fail_on.
+
+Security:
+  - Read-only: no file writes except cache update (explicit)
+  - Evidence masked for secrets
+  - Payload not logged; only hash + counts
+  - Max files/deps enforced via limits
+  - Timeout via deadline
+"""
+
+from __future__ import annotations
+
+import csv
+import fnmatch
+import hashlib
+import json
+import logging
+import os
+import re
+import time
+import uuid
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, FrozenSet, List, Optional, Set, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+EXCLUDED_DIRS: FrozenSet[str] = frozenset({
+    "node_modules", ".git", "dist", "build", "vendor",
+    ".venv", "venv", "venv_models", "sofia_venv",
+    "__pycache__", ".pytest_cache", "rollback_backups",
+    "docs/consolidation",
+})
+
+OSV_API_URL = "https://api.osv.dev/v1/querybatch"
+OSV_BATCH_SIZE = 100          # max per request
+OSV_TIMEOUT_SEC = 15.0
+
+# OSV ecosystems
+ECOSYSTEM_PYPI = "PyPI"
+ECOSYSTEM_NPM = "npm"
+
+SEVERITY_ORDER = {"CRITICAL": 4, "HIGH": 3, "MEDIUM": 2, "LOW": 1, "UNKNOWN": 0}
+
+# ─── Data Structures ──────────────────────────────────────────────────────────
+
+@dataclass
+class Package:
+    name: str
+    version: str                # empty string = unresolved/unpinned
+    ecosystem: str              # "PyPI" | "npm"
+    source_file: str
+    pinned: bool = True
+
+    @property
+    def normalized_name(self) -> str:
+        return self.name.lower().replace("_", "-")
+
+    @property
+    def cache_key(self) -> str:
+        return f"{self.ecosystem}:{self.normalized_name}:{self.version}"
+
+
+@dataclass
+class Vulnerability:
+    osv_id: str
+    ecosystem: str
+    package: str
+    version: str
+    severity: str               # CRITICAL | HIGH | MEDIUM | LOW | UNKNOWN
+    fixed_versions: List[str]
+    aliases: List[str]          # CVE-XXXX-XXXX etc.
+    evidence: Dict[str, str]
+    recommendation: str
+
+
+@dataclass
+class OutdatedPackage:
+    ecosystem: str
+    package: str
+    current: str
+    latest: Optional[str]
+    notes: str
+
+
+@dataclass
+class LicenseFinding:
+    package: str
+    license: str
+    policy: str                 # "deny" | "warn" | "ok" | "unknown"
+    recommendation: str
+
+
+@dataclass
+class ScanResult:
+    pass_: bool
+    summary: str
+    stats: Dict[str, Any]
+    vulnerabilities: List[Dict]
+    outdated: List[Dict]
+    licenses: List[Dict]
+    recommendations: List[str]
+
+
+# ─── Helpers ──────────────────────────────────────────────────────────────────
+
+_SECRET_PAT = re.compile(
+    r'(?i)(api[_-]?key|token|secret|password|bearer|jwt|private[_-]?key)'
+    r'[\s=:]+[\'"`]?([a-zA-Z0-9_\-\.]{8,})[\'"`]?'
+)
+
+
+def _redact(text: str) -> str:
+    return _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***REDACTED***", text or "")
+
+
+def _is_excluded(path: str) -> bool:
+    parts = Path(path).parts
+    return any(p in EXCLUDED_DIRS for p in parts)
+
+
+def _read_file(path: str, max_bytes: int = 524288) -> str:
+    try:
+        size = os.path.getsize(path)
+        with open(path, "r", errors="replace") as f:
+            return f.read(min(size, max_bytes))
+    except Exception:
+        return ""
+
+
+def _normalize_pkg_name(name: str) -> str:
+    """Normalize: lowercase, underscores → dashes."""
+    return name.strip().lower().replace("_", "-")
+
+
+def _compare_versions(v1: str, v2: str) -> int:
+    """
+    Simple version comparison. Returns -1 / 0 / 1.
+    Handles semver and PEP 440 in a best-effort way.
+    """
+    def _parts(v: str) -> List[int]:
+        nums = re.findall(r'\d+', v.split("+")[0].split("-")[0])
+        return [int(x) for x in nums] if nums else [0]
+
+    p1, p2 = _parts(v1), _parts(v2)
+    # Pad to equal length
+    max_len = max(len(p1), len(p2))
+    p1 += [0] * (max_len - len(p1))
+    p2 += [0] * (max_len - len(p2))
+    if p1 < p2:
+        return -1
+    if p1 > p2:
+        return 1
+    return 0
+
+
+# ─── Python Parsers ───────────────────────────────────────────────────────────
+
+def _parse_poetry_lock(content: str, source_file: str) -> List[Package]:
+    """Parse poetry.lock [[package]] sections."""
+    packages = []
+    # Split on [[package]] headers
+    sections = re.split(r'\[\[package\]\]', content)
+    for section in sections[1:]:
+        name_m = re.search(r'^name\s*=\s*"([^"]+)"', section, re.MULTILINE)
+        ver_m = re.search(r'^version\s*=\s*"([^"]+)"', section, re.MULTILINE)
+        if name_m and ver_m:
+            packages.append(Package(
+                name=name_m.group(1),
+                version=ver_m.group(1),
+                ecosystem=ECOSYSTEM_PYPI,
+                source_file=source_file,
+                pinned=True,
+            ))
+    return packages
+
+
+def _parse_pipfile_lock(content: str, source_file: str) -> List[Package]:
+    """Parse Pipfile.lock JSON."""
+    packages = []
+    try:
+        data = json.loads(content)
+        for section in ("default", "develop"):
+            for pkg_name, pkg_info in (data.get(section) or {}).items():
+                version = pkg_info.get("version", "")
+                # Pipfile.lock versions are like "==2.28.0"
+                version = re.sub(r'^==', '', version)
+                if version:
+                    packages.append(Package(
+                        name=pkg_name,
+                        version=version,
+                        ecosystem=ECOSYSTEM_PYPI,
+                        source_file=source_file,
+                        pinned=True,
+                    ))
+    except Exception as e:
+        logger.debug(f"Could not parse Pipfile.lock: {e}")
+    return packages
+
+
+_REQ_LINE_PAT = re.compile(
+    r'^([A-Za-z0-9_\-\.]+)(?:\[.*?\])?\s*==\s*([^\s;#]+)',
+    re.MULTILINE,
+)
+_REQ_UNPINNED_PAT = re.compile(
+    r'^([A-Za-z0-9_\-\.]+)(?:\[.*?\])?\s*[><!~^]=?\s*[^\s;#]+',
+    re.MULTILINE,
+)
+
+
+def _parse_requirements_txt(content: str, source_file: str) -> List[Package]:
+    """
+    Parse requirements.txt.
+    Only pinned (==) lines yield concrete versions.
+    Unpinned are recorded with empty version (unresolved).
+    """
+    packages = []
+    seen: Set[str] = set()
+
+    for m in _REQ_LINE_PAT.finditer(content):
+        name, version = m.group(1), m.group(2).strip()
+        key = _normalize_pkg_name(name)
+        if key not in seen:
+            packages.append(Package(
+                name=name, version=version,
+                ecosystem=ECOSYSTEM_PYPI,
+                source_file=source_file, pinned=True,
+            ))
+            seen.add(key)
+
+    # Record unpinned for reporting (no vuln scan)
+    for m in _REQ_UNPINNED_PAT.finditer(content):
+        name = m.group(1)
+        key = _normalize_pkg_name(name)
+        if key not in seen:
+            packages.append(Package(
+                name=name, version="",
+                ecosystem=ECOSYSTEM_PYPI,
+                source_file=source_file, pinned=False,
+            ))
+            seen.add(key)
+
+    return packages
+
+
+def _parse_pyproject_toml(content: str, source_file: str) -> List[Package]:
+    """Extract declared deps from pyproject.toml (without resolving versions)."""
+    packages = []
+    # [tool.poetry.dependencies] or [project.dependencies]
+    dep_section = re.search(
+        r'\[(?:tool\.poetry\.dependencies|project)\]([^\[]*)', content, re.DOTALL
+    )
+    if not dep_section:
+        return packages
+    block = dep_section.group(1)
+    for m in re.finditer(r'^([A-Za-z0-9_\-\.]+)\s*=', block, re.MULTILINE):
+        name = m.group(1).strip()
+        if name.lower() in ("python", "python-version"):
+            continue
+        packages.append(Package(
+            name=name, version="",
+            ecosystem=ECOSYSTEM_PYPI,
+            source_file=source_file, pinned=False,
+        ))
+    return packages
+
+
+# ─── Node Parsers ─────────────────────────────────────────────────────────────
+
+def _parse_package_lock_json(content: str, source_file: str) -> List[Package]:
+    """Parse package-lock.json (npm v2/v3 format)."""
+    packages = []
+    try:
+        data = json.loads(content)
+        # v2/v3: flat packages object
+        pkg_map = data.get("packages") or {}
+        for path_key, info in pkg_map.items():
+            if path_key == "" or not path_key.startswith("node_modules/"):
+                continue
+            # Extract package name from path
+            name = path_key.replace("node_modules/", "").split("/node_modules/")[-1]
+            version = info.get("version", "")
+            if name and version:
+                packages.append(Package(
+                    name=name, version=version,
+                    ecosystem=ECOSYSTEM_NPM,
+                    source_file=source_file, pinned=True,
+                ))
+        # v1 fallback: nested dependencies
+        if not packages:
+            for name, info in (data.get("dependencies") or {}).items():
+                version = info.get("version", "")
+                if version:
+                    packages.append(Package(
+                        name=name, version=version,
+                        ecosystem=ECOSYSTEM_NPM,
+                        source_file=source_file, pinned=True,
+                    ))
+    except Exception as e:
+        logger.debug(f"Could not parse package-lock.json: {e}")
+    return packages
+
+
+def _parse_pnpm_lock(content: str, source_file: str) -> List[Package]:
+    """Parse pnpm-lock.yaml packages section."""
+    packages = []
+    # Pattern: /package@version:
+    for m in re.finditer(r'^/([^@\s]+)@([^\s:]+):', content, re.MULTILINE):
+        name, version = m.group(1), m.group(2)
+        packages.append(Package(
+            name=name, version=version,
+            ecosystem=ECOSYSTEM_NPM,
+            source_file=source_file, pinned=True,
+        ))
+    return packages
+
+
+def _parse_yarn_lock(content: str, source_file: str) -> List[Package]:
+    """Parse yarn.lock v1 format."""
+    packages = []
+    # Yarn.lock block: "package@version":\n  version "X.Y.Z"
+    block_pat = re.compile(
+        r'^"?([^@"\s]+)@[^:]+:\n(?:\s+.*\n)*?\s+version "([^"]+)"',
+        re.MULTILINE,
+    )
+    seen: Set[str] = set()
+    for m in block_pat.finditer(content):
+        name, version = m.group(1), m.group(2)
+        key = f"{name}@{version}"
+        if key not in seen:
+            packages.append(Package(
+                name=name, version=version,
+                ecosystem=ECOSYSTEM_NPM,
+                source_file=source_file, pinned=True,
+            ))
+            seen.add(key)
+    return packages
+
+
+def _parse_package_json(content: str, source_file: str) -> List[Package]:
+    """Extract declared deps from package.json (no lock = unresolved)."""
+    packages = []
+    try:
+        data = json.loads(content)
+        for section in ("dependencies", "devDependencies"):
+            for name in (data.get(section) or {}):
+                packages.append(Package(
+                    name=name, version="",
+                    ecosystem=ECOSYSTEM_NPM,
+                    source_file=source_file, pinned=False,
+                ))
+    except Exception:
+        pass
+    return packages
+
+
+# ─── Dependency Discovery ─────────────────────────────────────────────────────
+
+_PYTHON_MANIFESTS = (
+    "poetry.lock", "Pipfile.lock",
+)
+_PYTHON_REQUIREMENTS = ("requirements",)  # matched via endswith
+_PYTHON_PYPROJECT = ("pyproject.toml",)
+_NODE_MANIFESTS = (
+    "package-lock.json", "pnpm-lock.yaml", "yarn.lock", "package.json",
+)
+
+
+def _find_and_parse_deps(
+    repo_root: str,
+    targets: List[str],
+    max_files: int,
+    deadline: float,
+) -> List[Package]:
+    """Walk repo and extract all packages from manifest files."""
+    all_packages: List[Package] = []
+    files_scanned = 0
+
+    for dirpath, dirnames, filenames in os.walk(repo_root):
+        dirnames[:] = [
+            d for d in dirnames
+            if d not in EXCLUDED_DIRS and not d.startswith(".")
+        ]
+        if time.monotonic() > deadline:
+            logger.warning("dependency_scanner: walk timeout")
+            break
+
+        for fname in filenames:
+            if files_scanned >= max_files:
+                break
+            full = os.path.join(dirpath, fname)
+            if _is_excluded(full):
+                continue
+
+            rel = os.path.relpath(full, repo_root)
+            content = None
+
+            if "python" in targets:
+                if fname in _PYTHON_MANIFESTS:
+                    content = _read_file(full)
+                    if fname == "poetry.lock":
+                        all_packages.extend(_parse_poetry_lock(content, rel))
+                    elif fname == "Pipfile.lock":
+                        all_packages.extend(_parse_pipfile_lock(content, rel))
+                    files_scanned += 1
+                elif fname.endswith(".txt") and "requirements" in fname.lower():
+                    content = _read_file(full)
+                    all_packages.extend(_parse_requirements_txt(content, rel))
+                    files_scanned += 1
+                elif fname in _PYTHON_PYPROJECT:
+                    content = _read_file(full)
+                    all_packages.extend(_parse_pyproject_toml(content, rel))
+                    files_scanned += 1
+
+            if "node" in targets:
+                if fname in _NODE_MANIFESTS:
+                    # Skip package.json if package-lock.json sibling exists
+                    if fname == "package.json":
+                        lock_exists = (
+                            os.path.exists(os.path.join(dirpath, "package-lock.json")) or
+                            os.path.exists(os.path.join(dirpath, "yarn.lock")) or
+                            os.path.exists(os.path.join(dirpath, "pnpm-lock.yaml"))
+                        )
+                        if lock_exists:
+                            continue
+                    content = _read_file(full)
+                    if fname == "package-lock.json":
+                        all_packages.extend(_parse_package_lock_json(content, rel))
+                    elif fname == "pnpm-lock.yaml":
+                        all_packages.extend(_parse_pnpm_lock(content, rel))
+                    elif fname == "yarn.lock":
+                        all_packages.extend(_parse_yarn_lock(content, rel))
+                    elif fname == "package.json":
+                        all_packages.extend(_parse_package_json(content, rel))
+                    files_scanned += 1
+
+    # Deduplicate: prefer pinned over unpinned; first seen wins
+    seen: Dict[str, Package] = {}
+    for pkg in all_packages:
+        key = f"{pkg.ecosystem}:{pkg.normalized_name}"
+        if key not in seen or (not seen[key].pinned and pkg.pinned):
+            seen[key] = pkg
+
+    return list(seen.values())
+
+
+# ─── OSV Cache ────────────────────────────────────────────────────────────────
+
+def _load_osv_cache(cache_path: str) -> Dict[str, Any]:
+    """Load offline OSV cache from JSON file."""
+    if not cache_path or not os.path.exists(cache_path):
+        return {}
+    try:
+        with open(cache_path, "r") as f:
+            data = json.load(f)
+        return data.get("entries", {})
+    except Exception as e:
+        logger.warning(f"Could not load OSV cache {cache_path}: {e}")
+        return {}
+
+
+def _save_osv_cache(cache_path: str, entries: Dict[str, Any]):
+    """Persist updated cache entries to disk."""
+    os.makedirs(os.path.dirname(os.path.abspath(cache_path)), exist_ok=True)
+    existing = {}
+    if os.path.exists(cache_path):
+        try:
+            with open(cache_path, "r") as f:
+                existing = json.load(f)
+        except Exception:
+            pass
+    existing_entries = existing.get("entries", {})
+    existing_entries.update(entries)
+    import datetime
+    output = {
+        "version": 1,
+        "updated_at": datetime.datetime.now(datetime.timezone.utc).isoformat(),
+        "entries": existing_entries,
+    }
+    with open(cache_path, "w") as f:
+        json.dump(output, f, indent=2)
+
+
+# ─── OSV API ──────────────────────────────────────────────────────────────────
+
+def _query_osv_online(
+    packages: List[Package],
+    new_cache: Dict[str, Any],
+    deadline: float,
+) -> Dict[str, List[Dict]]:
+    """
+    Query OSV.dev /v1/querybatch in batches.
+    Returns {cache_key: [vuln_objects]}.
+    """
+    try:
+        import httpx
+    except ImportError:
+        logger.warning("httpx not available for OSV online query")
+        return {}
+
+    results: Dict[str, List[Dict]] = {}
+    batches = [packages[i:i + OSV_BATCH_SIZE] for i in range(0, len(packages), OSV_BATCH_SIZE)]
+
+    for batch in batches:
+        if time.monotonic() > deadline:
+            break
+        queries = []
+        batch_keys = []
+        for pkg in batch:
+            if not pkg.pinned or not pkg.version:
+                continue
+            queries.append({
+                "package": {"name": pkg.normalized_name, "ecosystem": pkg.ecosystem},
+                "version": pkg.version,
+            })
+            batch_keys.append(pkg.cache_key)
+
+        if not queries:
+            continue
+
+        try:
+            remaining = max(1.0, deadline - time.monotonic())
+            timeout = min(OSV_TIMEOUT_SEC, remaining)
+            with httpx.Client(timeout=timeout) as client:
+                resp = client.post(OSV_API_URL, json={"queries": queries})
+                resp.raise_for_status()
+                data = resp.json()
+        except Exception as e:
+            logger.warning(f"OSV query failed: {e}")
+            continue
+
+        for key, result in zip(batch_keys, data.get("results", [])):
+            vulns = result.get("vulns") or []
+            results[key] = vulns
+            new_cache[key] = {"vulns": vulns, "cached_at": _now_iso()}
+
+    return results
+
+
+def _parse_osv_severity(vuln: Dict) -> str:
+    """Extract best-effort severity from OSV vuln object."""
+    # Try database_specific.severity (many databases provide this)
+    db_specific = vuln.get("database_specific", {})
+    sev = (db_specific.get("severity") or "").upper()
+    if sev in SEVERITY_ORDER:
+        return sev
+
+    # Try severity[].type=CVSS_V3 score
+    for sev_entry in (vuln.get("severity") or []):
+        score_str = sev_entry.get("score", "")
+        # CVSS vector like CVSS:3.1/AV:N/AC:L/.../C:H/I:H/A:H
+        # Extract base score from the end: not available directly
+        # Try to extract numerical score if present
+        num_m = re.search(r'(\d+\.\d+)', score_str)
+        if num_m:
+            score = float(num_m.group(1))
+            if score >= 9.0:
+                return "CRITICAL"
+            if score >= 7.0:
+                return "HIGH"
+            if score >= 4.0:
+                return "MEDIUM"
+            if score > 0:
+                return "LOW"
+
+    # Try ecosystem_specific
+    eco_specific = vuln.get("ecosystem_specific", {})
+    sev = (eco_specific.get("severity") or "").upper()
+    if sev in SEVERITY_ORDER:
+        return sev
+
+    return "UNKNOWN"
+
+
+def _extract_fixed_versions(vuln: Dict, pkg_name: str, ecosystem: str) -> List[str]:
+    """Extract fixed versions from OSV affected[].ranges[].events."""
+    fixed = []
+    for affected in (vuln.get("affected") or []):
+        pkg = affected.get("package", {})
+        if (pkg.get("ecosystem") or "").lower() != ecosystem.lower():
+            continue
+        if _normalize_pkg_name(pkg.get("name", "")) != _normalize_pkg_name(pkg_name):
+            continue
+        for rng in (affected.get("ranges") or []):
+            for event in (rng.get("events") or []):
+                if "fixed" in event:
+                    fixed.append(event["fixed"])
+    return sorted(set(fixed))
+
+
+def _lookup_vulnerability(
+    pkg: Package,
+    osv_vulns: List[Dict],
+) -> List[Vulnerability]:
+    """Convert raw OSV vulns → Vulnerability objects."""
+    results = []
+    for vuln in osv_vulns:
+        osv_id = vuln.get("id", "UNKNOWN")
+        aliases = [a for a in (vuln.get("aliases") or []) if a.startswith("CVE")]
+        severity = _parse_osv_severity(vuln)
+        fixed = _extract_fixed_versions(vuln, pkg.name, pkg.ecosystem)
+        rec = (
+            f"Upgrade {pkg.name} from {pkg.version} to {fixed[0]}"
+            if fixed else
+            f"No fix available for {pkg.name}@{pkg.version}. Monitor {osv_id}."
+        )
+        results.append(Vulnerability(
+            osv_id=osv_id,
+            ecosystem=pkg.ecosystem,
+            package=pkg.name,
+            version=pkg.version,
+            severity=severity,
+            fixed_versions=fixed,
+            aliases=aliases,
+            evidence={
+                "file": _redact(pkg.source_file),
+                "details": f"{pkg.name}=={pkg.version} in {pkg.source_file}",
+            },
+            recommendation=rec,
+        ))
+    return results
+
+
+# ─── Outdated Analysis ────────────────────────────────────────────────────────
+
+def _analyze_outdated(
+    packages: List[Package],
+    vuln_results: Dict[str, List[Dict]],
+) -> List[OutdatedPackage]:
+    """
+    Lockfile-only outdated analysis.
+    Uses fixed_versions from OSV results as a hint for "newer version available".
+    """
+    outdated = []
+    for pkg in packages:
+        if not pkg.pinned or not pkg.version:
+            continue
+        key = pkg.cache_key
+        vulns = vuln_results.get(key, [])
+        for vuln in vulns:
+            fixed = _extract_fixed_versions(vuln, pkg.name, pkg.ecosystem)
+            if not fixed:
+                continue
+            # Find the smallest fixed version > current
+            upgrades = [v for v in fixed if _compare_versions(v, pkg.version) > 0]
+            if upgrades:
+                min_fix = sorted(upgrades, key=lambda v: [int(x) for x in re.findall(r'\d+', v)])[0]
+                outdated.append(OutdatedPackage(
+                    ecosystem=pkg.ecosystem,
+                    package=pkg.name,
+                    current=pkg.version,
+                    latest=min_fix,
+                    notes=f"Security fix available (vuln: {vuln.get('id', '?')})",
+                ))
+                break  # One entry per package
+    return outdated
+
+
+# ─── License Policy ───────────────────────────────────────────────────────────
+
+def _apply_license_policy(
+    packages: List[Package],
+    policy_cfg: Dict,
+) -> List[LicenseFinding]:
+    """MVP: license data is rarely in lock files, so most will be UNKNOWN."""
+    if not policy_cfg.get("enabled", False):
+        return []
+
+    deny_list = {l.upper() for l in (policy_cfg.get("deny") or [])}
+    warn_list = {l.upper() for l in (policy_cfg.get("warn") or [])}
+    findings = []
+
+    for pkg in packages:
+        # In MVP there's no way to get license from lockfile without network
+        license_str = "UNKNOWN"
+        if license_str == "UNKNOWN":
+            continue  # skip unknown in MVP
+        policy = "ok"
+        if license_str.upper() in deny_list:
+            policy = "deny"
+        elif license_str.upper() in warn_list:
+            policy = "warn"
+        findings.append(LicenseFinding(
+            package=pkg.name,
+            license=license_str,
+            policy=policy,
+            recommendation=f"Review license {license_str} for {pkg.name}." if policy != "ok" else "",
+        ))
+    return findings
+
+
+# ─── Main Scanner ─────────────────────────────────────────────────────────────
+
+def scan_dependencies(
+    repo_root: str,
+    targets: Optional[List[str]] = None,
+    vuln_sources: Optional[Dict] = None,
+    license_policy: Optional[Dict] = None,
+    severity_thresholds: Optional[Dict] = None,
+    outdated_cfg: Optional[Dict] = None,
+    limits: Optional[Dict] = None,
+    timeout_sec: float = 40.0,
+) -> ScanResult:
+    """
+    Scan repo dependencies for vulnerabilities, outdated packages, license issues.
+
+    Args:
+        repo_root:          absolute path to repo root
+        targets:            ["python", "node"] (default: both)
+        vuln_sources:       {"osv": {"enabled": true, "mode": "online|offline_cache", "cache_path": "..."}}
+        license_policy:     {"enabled": false, "deny": [...], "warn": [...]}
+        severity_thresholds: {"fail_on": ["CRITICAL", "HIGH"], "warn_on": ["MEDIUM"]}
+        outdated_cfg:       {"enabled": true, "mode": "lockfile_only"}
+        limits:             {"max_files": 80, "max_deps": 2000, "max_vulns": 500}
+        timeout_sec:        hard deadline
+
+    Returns:
+        ScanResult with pass/fail verdict
+    """
+    deadline = time.monotonic() + timeout_sec
+    targets = targets or ["python", "node"]
+    vuln_sources = vuln_sources or {"osv": {"enabled": True, "mode": "offline_cache",
+                                             "cache_path": "ops/cache/osv_cache.json"}}
+    license_policy = license_policy or {"enabled": False}
+    severity_thresholds = severity_thresholds or {"fail_on": ["CRITICAL", "HIGH"], "warn_on": ["MEDIUM"]}
+    outdated_cfg = outdated_cfg or {"enabled": True, "mode": "lockfile_only"}
+    limits = limits or {"max_files": 80, "max_deps": 2000, "max_vulns": 500}
+
+    fail_on = {s.upper() for s in (severity_thresholds.get("fail_on") or ["CRITICAL", "HIGH"])}
+    warn_on = {s.upper() for s in (severity_thresholds.get("warn_on") or ["MEDIUM"])}
+
+    # ── Step 1: Extract dependencies ─────────────────────────────────────────
+    all_packages = _find_and_parse_deps(
+        repo_root, targets,
+        max_files=limits.get("max_files", 80),
+        deadline=deadline,
+    )
+
+    # Apply dep count limit
+    max_deps = limits.get("max_deps", 2000)
+    if len(all_packages) > max_deps:
+        logger.warning(f"Dep count {len(all_packages)} > max {max_deps}, truncating")
+        all_packages = all_packages[:max_deps]
+
+    pinned = [p for p in all_packages if p.pinned and p.version]
+    unpinned = [p for p in all_packages if not p.pinned or not p.version]
+
+    # ── Step 2: Vulnerability lookup ─────────────────────────────────────────
+    osv_cfg = vuln_sources.get("osv", {})
+    osv_enabled = osv_cfg.get("enabled", True)
+    osv_mode = osv_cfg.get("mode", "offline_cache")
+
+    # Resolve cache path (absolute or relative to repo_root)
+    cache_path_raw = osv_cfg.get("cache_path", "ops/cache/osv_cache.json")
+    cache_path = (
+        cache_path_raw if os.path.isabs(cache_path_raw)
+        else os.path.join(repo_root, cache_path_raw)
+    )
+
+    cache_entries = _load_osv_cache(cache_path) if osv_enabled else {}
+    new_cache: Dict[str, Any] = {}
+    vuln_results: Dict[str, List[Dict]] = {}
+
+    if osv_enabled:
+        # Populate from cache first
+        cache_miss: List[Package] = []
+        for pkg in pinned:
+            key = pkg.cache_key
+            if key in cache_entries:
+                vuln_results[key] = (cache_entries[key] or {}).get("vulns", [])
+            else:
+                cache_miss.append(pkg)
+
+        # Online query for cache misses
+        if osv_mode == "online" and cache_miss and time.monotonic() < deadline:
+            online_results = _query_osv_online(cache_miss, new_cache, deadline)
+            vuln_results.update(online_results)
+            # Mark remaining misses as UNKNOWN (no cache entry)
+            for pkg in cache_miss:
+                if pkg.cache_key not in vuln_results:
+                    vuln_results[pkg.cache_key] = None  # type: ignore[assignment]
+        else:
+            # Offline: cache misses → UNKNOWN
+            for pkg in cache_miss:
+                vuln_results[pkg.cache_key] = None  # type: ignore[assignment]
+
+        # Persist new cache entries if online mode
+        if new_cache and osv_mode == "online":
+            try:
+                _save_osv_cache(cache_path, new_cache)
+            except Exception as e:
+                logger.warning(f"Could not save OSV cache: {e}")
+
+    # ── Step 3: Build vulnerability findings ─────────────────────────────────
+    all_vulns: List[Vulnerability] = []
+    cache_miss_pkgs: List[Package] = []
+
+    for pkg in pinned:
+        key = pkg.cache_key
+        raw_vulns = vuln_results.get(key)
+        if raw_vulns is None:
+            cache_miss_pkgs.append(pkg)
+            continue
+        vulns = _lookup_vulnerability(pkg, raw_vulns)
+        all_vulns.extend(vulns)
+
+    # Apply vuln limit
+    max_vulns = limits.get("max_vulns", 500)
+    all_vulns = all_vulns[:max_vulns]
+
+    # Sort by severity desc
+    all_vulns.sort(key=lambda v: SEVERITY_ORDER.get(v.severity, 0), reverse=True)
+
+    # ── Step 4: Outdated ──────────────────────────────────────────────────────
+    outdated: List[OutdatedPackage] = []
+    if outdated_cfg.get("enabled", True):
+        outdated = _analyze_outdated(pinned, {
+            k: v for k, v in vuln_results.items() if v is not None
+        })
+
+    # ── Step 5: License policy ────────────────────────────────────────────────
+    licenses = _apply_license_policy(all_packages, license_policy)
+
+    # ── Step 6: Compute pass/fail ─────────────────────────────────────────────
+    by_severity: Dict[str, int] = {s: 0 for s in SEVERITY_ORDER}
+    for v in all_vulns:
+        by_severity[v.severity] = by_severity.get(v.severity, 0) + 1
+
+    blocking_count = sum(by_severity.get(s, 0) for s in fail_on)
+    warning_count = sum(by_severity.get(s, 0) for s in warn_on)
+
+    # License denials also block
+    denied_licenses = [lf for lf in licenses if lf.policy == "deny"]
+    if denied_licenses:
+        blocking_count += len(denied_licenses)
+
+    pass_ = blocking_count == 0
+
+    # ── Step 7: Build recommendations ────────────────────────────────────────
+    recommendations: List[str] = []
+    if blocking_count > 0:
+        top_crit = [v for v in all_vulns if v.severity in fail_on][:3]
+        for v in top_crit:
+            recommendations.append(v.recommendation)
+    if warning_count > 0:
+        recommendations.append(
+            f"{warning_count} MEDIUM severity vulnerabilities found — review and upgrade where possible."
+        )
+    if cache_miss_pkgs:
+        recommendations.append(
+            f"{len(cache_miss_pkgs)} packages have no OSV cache entry (severity UNKNOWN). "
+            "Run in online mode to populate cache: mode=online."
+        )
+    if unpinned:
+        recommendations.append(
+            f"{len(unpinned)} unpinned dependencies detected — cannot check for vulnerabilities. "
+            "Pin versions in requirements.txt/lock files."
+        )
+
+    # ── Step 8: Summary ───────────────────────────────────────────────────────
+    ecosystems_found = sorted({p.ecosystem for p in all_packages})
+    elapsed_ms = round((time.monotonic() - (deadline - timeout_sec)) * 1000, 1)
+
+    if pass_:
+        summary = (
+            f"✅ Dependency scan PASSED. "
+            f"{len(pinned)} deps scanned, {len(all_vulns)} vulns found "
+            f"({by_severity.get('CRITICAL', 0)} critical, {by_severity.get('HIGH', 0)} high)."
+        )
+    else:
+        summary = (
+            f"❌ Dependency scan FAILED. "
+            f"{blocking_count} blocking issue(s): "
+            f"{by_severity.get('CRITICAL', 0)} CRITICAL, {by_severity.get('HIGH', 0)} HIGH"
+            + (f", {len(denied_licenses)} denied licenses" if denied_licenses else "")
+            + "."
+        )
+
+    stats = {
+        "ecosystems": ecosystems_found,
+        "files_scanned": len(set(p.source_file for p in all_packages)),
+        "deps_total": len(all_packages),
+        "deps_pinned": len(pinned),
+        "deps_unresolved": len(cache_miss_pkgs),
+        "vulns_total": len(all_vulns),
+        "by_severity": by_severity,
+        "outdated_total": len(outdated),
+        "elapsed_ms": elapsed_ms,
+    }
+
+    return ScanResult(
+        pass_=pass_,
+        summary=summary,
+        stats=stats,
+        vulnerabilities=[_vuln_to_dict(v) for v in all_vulns],
+        outdated=[_outdated_to_dict(o) for o in outdated],
+        licenses=[_license_to_dict(lf) for lf in licenses],
+        recommendations=list(dict.fromkeys(recommendations)),  # dedupe
+    )
+
+
+def scan_dependencies_dict(repo_root: str, **kwargs) -> Dict:
+    """Convenience wrapper returning plain dict for ToolResult."""
+    result = scan_dependencies(repo_root, **kwargs)
+    return {
+        "pass": result.pass_,
+        "summary": result.summary,
+        "stats": result.stats,
+        "vulnerabilities": result.vulnerabilities,
+        "outdated": result.outdated,
+        "licenses": result.licenses,
+        "recommendations": result.recommendations,
+    }
+
+
+# ─── Serializers ──────────────────────────────────────────────────────────────
+
+def _vuln_to_dict(v: Vulnerability) -> Dict:
+    return {
+        "id": v.osv_id,
+        "ecosystem": v.ecosystem,
+        "package": v.package,
+        "version": v.version,
+        "severity": v.severity,
+        "fixed_versions": v.fixed_versions,
+        "aliases": v.aliases,
+        "evidence": {k: _redact(val) for k, val in v.evidence.items()},
+        "recommendation": v.recommendation,
+    }
+
+
+def _outdated_to_dict(o: OutdatedPackage) -> Dict:
+    return {
+        "ecosystem": o.ecosystem,
+        "package": o.package,
+        "current": o.current,
+        "latest": o.latest,
+        "notes": o.notes,
+    }
+
+
+def _license_to_dict(lf: LicenseFinding) -> Dict:
+    return {
+        "package": lf.package,
+        "license": lf.license,
+        "policy": lf.policy,
+        "recommendation": lf.recommendation,
+    }
+
+
+def _now_iso() -> str:
+    import datetime
+    return datetime.datetime.now(datetime.timezone.utc).isoformat()
--- a/services/router/drift_analyzer.py
+++ b/services/router/drift_analyzer.py
@@ -0,0 +1,898 @@
+"""
+Drift Analyzer — знаходить розбіжності між "джерелами правди" та "фактом".
+
+4 категорії перевірок (незалежні, кожна повертає findings):
+  1. services  — Service Catalog (inventory_services.csv / 01_SERVICE_CATALOG.md) vs docker-compose*.yml
+  2. openapi   — OpenAPI specs (docs/contracts/*.yaml) vs routes у коді (FastAPI decorators)
+  3. nats      — inventory_nats_topics.csv vs publish/subscribe usage у коді
+  4. tools     — tools_rollout.yml + rbac_tools_matrix.yml vs фактичні handlers у tool_manager.py
+
+Формат findings:
+  { category, severity, id, title, evidence: {path, lines, details}, recommended_fix }
+
+Pass rule: pass=false якщо errors > 0. Warnings/infos не валять gate.
+"""
+
+import csv
+import fnmatch
+import hashlib
+import json
+import logging
+import os
+import re
+import time
+import yaml
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, FrozenSet, List, Optional, Set, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+EXCLUDED_DIRS: FrozenSet[str] = frozenset({
+    "node_modules", ".git", "dist", "build", "vendor",
+    ".venv", "venv", "venv_models", "sofia_venv",
+    "__pycache__", ".pytest_cache", "rollback_backups",
+    "docs/consolidation",
+})
+
+MAX_FILES_PER_CATEGORY = 300
+MAX_BYTES_PER_FILE = 262144      # 256KB
+TIMEOUT_SEC = 25.0               # Hard deadline per full analysis
+
+# Known tool handlers (must be kept in sync with execute_tool dispatch in tool_manager.py)
+# Source: Priority 1–17 handlers in tool_manager.py
+KNOWN_TOOL_HANDLERS: FrozenSet[str] = frozenset({
+    "memory_search", "graph_query",
+    "web_search", "web_extract",
+    "image_generate", "comfy_generate_image", "comfy_generate_video",
+    "remember_fact",
+    "presentation_create", "presentation_status", "presentation_download",
+    "crawl4ai_scrape", "tts_speak", "file_tool",
+    "market_data",
+    "crm_search_client", "crm_upsert_client", "crm_upsert_site",
+    "crm_upsert_window_unit", "crm_create_quote", "crm_update_quote",
+    "crm_create_job", "calc_window_quote",
+    "docs_render_quote_pdf", "docs_render_invoice_pdf",
+    "schedule_propose_slots", "schedule_confirm_slot",
+    "repo_tool", "pr_reviewer_tool", "contract_tool",
+    "oncall_tool", "observability_tool", "config_linter_tool",
+    "threatmodel_tool", "job_orchestrator_tool", "kb_tool",
+    "drift_analyzer_tool",   # self-registration
+})
+
+# ─── Data Structures ──────────────────────────────────────────────────────────
+
+@dataclass
+class Finding:
+    category: str
+    severity: str           # "error" | "warning" | "info"
+    id: str
+    title: str
+    evidence: Dict[str, str] = field(default_factory=dict)
+    recommended_fix: str = ""
+
+    def to_dict(self) -> Dict:
+        return {
+            "category": self.category,
+            "severity": self.severity,
+            "id": self.id,
+            "title": self.title,
+            "evidence": self.evidence,
+            "recommended_fix": self.recommended_fix,
+        }
+
+
+@dataclass
+class DriftReport:
+    pass_: bool
+    summary: str
+    stats: Dict[str, Any]
+    findings: List[Dict]
+
+
+# ─── Utility helpers ──────────────────────────────────────────────────────────
+
+def _is_excluded(path: str) -> bool:
+    """Check if any part of the path is in the excluded dirs set."""
+    parts = Path(path).parts
+    return any(p in EXCLUDED_DIRS for p in parts)
+
+
+def _walk_files(root: str, extensions: Tuple[str, ...],
+                deadline: float) -> List[str]:
+    """
+    Walk repo root and collect files with given extensions.
+    Respects EXCLUDED_DIRS, MAX_FILES_PER_CATEGORY, TIMEOUT_SEC.
+    """
+    found = []
+    for dirpath, dirnames, filenames in os.walk(root):
+        # Prune excluded dirs in-place (affects os.walk recursion)
+        dirnames[:] = [
+            d for d in dirnames
+            if d not in EXCLUDED_DIRS and not d.startswith(".")
+        ]
+        if time.monotonic() > deadline:
+            logger.warning("_walk_files: timeout reached")
+            break
+        for fname in filenames:
+            if fname.endswith(extensions):
+                full = os.path.join(dirpath, fname)
+                if not _is_excluded(full):
+                    found.append(full)
+                    if len(found) >= MAX_FILES_PER_CATEGORY:
+                        return found
+    return found
+
+
+def _read_file(path: str) -> str:
+    """Read file with size limit. Returns empty string on error."""
+    try:
+        size = os.path.getsize(path)
+        if size > MAX_BYTES_PER_FILE:
+            with open(path, "r", errors="replace") as f:
+                return f.read(MAX_BYTES_PER_FILE)
+        with open(path, "r", errors="replace") as f:
+            return f.read()
+    except Exception:
+        return ""
+
+
+_SECRET_PAT = re.compile(
+    r'(?i)(api[_-]?key|token|secret|password|bearer|jwt|private[_-]?key)'
+    r'[\s=:]+[\'"`]?([a-zA-Z0-9_\-\.]{8,})[\'"`]?'
+)
+
+
+def _redact_evidence(text: str) -> str:
+    """Mask potential secrets in evidence strings."""
+    return _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***REDACTED***", text)
+
+
+def _rel(path: str, root: str) -> str:
+    """Return path relative to root, or absolute if outside."""
+    try:
+        return os.path.relpath(path, root)
+    except ValueError:
+        return path
+
+
+# ─── Category 1: Services ─────────────────────────────────────────────────────
+
+def _load_service_catalog(repo_root: str) -> Dict[str, str]:
+    """
+    Load services from inventory_services.csv.
+    Returns {service_name: status}.
+    """
+    csv_path = os.path.join(
+        repo_root, "docs", "architecture_inventory", "inventory_services.csv"
+    )
+    services = {}
+    if not os.path.exists(csv_path):
+        # Fallback: scan 01_SERVICE_CATALOG.md for table rows
+        md_path = os.path.join(
+            repo_root, "docs", "architecture_inventory", "01_SERVICE_CATALOG.md"
+        )
+        if os.path.exists(md_path):
+            content = _read_file(md_path)
+            for line in content.splitlines():
+                m = re.match(r'\|\s*([\w\-]+)\s*\|\s*(DEPLOYED|DEFINED|PLANNED[^\|]*)', line)
+                if m:
+                    services[m.group(1).strip()] = m.group(2).strip()
+        return services
+
+    try:
+        with open(csv_path, "r", newline="", errors="replace") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                name = (row.get("service") or "").strip()
+                status = (row.get("type") or "").strip()   # csv has 'type' not 'status'
+                if name:
+                    services[name] = status
+    except Exception as e:
+        logger.warning(f"Could not load inventory_services.csv: {e}")
+    return services
+
+
+def _load_compose_services(repo_root: str, deadline: float) -> Dict[str, str]:
+    """
+    Parse docker-compose*.yml files and return {service_name: compose_file}.
+    """
+    compose_files = []
+    for entry in os.listdir(repo_root):
+        if fnmatch.fnmatch(entry, "docker-compose*.yml"):
+            compose_files.append(os.path.join(repo_root, entry))
+
+    # Also infra subdir
+    infra_compose = os.path.join(repo_root, "infra", "compose", "docker-compose.yml")
+    if os.path.exists(infra_compose):
+        compose_files.append(infra_compose)
+
+    services = {}
+    for cf in compose_files:
+        if time.monotonic() > deadline:
+            break
+        try:
+            content = _read_file(cf)
+            data = yaml.safe_load(content) or {}
+            svc_section = data.get("services") or {}
+            for svc_name in svc_section:
+                services[svc_name] = _rel(cf, repo_root)
+        except Exception as e:
+            logger.debug(f"Could not parse {cf}: {e}")
+    return services
+
+
+def _analyze_services(repo_root: str, deadline: float) -> Tuple[List[Finding], Dict]:
+    findings = []
+    catalog = _load_service_catalog(repo_root)
+    compose_svcs = _load_compose_services(repo_root, deadline)
+
+    compose_names = set(compose_svcs.keys())
+    catalog_names = set(catalog.keys())
+
+    # DEPLOYED in catalog but missing from ALL compose files
+    for svc, status in catalog.items():
+        if "DEPLOYED" in status.upper() and svc not in compose_names:
+            # Normalize: some catalog names use dashes vs underscores differently
+            normalized = svc.replace("-", "_")
+            variants = {svc, normalized, svc.replace("_", "-")}
+            if not variants.intersection(compose_names):
+                findings.append(Finding(
+                    category="services",
+                    severity="error",
+                    id="DRIFT-SVC-001",
+                    title=f"Service '{svc}' marked DEPLOYED in catalog but absent from all docker-compose files",
+                    evidence={"path": "docs/architecture_inventory/inventory_services.csv",
+                              "details": f"status={status}, not found in compose"},
+                    recommended_fix=f"Add '{svc}' to appropriate docker-compose*.yml or update catalog status to DEFINED.",
+                ))
+
+    # In compose but not mentioned in catalog at all
+    for svc, compose_file in compose_svcs.items():
+        if svc not in catalog_names:
+            normalized = svc.replace("-", "_").replace("_", "-")
+            if svc not in catalog_names and normalized not in catalog_names:
+                findings.append(Finding(
+                    category="services",
+                    severity="warning",
+                    id="DRIFT-SVC-002",
+                    title=f"Service '{svc}' found in compose but not in service catalog",
+                    evidence={"path": compose_file, "details": f"defined in {compose_file}"},
+                    recommended_fix=f"Add '{svc}' to inventory_services.csv / 01_SERVICE_CATALOG.md.",
+                ))
+
+    stats = {
+        "catalog_entries": len(catalog),
+        "compose_services": len(compose_svcs),
+        "findings": len(findings),
+    }
+    return findings, stats
+
+
+# ─── Category 2: OpenAPI ──────────────────────────────────────────────────────
+
+def _load_openapi_paths(repo_root: str, deadline: float) -> Dict[str, Set[str]]:
+    """
+    Scan docs/contracts/*.openapi.yaml and any openapi*.yaml/yml/json.
+    Returns {"/path": {"get", "post", ...}}.
+    """
+    spec_files = []
+    contracts_dir = os.path.join(repo_root, "docs", "contracts")
+    if os.path.isdir(contracts_dir):
+        for f in os.listdir(contracts_dir):
+            if f.endswith((".yaml", ".yml", ".json")):
+                spec_files.append(os.path.join(contracts_dir, f))
+
+    # Also find any openapi*.yaml in repo root and services
+    for dirpath, dirnames, filenames in os.walk(repo_root):
+        dirnames[:] = [d for d in dirnames if d not in EXCLUDED_DIRS and not d.startswith(".")]
+        if time.monotonic() > deadline:
+            break
+        for f in filenames:
+            if re.match(r'openapi.*\.(ya?ml|json)$', f, re.IGNORECASE):
+                full = os.path.join(dirpath, f)
+                if full not in spec_files:
+                    spec_files.append(full)
+
+    paths: Dict[str, Set[str]] = {}
+    for sf in spec_files:
+        if time.monotonic() > deadline:
+            break
+        try:
+            content = _read_file(sf)
+            data = yaml.safe_load(content) if sf.endswith((".yaml", ".yml")) else json.loads(content)
+            if not isinstance(data, dict) or "paths" not in data:
+                continue
+            for path, methods in (data.get("paths") or {}).items():
+                if not isinstance(methods, dict):
+                    continue
+                methods_set = {
+                    m.lower() for m in methods
+                    if m.lower() in {"get", "post", "put", "patch", "delete", "head", "options"}
+                }
+                if path not in paths:
+                    paths[path] = set()
+                paths[path].update(methods_set)
+        except Exception as e:
+            logger.debug(f"Could not parse OpenAPI spec {sf}: {e}")
+
+    return paths
+
+
+_FASTAPI_ROUTE_PAT = re.compile(
+    r'@(?:app|router)\.(get|post|put|patch|delete|head|options)\(\s*[\'"]([^\'"]+)[\'"]',
+    re.MULTILINE,
+)
+_ADD_API_ROUTE_PAT = re.compile(
+    r'\.add_api_route\(\s*[\'"]([^\'"]+)[\'"].*?methods\s*=\s*\[([^\]]+)\]',
+    re.MULTILINE | re.DOTALL,
+)
+
+
+def _load_code_routes(repo_root: str, deadline: float) -> Dict[str, Set[str]]:
+    """
+    Scan Python files for FastAPI route decorators.
+    Returns {"/path": {"get", "post", ...}}.
+    """
+    py_files = _walk_files(repo_root, (".py",), deadline)
+    routes: Dict[str, Set[str]] = {}
+
+    for pf in py_files:
+        if time.monotonic() > deadline:
+            break
+        if ".venv" in pf or "venv" in pf or "node_modules" in pf:
+            continue
+        content = _read_file(pf)
+        if not content:
+            continue
+
+        for method, path in _FASTAPI_ROUTE_PAT.findall(content):
+            norm = path.rstrip("/") or "/"
+            if norm not in routes:
+                routes[norm] = set()
+            routes[norm].add(method.lower())
+
+        for path, methods_raw in _ADD_API_ROUTE_PAT.findall(content):
+            methods = {m.strip().strip('"\'').lower() for m in methods_raw.split(",")}
+            norm = path.rstrip("/") or "/"
+            if norm not in routes:
+                routes[norm] = set()
+            routes[norm].update(methods)
+
+    return routes
+
+
+def _normalize_path(path: str) -> str:
+    """Normalize OAS path for comparison: remove trailing slash, lowercase."""
+    return path.rstrip("/").lower() or "/"
+
+
+# Paths that are infrastructure-level and expected to be missing from OAS specs.
+# Add /internal/* and /debug/* patterns if your project uses them.
+_OAS_IGNORE_PATH_PREFIXES: Tuple[str, ...] = (
+    "/healthz", "/readyz", "/livez", "/metrics",
+    "/internal/", "/debug/", "/__", "/favicon",
+)
+
+
+def _is_oas_ignored(path: str) -> bool:
+    """Return True if path is on the OAS ignore allowlist."""
+    p = path.lower()
+    return any(p == prefix.rstrip("/") or p.startswith(prefix)
+               for prefix in _OAS_IGNORE_PATH_PREFIXES)
+
+
+def _load_openapi_deprecated(repo_root: str) -> Set[str]:
+    """
+    Return normalized paths marked as 'deprecated: true' in any OAS spec.
+    Deprecated endpoints downgrade from error to warning (DRIFT-OAS-001).
+    """
+    deprecated: Set[str] = set()
+    spec_files: List[str] = []
+    for dirpath, dirnames, filenames in os.walk(repo_root):
+        dirnames[:] = [d for d in dirnames if d not in EXCLUDED_DIRS and not d.startswith(".")]
+        for f in filenames:
+            if re.match(r'openapi.*\.(ya?ml|json)$', f, re.IGNORECASE):
+                spec_files.append(os.path.join(dirpath, f))
+
+    for sf in spec_files:
+        try:
+            content = _read_file(sf)
+            data = yaml.safe_load(content) if sf.endswith((".yaml", ".yml")) else json.loads(content)
+            if not isinstance(data, dict) or "paths" not in data:
+                continue
+            for path, methods in (data.get("paths") or {}).items():
+                if not isinstance(methods, dict):
+                    continue
+                for method, operation in methods.items():
+                    if isinstance(operation, dict) and operation.get("deprecated", False):
+                        deprecated.add(_normalize_path(path))
+        except Exception:
+            pass
+    return deprecated
+
+
+def _analyze_openapi(repo_root: str, deadline: float) -> Tuple[List[Finding], Dict]:
+    findings = []
+    spec_paths = _load_openapi_paths(repo_root, deadline)
+    code_routes = _load_code_routes(repo_root, deadline)
+
+    if not spec_paths:
+        return findings, {"spec_paths": 0, "code_routes": len(code_routes), "findings": 0}
+
+    deprecated_paths = _load_openapi_deprecated(repo_root)
+
+    spec_norm: Dict[str, Set[str]] = {
+        _normalize_path(p): methods for p, methods in spec_paths.items()
+    }
+    code_norm: Dict[str, Set[str]] = {
+        _normalize_path(p): methods for p, methods in code_routes.items()
+    }
+
+    # DRIFT-OAS-001: In spec but not in code
+    for path, methods in sorted(spec_norm.items()):
+        # Skip infra/health endpoints — they are expected to be absent from OAS
+        if _is_oas_ignored(path):
+            continue
+        if path not in code_norm:
+            # Deprecated spec paths → warning only, not blocking
+            severity = "warning" if path in deprecated_paths else "error"
+            dep_note = " (deprecated in spec)" if path in deprecated_paths else ""
+            findings.append(Finding(
+                category="openapi",
+                severity=severity,
+                id="DRIFT-OAS-001",
+                title=f"OpenAPI path '{path}'{dep_note} not found in codebase routes",
+                evidence={"path": "docs/contracts/",
+                          "details": f"methods={sorted(methods)}, missing from FastAPI decorators"},
+                recommended_fix=(
+                    f"Mark '{path}' as removed in OpenAPI or implement the route."
+                    if path in deprecated_paths
+                    else f"Implement '{path}' route in code or remove from OpenAPI spec."
+                ),
+            ))
+        else:
+            # DRIFT-OAS-003: Method mismatch
+            code_methods = code_norm[path]
+            missing_in_code = methods - code_methods
+            if missing_in_code:
+                findings.append(Finding(
+                    category="openapi",
+                    severity="warning",
+                    id="DRIFT-OAS-003",
+                    title=f"Method mismatch for path '{path}': spec has {sorted(missing_in_code)}, code missing",
+                    evidence={"path": "docs/contracts/",
+                              "details": f"spec={sorted(methods)}, code={sorted(code_methods)}"},
+                    recommended_fix=f"Add missing HTTP methods to code route for '{path}'.",
+                ))
+
+    # DRIFT-OAS-002: In code (/v1/ paths) but not in spec
+    for path, methods in sorted(code_norm.items()):
+        # Health/internal endpoints are expected to be absent from OAS
+        if _is_oas_ignored(path):
+            continue
+        if not path.startswith("/v1/"):
+            continue
+        if path not in spec_norm:
+            findings.append(Finding(
+                category="openapi",
+                severity="error",
+                id="DRIFT-OAS-002",
+                title=f"Code route '{path}' not documented in any OpenAPI spec",
+                evidence={"path": "services/", "details": f"methods={sorted(methods)}"},
+                recommended_fix=f"Add '{path}' to OpenAPI spec in docs/contracts/.",
+            ))
+
+    stats = {
+        "spec_paths": len(spec_paths),
+        "code_routes": len(code_routes),
+        "findings": len(findings),
+    }
+    return findings, stats
+
+
+# ─── Category 3: NATS ─────────────────────────────────────────────────────────
+
+_NATS_WILDCARD_PAT = re.compile(r'\{[^}]+\}|\*|>')   # {agent_id}, *, >
+
+def _normalize_nats_subject(subj: str) -> str:
+    """Replace wildcards with * for matching. Lowercase."""
+    return _NATS_WILDCARD_PAT.sub("*", subj.strip()).lower()
+
+
+def _load_nats_inventory(repo_root: str) -> Optional[List[str]]:
+    """
+    Load documented NATS subjects from inventory_nats_topics.csv.
+    Returns list of normalized subjects, or None if file absent.
+    """
+    csv_path = os.path.join(
+        repo_root, "docs", "architecture_inventory", "inventory_nats_topics.csv"
+    )
+    if not os.path.exists(csv_path):
+        return None
+
+    subjects = []
+    try:
+        with open(csv_path, "r", newline="", errors="replace") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                subj = (row.get("subject") or "").strip()
+                if subj:
+                    subjects.append(_normalize_nats_subject(subj))
+    except Exception as e:
+        logger.warning(f"Could not load nats inventory: {e}")
+        return None
+    return subjects
+
+
+_NATS_USAGE_PATTERNS = [
+    re.compile(r'(?:nc|nats|js|jetstream)\.publish\([\'"]([a-zA-Z0-9._{}*>-]+)[\'"]', re.IGNORECASE),
+    re.compile(r'(?:nc|nats|js|jetstream)\.subscribe\([\'"]([a-zA-Z0-9._{}*>-]+)[\'"]', re.IGNORECASE),
+    re.compile(r'nc\.subscribe\([\'"]([a-zA-Z0-9._{}*>-]+)[\'"]', re.IGNORECASE),
+    re.compile(r'subject\s*=\s*[\'"]([a-zA-Z0-9._{}*>-]{4,})[\'"]', re.IGNORECASE),
+    re.compile(r'SUBJECT\s*=\s*[\'"]([a-zA-Z0-9._{}*>-]{4,})[\'"]'),
+    re.compile(r'[\'"]([a-z][a-z0-9_]+\.[a-z][a-z0-9_]+(?:\.[a-zA-Z0-9_{}_.*>-]+){0,4})[\'"]'),
+]
+
+_NATS_SUBJECT_VALIDATE = re.compile(r'^[a-zA-Z][a-zA-Z0-9._{}*>-]{2,}$')
+
+
+def _load_nats_code_subjects(repo_root: str, deadline: float) -> Set[str]:
+    """Extract NATS subjects from code via regex patterns."""
+    py_files = _walk_files(repo_root, (".py",), deadline)
+    found: Set[str] = set()
+
+    for pf in py_files:
+        if time.monotonic() > deadline:
+            break
+        if "venv" in pf or "node_modules" in pf:
+            continue
+        content = _read_file(pf)
+        if not content:
+            continue
+        # Quick pre-filter: must contain at least one NATS-like call pattern
+        _NATS_CALL_HINTS = ("nc.", "nats.", "js.", "jetstream.", "subject=", "SUBJECT=", ".publish(", ".subscribe(")
+        if not any(hint in content for hint in _NATS_CALL_HINTS):
+            continue
+
+        for pat in _NATS_USAGE_PATTERNS:
+            for m in pat.finditer(content):
+                subj = m.group(1).strip()
+                # Basic subject validation (must contain a dot)
+                if "." in subj and _NATS_SUBJECT_VALIDATE.match(subj):
+                    found.add(_normalize_nats_subject(subj))
+
+    return found
+
+
+def _nats_subject_matches(code_subj: str, inventory_subjects: List[str]) -> bool:
+    """
+    Check if a code subject matches any inventory subject (wildcard-aware).
+    Supports * (one segment) and > (one or more segments).
+    """
+    code_parts = code_subj.split(".")
+    for inv in inventory_subjects:
+        inv_parts = inv.split(".")
+        if _nats_match(code_parts, inv_parts) or _nats_match(inv_parts, code_parts):
+            return True
+    return False
+
+
+def _nats_match(a_parts: List[str], b_parts: List[str]) -> bool:
+    """Match NATS subject a against pattern b (with * and > wildcards)."""
+    if not b_parts:
+        return not a_parts
+    if b_parts[-1] == ">":
+        return len(a_parts) >= len(b_parts) - 1
+    if len(a_parts) != len(b_parts):
+        return False
+    for a, b in zip(a_parts, b_parts):
+        if b == "*" or a == "*":
+            continue
+        if a != b:
+            return False
+    return True
+
+
+def _analyze_nats(repo_root: str, deadline: float) -> Tuple[List[Finding], Dict, bool]:
+    """Returns (findings, stats, skipped)."""
+    inventory = _load_nats_inventory(repo_root)
+    if inventory is None:
+        return [], {"skipped": True}, True
+
+    code_subjects = _load_nats_code_subjects(repo_root, deadline)
+    findings = []
+
+    # DRIFT-NATS-001: Used in code but not in inventory
+    for subj in sorted(code_subjects):
+        if not _nats_subject_matches(subj, inventory):
+            findings.append(Finding(
+                category="nats",
+                severity="warning",
+                id="DRIFT-NATS-001",
+                title=f"NATS subject '{subj}' used in code but not in inventory",
+                evidence={"path": "docs/architecture_inventory/inventory_nats_topics.csv",
+                          "details": f"subject '{subj}' not found (wildcard-aware match)"},
+                recommended_fix=f"Add '{subj}' to inventory_nats_topics.csv.",
+            ))
+
+    # DRIFT-NATS-002: In inventory but not used in code (info — may be legacy)
+    for inv_subj in inventory:
+        if inv_subj.endswith(".*") or inv_subj.endswith(".>"):
+            continue  # wildcard subscriptions — skip
+        if not _nats_subject_matches(inv_subj, list(code_subjects)):
+            findings.append(Finding(
+                category="nats",
+                severity="info",
+                id="DRIFT-NATS-002",
+                title=f"Documented NATS subject '{inv_subj}' not found in code (possibly legacy)",
+                evidence={"path": "docs/architecture_inventory/inventory_nats_topics.csv",
+                          "details": "no matching publish/subscribe call found"},
+                recommended_fix="Verify if subject is still active; mark as deprecated in inventory if not.",
+            ))
+
+    stats = {
+        "inventory_subjects": len(inventory),
+        "code_subjects": len(code_subjects),
+        "findings": len(findings),
+    }
+    return findings, stats, False
+
+
+# ─── Category 4: Tools ────────────────────────────────────────────────────────
+
+def _load_rollout_tools(repo_root: str) -> Set[str]:
+    """Extract all tool names mentioned in tools_rollout.yml groups."""
+    rollout_path = os.path.join(repo_root, "config", "tools_rollout.yml")
+    tools: Set[str] = set()
+    try:
+        with open(rollout_path, "r") as f:
+            data = yaml.safe_load(f) or {}
+    except Exception:
+        return tools
+
+    # Collect all values from group lists (non-@group entries are tool names)
+    def _collect(obj):
+        if isinstance(obj, list):
+            for item in obj:
+                if isinstance(item, str) and not item.startswith("@"):
+                    tools.add(item)
+                elif isinstance(item, str) and item.startswith("@"):
+                    group_name = item[1:]
+                    if group_name in data:
+                        _collect(data[group_name])
+        elif isinstance(obj, dict):
+            for v in obj.values():
+                _collect(v)
+
+    for key, value in data.items():
+        if key not in ("role_map", "agent_roles"):  # these are role configs, not tool lists
+            _collect(value)
+
+    # Also scan role_map tool lists
+    role_map = data.get("role_map", {})
+    for role_cfg in role_map.values():
+        _collect(role_cfg.get("tools", []))
+
+    return tools
+
+
+def _load_rbac_tools(repo_root: str) -> Dict[str, Set[str]]:
+    """Load tool→{actions} from rbac_tools_matrix.yml."""
+    matrix_path = os.path.join(repo_root, "config", "rbac_tools_matrix.yml")
+    result: Dict[str, Set[str]] = {}
+    try:
+        with open(matrix_path, "r") as f:
+            data = yaml.safe_load(f) or {}
+        for tool, cfg in (data.get("tools") or {}).items():
+            actions = set((cfg.get("actions") or {}).keys())
+            result[tool] = actions
+    except Exception:
+        pass
+    return result
+
+
+def _get_effective_tools_for_roles(repo_root: str) -> Dict[str, Set[str]]:
+    """Get effective tools for agent_default and agent_cto roles."""
+    result = {}
+    try:
+        import sys
+        router_path = os.path.join(repo_root, "services", "router")
+        if router_path not in sys.path:
+            sys.path.insert(0, router_path)
+        if repo_root not in sys.path:
+            sys.path.insert(0, repo_root)
+
+        from agent_tools_config import get_agent_tools, reload_rollout_config
+        reload_rollout_config()
+
+        # Use representative agents per role
+        result["agent_default"] = set(get_agent_tools("brand_new_agent_xyz_test"))
+        result["agent_cto"] = set(get_agent_tools("sofiia"))
+    except Exception as e:
+        logger.warning(f"Could not load effective tools: {e}")
+    return result
+
+
+def _analyze_tools(repo_root: str) -> Tuple[List[Finding], Dict]:
+    findings = []
+
+    rollout_tools = _load_rollout_tools(repo_root)
+    rbac_tools = _load_rbac_tools(repo_root)
+    role_tools = _get_effective_tools_for_roles(repo_root)
+
+    all_role_tools: Set[str] = set()
+    for tools in role_tools.values():
+        all_role_tools.update(tools)
+
+    # DRIFT-TOOLS-001: Tool in rollout but no handler in tool_manager.py
+    for tool in sorted(rollout_tools):
+        if tool not in KNOWN_TOOL_HANDLERS:
+            findings.append(Finding(
+                category="tools",
+                severity="error",
+                id="DRIFT-TOOLS-001",
+                title=f"Tool '{tool}' in tools_rollout.yml but no handler in tool_manager.py",
+                evidence={"path": "config/tools_rollout.yml",
+                          "details": f"'{tool}' referenced in rollout groups but missing from KNOWN_TOOL_HANDLERS"},
+                recommended_fix=f"Add handler for '{tool}' in tool_manager.py execute_tool dispatch, or remove from rollout.",
+            ))
+
+    # DRIFT-TOOLS-002: Handler exists but not in RBAC matrix
+    # Severity = error if tool is in rollout/standard_stack (actively used, no RBAC gate)
+    # Severity = warning if tool appears experimental / not yet rolled out
+    for tool in sorted(KNOWN_TOOL_HANDLERS):
+        if tool not in rbac_tools:
+            # Escalate to error if tool is actively distributed to agents
+            is_rollouted = tool in rollout_tools or tool in all_role_tools
+            severity = "error" if is_rollouted else "warning"
+            findings.append(Finding(
+                category="tools",
+                severity=severity,
+                id="DRIFT-TOOLS-002",
+                title=f"Tool '{tool}' has a handler but is absent from rbac_tools_matrix.yml",
+                evidence={"path": "config/rbac_tools_matrix.yml",
+                          "details": (
+                              f"'{tool}' not found in matrix.tools section. "
+                              + ("In rollout → no RBAC gate applied." if is_rollouted
+                                 else "Not in rollout (experimental/legacy).")
+                          )},
+                recommended_fix=f"Add '{tool}' with actions and entitlements to rbac_tools_matrix.yml.",
+            ))
+
+    # DRIFT-TOOLS-003: Tool in RBAC matrix but never appears in effective_tools
+    if all_role_tools:
+        for tool in sorted(rbac_tools.keys()):
+            if tool not in all_role_tools:
+                findings.append(Finding(
+                    category="tools",
+                    severity="warning",
+                    id="DRIFT-TOOLS-003",
+                    title=f"Tool '{tool}' is in RBAC matrix but never appears in effective_tools (dead config?)",
+                    evidence={"path": "config/rbac_tools_matrix.yml",
+                              "details": f"'{tool}' in matrix but not in any role's effective tool list"},
+                    recommended_fix=f"Add '{tool}' to a role in tools_rollout.yml or remove from matrix.",
+                ))
+
+    stats = {
+        "rollout_tools": len(rollout_tools),
+        "rbac_tools": len(rbac_tools),
+        "handlers": len(KNOWN_TOOL_HANDLERS),
+        "role_tools": {role: len(tools) for role, tools in role_tools.items()},
+        "findings": len(findings),
+    }
+    return findings, stats
+
+
+# ─── Main Analyzer ────────────────────────────────────────────────────────────
+
+def analyze_drift(
+    repo_root: str,
+    categories: Optional[List[str]] = None,
+    timeout_sec: float = TIMEOUT_SEC,
+) -> DriftReport:
+    """
+    Run drift analysis across requested categories.
+
+    Args:
+        repo_root: absolute path to repository root
+        categories: subset of ["services", "openapi", "nats", "tools"] (all if None)
+        timeout_sec: hard deadline for full analysis
+
+    Returns:
+        DriftReport with pass/fail verdict
+    """
+    all_categories = {"services", "openapi", "nats", "tools"}
+    if categories:
+        run_cats = {c for c in categories if c in all_categories}
+    else:
+        run_cats = all_categories
+
+    deadline = time.monotonic() + timeout_sec
+    all_findings: List[Finding] = []
+    skipped: List[str] = []
+
+    items_checked: Dict[str, int] = {}
+    cat_stats: Dict[str, Any] = {}
+
+    if "services" in run_cats:
+        findings, stats = _analyze_services(repo_root, deadline)
+        all_findings.extend(findings)
+        cat_stats["services"] = stats
+        items_checked["services"] = stats.get("catalog_entries", 0) + stats.get("compose_services", 0)
+
+    if "openapi" in run_cats:
+        findings, stats = _analyze_openapi(repo_root, deadline)
+        all_findings.extend(findings)
+        cat_stats["openapi"] = stats
+        items_checked["openapi"] = stats.get("spec_paths", 0) + stats.get("code_routes", 0)
+
+    if "nats" in run_cats:
+        findings, stats, was_skipped = _analyze_nats(repo_root, deadline)
+        if was_skipped:
+            skipped.append("nats")
+        else:
+            all_findings.extend(findings)
+            cat_stats["nats"] = stats
+            items_checked["nats"] = stats.get("inventory_subjects", 0) + stats.get("code_subjects", 0)
+
+    if "tools" in run_cats:
+        findings, stats = _analyze_tools(repo_root)
+        all_findings.extend(findings)
+        cat_stats["tools"] = stats
+        items_checked["tools"] = stats.get("rollout_tools", 0) + stats.get("rbac_tools", 0)
+
+    # Sort findings: severity desc (error > warning > info), then category, then id
+    severity_order = {"error": 0, "warning": 1, "info": 2}
+    all_findings.sort(key=lambda f: (severity_order.get(f.severity, 9), f.category, f.id))
+
+    # Redact evidence
+    for f in all_findings:
+        if f.evidence.get("details"):
+            f.evidence["details"] = _redact_evidence(f.evidence["details"])
+
+    errors = sum(1 for f in all_findings if f.severity == "error")
+    warnings = sum(1 for f in all_findings if f.severity == "warning")
+    infos = sum(1 for f in all_findings if f.severity == "info")
+
+    pass_ = errors == 0
+
+    if pass_:
+        summary = f"✅ Drift analysis PASSED. {len(all_findings)} findings ({warnings} warnings, {infos} infos)."
+    else:
+        summary = (
+            f"❌ Drift analysis FAILED. {errors} error(s), {warnings} warning(s). "
+            f"Categories checked: {sorted(run_cats - {'nats'} if 'nats' in skipped else run_cats)}."
+        )
+    if skipped:
+        summary += f" Skipped (no inventory): {skipped}."
+
+    elapsed_ms = round((time.monotonic() - (deadline - timeout_sec)) * 1000, 1)
+
+    return DriftReport(
+        pass_=pass_,
+        summary=summary,
+        stats={
+            "errors": errors,
+            "warnings": warnings,
+            "infos": infos,
+            "skipped": skipped,
+            "items_checked": items_checked,
+            "elapsed_ms": elapsed_ms,
+            "by_category": cat_stats,
+        },
+        findings=[f.to_dict() for f in all_findings],
+    )
+
+
+def analyze_drift_dict(repo_root: str, **kwargs) -> Dict:
+    """Convenience wrapper that returns a plain dict (for ToolResult)."""
+    report = analyze_drift(repo_root, **kwargs)
+    return {
+        "pass": report.pass_,
+        "summary": report.summary,
+        "stats": report.stats,
+        "findings": report.findings,
+    }
--- a/services/router/incident_artifacts.py
+++ b/services/router/incident_artifacts.py
@@ -0,0 +1,106 @@
+"""
+incident_artifacts.py — File-based artifact storage for incidents.
+
+Layout: ops/incidents/<incident_id>/<filename>
+
+Security:
+  - Path traversal guard (realpath must stay within base_dir)
+  - Max 2MB per artifact
+  - Only allowed formats: json, md, txt
+  - Atomic writes (temp + rename)
+"""
+from __future__ import annotations
+
+import base64
+import hashlib
+import logging
+import os
+import tempfile
+from pathlib import Path
+from typing import Dict, Optional
+
+logger = logging.getLogger(__name__)
+
+MAX_ARTIFACT_BYTES = 2 * 1024 * 1024  # 2MB
+ALLOWED_FORMATS = {"json", "md", "txt"}
+
+_ARTIFACTS_BASE = os.getenv(
+    "INCIDENT_ARTIFACTS_DIR",
+    str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "incidents"),
+)
+
+
+def _base_dir() -> Path:
+    return Path(os.getenv("INCIDENT_ARTIFACTS_DIR", _ARTIFACTS_BASE))
+
+
+def _safe_filename(name: str) -> str:
+    """Strip path separators and dangerous chars."""
+    safe = "".join(c for c in name if c.isalnum() or c in (".", "_", "-"))
+    return safe or "artifact"
+
+
+def write_artifact(
+    incident_id: str,
+    filename: str,
+    content_bytes: bytes,
+    *,
+    base_dir: Optional[str] = None,
+) -> Dict:
+    """
+    Write an artifact file atomically.
+
+    Returns: {"path": str, "sha256": str, "size_bytes": int}
+    Raises: ValueError on validation failure, OSError on write failure.
+    """
+    if not incident_id or "/" in incident_id or ".." in incident_id:
+        raise ValueError(f"Invalid incident_id: {incident_id}")
+
+    if len(content_bytes) > MAX_ARTIFACT_BYTES:
+        raise ValueError(f"Artifact too large: {len(content_bytes)} bytes (max {MAX_ARTIFACT_BYTES})")
+
+    safe_name = _safe_filename(filename)
+    ext = safe_name.rsplit(".", 1)[-1].lower() if "." in safe_name else ""
+    if ext not in ALLOWED_FORMATS:
+        raise ValueError(f"Format '{ext}' not allowed. Allowed: {ALLOWED_FORMATS}")
+
+    bd = Path(base_dir) if base_dir else _base_dir()
+    inc_dir = bd / incident_id
+    inc_dir.mkdir(parents=True, exist_ok=True)
+
+    target = inc_dir / safe_name
+    real_base = bd.resolve()
+    real_target = target.resolve()
+    if not str(real_target).startswith(str(real_base)):
+        raise ValueError("Path traversal detected")
+
+    sha = hashlib.sha256(content_bytes).hexdigest()
+
+    # Atomic write: temp file → rename
+    fd, tmp_path = tempfile.mkstemp(dir=str(inc_dir), suffix=f".{ext}.tmp")
+    try:
+        os.write(fd, content_bytes)
+        os.close(fd)
+        os.replace(tmp_path, str(target))
+    except Exception:
+        os.close(fd) if not os.get_inheritable(fd) else None
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+        raise
+
+    rel_path = str(target.relative_to(bd.parent.parent)) if bd.parent.parent.exists() else str(target)
+
+    logger.info("Artifact written: %s (%d bytes, sha256=%s…)", rel_path, len(content_bytes), sha[:12])
+    return {
+        "path": rel_path,
+        "sha256": sha,
+        "size_bytes": len(content_bytes),
+    }
+
+
+def decode_content(content_base64: str) -> bytes:
+    """Decode base64-encoded content. Raises ValueError on failure."""
+    try:
+        return base64.b64decode(content_base64)
+    except Exception as exc:
+        raise ValueError(f"Invalid base64 content: {exc}")
--- a/services/router/incident_escalation.py
+++ b/services/router/incident_escalation.py
@@ -0,0 +1,379 @@
+"""
+incident_escalation.py — Deterministic Incident Escalation Engine.
+
+Actions (exposed via incident_escalation_tool):
+  evaluate              — check active signatures against escalation thresholds
+  auto_resolve_candidates — find open incidents with no recent alerts
+
+No LLM usage; all logic is policy-driven.
+"""
+from __future__ import annotations
+
+import datetime
+import logging
+import os
+import yaml
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+# ─── Severity ordering ────────────────────────────────────────────────────────
+
+_SEV_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "INFO": 4}
+_SEV_NAMES = ["P0", "P1", "P2", "P3", "INFO"]
+
+
+def _sev_higher(a: str, b: str) -> bool:
+    """Return True if a is more severe (lower P number) than b."""
+    return _SEV_ORDER.get(a, 99) < _SEV_ORDER.get(b, 99)
+
+
+def _escalate_sev(current: str, cap: str = "P0") -> Optional[str]:
+    """Return next higher severity, or None if already at/above cap."""
+    idx = _SEV_ORDER.get(current)
+    if idx is None or idx == 0:
+        return None
+    target = _SEV_NAMES[idx - 1]
+    if _SEV_ORDER.get(target, 99) < _SEV_ORDER.get(cap, 0):
+        return None  # would exceed cap
+    return target
+
+
+def _now_iso() -> str:
+    return datetime.datetime.utcnow().isoformat()
+
+
+def _plus_hours(hours: int) -> str:
+    return (datetime.datetime.utcnow() + datetime.timedelta(hours=hours)).isoformat()
+
+
+# ─── Policy loading ───────────────────────────────────────────────────────────
+
+_POLICY_CACHE: Optional[Dict] = None
+_POLICY_PATHS = [
+    Path("config/incident_escalation_policy.yml"),
+    Path(__file__).resolve().parent.parent.parent / "config" / "incident_escalation_policy.yml",
+]
+
+
+def load_escalation_policy() -> Dict:
+    global _POLICY_CACHE
+    if _POLICY_CACHE is not None:
+        return _POLICY_CACHE
+    for path in _POLICY_PATHS:
+        if path.exists():
+            try:
+                with open(path) as f:
+                    data = yaml.safe_load(f) or {}
+                _POLICY_CACHE = data
+                return data
+            except Exception as e:
+                logger.warning("Failed to load escalation policy from %s: %s", path, e)
+    logger.warning("incident_escalation_policy.yml not found; using defaults")
+    _POLICY_CACHE = _builtin_defaults()
+    return _POLICY_CACHE
+
+
+def _builtin_defaults() -> Dict:
+    return {
+        "defaults": {"window_minutes": 60},
+        "escalation": {
+            "occurrences_thresholds": {"P2_to_P1": 10, "P1_to_P0": 25},
+            "triage_thresholds_24h": {"P2_to_P1": 3, "P1_to_P0": 6},
+            "severity_cap": "P0",
+            "create_followup_on_escalate": True,
+            "followup": {
+                "priority": "P1", "due_hours": 24, "owner": "oncall",
+                "message_template": "Escalated: occurrences={occurrences_60m}, triages_24h={triage_count_24h}",
+            },
+        },
+        "auto_resolve": {
+            "no_alerts_minutes_for_candidate": 60,
+            "close_allowed_severities": ["P2", "P3"],
+            "auto_close": False,
+            "candidate_event_type": "note",
+            "candidate_message": "Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
+        },
+        "alert_loop_slo": {
+            "claim_to_ack_p95_seconds": 60,
+            "failed_rate_pct": 5,
+            "processing_stuck_minutes": 15,
+        },
+    }
+
+
+# ─── Escalation thresholds helper ────────────────────────────────────────────
+
+def _determine_escalation(
+    current_severity: str,
+    occurrences_60m: int,
+    triage_count_24h: int,
+    policy: Dict,
+) -> Optional[str]:
+    """Return target severity if escalation is needed, else None."""
+    esc = policy.get("escalation", {})
+    occ_thresh = esc.get("occurrences_thresholds", {})
+    triage_thresh = esc.get("triage_thresholds_24h", {})
+    cap = esc.get("severity_cap", "P0")
+
+    # Build escalation rules in priority order (most → least severe)
+    rules = [
+        ("P1", "P0", occ_thresh.get("P1_to_P0", 25), triage_thresh.get("P1_to_P0", 6)),
+        ("P2", "P1", occ_thresh.get("P2_to_P1", 10), triage_thresh.get("P2_to_P1", 3)),
+    ]
+
+    for from_sev, to_sev, occ_limit, triage_limit in rules:
+        if current_severity != from_sev:
+            continue
+        if occurrences_60m >= occ_limit or triage_count_24h >= triage_limit:
+            # Check cap
+            if not _sev_higher(cap, to_sev) and to_sev != cap:
+                # to_sev is more severe than cap — not allowed
+                if _sev_higher(to_sev, cap):
+                    return cap
+            return to_sev
+    return None
+
+
+# ─── Core evaluate function ───────────────────────────────────────────────────
+
+def evaluate_escalations(
+    params: Dict,
+    alert_store,
+    sig_state_store,
+    incident_store,
+    policy: Optional[Dict] = None,
+    dry_run: bool = False,
+) -> Dict:
+    """
+    Main escalation evaluation. Returns structured summary.
+    """
+    if policy is None:
+        policy = load_escalation_policy()
+
+    env_filter = params.get("env")  # "prod" / "staging" / None = any
+    window_minutes = int(params.get("window_minutes",
+                                    policy.get("defaults", {}).get("window_minutes", 60)))
+    limit = int(params.get("limit", 100))
+
+    esc_cfg = policy.get("escalation", {})
+    cap = esc_cfg.get("severity_cap", "P0")
+    create_followup = esc_cfg.get("create_followup_on_escalate", True)
+    followup_cfg = esc_cfg.get("followup", {})
+
+    # Pull active signatures
+    active_sigs = sig_state_store.list_active_signatures(
+        window_minutes=window_minutes, limit=limit
+    )
+
+    evaluated = 0
+    escalated = 0
+    followups_created = 0
+    candidates: List[Dict] = []
+    recommendations: List[str] = []
+
+    for sig_state in active_sigs:
+        signature = sig_state.get("signature", "")
+        occurrences_60m = sig_state.get("occurrences_60m", 0)
+        triage_count_24h = sig_state.get("triage_count_24h", 0)
+
+        # Find open incident with this signature
+        all_incidents = incident_store.list_incidents(
+            {"status": "open"}, limit=200
+        )
+        matching = [
+            i for i in all_incidents
+            if i.get("meta", {}).get("incident_signature") == signature
+            and (not env_filter or i.get("env") == env_filter)
+        ]
+        if not matching:
+            # Also check mitigating
+            mitigating = incident_store.list_incidents(
+                {"status": "mitigating"}, limit=200
+            )
+            matching = [
+                i for i in mitigating
+                if i.get("meta", {}).get("incident_signature") == signature
+                and (not env_filter or i.get("env") == env_filter)
+            ]
+
+        if not matching:
+            evaluated += 1
+            continue
+
+        incident = matching[0]
+        inc_id = incident["id"]
+        current_sev = incident.get("severity", "P2")
+
+        evaluated += 1
+
+        target_sev = _determine_escalation(
+            current_sev, occurrences_60m, triage_count_24h, policy
+        )
+
+        if not target_sev:
+            continue  # no escalation needed
+
+        candidates.append({
+            "incident_id": inc_id,
+            "service": incident.get("service"),
+            "from_severity": current_sev,
+            "to_severity": target_sev,
+            "occurrences_60m": occurrences_60m,
+            "triage_count_24h": triage_count_24h,
+            "signature": signature,
+        })
+
+        if dry_run:
+            continue
+
+        # Append escalation decision event
+        esc_msg = (
+            f"Escalated {current_sev} → {target_sev}: "
+            f"occurrences_60m={occurrences_60m}, "
+            f"triage_count_24h={triage_count_24h}"
+        )
+        incident_store.append_event(inc_id, "decision", esc_msg, meta={
+            "from_severity": current_sev,
+            "to_severity": target_sev,
+            "occurrences_60m": occurrences_60m,
+            "triage_count_24h": triage_count_24h,
+            "policy_cap": cap,
+            "automated": True,
+        })
+        escalated += 1
+
+        # Create follow-up event if configured
+        if create_followup:
+            tmpl = followup_cfg.get(
+                "message_template",
+                "Escalation follow-up: investigate {occurrences_60m} occurrences"
+            )
+            followup_msg = tmpl.format(
+                occurrences_60m=occurrences_60m,
+                triage_count_24h=triage_count_24h,
+            )
+            due = _plus_hours(int(followup_cfg.get("due_hours", 24)))
+            incident_store.append_event(inc_id, "followup", followup_msg, meta={
+                "priority": followup_cfg.get("priority", "P1"),
+                "due_date": due,
+                "owner": followup_cfg.get("owner", "oncall"),
+                "auto_created": True,
+            })
+            followups_created += 1
+
+        recommendations.append(
+            f"Incident {inc_id} ({incident.get('service')}) escalated "
+            f"{current_sev}→{target_sev}: {esc_msg}"
+        )
+
+    return {
+        "evaluated": evaluated,
+        "escalated": escalated,
+        "followups_created": followups_created,
+        "candidates": candidates,
+        "recommendations": recommendations,
+        "dry_run": dry_run,
+    }
+
+
+# ─── Auto-resolve candidates ──────────────────────────────────────────────────
+
+def find_auto_resolve_candidates(
+    params: Dict,
+    sig_state_store,
+    incident_store,
+    policy: Optional[Dict] = None,
+    dry_run: bool = True,
+) -> Dict:
+    """
+    Find open incidents where no alerts have been seen in the last N minutes.
+    Returns list of candidate incidents.
+    By default dry_run=True — no state changes.
+    """
+    if policy is None:
+        policy = load_escalation_policy()
+
+    ar = policy.get("auto_resolve", {})
+    no_alerts_minutes = int(params.get(
+        "no_alerts_minutes",
+        ar.get("no_alerts_minutes_for_candidate", 60)
+    ))
+    env_filter = params.get("env")
+    limit = int(params.get("limit", 100))
+    close_allowed = ar.get("close_allowed_severities", ["P2", "P3"])
+    auto_close = ar.get("auto_close", False)
+    candidate_event_type = ar.get("candidate_event_type", "note")
+    candidate_msg_tmpl = ar.get(
+        "candidate_message",
+        "Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
+    )
+
+    now_dt = datetime.datetime.utcnow()
+    no_alert_cutoff = (now_dt - datetime.timedelta(minutes=no_alerts_minutes)).isoformat()
+
+    # Pull all open incidents
+    all_open = incident_store.list_incidents({"status": "open"}, limit=limit)
+    if env_filter:
+        all_open = [i for i in all_open if i.get("env") == env_filter]
+
+    candidates: List[Dict] = []
+    closed: List[str] = []
+
+    for incident in all_open:
+        inc_id = incident["id"]
+        signature = incident.get("meta", {}).get("incident_signature")
+        if not signature:
+            continue
+
+        sig_state = sig_state_store.get_state(signature)
+        if not sig_state:
+            continue
+
+        last_alert = sig_state.get("last_alert_at") or ""
+        if last_alert >= no_alert_cutoff:
+            continue  # alert seen recently → not a candidate
+
+        current_sev = incident.get("severity", "P2")
+        can_close = current_sev in close_allowed
+
+        candidates.append({
+            "incident_id": inc_id,
+            "service": incident.get("service"),
+            "severity": current_sev,
+            "last_alert_at": last_alert,
+            "minutes_without_alerts": round(
+                (now_dt - datetime.datetime.fromisoformat(last_alert)).total_seconds() / 60
+                if last_alert else no_alerts_minutes
+            ),
+            "auto_close_eligible": can_close and auto_close,
+        })
+
+        if dry_run:
+            continue
+
+        # Append candidate note to incident
+        msg = candidate_msg_tmpl.format(no_alerts_minutes=no_alerts_minutes)
+        incident_store.append_event(inc_id, candidate_event_type, msg, meta={
+            "last_alert_at": last_alert,
+            "no_alerts_minutes": no_alerts_minutes,
+            "auto_created": True,
+        })
+
+        if can_close and auto_close:
+            incident_store.close_incident(
+                inc_id,
+                _now_iso(),
+                f"Auto-closed: no alerts for {no_alerts_minutes} minutes",
+            )
+            closed.append(inc_id)
+
+    return {
+        "candidates": candidates,
+        "candidates_count": len(candidates),
+        "closed": closed,
+        "closed_count": len(closed),
+        "no_alerts_minutes": no_alerts_minutes,
+        "dry_run": dry_run,
+    }
--- a/services/router/incident_intel_utils.py
+++ b/services/router/incident_intel_utils.py
@@ -0,0 +1,143 @@
+"""
+incident_intel_utils.py — Data helpers for Incident Intelligence Layer.
+
+Provides:
+  - kind extraction from incident (signature, meta, title heuristics)
+  - normalized key fields dict
+  - time-proximity helpers
+  - safe truncation/masking
+
+No external dependencies beyond stdlib.
+"""
+from __future__ import annotations
+
+import datetime
+import re
+from typing import Any, Dict, Optional, Tuple
+
+# ─── Kind heuristics ──────────────────────────────────────────────────────────
+
+_TITLE_KIND_PATTERNS = [
+    (re.compile(r'\b(latency|slow|timeout|p9[5-9]|p100)\b', re.I), "latency"),
+    (re.compile(r'\b(error.?rate|5xx|http.?error|exception)\b', re.I), "error_rate"),
+    (re.compile(r'\b(slo.?breach|slo)\b', re.I), "slo_breach"),
+    (re.compile(r'\b(oom|out.?of.?memory|memory.?pressure)\b', re.I), "oom"),
+    (re.compile(r'\b(disk|storage|volume.?full|inode)\b', re.I), "disk"),
+    (re.compile(r'\b(security|intrusion|cve|vuln|unauthorized)\b', re.I), "security"),
+    (re.compile(r'\b(deploy|rollout|release|canary)\b', re.I), "deploy"),
+    (re.compile(r'\b(crash.?loop|crashloop|restart)\b', re.I), "crashloop"),
+    (re.compile(r'\b(queue|lag|consumer|backlog)\b', re.I), "queue"),
+    (re.compile(r'\b(network|connectivity|dns|unreachable)\b', re.I), "network"),
+]
+
+_KNOWN_KINDS = frozenset([
+    "slo_breach", "crashloop", "latency", "error_rate",
+    "disk", "oom", "deploy", "security", "custom", "network", "queue",
+])
+
+
+def extract_kind(incident: Dict) -> str:
+    """
+    Best-effort kind extraction. Priority:
+    1. incident.meta.kind (if present)
+    2. incident.meta.alert_kind
+    3. Title heuristics
+    4. 'custom'
+    """
+    meta = incident.get("meta") or {}
+
+    # Direct meta fields
+    for key in ("kind", "alert_kind"):
+        v = meta.get(key)
+        if v and v in _KNOWN_KINDS:
+            return v
+
+    # Title heuristics
+    title = incident.get("title", "") or ""
+    for pat, kind_name in _TITLE_KIND_PATTERNS:
+        if pat.search(title):
+            return kind_name
+
+    return "custom"
+
+
+def incident_key_fields(incident: Dict) -> Dict:
+    """Return a normalized dict of key fields used for correlation."""
+    meta = incident.get("meta") or {}
+    return {
+        "id": incident.get("id", ""),
+        "service": incident.get("service", ""),
+        "env": incident.get("env", "prod"),
+        "severity": incident.get("severity", "P2"),
+        "status": incident.get("status", "open"),
+        "started_at": incident.get("started_at", ""),
+        "signature": meta.get("incident_signature", ""),
+        "kind": extract_kind(incident),
+    }
+
+
+# ─── Time helpers ─────────────────────────────────────────────────────────────
+
+def parse_iso(ts: str) -> Optional[datetime.datetime]:
+    """Parse ISO timestamp string to datetime, returns None on failure."""
+    if not ts:
+        return None
+    try:
+        return datetime.datetime.fromisoformat(ts.rstrip("Z").split("+")[0])
+    except (ValueError, AttributeError):
+        return None
+
+
+def minutes_apart(ts_a: str, ts_b: str) -> Optional[float]:
+    """Return absolute minutes between two ISO timestamps, or None."""
+    a = parse_iso(ts_a)
+    b = parse_iso(ts_b)
+    if a is None or b is None:
+        return None
+    return abs((a - b).total_seconds()) / 60.0
+
+
+def incidents_within_minutes(inc_a: Dict, inc_b: Dict, within: float) -> bool:
+    """Return True if two incidents started within `within` minutes of each other."""
+    gap = minutes_apart(
+        inc_a.get("started_at", ""),
+        inc_b.get("started_at", ""),
+    )
+    return gap is not None and gap <= within
+
+
+# ─── Text helpers ─────────────────────────────────────────────────────────────
+
+def safe_truncate(text: str, max_chars: int = 200) -> str:
+    if not text:
+        return ""
+    return text[:max_chars] + ("…" if len(text) > max_chars else "")
+
+
+def mask_signature(sig: str, prefix_len: int = 8) -> str:
+    """Show only first N chars of a SHA-256 signature for readability."""
+    if not sig:
+        return ""
+    return sig[:prefix_len]
+
+
+def severity_rank(sev: str) -> int:
+    """Lower = more severe."""
+    return {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "INFO": 4}.get(sev, 5)
+
+
+def format_duration(started_at: str, ended_at: Optional[str]) -> str:
+    """Human-readable duration string."""
+    a = parse_iso(started_at)
+    if a is None:
+        return "unknown"
+    if ended_at:
+        b = parse_iso(ended_at)
+        if b:
+            secs = (b - a).total_seconds()
+            if secs < 60:
+                return f"{int(secs)}s"
+            if secs < 3600:
+                return f"{int(secs / 60)}m"
+            return f"{secs / 3600:.1f}h"
+    return "ongoing"
--- a/services/router/incident_intelligence.py
+++ b/services/router/incident_intelligence.py
--- a/services/router/incident_store.py
+++ b/services/router/incident_store.py
@@ -0,0 +1,690 @@
+"""
+incident_store.py — Incident Log storage abstraction.
+
+Backends:
+  - MemoryIncidentStore  (testing)
+  - JsonlIncidentStore   (MVP/fallback — ops/incidents/ directory)
+  - PostgresIncidentStore(production — psycopg2 sync)
+  - AutoIncidentStore    (Postgres primary → JSONL fallback)
+
+All writes are non-fatal: exceptions are logged as warnings.
+"""
+from __future__ import annotations
+
+import datetime
+import hashlib
+import json
+import logging
+import os
+import re
+import threading
+import time
+import uuid
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+_SECRET_PAT = re.compile(r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+')
+
+
+def _redact_text(text: str, max_len: int = 4000) -> str:
+    """Mask secrets, truncate."""
+    text = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", text)
+    return text[:max_len] if len(text) > max_len else text
+
+
+def _now_iso() -> str:
+    return datetime.datetime.now(datetime.timezone.utc).isoformat()
+
+
+def _generate_incident_id() -> str:
+    now = datetime.datetime.now(datetime.timezone.utc)
+    rand = uuid.uuid4().hex[:6]
+    return f"inc_{now.strftime('%Y%m%d_%H%M')}_{rand}"
+
+
+# ─── Abstract interface ──────────────────────────────────────────────────────
+
+class IncidentStore(ABC):
+    @abstractmethod
+    def create_incident(self, data: Dict) -> Dict:
+        ...
+
+    @abstractmethod
+    def get_incident(self, incident_id: str) -> Optional[Dict]:
+        ...
+
+    @abstractmethod
+    def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
+        ...
+
+    @abstractmethod
+    def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
+        ...
+
+    @abstractmethod
+    def append_event(self, incident_id: str, event_type: str, message: str,
+                     meta: Optional[Dict] = None) -> Optional[Dict]:
+        ...
+
+    @abstractmethod
+    def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
+        ...
+
+    @abstractmethod
+    def add_artifact(self, incident_id: str, kind: str, fmt: str,
+                     path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
+        ...
+
+    @abstractmethod
+    def get_artifacts(self, incident_id: str) -> List[Dict]:
+        ...
+
+
+# ─── In-memory (testing) ─────────────────────────────────────────────────────
+
+class MemoryIncidentStore(IncidentStore):
+    def __init__(self):
+        self._incidents: Dict[str, Dict] = {}
+        self._events: Dict[str, List[Dict]] = {}
+        self._artifacts: Dict[str, List[Dict]] = {}
+        self._lock = threading.Lock()
+
+    def create_incident(self, data: Dict) -> Dict:
+        inc_id = data.get("id") or _generate_incident_id()
+        now = _now_iso()
+        inc = {
+            "id": inc_id,
+            "workspace_id": data.get("workspace_id", "default"),
+            "service": data["service"],
+            "env": data.get("env", "prod"),
+            "severity": data.get("severity", "P2"),
+            "status": "open",
+            "title": _redact_text(data.get("title", ""), 500),
+            "summary": _redact_text(data.get("summary", "") or "", 2000),
+            "started_at": data.get("started_at", now),
+            "ended_at": None,
+            "created_by": data.get("created_by", "unknown"),
+            "created_at": now,
+            "updated_at": now,
+            "meta": data.get("meta") or {},
+        }
+        with self._lock:
+            self._incidents[inc_id] = inc
+            self._events[inc_id] = []
+            self._artifacts[inc_id] = []
+        return inc
+
+    def get_incident(self, incident_id: str) -> Optional[Dict]:
+        inc = self._incidents.get(incident_id)
+        if not inc:
+            return None
+        events = self._events.get(incident_id, [])[-20:]
+        artifacts = self._artifacts.get(incident_id, [])
+        return {**inc, "events": events, "artifacts": artifacts}
+
+    def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
+        filters = filters or {}
+        result = list(self._incidents.values())
+        if filters.get("status"):
+            result = [i for i in result if i["status"] == filters["status"]]
+        if filters.get("service"):
+            result = [i for i in result if i["service"] == filters["service"]]
+        if filters.get("env"):
+            result = [i for i in result if i["env"] == filters["env"]]
+        if filters.get("severity"):
+            result = [i for i in result if i["severity"] == filters["severity"]]
+        result.sort(key=lambda x: x.get("created_at", ""), reverse=True)
+        return result[:limit]
+
+    def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
+        inc = self._incidents.get(incident_id)
+        if not inc:
+            return None
+        with self._lock:
+            inc["status"] = "closed"
+            inc["ended_at"] = ended_at
+            inc["summary"] = _redact_text(resolution, 2000) if resolution else inc.get("summary")
+            inc["updated_at"] = _now_iso()
+            self._events.setdefault(incident_id, []).append({
+                "ts": _now_iso(),
+                "type": "status_change",
+                "message": f"Incident closed: {_redact_text(resolution, 500)}",
+                "meta": None,
+            })
+        return inc
+
+    def append_event(self, incident_id: str, event_type: str, message: str,
+                     meta: Optional[Dict] = None) -> Optional[Dict]:
+        if incident_id not in self._incidents:
+            return None
+        ev = {
+            "ts": _now_iso(),
+            "type": event_type,
+            "message": _redact_text(message, 4000),
+            "meta": meta,
+        }
+        with self._lock:
+            self._events.setdefault(incident_id, []).append(ev)
+            self._incidents[incident_id]["updated_at"] = _now_iso()
+        return ev
+
+    def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
+        return self._events.get(incident_id, [])[:limit]
+
+    def add_artifact(self, incident_id: str, kind: str, fmt: str,
+                     path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
+        if incident_id not in self._incidents:
+            return None
+        art = {
+            "ts": _now_iso(),
+            "kind": kind,
+            "format": fmt,
+            "path": path,
+            "sha256": sha256,
+            "size_bytes": size_bytes,
+        }
+        with self._lock:
+            self._artifacts.setdefault(incident_id, []).append(art)
+        return art
+
+    def get_artifacts(self, incident_id: str) -> List[Dict]:
+        return self._artifacts.get(incident_id, [])
+
+
+# ─── JSONL (MVP file backend) ────────────────────────────────────────────────
+
+class JsonlIncidentStore(IncidentStore):
+    """
+    Stores incidents/events/artifacts as separate JSONL files in a directory.
+    Layout:
+      <base_dir>/incidents.jsonl
+      <base_dir>/events.jsonl
+      <base_dir>/artifacts.jsonl
+    """
+
+    def __init__(self, base_dir: str):
+        self._dir = Path(base_dir)
+        self._dir.mkdir(parents=True, exist_ok=True)
+        self._lock = threading.Lock()
+
+    def _incidents_path(self) -> Path:
+        return self._dir / "incidents.jsonl"
+
+    def _events_path(self) -> Path:
+        return self._dir / "events.jsonl"
+
+    def _artifacts_path(self) -> Path:
+        return self._dir / "artifacts.jsonl"
+
+    def _read_jsonl(self, path: Path) -> List[Dict]:
+        if not path.exists():
+            return []
+        items = []
+        try:
+            with open(path, "r", encoding="utf-8") as fh:
+                for line in fh:
+                    line = line.strip()
+                    if line:
+                        try:
+                            items.append(json.loads(line))
+                        except json.JSONDecodeError:
+                            pass
+        except Exception:
+            pass
+        return items
+
+    def _append_jsonl(self, path: Path, record: Dict) -> None:
+        with self._lock:
+            with open(path, "a", encoding="utf-8") as fh:
+                fh.write(json.dumps(record, ensure_ascii=False, default=str) + "\n")
+
+    def _rewrite_jsonl(self, path: Path, items: List[Dict]) -> None:
+        with self._lock:
+            with open(path, "w", encoding="utf-8") as fh:
+                for item in items:
+                    fh.write(json.dumps(item, ensure_ascii=False, default=str) + "\n")
+
+    def create_incident(self, data: Dict) -> Dict:
+        inc_id = data.get("id") or _generate_incident_id()
+        now = _now_iso()
+        inc = {
+            "id": inc_id,
+            "workspace_id": data.get("workspace_id", "default"),
+            "service": data["service"],
+            "env": data.get("env", "prod"),
+            "severity": data.get("severity", "P2"),
+            "status": "open",
+            "title": _redact_text(data.get("title", ""), 500),
+            "summary": _redact_text(data.get("summary", "") or "", 2000),
+            "started_at": data.get("started_at", now),
+            "ended_at": None,
+            "created_by": data.get("created_by", "unknown"),
+            "created_at": now,
+            "updated_at": now,
+            "meta": data.get("meta") or {},
+        }
+        self._append_jsonl(self._incidents_path(), inc)
+        return inc
+
+    def get_incident(self, incident_id: str) -> Optional[Dict]:
+        incidents = self._read_jsonl(self._incidents_path())
+        inc = next((i for i in incidents if i.get("id") == incident_id), None)
+        if not inc:
+            return None
+        events = [e for e in self._read_jsonl(self._events_path())
+                  if e.get("incident_id") == incident_id][-20:]
+        artifacts = [a for a in self._read_jsonl(self._artifacts_path())
+                     if a.get("incident_id") == incident_id]
+        return {**inc, "events": events, "artifacts": artifacts}
+
+    def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
+        filters = filters or {}
+        incidents = self._read_jsonl(self._incidents_path())
+        if filters.get("status"):
+            incidents = [i for i in incidents if i.get("status") == filters["status"]]
+        if filters.get("service"):
+            incidents = [i for i in incidents if i.get("service") == filters["service"]]
+        if filters.get("env"):
+            incidents = [i for i in incidents if i.get("env") == filters["env"]]
+        if filters.get("severity"):
+            incidents = [i for i in incidents if i.get("severity") == filters["severity"]]
+        incidents.sort(key=lambda x: x.get("created_at", ""), reverse=True)
+        return incidents[:limit]
+
+    def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
+        incidents = self._read_jsonl(self._incidents_path())
+        found = None
+        for inc in incidents:
+            if inc.get("id") == incident_id:
+                inc["status"] = "closed"
+                inc["ended_at"] = ended_at
+                if resolution:
+                    inc["summary"] = _redact_text(resolution, 2000)
+                inc["updated_at"] = _now_iso()
+                found = inc
+                break
+        if not found:
+            return None
+        self._rewrite_jsonl(self._incidents_path(), incidents)
+        self.append_event(incident_id, "status_change",
+                          f"Incident closed: {_redact_text(resolution or '', 500)}")
+        return found
+
+    def append_event(self, incident_id: str, event_type: str, message: str,
+                     meta: Optional[Dict] = None) -> Optional[Dict]:
+        incidents = self._read_jsonl(self._incidents_path())
+        if not any(i.get("id") == incident_id for i in incidents):
+            return None
+        ev = {
+            "incident_id": incident_id,
+            "ts": _now_iso(),
+            "type": event_type,
+            "message": _redact_text(message, 4000),
+            "meta": meta,
+        }
+        self._append_jsonl(self._events_path(), ev)
+        return ev
+
+    def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
+        events = self._read_jsonl(self._events_path())
+        return [e for e in events if e.get("incident_id") == incident_id][:limit]
+
+    def add_artifact(self, incident_id: str, kind: str, fmt: str,
+                     path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
+        incidents = self._read_jsonl(self._incidents_path())
+        if not any(i.get("id") == incident_id for i in incidents):
+            return None
+        art = {
+            "incident_id": incident_id,
+            "ts": _now_iso(),
+            "kind": kind,
+            "format": fmt,
+            "path": path,
+            "sha256": sha256,
+            "size_bytes": size_bytes,
+        }
+        self._append_jsonl(self._artifacts_path(), art)
+        return art
+
+    def get_artifacts(self, incident_id: str) -> List[Dict]:
+        artifacts = self._read_jsonl(self._artifacts_path())
+        return [a for a in artifacts if a.get("incident_id") == incident_id]
+
+
+# ─── Postgres backend ─────────────────────────────────────────────────────────
+
+class PostgresIncidentStore(IncidentStore):
+    """
+    Production backend using psycopg2 (sync).
+    Tables created by ops/scripts/migrate_incidents_postgres.py.
+    """
+
+    def __init__(self, dsn: str):
+        self._dsn = dsn
+        self._local = threading.local()
+
+    def _conn(self):
+        """Get or create a per-thread connection."""
+        conn = getattr(self._local, "conn", None)
+        if conn is None or conn.closed:
+            import psycopg2  # type: ignore
+            conn = psycopg2.connect(self._dsn)
+            conn.autocommit = True
+            self._local.conn = conn
+        return conn
+
+    def create_incident(self, data: Dict) -> Dict:
+        inc_id = data.get("id") or _generate_incident_id()
+        now = _now_iso()
+        cur = self._conn().cursor()
+        cur.execute(
+            """INSERT INTO incidents (id,workspace_id,service,env,severity,status,
+               title,summary,started_at,created_by,created_at,updated_at)
+               VALUES (%s,%s,%s,%s,%s,'open',%s,%s,%s,%s,%s,%s)""",
+            (inc_id, data.get("workspace_id", "default"),
+             data["service"], data.get("env", "prod"),
+             data.get("severity", "P2"),
+             _redact_text(data.get("title", ""), 500),
+             _redact_text(data.get("summary", "") or "", 2000),
+             data.get("started_at") or now,
+             data.get("created_by", "unknown"), now, now),
+        )
+        cur.close()
+        return {"id": inc_id, "status": "open", "service": data["service"],
+                "severity": data.get("severity", "P2"),
+                "started_at": data.get("started_at") or now,
+                "created_at": now}
+
+    def get_incident(self, incident_id: str) -> Optional[Dict]:
+        cur = self._conn().cursor()
+        cur.execute("SELECT id,workspace_id,service,env,severity,status,title,summary,"
+                     "started_at,ended_at,created_by,created_at,updated_at "
+                     "FROM incidents WHERE id=%s", (incident_id,))
+        row = cur.fetchone()
+        if not row:
+            cur.close()
+            return None
+        cols = [d[0] for d in cur.description]
+        inc = {c: (v.isoformat() if isinstance(v, datetime.datetime) else v) for c, v in zip(cols, row)}
+        # Events
+        cur.execute("SELECT ts,type,message,meta FROM incident_events "
+                     "WHERE incident_id=%s ORDER BY ts DESC LIMIT 200", (incident_id,))
+        events = []
+        for r in cur.fetchall():
+            events.append({"ts": r[0].isoformat() if r[0] else "", "type": r[1],
+                           "message": r[2], "meta": r[3]})
+        events.reverse()
+        # Artifacts
+        cur.execute("SELECT ts,kind,format,path,sha256,size_bytes FROM incident_artifacts "
+                     "WHERE incident_id=%s ORDER BY ts", (incident_id,))
+        artifacts = []
+        for r in cur.fetchall():
+            artifacts.append({"ts": r[0].isoformat() if r[0] else "", "kind": r[1],
+                              "format": r[2], "path": r[3], "sha256": r[4], "size_bytes": r[5]})
+        cur.close()
+        return {**inc, "events": events, "artifacts": artifacts}
+
+    def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
+        filters = filters or {}
+        clauses = []
+        params: list = []
+        for k in ("status", "service", "env", "severity"):
+            if filters.get(k):
+                clauses.append(f"{k}=%s")
+                params.append(filters[k])
+        if filters.get("window_days"):
+            clauses.append("created_at >= NOW() - INTERVAL '%s days'")
+            params.append(int(filters["window_days"]))
+        where = ("WHERE " + " AND ".join(clauses)) if clauses else ""
+        params.append(min(limit, 200))
+        cur = self._conn().cursor()
+        cur.execute(f"SELECT id,workspace_id,service,env,severity,status,title,summary,"
+                     f"started_at,ended_at,created_by,created_at,updated_at "
+                     f"FROM incidents {where} ORDER BY created_at DESC LIMIT %s", params)
+        cols = [d[0] for d in cur.description]
+        rows = []
+        for row in cur.fetchall():
+            rows.append({c: (v.isoformat() if isinstance(v, datetime.datetime) else v)
+                         for c, v in zip(cols, row)})
+        cur.close()
+        return rows
+
+    def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
+        cur = self._conn().cursor()
+        cur.execute("UPDATE incidents SET status='closed', ended_at=%s, summary=%s, updated_at=%s "
+                     "WHERE id=%s RETURNING id",
+                     (ended_at or _now_iso(), _redact_text(resolution, 2000) if resolution else None,
+                      _now_iso(), incident_id))
+        if not cur.fetchone():
+            cur.close()
+            return None
+        cur.close()
+        self.append_event(incident_id, "status_change",
+                          f"Incident closed: {_redact_text(resolution or '', 500)}")
+        return {"id": incident_id, "status": "closed"}
+
+    def append_event(self, incident_id: str, event_type: str, message: str,
+                     meta: Optional[Dict] = None) -> Optional[Dict]:
+        now = _now_iso()
+        cur = self._conn().cursor()
+        meta_json = json.dumps(meta, default=str) if meta else None
+        cur.execute("INSERT INTO incident_events (incident_id,ts,type,message,meta) "
+                     "VALUES (%s,%s,%s,%s,%s)",
+                     (incident_id, now, event_type, _redact_text(message, 4000), meta_json))
+        cur.close()
+        return {"ts": now, "type": event_type, "message": _redact_text(message, 4000), "meta": meta}
+
+    def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
+        cur = self._conn().cursor()
+        cur.execute("SELECT ts,type,message,meta FROM incident_events "
+                     "WHERE incident_id=%s ORDER BY ts LIMIT %s", (incident_id, limit))
+        events = [{"ts": r[0].isoformat() if r[0] else "", "type": r[1],
+                    "message": r[2], "meta": r[3]} for r in cur.fetchall()]
+        cur.close()
+        return events
+
+    def add_artifact(self, incident_id: str, kind: str, fmt: str,
+                     path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
+        now = _now_iso()
+        cur = self._conn().cursor()
+        cur.execute("INSERT INTO incident_artifacts (incident_id,ts,kind,format,path,sha256,size_bytes) "
+                     "VALUES (%s,%s,%s,%s,%s,%s,%s)",
+                     (incident_id, now, kind, fmt, path, sha256, size_bytes))
+        cur.close()
+        return {"ts": now, "kind": kind, "format": fmt, "path": path,
+                "sha256": sha256, "size_bytes": size_bytes}
+
+    def get_artifacts(self, incident_id: str) -> List[Dict]:
+        cur = self._conn().cursor()
+        cur.execute("SELECT ts,kind,format,path,sha256,size_bytes FROM incident_artifacts "
+                     "WHERE incident_id=%s ORDER BY ts", (incident_id,))
+        artifacts = [{"ts": r[0].isoformat() if r[0] else "", "kind": r[1], "format": r[2],
+                       "path": r[3], "sha256": r[4], "size_bytes": r[5]} for r in cur.fetchall()]
+        cur.close()
+        return artifacts
+
+    def close(self):
+        conn = getattr(self._local, "conn", None)
+        if conn and not conn.closed:
+            conn.close()
+
+
+# ─── Auto backend (Postgres → JSONL fallback) ────────────────────────────────
+
+class AutoIncidentStore(IncidentStore):
+    """
+    Tries Postgres first; on any failure falls back to JSONL.
+    Re-attempts Postgres after RECOVERY_INTERVAL_S (5 min).
+    """
+
+    _RECOVERY_INTERVAL_S = 300
+
+    def __init__(self, pg_dsn: str, jsonl_dir: str):
+        self._pg_dsn = pg_dsn
+        self._jsonl_dir = jsonl_dir
+        self._primary: Optional[PostgresIncidentStore] = None
+        self._fallback: Optional[JsonlIncidentStore] = None
+        self._using_fallback = False
+        self._fallback_since: float = 0.0
+        self._init_lock = threading.Lock()
+
+    def _get_primary(self) -> PostgresIncidentStore:
+        if self._primary is None:
+            with self._init_lock:
+                if self._primary is None:
+                    self._primary = PostgresIncidentStore(self._pg_dsn)
+        return self._primary
+
+    def _get_fallback(self) -> JsonlIncidentStore:
+        if self._fallback is None:
+            with self._init_lock:
+                if self._fallback is None:
+                    self._fallback = JsonlIncidentStore(self._jsonl_dir)
+        return self._fallback
+
+    def _maybe_recover(self) -> None:
+        if self._using_fallback and self._fallback_since > 0:
+            if time.monotonic() - self._fallback_since >= self._RECOVERY_INTERVAL_S:
+                logger.info("AutoIncidentStore: attempting Postgres recovery")
+                self._using_fallback = False
+                self._fallback_since = 0.0
+
+    def _switch_to_fallback(self, err: Exception) -> None:
+        logger.warning("AutoIncidentStore: Postgres failed (%s), using JSONL fallback", err)
+        self._using_fallback = True
+        self._fallback_since = time.monotonic()
+
+    def active_backend(self) -> str:
+        return "jsonl_fallback" if self._using_fallback else "postgres"
+
+    # ── Delegate methods ──────────────────────────────────────────────────────
+
+    def create_incident(self, data: Dict) -> Dict:
+        self._maybe_recover()
+        if not self._using_fallback:
+            try:
+                return self._get_primary().create_incident(data)
+            except Exception as e:
+                self._switch_to_fallback(e)
+        return self._get_fallback().create_incident(data)
+
+    def get_incident(self, incident_id: str) -> Optional[Dict]:
+        self._maybe_recover()
+        if not self._using_fallback:
+            try:
+                return self._get_primary().get_incident(incident_id)
+            except Exception as e:
+                self._switch_to_fallback(e)
+        return self._get_fallback().get_incident(incident_id)
+
+    def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
+        self._maybe_recover()
+        if not self._using_fallback:
+            try:
+                return self._get_primary().list_incidents(filters, limit)
+            except Exception as e:
+                self._switch_to_fallback(e)
+        return self._get_fallback().list_incidents(filters, limit)
+
+    def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
+        self._maybe_recover()
+        if not self._using_fallback:
+            try:
+                return self._get_primary().close_incident(incident_id, ended_at, resolution)
+            except Exception as e:
+                self._switch_to_fallback(e)
+        return self._get_fallback().close_incident(incident_id, ended_at, resolution)
+
+    def append_event(self, incident_id: str, event_type: str, message: str,
+                     meta: Optional[Dict] = None) -> Optional[Dict]:
+        self._maybe_recover()
+        if not self._using_fallback:
+            try:
+                return self._get_primary().append_event(incident_id, event_type, message, meta)
+            except Exception as e:
+                self._switch_to_fallback(e)
+        return self._get_fallback().append_event(incident_id, event_type, message, meta)
+
+    def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
+        self._maybe_recover()
+        if not self._using_fallback:
+            try:
+                return self._get_primary().get_events(incident_id, limit)
+            except Exception as e:
+                self._switch_to_fallback(e)
+        return self._get_fallback().get_events(incident_id, limit)
+
+    def add_artifact(self, incident_id: str, kind: str, fmt: str,
+                     path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
+        self._maybe_recover()
+        if not self._using_fallback:
+            try:
+                return self._get_primary().add_artifact(incident_id, kind, fmt, path, sha256, size_bytes)
+            except Exception as e:
+                self._switch_to_fallback(e)
+        return self._get_fallback().add_artifact(incident_id, kind, fmt, path, sha256, size_bytes)
+
+    def get_artifacts(self, incident_id: str) -> List[Dict]:
+        self._maybe_recover()
+        if not self._using_fallback:
+            try:
+                return self._get_primary().get_artifacts(incident_id)
+            except Exception as e:
+                self._switch_to_fallback(e)
+        return self._get_fallback().get_artifacts(incident_id)
+
+
+# ─── Singleton ────────────────────────────────────────────────────────────────
+
+_store: Optional[IncidentStore] = None
+_store_lock = threading.Lock()
+
+
+def get_incident_store() -> IncidentStore:
+    global _store
+    if _store is None:
+        with _store_lock:
+            if _store is None:
+                _store = _create_store()
+    return _store
+
+
+def set_incident_store(store: Optional[IncidentStore]) -> None:
+    global _store
+    with _store_lock:
+        _store = store
+
+
+def _create_store() -> IncidentStore:
+    backend = os.getenv("INCIDENT_BACKEND", "jsonl").lower()
+    dsn = os.getenv("DATABASE_URL") or os.getenv("INCIDENT_DATABASE_URL", "")
+    jsonl_dir = os.getenv(
+        "INCIDENT_JSONL_DIR",
+        str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "incidents"),
+    )
+
+    if backend == "memory":
+        logger.info("IncidentStore: in-memory (testing only)")
+        return MemoryIncidentStore()
+
+    if backend == "postgres":
+        if dsn:
+            logger.info("IncidentStore: postgres dsn=%s…", dsn[:30])
+            return PostgresIncidentStore(dsn)
+        logger.warning("INCIDENT_BACKEND=postgres but no DATABASE_URL; falling back to jsonl")
+
+    if backend == "auto":
+        if dsn:
+            logger.info("IncidentStore: auto (postgres→jsonl fallback) dsn=%s…", dsn[:30])
+            return AutoIncidentStore(pg_dsn=dsn, jsonl_dir=jsonl_dir)
+        logger.info("IncidentStore: auto — no DATABASE_URL, using jsonl")
+
+    if backend == "null":
+        return MemoryIncidentStore()
+
+    # Default: JSONL
+    logger.info("IncidentStore: jsonl dir=%s", jsonl_dir)
+    return JsonlIncidentStore(jsonl_dir)
--- a/services/router/llm_enrichment.py
+++ b/services/router/llm_enrichment.py
@@ -0,0 +1,261 @@
+"""
+llm_enrichment.py — Optional LLM enrichment for Risk Attribution (strictly bounded).
+
+Design constraints:
+  - LLM output is explanatory ONLY — never changes scores or decisions.
+  - Default mode is OFF (llm_mode="off").
+  - Local mode calls a local HTTP model runner (Ollama-compatible by default).
+  - Triggers are checked before every call: off if delta < warn OR band not high/critical.
+  - Input is hard-truncated to llm_max_chars_in.
+  - Output is hard-truncated to llm_max_chars_out.
+  - Any error → graceful skip, returns {enabled: false, text: null}.
+
+Hardening guards (new):
+  - model_allowlist: model must be in allowlist or call is skipped.
+  - max_calls_per_digest: caller passes a mutable counter dict; stops after limit.
+  - per_day_dedupe: in-memory key per (date, service, env) prevents duplicate calls.
+
+Usage:
+  from llm_enrichment import maybe_enrich_attribution
+  call_counter = {"count": 0}
+  report["llm_enrichment"] = maybe_enrich_attribution(
+      attribution_report, risk_report, attr_policy,
+      call_counter=call_counter,
+  )
+"""
+from __future__ import annotations
+
+import datetime
+import json
+import logging
+from typing import Dict, Optional
+
+logger = logging.getLogger(__name__)
+
+# ─── Per-day dedupe store (module-level in-memory) ───────────────────────────
+# key: "risk_enrich:{YYYY-MM-DD}:{service}:{env}" → True
+_dedupe_store: Dict[str, bool] = {}
+
+
+def _dedupe_key(service: str, env: str) -> str:
+    date = datetime.datetime.utcnow().strftime("%Y-%m-%d")
+    return f"risk_enrich:{date}:{service}:{env}"
+
+
+def _is_deduped(service: str, env: str) -> bool:
+    return _dedupe_store.get(_dedupe_key(service, env), False)
+
+
+def _mark_deduped(service: str, env: str) -> None:
+    _dedupe_store[_dedupe_key(service, env)] = True
+
+
+def _clear_dedupe_store() -> None:
+    """Test helper to reset per-day dedup state."""
+    _dedupe_store.clear()
+
+# ─── Trigger guard ────────────────────────────────────────────────────────────
+
+def _should_trigger(risk_report: Dict, attr_policy: Dict) -> bool:
+    """
+    Returns True only if triggers are met:
+      delta_24h >= risk_delta_warn  OR  band in band_in
+    Both conditions are OR — either is enough.
+    """
+    triggers = attr_policy.get("llm_triggers", {})
+    delta_warn = int(triggers.get("risk_delta_warn", 10))
+    band_in = set(triggers.get("band_in", ["high", "critical"]))
+
+    band = risk_report.get("band", "low")
+    delta_24h = (risk_report.get("trend") or {}).get("delta_24h")
+
+    if band in band_in:
+        return True
+    if delta_24h is not None and delta_24h >= delta_warn:
+        return True
+    return False
+
+
+# ─── Prompt builder ───────────────────────────────────────────────────────────
+
+def _build_prompt(
+    attribution_report: Dict,
+    risk_report: Dict,
+    max_chars: int,
+) -> str:
+    """Build a compact prompt for local LLM enrichment."""
+    service = attribution_report.get("service", "?")
+    env = attribution_report.get("env", "prod")
+    score = risk_report.get("score", 0)
+    band = risk_report.get("band", "?")
+    delta = attribution_report.get("delta_24h")
+    causes = attribution_report.get("causes", [])[:3]
+    reasons = risk_report.get("reasons", [])[:4]
+
+    causes_text = "\n".join(
+        f"  - {c['type']} (score={c['score']}, confidence={c['confidence']}): "
+        + "; ".join(c.get("evidence", []))
+        for c in causes
+    )
+    reasons_text = "\n".join(f"  - {r}" for r in reasons)
+
+    prompt = (
+        f"You are a platform reliability assistant. Provide a 2-3 sentence human-readable "
+        f"explanation for a risk spike in service '{service}' (env={env}).\n\n"
+        f"Risk score: {score} ({band}). "
+        + (f"Delta 24h: +{delta}.\n\n" if delta is not None else "\n\n")
+        + f"Risk signals:\n{reasons_text}\n\n"
+        f"Attributed causes:\n{causes_text}\n\n"
+        f"Write a concise explanation (max 3 sentences). Do NOT include scores or numbers "
+        f"from above verbatim. Focus on actionable insight."
+    )
+    return prompt[:max_chars]
+
+
+# ─── Local model call ─────────────────────────────────────────────────────────
+
+def _is_model_allowed(model: str, attr_policy: Dict) -> bool:
+    """Return True if model is in llm_local.model_allowlist (or list is empty/absent)."""
+    allowlist = attr_policy.get("llm_local", {}).get("model_allowlist")
+    if not allowlist:
+        return True   # no restriction configured
+    return model in allowlist
+
+
+def _call_local_llm(
+    prompt: str,
+    attr_policy: Dict,
+    max_out: int,
+) -> Optional[str]:
+    """
+    Calls Ollama-compatible local endpoint.
+    Skips if model is not in model_allowlist.
+    Returns text or None on failure.
+    """
+    llm_cfg = attr_policy.get("llm_local", {})
+    endpoint = llm_cfg.get("endpoint", "http://localhost:11434/api/generate")
+    model = llm_cfg.get("model", "llama3")
+    timeout = int(llm_cfg.get("timeout_seconds", 15))
+
+    if not _is_model_allowed(model, attr_policy):
+        logger.warning("llm_enrichment: model '%s' not in allowlist; skipping", model)
+        return None
+
+    try:
+        import urllib.request
+        payload = json.dumps({
+            "model": model,
+            "prompt": prompt,
+            "stream": False,
+            "options": {"num_predict": max_out // 4},  # approx token budget
+        }).encode()
+        req = urllib.request.Request(
+            endpoint,
+            data=payload,
+            headers={"Content-Type": "application/json"},
+            method="POST",
+        )
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            body = json.loads(resp.read())
+            text = body.get("response", "") or ""
+            return text[:max_out] if text else None
+    except (Exception, OSError, ConnectionError) as e:
+        logger.warning("llm_enrichment: local LLM call failed: %s", e)
+        return None
+
+
+# ─── Public interface ─────────────────────────────────────────────────────────
+
+def maybe_enrich_attribution(
+    attribution_report: Dict,
+    risk_report: Dict,
+    attr_policy: Optional[Dict] = None,
+    *,
+    call_counter: Optional[Dict] = None,
+) -> Dict:
+    """
+    Conditionally enrich attribution_report with LLM text.
+
+    Hardening guards (checked in order):
+      1. llm_mode must be "local" (not "off" or "remote")
+      2. triggers must be met (delta >= warn OR band in high/critical)
+      3. model must be in model_allowlist
+      4. max_calls_per_digest not exceeded (via mutable `call_counter` dict)
+      5. per-day dedupe: (service, env) pair not already enriched today
+
+    Returns:
+      {"enabled": True/False, "text": str|None, "mode": str}
+
+    Never raises. LLM output does NOT alter scores.
+    """
+    if attr_policy is None:
+        try:
+            from risk_attribution import load_attribution_policy
+            attr_policy = load_attribution_policy()
+        except Exception:
+            return {"enabled": False, "text": None, "mode": "off"}
+
+    mode = (attr_policy.get("defaults") or {}).get("llm_mode", "off")
+
+    if mode == "off":
+        return {"enabled": False, "text": None, "mode": "off"}
+
+    # Guard: triggers
+    if not _should_trigger(risk_report, attr_policy):
+        return {"enabled": False, "text": None, "mode": mode,
+                "skipped_reason": "triggers not met"}
+
+    service = attribution_report.get("service", "")
+    env = attribution_report.get("env", "prod")
+
+    # Guard: model allowlist (checked early so tests can assert without calling LLM)
+    if mode == "local":
+        llm_local_cfg_early = attr_policy.get("llm_local", {})
+        model_cfg = llm_local_cfg_early.get("model", "llama3")
+        if not _is_model_allowed(model_cfg, attr_policy):
+            logger.warning("llm_enrichment: model '%s' not in allowlist; skipping", model_cfg)
+            return {"enabled": False, "text": None, "mode": mode,
+                    "skipped_reason": f"model '{model_cfg}' not in allowlist"}
+
+    # Guard: per-day dedupe
+    llm_local_cfg = attr_policy.get("llm_local", {})
+    if llm_local_cfg.get("per_day_dedupe", True):
+        if _is_deduped(service, env):
+            return {"enabled": False, "text": None, "mode": mode,
+                    "skipped_reason": "per_day_dedupe: already enriched today"}
+
+    # Guard: max_calls_per_digest
+    if call_counter is not None:
+        max_calls = int(llm_local_cfg.get("max_calls_per_digest", 3))
+        if call_counter.get("count", 0) >= max_calls:
+            return {"enabled": False, "text": None, "mode": mode,
+                    "skipped_reason": f"max_calls_per_digest={max_calls} reached"}
+
+    defaults = attr_policy.get("defaults", {})
+    max_in = int(defaults.get("llm_max_chars_in", 3500))
+    max_out = int(defaults.get("llm_max_chars_out", 800))
+    prompt = _build_prompt(attribution_report, risk_report, max_in)
+
+    if mode == "local":
+        try:
+            text = _call_local_llm(prompt, attr_policy, max_out)
+        except Exception as e:
+            logger.warning("llm_enrichment: local call raised: %s", e)
+            text = None
+
+        if text is not None:
+            # Update guards on success
+            _mark_deduped(service, env)
+            if call_counter is not None:
+                call_counter["count"] = call_counter.get("count", 0) + 1
+
+        return {
+            "enabled": text is not None,
+            "text": text,
+            "mode": "local",
+        }
+
+    # mode == "remote" — not implemented; stub for future extensibility
+    logger.debug("llm_enrichment: remote mode not implemented; skipping")
+    return {"enabled": False, "text": None, "mode": "remote",
+            "skipped_reason": "remote not implemented"}
--- a/services/router/platform_priority_digest.py
+++ b/services/router/platform_priority_digest.py
@@ -0,0 +1,340 @@
+"""
+platform_priority_digest.py — Weekly Platform Priority Digest.
+DAARION.city | deterministic, no LLM.
+
+Generates a Markdown + JSON report prioritising services by Architecture Pressure,
+optionally correlated with Risk score/delta.
+
+Outputs:
+  ops/reports/platform/{YYYY-WW}.md
+  ops/reports/platform/{YYYY-WW}.json
+
+Public API:
+  weekly_platform_digest(env, ...) -> DigestResult
+"""
+from __future__ import annotations
+
+import datetime
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from architecture_pressure import load_pressure_policy
+
+logger = logging.getLogger(__name__)
+
+# ─── Action templates ─────────────────────────────────────────────────────────
+
+_ACTION_TEMPLATES = {
+    "arch_review": (
+        "📋 **Schedule architecture review**: '{service}' pressure={score} "
+        "({band}). Review structural debt and recurring failure patterns."
+    ),
+    "refactor_sprint": (
+        "🔧 **Allocate refactor sprint**: '{service}' has {regressions} regressions "
+        "and {escalations} escalations in 30d — structural instability requires investment."
+    ),
+    "freeze_features": (
+        "🚫 **Freeze non-critical features**: '{service}' is critical-pressure + "
+        "risk-high. Stabilise before new feature work."
+    ),
+    "reduce_backlog": (
+        "📌 **Reduce followup backlog**: '{service}' has {overdue} overdue follow-ups. "
+        "Address before next release cycle."
+    ),
+}
+
+
+def _now_week() -> str:
+    """Return ISO week string: YYYY-WNN."""
+    return datetime.datetime.utcnow().strftime("%Y-W%V")
+
+
+def _now_date() -> str:
+    return datetime.datetime.utcnow().strftime("%Y-%m-%d")
+
+
+def _clamp(text: str, max_chars: int) -> str:
+    if max_chars and len(text) > max_chars:
+        return text[:max_chars - 3] + "…"
+    return text
+
+
+# ─── Action list builder ──────────────────────────────────────────────────────
+
+def _build_priority_actions(pressure_reports: List[Dict], risk_reports: Optional[Dict] = None) -> List[str]:
+    actions = []
+    risk_reports = risk_reports or {}
+
+    for r in pressure_reports:
+        svc = r["service"]
+        score = r.get("score", 0)
+        band = r.get("band", "low")
+        comp = r.get("components", {})
+
+        if r.get("requires_arch_review"):
+            actions.append(
+                _ACTION_TEMPLATES["arch_review"].format(
+                    service=svc, score=score, band=band
+                )
+            )
+
+        regressions = int(comp.get("regressions_30d", 0))
+        escalations = int(comp.get("escalations_30d", 0))
+        if regressions >= 3 and escalations >= 2:
+            actions.append(
+                _ACTION_TEMPLATES["refactor_sprint"].format(
+                    service=svc, regressions=regressions, escalations=escalations
+                )
+            )
+
+        rr = risk_reports.get(svc, {})
+        risk_band = rr.get("band", "low") if rr else r.get("risk_band", "low")
+        if band == "critical" and risk_band in ("high", "critical"):
+            actions.append(
+                _ACTION_TEMPLATES["freeze_features"].format(service=svc)
+            )
+
+        overdue = int(comp.get("followups_overdue", 0))
+        if overdue >= 2:
+            actions.append(
+                _ACTION_TEMPLATES["reduce_backlog"].format(service=svc, overdue=overdue)
+            )
+
+    return actions[:20]  # cap
+
+
+# ─── Markdown builder ─────────────────────────────────────────────────────────
+
+def _build_markdown(
+    week_str: str,
+    env: str,
+    pressure_reports: List[Dict],
+    investment_list: List[Dict],
+    actions: List[str],
+    band_counts: Dict[str, int],
+) -> str:
+    lines = [
+        f"# Platform Priority Digest — {env.upper()} | {week_str}",
+        f"_Generated: {_now_date()} | Deterministic | No LLM_",
+        "",
+        "## Pressure Band Summary",
+        "",
+        f"| Band | Services |",
+        f"|------|---------|",
+        f"| 🔴 Critical | {band_counts.get('critical', 0)} |",
+        f"| 🟠 High | {band_counts.get('high', 0)} |",
+        f"| 🟡 Medium | {band_counts.get('medium', 0)} |",
+        f"| 🟢 Low | {band_counts.get('low', 0)} |",
+        "",
+    ]
+
+    # Critical pressure
+    critical = [r for r in pressure_reports if r.get("band") == "critical"]
+    if critical:
+        lines += ["## 🔴 Critical Structural Pressure", ""]
+        for r in critical:
+            svc = r["service"]
+            score = r.get("score", 0)
+            summary = "; ".join(r.get("signals_summary", [])[:3])
+            arch_flag = " ⚠️ ARCH REVIEW REQUIRED" if r.get("requires_arch_review") else ""
+            lines.append(f"### {svc} (score={score}){arch_flag}")
+            lines.append(f"> {summary}")
+            # Risk correlation
+            if r.get("risk_score") is not None:
+                lines.append(
+                    f"> Risk: {r['risk_score']} ({r.get('risk_band', '?')})"
+                    + (f" Δ24h: +{r['risk_delta_24h']}" if r.get("risk_delta_24h") else "")
+                )
+            lines.append("")
+
+    # High pressure
+    high = [r for r in pressure_reports if r.get("band") == "high"]
+    if high:
+        lines += ["## 🟠 High Pressure Services", ""]
+        for r in high:
+            svc = r["service"]
+            score = r.get("score", 0)
+            summary = (r.get("signals_summary") or [""])[0]
+            lines.append(
+                f"- **{svc}** (score={score}): {summary}"
+            )
+        lines.append("")
+
+    # Investment priority list
+    if investment_list:
+        lines += ["## 📊 Investment Priority List", ""]
+        lines.append("Services where Pressure ≥ require_arch_review_at AND risk is elevated:")
+        lines.append("")
+        for i, item in enumerate(investment_list, 1):
+            lines.append(
+                f"{i}. **{item['service']}** — Pressure: {item['pressure_score']} "
+                f"({item['pressure_band']}) | Risk: {item.get('risk_score', 'N/A')} "
+                f"({item.get('risk_band', 'N/A')})"
+            )
+        lines.append("")
+
+    # Action recommendations
+    if actions:
+        lines += ["## ✅ Action Recommendations", ""]
+        for action in actions:
+            lines.append(f"- {action}")
+        lines.append("")
+
+    lines += [
+        "---",
+        "_Generated by DAARION.city Platform Priority Digest (deterministic, no LLM)_",
+    ]
+    return "\n".join(lines)
+
+
+# ─── Main digest function ─────────────────────────────────────────────────────
+
+def weekly_platform_digest(
+    env: str = "prod",
+    *,
+    pressure_reports: Optional[List[Dict]] = None,
+    risk_reports: Optional[Dict[str, Dict]] = None,
+    policy: Optional[Dict] = None,
+    week_str: Optional[str] = None,
+    output_dir: Optional[str] = None,
+    date_str: Optional[str] = None,
+    write_files: bool = True,
+    auto_followup: bool = True,
+    incident_store=None,
+) -> Dict:
+    """
+    Generate Weekly Platform Priority Digest.
+
+    Args:
+      pressure_reports: pre-computed pressure reports list (sorted by score desc)
+      risk_reports: {service: RiskReport} for side-by-side correlation
+      policy: architecture_pressure_policy (loaded if None)
+      week_str: ISO week for filenames (defaults to current week)
+      output_dir: override output directory
+      write_files: write .md and .json to disk
+      auto_followup: call maybe_create_arch_review_followup for each requiring review
+      incident_store: needed for auto_followup
+
+    Returns: DigestResult dict with markdown, json_data, files_written, followups_created.
+    """
+    if policy is None:
+        policy = load_pressure_policy()
+
+    effective_week = week_str or _now_week()
+    effective_date = date_str or _now_date()
+    cfg_output_dir = policy.get("digest", {}).get("output_dir", "ops/reports/platform")
+    effective_output_dir = output_dir or cfg_output_dir
+    max_chars = int(policy.get("digest", {}).get("max_chars", 12000))
+    top_n = int(policy.get("digest", {}).get("top_n_in_digest", 10))
+
+    pressure_reports = sorted(pressure_reports or [], key=lambda r: -r.get("score", 0))[:top_n]
+    risk_reports = risk_reports or {}
+
+    # Band counts
+    band_counts: Dict[str, int] = {"critical": 0, "high": 0, "medium": 0, "low": 0}
+    for r in pressure_reports:
+        b = r.get("band", "low")
+        band_counts[b] = band_counts.get(b, 0) + 1
+
+    # Investment priority list: requires_arch_review AND (risk high/critical OR delta > 0)
+    review_at = int(policy.get("priority_rules", {}).get("require_arch_review_at", 70))
+    investment_list = []
+    for r in pressure_reports:
+        if not r.get("requires_arch_review"):
+            continue
+        svc = r["service"]
+        rr = risk_reports.get(svc, {})
+        risk_band = rr.get("band", "low") if rr else r.get("risk_band", "low") or "low"
+        risk_delta = (rr.get("trend") or {}).get("delta_24h") if rr else r.get("risk_delta_24h")
+        if risk_band in ("high", "critical") or (risk_delta is not None and risk_delta > 0):
+            investment_list.append({
+                "service": svc,
+                "pressure_score": r.get("score"),
+                "pressure_band": r.get("band"),
+                "risk_score": rr.get("score") if rr else r.get("risk_score"),
+                "risk_band": risk_band,
+                "risk_delta_24h": risk_delta,
+            })
+
+    actions = _build_priority_actions(pressure_reports, risk_reports)
+
+    markdown_raw = _build_markdown(
+        week_str=effective_week,
+        env=env,
+        pressure_reports=pressure_reports,
+        investment_list=investment_list,
+        actions=actions,
+        band_counts=band_counts,
+    )
+    markdown = _clamp(markdown_raw, max_chars)
+
+    json_data = {
+        "week": effective_week,
+        "date": effective_date,
+        "env": env,
+        "generated_at": datetime.datetime.utcnow().isoformat(),
+        "band_counts": band_counts,
+        "top_pressure_services": [
+            {
+                "service": r.get("service"),
+                "score": r.get("score"),
+                "band": r.get("band"),
+                "requires_arch_review": r.get("requires_arch_review"),
+                "signals_summary": r.get("signals_summary", [])[:4],
+                "components": r.get("components", {}),
+                "risk_score": r.get("risk_score"),
+                "risk_band": r.get("risk_band"),
+                "risk_delta_24h": r.get("risk_delta_24h"),
+            }
+            for r in pressure_reports
+        ],
+        "investment_priority_list": investment_list,
+        "actions": actions,
+    }
+
+    # ── Auto followup creation ────────────────────────────────────────────────
+    followups_created = []
+    if auto_followup and incident_store is not None:
+        from architecture_pressure import maybe_create_arch_review_followup
+        for r in pressure_reports:
+            if r.get("requires_arch_review"):
+                fu_result = maybe_create_arch_review_followup(
+                    r,
+                    incident_store=incident_store,
+                    policy=policy,
+                    week_str=effective_week,
+                )
+                if fu_result.get("created"):
+                    followups_created.append({
+                        "service": r["service"],
+                        "dedupe_key": fu_result.get("dedupe_key"),
+                        "incident_id": fu_result.get("incident_id"),
+                    })
+
+    # ── Write files ───────────────────────────────────────────────────────────
+    files_written: List[str] = []
+    if write_files:
+        try:
+            out_path = Path(effective_output_dir)
+            out_path.mkdir(parents=True, exist_ok=True)
+            md_file = out_path / f"{effective_week}.md"
+            json_file = out_path / f"{effective_week}.json"
+            md_file.write_text(markdown, encoding="utf-8")
+            json_file.write_text(json.dumps(json_data, indent=2, default=str), encoding="utf-8")
+            files_written = [str(md_file), str(json_file)]
+            logger.info("platform_priority_digest: wrote %s and %s", md_file, json_file)
+        except Exception as e:
+            logger.warning("platform_priority_digest: failed to write files: %s", e)
+
+    return {
+        "week": effective_week,
+        "env": env,
+        "markdown": markdown,
+        "json_data": json_data,
+        "files_written": files_written,
+        "followups_created": followups_created,
+        "band_counts": band_counts,
+    }
--- a/services/router/provider_budget.py
+++ b/services/router/provider_budget.py
@@ -0,0 +1,419 @@
+"""Provider Budget Tracker — real-money token usage accounting.
+
+Tracks:
+  - Tokens used (input/output) per provider per model
+  - Estimated USD cost based on published pricing
+  - Approximate balance (if configured via env var)
+  - Rolling 24h / 7d / 30d windows
+
+Pricing table: updated Feb 2026 (USD per 1M tokens)
+"""
+from __future__ import annotations
+
+import json
+import logging
+import os
+import threading
+import time
+from collections import defaultdict
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+# ── Pricing catalog (USD / 1M tokens) ─────────────────────────────────────────
+
+PRICING: Dict[str, Dict[str, float]] = {
+    # provider → model_pattern → {input, output}
+    "anthropic": {
+        "claude-sonnet-4-5":   {"input": 3.0,  "output": 15.0},
+        "claude-opus-4-5":     {"input": 15.0, "output": 75.0},
+        "claude-haiku-3-5":    {"input": 0.8,  "output": 4.0},
+        "claude-3-5-sonnet":   {"input": 3.0,  "output": 15.0},
+        "_default":            {"input": 3.0,  "output": 15.0},
+    },
+    "grok": {
+        "grok-4-1-fast-reasoning": {"input": 5.0, "output": 15.0},
+        "grok-3":                  {"input": 5.0, "output": 25.0},
+        "grok-2-1212":             {"input": 2.0, "output": 10.0},
+        "_default":                {"input": 5.0, "output": 15.0},
+    },
+    "deepseek": {
+        "deepseek-chat":           {"input": 0.27, "output": 1.10},
+        "deepseek-reasoner":       {"input": 0.55, "output": 2.19},
+        "_default":                {"input": 0.27, "output": 1.10},
+    },
+    "mistral": {
+        "mistral-large-latest":    {"input": 2.0,  "output": 6.0},
+        "mistral-small-latest":    {"input": 0.2,  "output": 0.6},
+        "_default":                {"input": 2.0,  "output": 6.0},
+    },
+    "openai": {
+        "gpt-4o":                  {"input": 2.5,  "output": 10.0},
+        "gpt-4o-mini":             {"input": 0.15, "output": 0.60},
+        "gpt-4-turbo":             {"input": 10.0, "output": 30.0},
+        "_default":                {"input": 2.5,  "output": 10.0},
+    },
+    "glm": {
+        "glm-4-plus":              {"input": 0.05, "output": 0.05},
+        "glm-4-flash":             {"input": 0.0,  "output": 0.0},   # free tier
+        "glm-4.7-flash":           {"input": 0.0,  "output": 0.0},
+        "glm-z1-plus":             {"input": 0.07, "output": 0.07},
+        "_default":                {"input": 0.05, "output": 0.05},
+    },
+    "ollama": {
+        "_default": {"input": 0.0, "output": 0.0},
+    },
+}
+
+
+def get_price(provider: str, model: str) -> Dict[str, float]:
+    p = PRICING.get(provider.lower(), PRICING.get("anthropic"))
+    # exact match
+    if model in p:
+        return p[model]
+    # prefix match
+    for k, v in p.items():
+        if k != "_default" and model.startswith(k):
+            return v
+    return p.get("_default", {"input": 3.0, "output": 15.0})
+
+
+def calc_cost_usd(provider: str, model: str, input_tokens: int, output_tokens: int) -> float:
+    price = get_price(provider, model)
+    return (input_tokens * price["input"] + output_tokens * price["output"]) / 1_000_000
+
+
+# ── Usage record ──────────────────────────────────────────────────────────────
+
+@dataclass
+class UsageRecord:
+    ts: float
+    provider: str
+    model: str
+    agent: str
+    input_tokens: int
+    output_tokens: int
+    cost_usd: float
+    latency_ms: int = 0
+    task_type: str = ""
+    fallback_used: bool = False
+
+
+# ── Storage ────────────────────────────────────────────────────────────────────
+
+_BUDGET_DIR = Path(os.getenv("BUDGET_DATA_DIR", os.path.expanduser("~/.sofiia/budget")))
+_USAGE_FILE = _BUDGET_DIR / "usage.jsonl"
+_LIMITS_FILE = _BUDGET_DIR / "limits.json"
+
+_lock = threading.Lock()
+
+
+def _ensure_dir() -> None:
+    _BUDGET_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def _append_usage(rec: UsageRecord) -> None:
+    _ensure_dir()
+    with _lock:
+        with open(_USAGE_FILE, "a", encoding="utf-8") as f:
+            f.write(json.dumps(asdict(rec)) + "\n")
+
+
+def _load_usage(since_ts: float = 0.0) -> List[UsageRecord]:
+    if not _USAGE_FILE.exists():
+        return []
+    records: List[UsageRecord] = []
+    with _lock:
+        try:
+            with open(_USAGE_FILE, "r", encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        d = json.loads(line)
+                        if d.get("ts", 0) >= since_ts:
+                            records.append(UsageRecord(**d))
+                    except Exception:
+                        pass
+        except Exception as e:
+            logger.warning("budget: failed to load usage: %s", e)
+    return records
+
+
+# ── Manual balance config ──────────────────────────────────────────────────────
+
+def _load_limits() -> Dict[str, Any]:
+    if not _LIMITS_FILE.exists():
+        return {}
+    try:
+        with open(_LIMITS_FILE, "r") as f:
+            return json.load(f)
+    except Exception:
+        return {}
+
+
+def _save_limits(data: Dict[str, Any]) -> None:
+    _ensure_dir()
+    with _lock:
+        with open(_LIMITS_FILE, "w") as f:
+            json.dump(data, f, indent=2)
+
+
+# ── Public API ─────────────────────────────────────────────────────────────────
+
+def track_usage(
+    provider: str,
+    model: str,
+    agent: str,
+    input_tokens: int,
+    output_tokens: int,
+    latency_ms: int = 0,
+    task_type: str = "",
+    fallback_used: bool = False,
+) -> float:
+    """Record token usage and return cost in USD."""
+    cost = calc_cost_usd(provider, model, input_tokens, output_tokens)
+    rec = UsageRecord(
+        ts=time.time(),
+        provider=provider,
+        model=model,
+        agent=agent,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        cost_usd=cost,
+        latency_ms=latency_ms,
+        task_type=task_type,
+        fallback_used=fallback_used,
+    )
+    _append_usage(rec)
+    logger.debug(
+        "💰 tracked: provider=%s model=%s tokens=%d+%d cost=$%.5f",
+        provider, model, input_tokens, output_tokens, cost,
+    )
+    return cost
+
+
+@dataclass
+class ProviderStats:
+    provider: str
+    total_input_tokens: int = 0
+    total_output_tokens: int = 0
+    total_cost_usd: float = 0.0
+    call_count: int = 0
+    avg_latency_ms: float = 0.0
+    top_models: List[Dict[str, Any]] = field(default_factory=list)
+    # Configured limits (from limits.json)
+    monthly_limit_usd: Optional[float] = None
+    topup_balance_usd: Optional[float] = None
+    estimated_remaining_usd: Optional[float] = None
+
+
+def get_stats(window_hours: int = 720) -> Dict[str, ProviderStats]:
+    """
+    Aggregate usage stats per provider for the given time window.
+    Default window = 720h = 30 days.
+    """
+    since_ts = time.time() - window_hours * 3600
+    records = _load_usage(since_ts)
+    by_provider = _aggregate_records(records)
+
+    limits = _load_limits()
+    for p, s in by_provider.items():
+        lim = limits.get(p, {})
+        if "monthly_limit_usd" in lim:
+            s.monthly_limit_usd = lim["monthly_limit_usd"]
+        if "topup_balance_usd" in lim:
+            s.topup_balance_usd = lim["topup_balance_usd"]
+            s.estimated_remaining_usd = round(lim["topup_balance_usd"] - s.total_cost_usd, 4)
+
+    return by_provider
+
+
+def get_dashboard_data() -> Dict[str, Any]:
+    """
+    Returns structured data for the budget dashboard UI.
+    Includes 24h, 7d, 30d windows.
+    Single file read + in-memory filtering for all three windows.
+    """
+    now = time.time()
+    ts_30d = now - 720 * 3600
+    ts_7d  = now - 168 * 3600
+    ts_24h = now - 24 * 3600
+
+    all_records = _load_usage(since_ts=ts_30d)
+    records_7d  = [r for r in all_records if r.ts >= ts_7d]
+    records_24h = [r for r in records_7d if r.ts >= ts_24h]
+
+    stats_30d = _aggregate_records(all_records)
+    stats_7d  = _aggregate_records(records_7d)
+    stats_24h = _aggregate_records(records_24h)
+
+    limits = _load_limits()
+
+    # Apply limits to 30d stats
+    for p, s in stats_30d.items():
+        lim = limits.get(p, {})
+        if "monthly_limit_usd" in lim:
+            s.monthly_limit_usd = lim["monthly_limit_usd"]
+        if "topup_balance_usd" in lim:
+            s.topup_balance_usd = lim["topup_balance_usd"]
+            s.estimated_remaining_usd = round(lim["topup_balance_usd"] - s.total_cost_usd, 4)
+
+    all_providers = sorted({
+        *(k for k in PRICING if k != "ollama"),
+        *stats_30d.keys(),
+    })
+
+    providers_data = []
+    for p in all_providers:
+        s30 = stats_30d.get(p, ProviderStats(provider=p))
+        s7  = stats_7d.get(p, ProviderStats(provider=p))
+        s24 = stats_24h.get(p, ProviderStats(provider=p))
+        plim = limits.get(p, {})
+
+        providers_data.append({
+            "provider": p,
+            "display_name": _provider_display_name(p),
+            "icon": _provider_icon(p),
+            "available": bool(os.getenv(_provider_env_key(p), "").strip()),
+            "cost_24h": round(s24.total_cost_usd, 5),
+            "cost_7d":  round(s7.total_cost_usd, 5),
+            "cost_30d": round(s30.total_cost_usd, 5),
+            "calls_24h": s24.call_count,
+            "calls_30d": s30.call_count,
+            "tokens_24h": s24.total_input_tokens + s24.total_output_tokens,
+            "tokens_30d": s30.total_input_tokens + s30.total_output_tokens,
+            "avg_latency_ms": round(s30.avg_latency_ms),
+            "monthly_limit_usd": s30.monthly_limit_usd,
+            "topup_balance_usd": plim.get("topup_balance_usd"),
+            "estimated_remaining_usd": s30.estimated_remaining_usd,
+            "top_models": s30.top_models,
+        })
+
+    total_24h = sum(s.total_cost_usd for s in stats_24h.values())
+    total_7d  = sum(s.total_cost_usd for s in stats_7d.values())
+    total_30d = sum(s.total_cost_usd for s in stats_30d.values())
+
+    return {
+        "providers": providers_data,
+        "summary": {
+            "total_cost_24h": round(total_24h, 5),
+            "total_cost_7d":  round(total_7d, 5),
+            "total_cost_30d": round(total_30d, 5),
+            "total_calls_30d": sum(s.call_count for s in stats_30d.values()),
+        },
+        "generated_at": now,
+    }
+
+
+def _aggregate_records(records: List[UsageRecord]) -> Dict[str, ProviderStats]:
+    """Aggregate a list of records into per-provider stats."""
+    by_provider: Dict[str, ProviderStats] = {}
+    model_usage: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(
+        lambda: defaultdict(lambda: {"calls": 0, "cost": 0.0, "tokens": 0})
+    )
+    for rec in records:
+        p = rec.provider
+        if p not in by_provider:
+            by_provider[p] = ProviderStats(provider=p)
+        s = by_provider[p]
+        s.total_input_tokens += rec.input_tokens
+        s.total_output_tokens += rec.output_tokens
+        s.total_cost_usd += rec.cost_usd
+        s.call_count += 1
+        if rec.latency_ms:
+            s.avg_latency_ms = (
+                (s.avg_latency_ms * (s.call_count - 1) + rec.latency_ms) / s.call_count
+            )
+        model_usage[p][rec.model]["calls"] += 1
+        model_usage[p][rec.model]["cost"] += rec.cost_usd
+        model_usage[p][rec.model]["tokens"] += rec.input_tokens + rec.output_tokens
+
+    for p, s in by_provider.items():
+        top = sorted(model_usage[p].items(), key=lambda x: x[1]["cost"], reverse=True)[:3]
+        s.top_models = [{"model": k, **v} for k, v in top]
+
+    return by_provider
+
+
+def rotate_usage_log(max_age_days: int = 90) -> int:
+    """Remove records older than max_age_days. Returns count of removed lines."""
+    if not _USAGE_FILE.exists():
+        return 0
+    cutoff = time.time() - max_age_days * 86400
+    kept = []
+    removed = 0
+    with _lock:
+        try:
+            with open(_USAGE_FILE, "r", encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        d = json.loads(line)
+                        if d.get("ts", 0) >= cutoff:
+                            kept.append(line)
+                        else:
+                            removed += 1
+                    except Exception:
+                        removed += 1
+            with open(_USAGE_FILE, "w", encoding="utf-8") as f:
+                for line in kept:
+                    f.write(line + "\n")
+        except Exception as e:
+            logger.warning("budget: rotate failed: %s", e)
+    if removed:
+        logger.info("budget: rotated %d old records (>%dd)", removed, max_age_days)
+    return removed
+
+
+def set_provider_limit(provider: str, monthly_limit_usd: Optional[float] = None, topup_balance_usd: Optional[float] = None) -> None:
+    """Configure budget limits for a provider."""
+    limits = _load_limits()
+    if provider not in limits:
+        limits[provider] = {}
+    if monthly_limit_usd is not None:
+        limits[provider]["monthly_limit_usd"] = monthly_limit_usd
+    if topup_balance_usd is not None:
+        limits[provider]["topup_balance_usd"] = topup_balance_usd
+    _save_limits(limits)
+    logger.info("budget: set limits for %s: %s", provider, limits[provider])
+
+
+def _provider_display_name(p: str) -> str:
+    return {
+        "anthropic": "Anthropic Claude",
+        "grok": "xAI Grok",
+        "deepseek": "DeepSeek",
+        "mistral": "Mistral AI",
+        "openai": "OpenAI",
+        "glm": "GLM / Z.AI",
+        "ollama": "Local (Ollama)",
+    }.get(p, p.title())
+
+
+def _provider_icon(p: str) -> str:
+    return {
+        "anthropic": "🟣",
+        "grok": "⚡",
+        "deepseek": "🔵",
+        "mistral": "🌊",
+        "openai": "🟢",
+        "glm": "🐉",
+        "ollama": "🖥️",
+    }.get(p, "🤖")
+
+
+def _provider_env_key(p: str) -> str:
+    return {
+        "anthropic": "ANTHROPIC_API_KEY",
+        "grok": "GROK_API_KEY",
+        "deepseek": "DEEPSEEK_API_KEY",
+        "mistral": "MISTRAL_API_KEY",
+        "openai": "OPENAI_API_KEY",
+        "glm": "GLM5_API_KEY",
+    }.get(p, f"{p.upper()}_API_KEY")
--- a/services/router/release_check_runner.py
+++ b/services/router/release_check_runner.py
--- a/services/router/risk_attribution.py
+++ b/services/router/risk_attribution.py
@@ -0,0 +1,731 @@
+"""
+risk_attribution.py — Change Impact Attribution Engine (deterministic, no LLM by default).
+
+Given a service + env, explains WHY risk spiked by correlating signals:
+  deploy activity, dependency scan findings, drift errors, incident storms,
+  SLO violations, overdue follow-ups, alert-loop degradation.
+
+New in this revision:
+  - Change Timeline: ordered event stream (deploy, incident, slo, followup, …)
+  - Evidence refs: alert_ref[], incident_id[], release_check_run_id, artifact paths
+  - Per-cause refs (clickthrough IDs for UI)
+
+Provides:
+  load_attribution_policy() -> Dict
+  compute_attribution(service, env, ...) -> AttributionReport (includes timeline + evidence_refs)
+  build_timeline(events, policy) -> List[TimelineItem]
+  fetch_signals_from_stores(service, env, ...) -> SignalsData
+
+LLM enrichment is separate (llm_enrichment.py) and off by default.
+"""
+from __future__ import annotations
+
+import datetime
+import logging
+import yaml
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ─── Policy ───────────────────────────────────────────────────────────────────
+
+_ATTR_POLICY_CACHE: Optional[Dict] = None
+_ATTR_POLICY_SEARCH_PATHS = [
+    Path("config/risk_attribution_policy.yml"),
+    Path(__file__).resolve().parent.parent.parent / "config" / "risk_attribution_policy.yml",
+]
+
+
+def load_attribution_policy() -> Dict:
+    global _ATTR_POLICY_CACHE
+    if _ATTR_POLICY_CACHE is not None:
+        return _ATTR_POLICY_CACHE
+    for p in _ATTR_POLICY_SEARCH_PATHS:
+        if p.exists():
+            try:
+                with open(p) as f:
+                    data = yaml.safe_load(f) or {}
+                _ATTR_POLICY_CACHE = data
+                return data
+            except Exception as e:
+                logger.warning("Failed to load risk_attribution_policy from %s: %s", p, e)
+    _ATTR_POLICY_CACHE = _builtin_attr_defaults()
+    return _ATTR_POLICY_CACHE
+
+
+def _reload_attribution_policy() -> None:
+    global _ATTR_POLICY_CACHE
+    _ATTR_POLICY_CACHE = None
+
+
+def _builtin_attr_defaults() -> Dict:
+    return {
+        "defaults": {"lookback_hours": 24, "max_causes": 5, "llm_mode": "off",
+                     "llm_max_chars_in": 3500, "llm_max_chars_out": 800},
+        "llm_triggers": {"risk_delta_warn": 10, "risk_delta_fail": 20,
+                         "band_in": ["high", "critical"]},
+        "weights": {"deploy": 30, "dependency": 25, "drift": 25, "incident_storm": 20,
+                    "slo_violation": 15, "followups_overdue": 10, "alert_loop_degraded": 10},
+        "signals": {
+            "deploy": {"kinds": ["deploy", "deployment", "rollout", "canary"]},
+            "dependency": {"release_gate_names": ["dependency_scan", "deps"]},
+            "drift": {"release_gate_names": ["drift", "config_drift"]},
+            "incident_storm": {"thresholds": {"occurrences_60m_warn": 10,
+                                               "escalations_24h_warn": 2}},
+            "slo": {"require_active_violation": True},
+        },
+        "output": {"confidence_bands": {"high": 60, "medium": 35}},
+        "timeline": {
+            "enabled": True,
+            "lookback_hours": 24,
+            "max_items": 30,
+            "include_types": ["deploy", "dependency", "drift", "incident", "slo",
+                               "followup", "alert_loop", "release_gate"],
+            "time_bucket_minutes": 5,
+        },
+        "evidence_linking": {"enabled": True, "max_refs_per_cause": 10},
+        "llm_local": {
+            "endpoint": "http://localhost:11434/api/generate",
+            "model": "llama3",
+            "timeout_seconds": 15,
+            "model_allowlist": ["qwen2.5-coder:3b", "llama3.1:8b-instruct", "phi3:mini", "llama3"],
+            "max_calls_per_digest": 3,
+            "per_day_dedupe": True,
+        },
+    }
+
+
+# ─── Confidence ───────────────────────────────────────────────────────────────
+
+def _score_to_confidence(score: int, policy: Dict) -> str:
+    bands = policy.get("output", {}).get("confidence_bands", {})
+    high_t = int(bands.get("high", 60))
+    med_t = int(bands.get("medium", 35))
+    if score >= high_t:
+        return "high"
+    if score >= med_t:
+        return "medium"
+    return "low"
+
+
+# ─── Signal detection helpers (now also return refs) ──────────────────────────
+
+def _cap_refs(refs: List[Any], max_refs: int) -> List[Any]:
+    return refs[:max_refs]
+
+
+def _detect_deploy(
+    alerts: List[Dict],
+    cutoff_iso: str,
+    policy: Dict,
+    max_refs: int = 10,
+) -> Tuple[int, List[str], List[Dict]]:
+    """Returns (score, evidence_list, refs)."""
+    kinds = set(policy.get("signals", {}).get("deploy", {}).get(
+        "kinds", ["deploy", "deployment", "rollout", "canary"]
+    ))
+    deploy_alerts = [
+        a for a in alerts
+        if a.get("kind", "").lower() in kinds and a.get("created_at", "") >= cutoff_iso
+    ]
+    if not deploy_alerts:
+        return 0, [], []
+    weight = int(policy.get("weights", {}).get("deploy", 30))
+    last_seen = max(a.get("created_at", "") for a in deploy_alerts)
+    evidence = [
+        f"deploy alerts: {len(deploy_alerts)} in last 24h",
+        f"last seen: {last_seen[:16] if last_seen else 'unknown'}",
+    ]
+    refs = _cap_refs(
+        [{"alert_ref": a["alert_ref"], "kind": a.get("kind", "deploy"),
+          "ts": a.get("created_at", "")}
+         for a in deploy_alerts if a.get("alert_ref")],
+        max_refs,
+    )
+    return weight, evidence, refs
+
+
+def _detect_dependency(
+    release_gate_results: List[Dict],
+    policy: Dict,
+    max_refs: int = 10,
+) -> Tuple[int, List[str], List[Dict]]:
+    gate_names = set(policy.get("signals", {}).get("dependency", {}).get(
+        "release_gate_names", ["dependency_scan", "deps"]
+    ))
+    failing = [
+        g for g in release_gate_results
+        if g.get("gate") in gate_names and g.get("status") in ("fail", "warn")
+    ]
+    if not failing:
+        return 0, [], []
+    weight = int(policy.get("weights", {}).get("dependency", 25))
+    evidence = [f"dependency_scan gate: {g['gate']} = {g['status']}" for g in failing[:3]]
+    refs = _cap_refs(
+        [{"release_check_run_id": g.get("run_id"), "gate": g["gate"],
+          "artifact": g.get("artifact")}
+         for g in failing if g.get("run_id") or g.get("artifact")],
+        max_refs,
+    )
+    return weight, evidence, refs
+
+
+def _detect_drift(
+    release_gate_results: List[Dict],
+    policy: Dict,
+    max_refs: int = 10,
+) -> Tuple[int, List[str], List[Dict]]:
+    gate_names = set(policy.get("signals", {}).get("drift", {}).get(
+        "release_gate_names", ["drift", "config_drift"]
+    ))
+    failing = [
+        g for g in release_gate_results
+        if g.get("gate") in gate_names and g.get("status") in ("fail", "warn")
+    ]
+    if not failing:
+        return 0, [], []
+    weight = int(policy.get("weights", {}).get("drift", 25))
+    evidence = [f"drift gate: {g['gate']} = {g['status']}" for g in failing[:3]]
+    refs = _cap_refs(
+        [{"release_check_run_id": g.get("run_id"), "gate": g["gate"],
+          "artifact": g.get("artifact")}
+         for g in failing if g.get("run_id") or g.get("artifact")],
+        max_refs,
+    )
+    return weight, evidence, refs
+
+
+def _detect_incident_storm(
+    occurrences_60m: int,
+    escalations_24h: int,
+    policy: Dict,
+    incident_ids: Optional[List[str]] = None,
+    max_refs: int = 10,
+) -> Tuple[int, List[str], List[Dict]]:
+    storm_cfg = policy.get("signals", {}).get("incident_storm", {}).get("thresholds", {})
+    occ_warn = int(storm_cfg.get("occurrences_60m_warn", 10))
+    esc_warn = int(storm_cfg.get("escalations_24h_warn", 2))
+
+    triggered = (occurrences_60m >= occ_warn) or (escalations_24h >= esc_warn)
+    if not triggered:
+        return 0, [], []
+
+    weight = int(policy.get("weights", {}).get("incident_storm", 20))
+    evidence = []
+    if occurrences_60m >= occ_warn:
+        evidence.append(f"occurrences_60m={occurrences_60m} (≥{occ_warn})")
+    if escalations_24h >= esc_warn:
+        evidence.append(f"escalations_24h={escalations_24h} (≥{esc_warn})")
+    refs = _cap_refs(
+        [{"incident_id": iid} for iid in (incident_ids or [])],
+        max_refs,
+    )
+    return weight, evidence, refs
+
+
+def _detect_slo(
+    slo_violations: int,
+    policy: Dict,
+    slo_metrics: Optional[List[str]] = None,
+    max_refs: int = 10,
+) -> Tuple[int, List[str], List[Dict]]:
+    require_active = policy.get("signals", {}).get("slo", {}).get("require_active_violation", True)
+    if require_active and slo_violations == 0:
+        return 0, [], []
+    if slo_violations == 0:
+        return 0, [], []
+    weight = int(policy.get("weights", {}).get("slo_violation", 15))
+    evidence = [f"active SLO violations: {slo_violations}"]
+    refs = _cap_refs(
+        [{"metric": m} for m in (slo_metrics or [])],
+        max_refs,
+    )
+    return weight, evidence, refs
+
+
+def _detect_followups_overdue(
+    overdue_count: int,
+    policy: Dict,
+    followup_refs: Optional[List[Dict]] = None,
+    max_refs: int = 10,
+) -> Tuple[int, List[str], List[Dict]]:
+    if overdue_count == 0:
+        return 0, [], []
+    weight = int(policy.get("weights", {}).get("followups_overdue", 10))
+    evidence = [f"overdue follow-ups: {overdue_count}"]
+    refs = _cap_refs(followup_refs or [], max_refs)
+    return weight, evidence, refs
+
+
+def _detect_alert_loop_degraded(
+    loop_slo_violations: int,
+    policy: Dict,
+    max_refs: int = 10,
+) -> Tuple[int, List[str], List[Dict]]:
+    if loop_slo_violations == 0:
+        return 0, [], []
+    weight = int(policy.get("weights", {}).get("alert_loop_degraded", 10))
+    evidence = [f"alert-loop SLO violations: {loop_slo_violations}"]
+    refs: List[Dict] = []
+    return weight, evidence, refs
+
+
+# ─── Timeline builder ────────────────────────────────────────────────────────
+
+def _bucket_key(ts_iso: str, bucket_minutes: int) -> str:
+    """Round timestamp down to the nearest bucket boundary."""
+    try:
+        dt = datetime.datetime.fromisoformat(ts_iso.rstrip("Z"))
+        total_mins = dt.hour * 60 + dt.minute
+        bucket_start = (total_mins // bucket_minutes) * bucket_minutes
+        return f"{dt.strftime('%Y-%m-%d')}T{bucket_start // 60:02d}:{bucket_start % 60:02d}"
+    except Exception:
+        return ts_iso[:13]  # fallback: truncate to hour
+
+
+def build_timeline(
+    raw_events: List[Dict],
+    policy: Optional[Dict] = None,
+) -> List[Dict]:
+    """
+    Build an ordered Change Timeline from raw event dicts.
+
+    raw_events is a list of:
+      {ts, type, label, refs, ...}
+
+    Returns newest-first list, bucketed and capped at max_items.
+    Multiple same-type events in the same time bucket are coalesced into
+    one "xN" item.
+    """
+    if policy is None:
+        policy = load_attribution_policy()
+
+    tl_cfg = policy.get("timeline", {})
+    if not tl_cfg.get("enabled", True):
+        return []
+
+    max_items = int(tl_cfg.get("max_items", 30))
+    bucket_minutes = int(tl_cfg.get("time_bucket_minutes", 5))
+    include_types = set(tl_cfg.get("include_types", []))
+
+    # Filter by allowed types
+    filtered = [
+        e for e in raw_events
+        if not include_types or e.get("type") in include_types
+    ]
+
+    # Sort newest-first
+    filtered.sort(key=lambda e: e.get("ts", ""), reverse=True)
+
+    # Bucket coalescing: same type + same bucket → single item with count
+    seen: Dict[str, Dict] = {}   # key → accumulated item
+    order: List[str] = []         # preserve insertion order
+
+    for ev in filtered:
+        bk = _bucket_key(ev.get("ts", ""), bucket_minutes)
+        key = f"{ev.get('type', 'unknown')}:{bk}"
+        if key not in seen:
+            seen[key] = {
+                "ts": ev.get("ts", ""),
+                "type": ev.get("type", "unknown"),
+                "label": ev.get("label", ""),
+                "refs": list(ev.get("refs", {}).items() if isinstance(ev.get("refs"), dict)
+                              else ev.get("refs", [])),
+                "_count": 1,
+                "_latest_ts": ev.get("ts", ""),
+            }
+            order.append(key)
+        else:
+            seen[key]["_count"] += 1
+            # Keep latest ts
+            if ev.get("ts", "") > seen[key]["_latest_ts"]:
+                seen[key]["_latest_ts"] = ev.get("ts", "")
+                seen[key]["ts"] = ev.get("ts", "")
+            # Merge refs (up to 5 per bucket)
+            new_refs = (list(ev.get("refs", {}).items()) if isinstance(ev.get("refs"), dict)
+                        else ev.get("refs", []))
+            if len(seen[key]["refs"]) < 5:
+                seen[key]["refs"].extend(new_refs[:5 - len(seen[key]["refs"])])
+
+    # Build final items
+    items = []
+    for key in order:
+        item = seen[key]
+        count = item.pop("_count", 1)
+        item.pop("_latest_ts", None)
+        if count > 1:
+            item["label"] = f"{item['label']} (×{count})"
+        # Convert refs back to dict if needed
+        if isinstance(item["refs"], list) and item["refs"] and isinstance(item["refs"][0], tuple):
+            item["refs"] = dict(item["refs"])
+        items.append(item)
+
+    return items[:max_items]
+
+
+def _make_timeline_events_from_alerts(
+    alerts: List[Dict],
+    deploy_kinds: set,
+    cutoff_iso: str,
+) -> List[Dict]:
+    """Convert alert records to raw timeline events."""
+    events = []
+    for a in alerts:
+        if a.get("created_at", "") < cutoff_iso:
+            continue
+        kind = a.get("kind", "").lower()
+        ev_type = "deploy" if kind in deploy_kinds else "alert"
+        refs = {}
+        if a.get("alert_ref"):
+            refs["alert_ref"] = a["alert_ref"]
+        if a.get("service"):
+            refs["service"] = a["service"]
+        events.append({
+            "ts": a.get("created_at", ""),
+            "type": ev_type,
+            "label": f"Alert: {kind}" + (f" ({a.get('title', '')})"
+                                          if a.get("title") else ""),
+            "refs": refs,
+        })
+    return events
+
+
+def _make_timeline_events_from_incidents(
+    incidents: List[Dict],
+    events_by_id: Dict[str, List[Dict]],
+    cutoff_iso: str,
+) -> List[Dict]:
+    """Convert incident + escalation events to raw timeline events."""
+    timeline_events = []
+    for inc in incidents:
+        inc_id = inc.get("id", "")
+        started = inc.get("started_at") or inc.get("created_at", "")
+        if started >= cutoff_iso:
+            timeline_events.append({
+                "ts": started,
+                "type": "incident",
+                "label": f"Incident started: {inc.get('title', inc_id)[:80]}",
+                "refs": {"incident_id": inc_id},
+            })
+        for ev in events_by_id.get(inc_id, []):
+            if (ev.get("type") == "decision"
+                    and "Escalat" in (ev.get("message") or "")
+                    and ev.get("ts", "") >= cutoff_iso):
+                timeline_events.append({
+                    "ts": ev["ts"],
+                    "type": "incident",
+                    "label": f"Incident escalated: {inc_id}",
+                    "refs": {"incident_id": inc_id,
+                              "event_type": ev.get("type", "")},
+                })
+    return timeline_events
+
+
+def _make_timeline_events_from_gates(
+    release_gate_results: List[Dict],
+) -> List[Dict]:
+    """Convert release gate results to raw timeline events."""
+    events = []
+    for g in release_gate_results:
+        if g.get("status") not in ("fail", "warn"):
+            continue
+        gate_type = "dependency" if "dep" in g.get("gate", "").lower() else "release_gate"
+        if "drift" in g.get("gate", "").lower():
+            gate_type = "drift"
+        refs: Dict = {}
+        if g.get("run_id"):
+            refs["release_check_run_id"] = g["run_id"]
+        if g.get("artifact"):
+            refs["artifact"] = g["artifact"]
+        events.append({
+            "ts": g.get("ts", datetime.datetime.utcnow().isoformat()),
+            "type": gate_type,
+            "label": f"Gate {g['gate']} = {g['status']}",
+            "refs": refs,
+        })
+    return events
+
+
+# ─── Evidence refs builder ────────────────────────────────────────────────────
+
+def build_evidence_refs(
+    alerts_24h: List[Dict],
+    incidents_24h: List[Dict],
+    release_gate_results: List[Dict],
+    followup_refs: Optional[List[Dict]] = None,
+    policy: Optional[Dict] = None,
+) -> Dict:
+    """
+    Collect top-level evidence_refs: alert_refs, incident_ids,
+    release_check_run_ids, artifacts.
+    """
+    if policy is None:
+        policy = load_attribution_policy()
+
+    max_refs = int(policy.get("evidence_linking", {}).get("max_refs_per_cause", 10))
+
+    alert_refs = _cap_refs(
+        [a["alert_ref"] for a in alerts_24h if a.get("alert_ref")], max_refs
+    )
+    incident_ids = _cap_refs(
+        list({inc.get("id", "") for inc in incidents_24h if inc.get("id")}), max_refs
+    )
+    rc_ids = _cap_refs(
+        list({g.get("run_id") for g in release_gate_results if g.get("run_id")}), max_refs
+    )
+    artifacts = _cap_refs(
+        list({g.get("artifact") for g in release_gate_results if g.get("artifact")}), max_refs
+    )
+    fu_refs = _cap_refs(
+        [r for r in (followup_refs or []) if r], max_refs
+    )
+
+    return {
+        "alerts": alert_refs,
+        "incidents": incident_ids,
+        "release_checks": list(filter(None, rc_ids)),
+        "artifacts": list(filter(None, artifacts)),
+        "followups": fu_refs,
+    }
+
+
+# ─── Summary builder ──────────────────────────────────────────────────────────
+
+_TYPE_LABELS = {
+    "deploy": "deploy activity",
+    "dependency": "dependency change",
+    "drift": "config/infrastructure drift",
+    "incident_storm": "incident storm",
+    "slo_violation": "SLO violation",
+    "followups_overdue": "overdue follow-ups",
+    "alert_loop_degraded": "alert-loop degradation",
+}
+
+
+def _build_summary(causes: List[Dict]) -> str:
+    if not causes:
+        return "No significant attribution signals detected."
+    labels = [_TYPE_LABELS.get(c["type"], c["type"]) for c in causes[:3]]
+    return "Likely causes: " + " + ".join(labels) + "."
+
+
+# ─── Main attribution function ────────────────────────────────────────────────
+
+def compute_attribution(
+    service: str,
+    env: str,
+    *,
+    risk_report: Optional[Dict] = None,
+    # Signals (pre-fetched)
+    alerts_24h: Optional[List[Dict]] = None,
+    occurrences_60m: int = 0,
+    escalations_24h: int = 0,
+    release_gate_results: Optional[List[Dict]] = None,
+    slo_violations: int = 0,
+    slo_metrics: Optional[List[str]] = None,
+    overdue_followup_count: int = 0,
+    followup_refs: Optional[List[Dict]] = None,
+    loop_slo_violations: int = 0,
+    # For evidence + timeline
+    incidents_24h: Optional[List[Dict]] = None,
+    incident_events: Optional[Dict[str, List[Dict]]] = None,
+    window_hours: int = 24,
+    policy: Optional[Dict] = None,
+) -> Dict:
+    """
+    Deterministic attribution: causes with evidence, refs, timeline, evidence_refs.
+
+    All signal arguments default to safe empty values.
+    Never raises (returns minimal report on any error).
+    """
+    if policy is None:
+        policy = load_attribution_policy()
+
+    cutoff = (
+        datetime.datetime.utcnow() - datetime.timedelta(hours=window_hours)
+    ).isoformat()
+
+    max_causes = int(policy.get("defaults", {}).get("max_causes", 5))
+    max_refs = int(policy.get("evidence_linking", {}).get("max_refs_per_cause", 10))
+    risk_report = risk_report or {}
+    alerts_24h = alerts_24h or []
+    release_gate_results = release_gate_results or []
+    incidents_24h = incidents_24h or []
+    incident_events = incident_events or {}
+
+    # Extract from risk_report.components when not explicitly provided
+    if slo_violations == 0 and risk_report:
+        slo_violations = (risk_report.get("components", {}).get("slo") or {}).get("violations", 0)
+    if overdue_followup_count == 0 and risk_report:
+        fu = risk_report.get("components", {}).get("followups") or {}
+        overdue_followup_count = fu.get("P0", 0) + fu.get("P1", 0) + fu.get("other", 0)
+    if loop_slo_violations == 0 and risk_report:
+        loop_slo_violations = (
+            risk_report.get("components", {}).get("alerts_loop") or {}
+        ).get("violations", 0)
+
+    incident_ids = [inc.get("id", "") for inc in incidents_24h if inc.get("id")]
+
+    # ── Score each signal (now with refs) ────────────────────────────────────
+    candidates: List[Dict] = []
+
+    score, evid, refs = _detect_deploy(alerts_24h, cutoff, policy, max_refs)
+    if score:
+        candidates.append({"type": "deploy", "score": score, "evidence": evid, "refs": refs})
+
+    score, evid, refs = _detect_dependency(release_gate_results, policy, max_refs)
+    if score:
+        candidates.append({"type": "dependency", "score": score, "evidence": evid, "refs": refs})
+
+    score, evid, refs = _detect_drift(release_gate_results, policy, max_refs)
+    if score:
+        candidates.append({"type": "drift", "score": score, "evidence": evid, "refs": refs})
+
+    score, evid, refs = _detect_incident_storm(
+        occurrences_60m, escalations_24h, policy, incident_ids, max_refs
+    )
+    if score:
+        candidates.append({"type": "incident_storm", "score": score, "evidence": evid, "refs": refs})
+
+    score, evid, refs = _detect_slo(slo_violations, policy, slo_metrics, max_refs)
+    if score:
+        candidates.append({"type": "slo_violation", "score": score, "evidence": evid, "refs": refs})
+
+    score, evid, refs = _detect_followups_overdue(
+        overdue_followup_count, policy, followup_refs, max_refs
+    )
+    if score:
+        candidates.append({"type": "followups_overdue", "score": score,
+                            "evidence": evid, "refs": refs})
+
+    score, evid, refs = _detect_alert_loop_degraded(loop_slo_violations, policy, max_refs)
+    if score:
+        candidates.append({"type": "alert_loop_degraded", "score": score,
+                            "evidence": evid, "refs": refs})
+
+    # Sort desc, cap, add confidence
+    candidates.sort(key=lambda c: -c["score"])
+    causes = candidates[:max_causes]
+    for c in causes:
+        c["confidence"] = _score_to_confidence(c["score"], policy)
+
+    delta_24h = (risk_report.get("trend") or {}).get("delta_24h")
+    summary = _build_summary(causes)
+
+    # ── Timeline ──────────────────────────────────────────────────────────────
+    tl_cfg = policy.get("timeline", {})
+    deploy_kinds = set(policy.get("signals", {}).get("deploy", {}).get(
+        "kinds", ["deploy", "deployment", "rollout", "canary"]
+    ))
+    raw_events: List[Dict] = []
+    raw_events.extend(_make_timeline_events_from_alerts(alerts_24h, deploy_kinds, cutoff))
+    raw_events.extend(_make_timeline_events_from_incidents(incidents_24h, incident_events, cutoff))
+    raw_events.extend(_make_timeline_events_from_gates(release_gate_results))
+    timeline = build_timeline(raw_events, policy) if tl_cfg.get("enabled", True) else []
+
+    # ── Evidence refs ─────────────────────────────────────────────────────────
+    evidence_refs: Dict = {}
+    if policy.get("evidence_linking", {}).get("enabled", True):
+        evidence_refs = build_evidence_refs(
+            alerts_24h, incidents_24h, release_gate_results,
+            followup_refs=followup_refs, policy=policy,
+        )
+
+    return {
+        "service": service,
+        "env": env,
+        "window_hours": window_hours,
+        "delta_24h": delta_24h,
+        "causes": causes,
+        "summary": summary,
+        "timeline": timeline,
+        "evidence_refs": evidence_refs,
+        "llm_enrichment": {"enabled": False, "text": None},
+    }
+
+
+# ─── Signal fetcher (for wiring in tool_manager/risk_engine) ─────────────────
+
+def fetch_signals_from_stores(
+    service: str,
+    env: str,
+    window_hours: int = 24,
+    *,
+    alert_store=None,
+    incident_store=None,
+    policy: Optional[Dict] = None,
+) -> Dict:
+    """
+    Fetches raw signals from existing stores.
+    Returns a dict ready to unpack into compute_attribution().
+    Always non-fatal per store.
+    """
+    if policy is None:
+        policy = load_attribution_policy()
+
+    cutoff = (
+        datetime.datetime.utcnow() - datetime.timedelta(hours=window_hours)
+    ).isoformat()
+
+    # ── Deploy + other alerts ─────────────────────────────────────────────────
+    alerts_24h: List[Dict] = []
+    try:
+        if alert_store is not None:
+            all_alerts = alert_store.list_alerts(limit=200)
+            alerts_24h = [
+                a for a in all_alerts
+                if a.get("created_at", "") >= cutoff
+                and (not a.get("service") or a.get("service") == service)
+            ]
+    except Exception as e:
+        logger.warning("attribution fetch alerts failed: %s", e)
+
+    # ── Incidents in window + event maps ──────────────────────────────────────
+    incidents_24h: List[Dict] = []
+    incident_events: Dict[str, List[Dict]] = {}
+    occurrences_60m = 0
+    escalations_24h = 0
+
+    try:
+        if incident_store is not None:
+            cutoff_60m = (
+                datetime.datetime.utcnow() - datetime.timedelta(minutes=60)
+            ).isoformat()
+
+            # Count alert occurrences from alert_store top_signatures
+            if alert_store is not None:
+                try:
+                    sigs = alert_store.top_signatures(window_minutes=60, limit=20)
+                    occurrences_60m = sum(s.get("occurrences", 0) for s in sigs)
+                except Exception:
+                    pass
+
+            incs = incident_store.list_incidents({"service": service}, limit=30)
+            for inc in incs:
+                inc_id = inc.get("id", "")
+                inc_started = inc.get("started_at") or inc.get("created_at", "")
+                try:
+                    events = incident_store.get_events(inc_id, limit=50)
+                    incident_events[inc_id] = events
+                    for ev in events:
+                        if (ev.get("type") == "decision"
+                                and "Escalat" in (ev.get("message") or "")
+                                and ev.get("ts", "") >= cutoff):
+                            escalations_24h += 1
+                except Exception:
+                    pass
+                # Include incident if started within window
+                if inc_started >= cutoff:
+                    incidents_24h.append(inc)
+    except Exception as e:
+        logger.warning("attribution fetch incident signals failed: %s", e)
+
+    return {
+        "alerts_24h": alerts_24h,
+        "occurrences_60m": occurrences_60m,
+        "escalations_24h": escalations_24h,
+        "incidents_24h": incidents_24h,
+        "incident_events": incident_events,
+        "release_gate_results": [],   # caller can inject if persisted
+    }
--- a/services/router/risk_digest.py
+++ b/services/router/risk_digest.py
@@ -0,0 +1,341 @@
+"""
+risk_digest.py — Daily Risk Digest generator (deterministic, no LLM).
+
+Produces:
+  ops/reports/risk/YYYY-MM-DD.json
+  ops/reports/risk/YYYY-MM-DD.md
+
+Content:
+  - Top risky services (score desc)
+  - Top regressions (delta_24h desc)
+  - SLO violation summary
+  - Deterministic action list based on risk state
+"""
+from __future__ import annotations
+
+import datetime
+import json
+import logging
+import math
+import os
+from pathlib import Path
+from typing import Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+_ACTION_TEMPLATES = {
+    "regression_fail": "🚨 **Regression detected**: {service} score +{delta} in 24h. Freeze deployments; inspect recent incidents/followups immediately.",
+    "regression_warn": "⚠️ **Score rising**: {service} +{delta} in 24h. Review open incidents and overdue follow-ups.",
+    "critical_band": "🔴 **Critical risk**: {service} (score {score}). Oncall review required within 2h.",
+    "high_band": "🟠 **High risk**: {service} (score {score}). Coordinate with oncall before next release.",
+    "overdue_followups": "📋 **Overdue follow-ups**: {service} has {count} overdue follow-up(s). Close them to reduce risk score.",
+    "slo_violation": "📉 **SLO violation**: {service} has {count} active SLO violation(s). Avoid deploying until clear.",
+}
+
+
+def _now_date() -> str:
+    return datetime.datetime.utcnow().strftime("%Y-%m-%d")
+
+
+def _clamp(text: str, max_chars: int) -> str:
+    if len(text) <= max_chars:
+        return text
+    truncated = text[:max_chars]
+    return truncated + "\n\n_[digest truncated to policy max_chars]_"
+
+
+def _build_action_list(reports: List[Dict]) -> List[str]:
+    actions = []
+    for r in reports[:10]:
+        service = r.get("service", "?")
+        score = r.get("score", 0)
+        band = r.get("band", "low")
+        trend = r.get("trend") or {}
+        comp = r.get("components", {})
+
+        delta_24h = trend.get("delta_24h")
+        reg = trend.get("regression", {})
+
+        if reg.get("fail") and delta_24h is not None and delta_24h > 0:
+            actions.append(_ACTION_TEMPLATES["regression_fail"].format(
+                service=service, delta=delta_24h))
+        elif reg.get("warn") and delta_24h is not None and delta_24h > 0:
+            actions.append(_ACTION_TEMPLATES["regression_warn"].format(
+                service=service, delta=delta_24h))
+
+        if band == "critical":
+            actions.append(_ACTION_TEMPLATES["critical_band"].format(
+                service=service, score=score))
+        elif band == "high":
+            actions.append(_ACTION_TEMPLATES["high_band"].format(
+                service=service, score=score))
+
+        overdue = (
+            (comp.get("followups") or {}).get("P0", 0)
+            + (comp.get("followups") or {}).get("P1", 0)
+            + (comp.get("followups") or {}).get("other", 0)
+        )
+        if overdue:
+            actions.append(_ACTION_TEMPLATES["overdue_followups"].format(
+                service=service, count=overdue))
+
+        slo_count = (comp.get("slo") or {}).get("violations", 0)
+        if slo_count:
+            actions.append(_ACTION_TEMPLATES["slo_violation"].format(
+                service=service, count=slo_count))
+
+    return actions[:20]  # cap
+
+
+def _build_markdown(
+    date_str: str,
+    env: str,
+    reports: List[Dict],
+    top_regressions: List[Dict],
+    improving: List[Dict],
+    actions: List[str],
+    band_counts: Dict,
+) -> str:
+    lines = [
+        f"# Risk Digest — {date_str} ({env})",
+        "",
+        f"Generated: {datetime.datetime.utcnow().isoformat()} UTC",
+        "",
+        "## Band Summary",
+        "",
+        "| Band | Count |",
+        "|------|-------|",
+    ]
+    for band in ("critical", "high", "medium", "low"):
+        lines.append(f"| {band} | {band_counts.get(band, 0)} |")
+
+    lines += [
+        "",
+        "## Top Risky Services",
+        "",
+        "| Service | Score | Band | Δ24h | Δ7d |",
+        "|---------|-------|------|------|-----|",
+    ]
+    for r in reports:
+        t = r.get("trend") or {}
+        d24 = t.get("delta_24h")
+        d7 = t.get("delta_7d")
+        d24_str = (f"+{d24}" if d24 and d24 > 0 else str(d24)) if d24 is not None else "—"
+        d7_str = (f"+{d7}" if d7 and d7 > 0 else str(d7)) if d7 is not None else "—"
+        lines.append(
+            f"| {r['service']} | {r.get('score', 0)} | {r.get('band', '?')} "
+            f"| {d24_str} | {d7_str} |"
+        )
+
+    if top_regressions:
+        lines += ["", "## Top Regressions (Δ24h)", ""]
+        for item in top_regressions:
+            delta = item.get("delta_24h", 0)
+            lines.append(f"- **{item['service']}**: +{delta} points in 24h")
+
+    # ── Likely Causes (Attribution) ───────────────────────────────────────────
+    regressions_with_attribution = [
+        r for r in reports
+        if (r.get("trend") or {}).get("delta_24h") is not None
+        and r["trend"]["delta_24h"] > 0
+        and r.get("attribution") is not None
+        and r["attribution"].get("causes")
+    ]
+    regressions_with_attribution = sorted(
+        regressions_with_attribution,
+        key=lambda r: -(r.get("trend") or {}).get("delta_24h", 0),
+    )[:5]
+
+    if regressions_with_attribution:
+        lines += ["", "## Likely Causes (Top Regressions)", ""]
+        for r in regressions_with_attribution:
+            svc = r["service"]
+            attr = r["attribution"]
+            delta = r["trend"]["delta_24h"]
+            summary = attr.get("summary", "")
+            lines.append(f"### {svc} (+{delta} pts)")
+            if summary:
+                lines.append(f"> {summary}")
+            causes = attr.get("causes", [])[:2]
+            for c in causes:
+                evid = "; ".join(c.get("evidence", []))
+                lines.append(
+                    f"- **{c['type']}** (confidence: {c.get('confidence', '?')}): {evid}"
+                )
+            # LLM text if available
+            llm = attr.get("llm_enrichment") or {}
+            if llm.get("enabled") and llm.get("text"):
+                lines += ["", f"  _LLM insight_: {llm['text'][:400]}"]
+            lines.append("")
+
+    # ── Change Timeline (Top Regressions) ────────────────────────────────────
+    regressions_with_timeline = [
+        r for r in regressions_with_attribution
+        if r.get("attribution") and r["attribution"].get("timeline")
+    ]
+    if regressions_with_timeline:
+        lines += ["", "## Change Timeline (Top Regressions)", ""]
+        for r in regressions_with_timeline:
+            svc = r["service"]
+            timeline = r["attribution"]["timeline"][:5]   # top 5 per service
+            lines.append(f"### {svc}")
+            for item in timeline:
+                ts = (item.get("ts") or "")[:16]
+                label = item.get("label", "")
+                ev_type = item.get("type", "")
+                lines.append(f"- `{ts}` [{ev_type}] {label}")
+            lines.append("")
+
+    if improving:
+        lines += ["", "## Improving Services (Δ7d)", ""]
+        for item in improving:
+            delta = item.get("delta_7d", 0)
+            lines.append(f"- **{item['service']}**: {delta} points over 7d")
+
+    if actions:
+        lines += ["", "## Action List", ""]
+        for action in actions:
+            lines.append(f"- {action}")
+
+    lines += ["", "---", "_Generated by DAARION.city Risk Digest (deterministic, no LLM by default)_"]
+    return "\n".join(lines)
+
+
+def daily_digest(
+    env: str = "prod",
+    *,
+    service_reports: Optional[List[Dict]] = None,
+    policy: Optional[Dict] = None,
+    date_str: Optional[str] = None,
+    output_dir: Optional[str] = None,
+    write_files: bool = True,
+) -> Dict:
+    """
+    Build and optionally persist the daily risk digest.
+
+    service_reports — pre-fetched+enriched list of RiskReports (with trend).
+    Returns {json_path, md_path, json_data, markdown, date, env}
+    """
+    from risk_engine import load_risk_policy, compute_risk_dashboard
+
+    if policy is None:
+        policy = load_risk_policy()
+
+    digest_cfg = policy.get("digest", {})
+    top_n = int(digest_cfg.get("top_n", 10))
+    max_chars = int(digest_cfg.get("markdown_max_chars", 8000))
+    cfg_output_dir = digest_cfg.get("output_dir", "ops/reports/risk")
+
+    effective_output_dir = output_dir or cfg_output_dir
+    effective_date = date_str or _now_date()
+
+    reports = sorted(service_reports or [], key=lambda r: -r.get("score", 0))[:top_n]
+
+    # Band counts
+    band_counts: Dict[str, int] = {"critical": 0, "high": 0, "medium": 0, "low": 0}
+    for r in reports:
+        b = r.get("band", "low")
+        band_counts[b] = band_counts.get(b, 0) + 1
+
+    # Top regressions
+    top_regressions = sorted(
+        [r for r in reports if (r.get("trend") or {}).get("delta_24h") is not None
+         and r["trend"]["delta_24h"] > 0],
+        key=lambda r: -r["trend"]["delta_24h"],
+    )[:5]
+    top_regressions_out = [
+        {"service": r["service"], "delta_24h": r["trend"]["delta_24h"],
+         "attribution_causes": [
+             {"type": c["type"], "score": c["score"],
+              "confidence": c.get("confidence", "low"),
+              "evidence": c.get("evidence", [])[:2],
+              "refs": c.get("refs", [])[:3]}
+             for c in (r.get("attribution") or {}).get("causes", [])[:2]
+         ],
+         "timeline_preview": (r.get("attribution") or {}).get("timeline", [])[:3],
+         }
+        for r in top_regressions
+    ]
+
+    # Improving services
+    improving = sorted(
+        [r for r in reports if (r.get("trend") or {}).get("delta_7d") is not None
+         and r["trend"]["delta_7d"] < 0],
+        key=lambda r: r["trend"]["delta_7d"],
+    )[:5]
+    improving_out = [
+        {"service": r["service"], "delta_7d": r["trend"]["delta_7d"]}
+        for r in improving
+    ]
+
+    actions = _build_action_list(reports)
+
+    markdown_raw = _build_markdown(
+        date_str=effective_date,
+        env=env,
+        reports=reports,
+        top_regressions=top_regressions_out,
+        improving=improving_out,
+        actions=actions,
+        band_counts=band_counts,
+    )
+    markdown = _clamp(markdown_raw, max_chars)
+
+    json_data = {
+        "date": effective_date,
+        "env": env,
+        "generated_at": datetime.datetime.utcnow().isoformat(),
+        "band_counts": band_counts,
+        "top_services": [
+            {
+                "service": r.get("service"),
+                "score": r.get("score"),
+                "band": r.get("band"),
+                "delta_24h": (r.get("trend") or {}).get("delta_24h"),
+                "delta_7d": (r.get("trend") or {}).get("delta_7d"),
+                "regression": (r.get("trend") or {}).get("regression"),
+                "reasons": r.get("reasons", [])[:5],
+                "attribution_summary": (r.get("attribution") or {}).get("summary"),
+                "top_causes": [
+                    {"type": c["type"], "score": c["score"],
+                     "confidence": c.get("confidence", "low"),
+                     "evidence": c.get("evidence", [])[:2],
+                     "refs": c.get("refs", [])[:3]}
+                    for c in (r.get("attribution") or {}).get("causes", [])[:2]
+                ],
+                "timeline_preview": (r.get("attribution") or {}).get("timeline", [])[:3],
+                "evidence_refs": (r.get("attribution") or {}).get("evidence_refs", {}),
+            }
+            for r in reports
+        ],
+        "top_regressions": top_regressions_out,
+        "improving_services": improving_out,
+        "actions": actions,
+    }
+
+    json_path: Optional[str] = None
+    md_path: Optional[str] = None
+
+    if write_files:
+        try:
+            out = Path(effective_output_dir)
+            out.mkdir(parents=True, exist_ok=True)
+            json_path = str(out / f"{effective_date}.json")
+            md_path = str(out / f"{effective_date}.md")
+            with open(json_path, "w") as f:
+                json.dump(json_data, f, indent=2)
+            with open(md_path, "w") as f:
+                f.write(markdown)
+            logger.info("Risk digest written: %s, %s", json_path, md_path)
+        except Exception as e:
+            logger.warning("Risk digest write failed: %s", e)
+            json_path = md_path = None
+
+    return {
+        "date": effective_date,
+        "env": env,
+        "json_path": json_path,
+        "md_path": md_path,
+        "json_data": json_data,
+        "markdown": markdown,
+    }
--- a/services/router/risk_engine.py
+++ b/services/router/risk_engine.py
@@ -0,0 +1,710 @@
+"""
+risk_engine.py — Service Risk Index Engine (deterministic, no LLM).
+
+Provides:
+  compute_service_risk(service, env, ...) -> RiskReport
+  compute_risk_dashboard(env, top_n, ...) -> Dashboard
+  compute_trend(series)   -> TrendReport
+  enrich_risk_report_with_trend(report, history_store, policy) -> report (mutated)
+  snapshot_all_services(env, compute_fn, history_store, policy) -> SnapshotResult
+
+All inputs come from existing stores and tools.
+The engine never calls external services directly — callers inject store references.
+"""
+from __future__ import annotations
+
+import datetime
+import logging
+import math
+import yaml
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ─── Policy ───────────────────────────────────────────────────────────────────
+
+_POLICY_CACHE: Optional[Dict] = None
+_POLICY_SEARCH_PATHS = [
+    Path("config/risk_policy.yml"),
+    Path(__file__).resolve().parent.parent.parent / "config" / "risk_policy.yml",
+]
+
+
+def load_risk_policy() -> Dict:
+    global _POLICY_CACHE
+    if _POLICY_CACHE is not None:
+        return _POLICY_CACHE
+    for p in _POLICY_SEARCH_PATHS:
+        if p.exists():
+            try:
+                with open(p) as f:
+                    data = yaml.safe_load(f) or {}
+                _POLICY_CACHE = data
+                return data
+            except Exception as e:
+                logger.warning("Failed to load risk_policy from %s: %s", p, e)
+    logger.warning("risk_policy.yml not found; using built-in defaults")
+    _POLICY_CACHE = _builtin_defaults()
+    return _POLICY_CACHE
+
+
+def _builtin_defaults() -> Dict:
+    return {
+        "defaults": {"window_hours": 24, "recurrence_windows_days": [7, 30],
+                     "slo_window_minutes": 60},
+        "thresholds": {
+            "bands": {"low_max": 20, "medium_max": 50, "high_max": 80},
+            "risk_watch": {"warn_at": 50, "fail_at": 80},
+        },
+        "weights": {
+            "open_incidents": {"P0": 50, "P1": 25, "P2": 10, "P3": 5},
+            "recurrence": {
+                "signature_warn_7d": 10, "signature_high_7d": 20,
+                "kind_warn_7d": 8, "kind_high_7d": 15,
+                "signature_high_30d": 10, "kind_high_30d": 8,
+            },
+            "followups": {"overdue_P0": 20, "overdue_P1": 12, "overdue_other": 6},
+            "slo": {"violation": 10},
+            "alerts_loop": {"slo_violation": 10},
+            "escalation": {"escalations_24h": {"warn": 5, "high": 12}},
+        },
+        "service_overrides": {},
+        "p0_services": ["gateway", "router"],
+    }
+
+
+def _reload_policy() -> None:
+    global _POLICY_CACHE
+    _POLICY_CACHE = None
+
+
+# ─── Band classification ──────────────────────────────────────────────────────
+
+def score_to_band(score: int, policy: Dict) -> str:
+    bands = policy.get("thresholds", {}).get("bands", {})
+    low_max = int(bands.get("low_max", 20))
+    medium_max = int(bands.get("medium_max", 50))
+    high_max = int(bands.get("high_max", 80))
+    if score <= low_max:
+        return "low"
+    if score <= medium_max:
+        return "medium"
+    if score <= high_max:
+        return "high"
+    return "critical"
+
+
+def get_service_thresholds(service: str, policy: Dict) -> Dict:
+    overrides = policy.get("service_overrides", {}).get(service, {})
+    defaults = policy.get("thresholds", {}).get("risk_watch", {})
+    ov_rw = overrides.get("risk_watch", {})
+    return {
+        "warn_at": int(ov_rw.get("warn_at", defaults.get("warn_at", 50))),
+        "fail_at": int(ov_rw.get("fail_at", defaults.get("fail_at", 80))),
+    }
+
+
+# ─── Individual scoring components ───────────────────────────────────────────
+
+def _score_open_incidents(
+    open_incidents: List[Dict],
+    weights: Dict,
+) -> Tuple[int, Dict, List[str]]:
+    """Score open incidents by severity."""
+    w = weights.get("open_incidents", {})
+    counts: Dict[str, int] = {"P0": 0, "P1": 0, "P2": 0, "P3": 0}
+    points = 0
+    for inc in open_incidents:
+        sev = inc.get("severity", "P3")
+        if sev in counts:
+            counts[sev] += 1
+        pts = int(w.get(sev, 0))
+        points += pts
+
+    reasons = []
+    if counts["P0"]:
+        reasons.append(f"Open P0 incident(s): {counts['P0']}")
+    if counts["P1"]:
+        reasons.append(f"Open P1 incident(s): {counts['P1']}")
+    if counts["P2"]:
+        reasons.append(f"Open P2 incident(s): {counts['P2']}")
+
+    return points, {**counts, "points": points}, reasons
+
+
+def _score_recurrence(
+    recurrence_data: Dict,
+    weights: Dict,
+) -> Tuple[int, Dict, List[str]]:
+    """Score from recurrence detection stats."""
+    w = weights.get("recurrence", {})
+    high_rec = recurrence_data.get("high_recurrence", {})
+    warn_rec = recurrence_data.get("warn_recurrence", {})
+
+    high_sigs_7d = len(high_rec.get("signatures", []))
+    high_kinds_7d = len(high_rec.get("kinds", []))
+    warn_sigs_7d = len(warn_rec.get("signatures", []))
+    warn_kinds_7d = len(warn_rec.get("kinds", []))
+
+    # Note: 30d data comes from separate call; keep it optional
+    high_sigs_30d = len(recurrence_data.get("high_recurrence_30d", {}).get("signatures", []))
+    high_kinds_30d = len(recurrence_data.get("high_recurrence_30d", {}).get("kinds", []))
+
+    points = (
+        high_sigs_7d * int(w.get("signature_high_7d", 20))
+        + warn_sigs_7d * int(w.get("signature_warn_7d", 10))
+        + high_kinds_7d * int(w.get("kind_high_7d", 15))
+        + warn_kinds_7d * int(w.get("kind_warn_7d", 8))
+        + high_sigs_30d * int(w.get("signature_high_30d", 10))
+        + high_kinds_30d * int(w.get("kind_high_30d", 8))
+    )
+
+    component = {
+        "high_signatures_7d": high_sigs_7d,
+        "warn_signatures_7d": warn_sigs_7d,
+        "high_kinds_7d": high_kinds_7d,
+        "warn_kinds_7d": warn_kinds_7d,
+        "high_signatures_30d": high_sigs_30d,
+        "high_kinds_30d": high_kinds_30d,
+        "points": points,
+    }
+    reasons = []
+    if high_sigs_7d:
+        reasons.append(f"High recurrence signatures (7d): {high_sigs_7d}")
+    if high_kinds_7d:
+        reasons.append(f"High recurrence kinds (7d): {high_kinds_7d}")
+    if warn_sigs_7d:
+        reasons.append(f"Warn recurrence signatures (7d): {warn_sigs_7d}")
+    return points, component, reasons
+
+
+def _score_followups(
+    followups_data: Dict,
+    weights: Dict,
+) -> Tuple[int, Dict, List[str]]:
+    """Score overdue follow-ups by priority."""
+    w = weights.get("followups", {})
+    overdue = followups_data.get("overdue_followups", [])
+    counts: Dict[str, int] = {"P0": 0, "P1": 0, "other": 0}
+    points = 0
+
+    for fu in overdue:
+        prio = fu.get("priority", "other")
+        if prio == "P0":
+            counts["P0"] += 1
+            points += int(w.get("overdue_P0", 20))
+        elif prio == "P1":
+            counts["P1"] += 1
+            points += int(w.get("overdue_P1", 12))
+        else:
+            counts["other"] += 1
+            points += int(w.get("overdue_other", 6))
+
+    reasons = []
+    if counts["P0"]:
+        reasons.append(f"Overdue follow-ups (P0): {counts['P0']}")
+    if counts["P1"]:
+        reasons.append(f"Overdue follow-ups (P1): {counts['P1']}")
+    if counts["other"]:
+        reasons.append(f"Overdue follow-ups (other): {counts['other']}")
+
+    return points, {**counts, "points": points}, reasons
+
+
+def _score_slo(
+    slo_data: Dict,
+    weights: Dict,
+) -> Tuple[int, Dict, List[str]]:
+    """Score SLO violations."""
+    w = weights.get("slo", {})
+    violations = slo_data.get("violations", [])
+    skipped = slo_data.get("skipped", False)
+
+    if skipped:
+        return 0, {"violations": 0, "skipped": True, "points": 0}, []
+
+    count = len(violations)
+    points = count * int(w.get("violation", 10))
+    reasons = []
+    if count:
+        reasons.append(f"Active SLO violation(s) in window: {count}")
+    return points, {"violations": count, "skipped": False, "points": points}, reasons
+
+
+def _score_alerts_loop(
+    loop_slo: Dict,
+    weights: Dict,
+) -> Tuple[int, Dict, List[str]]:
+    """Score alert-loop SLO violations (self-monitoring)."""
+    w = weights.get("alerts_loop", {})
+    violations = loop_slo.get("violations", [])
+    count = len(violations)
+    points = count * int(w.get("slo_violation", 10))
+    reasons = []
+    if count:
+        reasons.append(f"Alert-loop SLO violation(s): {count}")
+    return points, {"violations": count, "points": points}, reasons
+
+
+def _score_escalations(
+    escalation_count: int,
+    weights: Dict,
+) -> Tuple[int, Dict, List[str]]:
+    """Score escalations in last 24h."""
+    esc_w = weights.get("escalation", {}).get("escalations_24h", {})
+    warn_pts = int(esc_w.get("warn", 5))
+    high_pts = int(esc_w.get("high", 12))
+
+    if escalation_count >= 3:
+        points = high_pts
+    elif escalation_count >= 1:
+        points = warn_pts
+    else:
+        points = 0
+
+    reasons = []
+    if escalation_count:
+        reasons.append(f"Escalations in last 24h: {escalation_count}")
+
+    return points, {"count_24h": escalation_count, "points": points}, reasons
+
+
+# ─── Main scoring function ────────────────────────────────────────────────────
+
+def compute_service_risk(
+    service: str,
+    env: str = "prod",
+    *,
+    open_incidents: Optional[List[Dict]] = None,
+    recurrence_7d: Optional[Dict] = None,
+    recurrence_30d: Optional[Dict] = None,
+    followups_data: Optional[Dict] = None,
+    slo_data: Optional[Dict] = None,
+    alerts_loop_slo: Optional[Dict] = None,
+    escalation_count_24h: int = 0,
+    policy: Optional[Dict] = None,
+) -> Dict:
+    """
+    Compute risk score for a service.
+
+    Accepts pre-fetched data dicts (callers are responsible for fetching
+    from stores/tools). All args default to empty/safe values so the engine
+    never crashes due to missing data.
+    """
+    if policy is None:
+        policy = load_risk_policy()
+
+    weights = policy.get("weights", _builtin_defaults()["weights"])
+
+    # ── Compute each component ────────────────────────────────────────────────
+    open_incs = open_incidents or []
+    pts_inc, comp_inc, reasons_inc = _score_open_incidents(open_incs, weights)
+
+    # Merge 7d + 30d recurrence into a single dict
+    rec_merged = dict(recurrence_7d or {})
+    if recurrence_30d:
+        rec_merged["high_recurrence_30d"] = recurrence_30d.get("high_recurrence", {})
+        rec_merged["warn_recurrence_30d"] = recurrence_30d.get("warn_recurrence", {})
+    pts_rec, comp_rec, reasons_rec = _score_recurrence(rec_merged, weights)
+
+    pts_fu, comp_fu, reasons_fu = _score_followups(followups_data or {}, weights)
+    pts_slo, comp_slo, reasons_slo = _score_slo(slo_data or {}, weights)
+    pts_loop, comp_loop, reasons_loop = _score_alerts_loop(alerts_loop_slo or {}, weights)
+    pts_esc, comp_esc, reasons_esc = _score_escalations(escalation_count_24h, weights)
+
+    total = max(0, pts_inc + pts_rec + pts_fu + pts_slo + pts_loop + pts_esc)
+    band = score_to_band(total, policy)
+    svc_thresholds = get_service_thresholds(service, policy)
+
+    all_reasons = reasons_inc + reasons_rec + reasons_fu + reasons_slo + reasons_loop + reasons_esc
+
+    # Deterministic recommendations
+    recs = _build_recommendations(band, comp_inc, comp_rec, comp_fu, comp_slo)
+
+    return {
+        "service": service,
+        "env": env,
+        "score": total,
+        "band": band,
+        "thresholds": svc_thresholds,
+        "components": {
+            "open_incidents": comp_inc,
+            "recurrence": comp_rec,
+            "followups": comp_fu,
+            "slo": comp_slo,
+            "alerts_loop": comp_loop,
+            "escalations": comp_esc,
+        },
+        "reasons": all_reasons,
+        "recommendations": recs,
+        "updated_at": datetime.datetime.utcnow().isoformat(),
+    }
+
+
+def _build_recommendations(
+    band: str,
+    comp_inc: Dict,
+    comp_rec: Dict,
+    comp_fu: Dict,
+    comp_slo: Dict,
+) -> List[str]:
+    recs = []
+    if comp_inc.get("P0", 0) or comp_inc.get("P1", 0):
+        recs.append("Prioritize open P0/P1 incidents before deploying.")
+    if comp_rec.get("high_signatures_7d", 0) or comp_rec.get("high_kinds_7d", 0):
+        recs.append("Investigate recurring failure patterns (high recurrence buckets).")
+    if comp_fu.get("P0", 0) or comp_fu.get("P1", 0):
+        recs.append("Prioritize follow-up closure for recurring bucket(s).")
+    if comp_slo.get("violations", 0):
+        recs.append("Avoid risky deploys until SLO violation clears.")
+    if band in ("high", "critical"):
+        recs.append("Service is high-risk — coordinate with oncall before release.")
+    return recs[:6]
+
+
+# ─── Dashboard ────────────────────────────────────────────────────────────────
+
+# ─── Trend computation ────────────────────────────────────────────────────────
+
+def compute_trend(
+    series: List,  # List[RiskSnapshot] — most-recent first
+    policy: Optional[Dict] = None,
+) -> Dict:
+    """
+    Compute trend metrics from a list of RiskSnapshot objects (or dicts).
+
+    Returns:
+      delta_24h, delta_7d, slope_per_day, volatility, regression{warn, fail}
+    """
+    if policy is None:
+        policy = load_risk_policy()
+
+    trend_cfg = policy.get("trend", {})
+    reg = trend_cfg.get("regression_threshold", {})
+    warn_24h = int(reg.get("delta_24h_warn", 10))
+    fail_24h = int(reg.get("delta_24h_fail", 20))
+    warn_7d = int(reg.get("delta_7d_warn", 15))
+    fail_7d = int(reg.get("delta_7d_fail", 30))
+
+    if not series:
+        return _empty_trend()
+
+    # Normalise: accept both RiskSnapshot dataclasses and plain dicts
+    def _score(s) -> int:
+        return int(s.score if hasattr(s, "score") else s["score"])
+
+    def _ts(s) -> str:
+        return s.ts if hasattr(s, "ts") else s["ts"]
+
+    now = datetime.datetime.utcnow()
+    latest_score = _score(series[0])
+
+    # ── delta_24h ─────────────────────────────────────────────────────────────
+    cutoff_24h = (now - datetime.timedelta(hours=24)).isoformat()
+    base_24h = _find_baseline(series, cutoff_24h, _ts)
+    delta_24h = (latest_score - _score(base_24h)) if base_24h is not None else None
+
+    # ── delta_7d ──────────────────────────────────────────────────────────────
+    cutoff_7d = (now - datetime.timedelta(hours=168)).isoformat()
+    base_7d = _find_baseline(series, cutoff_7d, _ts)
+    delta_7d = (latest_score - _score(base_7d)) if base_7d is not None else None
+
+    # ── slope (simple linear regression over all available points) ────────────
+    slope_per_day: Optional[float] = None
+    if len(series) >= 2:
+        # xs = age in hours from oldest point
+        pairs = [(now - _parse_ts(_ts(s))).total_seconds() / 3600.0 for s in series]
+        hours_from_oldest = [max(pairs) - p for p in pairs]  # 0=oldest, max=newest
+        scores = [_score(s) for s in series]
+        slope_per_day = _linear_slope(hours_from_oldest, scores) * 24  # per day
+
+    # ── volatility (stddev of daily last-score-per-day over 7d) ──────────────
+    volatility: Optional[float] = None
+    daily_scores = _daily_latest_scores(series, days=7, _ts_fn=_ts, _score_fn=_score)
+    if len(daily_scores) >= 2:
+        mean = sum(daily_scores) / len(daily_scores)
+        variance = sum((x - mean) ** 2 for x in daily_scores) / len(daily_scores)
+        volatility = round(math.sqrt(variance), 2)
+
+    # ── regression flags ──────────────────────────────────────────────────────
+    reg_warn = (
+        (delta_24h is not None and delta_24h >= warn_24h)
+        or (delta_7d is not None and delta_7d >= warn_7d)
+    )
+    reg_fail = (
+        (delta_24h is not None and delta_24h >= fail_24h)
+        or (delta_7d is not None and delta_7d >= fail_7d)
+    )
+
+    return {
+        "delta_24h": delta_24h,
+        "delta_7d": delta_7d,
+        "slope_per_day": round(slope_per_day, 2) if slope_per_day is not None else None,
+        "volatility": volatility,
+        "regression": {"warn": reg_warn, "fail": reg_fail},
+    }
+
+
+def _empty_trend() -> Dict:
+    return {
+        "delta_24h": None, "delta_7d": None,
+        "slope_per_day": None, "volatility": None,
+        "regression": {"warn": False, "fail": False},
+    }
+
+
+def _find_baseline(series, cutoff_iso: str, ts_fn):
+    """Return the first element whose ts <= cutoff (series is newest-first)."""
+    for s in series:
+        if ts_fn(s) <= cutoff_iso:
+            return s
+    return None
+
+
+def _parse_ts(ts_str: str) -> datetime.datetime:
+    ts_str = ts_str.rstrip("Z")
+    for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d"):
+        try:
+            return datetime.datetime.strptime(ts_str, fmt)
+        except ValueError:
+            continue
+    return datetime.datetime.utcnow()
+
+
+def _linear_slope(xs: List[float], ys: List[float]) -> float:
+    """Simple least-squares slope (score per hour)."""
+    n = len(xs)
+    if n < 2:
+        return 0.0
+    x_mean = sum(xs) / n
+    y_mean = sum(ys) / n
+    num = sum((xs[i] - x_mean) * (ys[i] - y_mean) for i in range(n))
+    den = sum((xs[i] - x_mean) ** 2 for i in range(n))
+    return num / den if den != 0 else 0.0
+
+
+def _daily_latest_scores(series, days: int, _ts_fn, _score_fn) -> List[float]:
+    """Collect the latest score for each calendar day over last `days` days."""
+    now = datetime.datetime.utcnow()
+    day_scores: Dict[str, int] = {}
+    cutoff = (now - datetime.timedelta(days=days)).isoformat()
+    for s in series:
+        ts = _ts_fn(s)
+        if ts < cutoff:
+            break
+        day_key = ts[:10]  # YYYY-MM-DD
+        if day_key not in day_scores:  # series is newest-first, so first = latest
+            day_scores[day_key] = _score_fn(s)
+    return list(day_scores.values())
+
+
+def enrich_risk_report_with_trend(
+    report: Dict,
+    history_store,  # RiskHistoryStore
+    policy: Optional[Dict] = None,
+) -> Dict:
+    """
+    Mutates `report` in-place to add a `trend` key.
+    Non-fatal: on any error, adds `trend: null`.
+    """
+    try:
+        service = report.get("service", "")
+        env = report.get("env", "prod")
+        if policy is None:
+            policy = load_risk_policy()
+
+        trend_cfg = policy.get("trend", {})
+        vol_hours = int(trend_cfg.get("volatility_window_hours", 168))
+        series = history_store.get_series(service, env, hours=vol_hours, limit=500)
+        report["trend"] = compute_trend(series, policy=policy)
+    except Exception as e:
+        logger.warning("enrich_risk_report_with_trend failed for %s: %s", report.get("service"), e)
+        report["trend"] = None
+    return report
+
+
+def enrich_risk_report_with_attribution(
+    report: Dict,
+    *,
+    alert_store=None,
+    incident_store=None,
+    attr_policy: Optional[Dict] = None,
+) -> Dict:
+    """
+    Mutates `report` in-place to add an `attribution` key.
+    Non-fatal: on any error, adds `attribution: null`.
+    LLM enrichment is applied if policy.llm_mode != 'off' and triggers met.
+    """
+    try:
+        from risk_attribution import (
+            compute_attribution, fetch_signals_from_stores, load_attribution_policy,
+        )
+        from llm_enrichment import maybe_enrich_attribution
+
+        if attr_policy is None:
+            attr_policy = load_attribution_policy()
+
+        service = report.get("service", "")
+        env = report.get("env", "prod")
+
+        # Fetch raw signals
+        signals = fetch_signals_from_stores(
+            service, env,
+            window_hours=int((attr_policy.get("defaults") or {}).get("lookback_hours", 24)),
+            alert_store=alert_store,
+            incident_store=incident_store,
+            policy=attr_policy,
+        )
+
+        attribution = compute_attribution(
+            service, env,
+            risk_report=report,
+            **signals,
+            policy=attr_policy,
+        )
+
+        # Optionally enrich with LLM (bounded, off by default)
+        attribution["llm_enrichment"] = maybe_enrich_attribution(
+            attribution, report, attr_policy
+        )
+
+        report["attribution"] = attribution
+    except Exception as e:
+        logger.warning("enrich_risk_report_with_attribution failed for %s: %s",
+                       report.get("service"), e)
+        report["attribution"] = None
+    return report
+
+
+# ─── Snapshot writer ──────────────────────────────────────────────────────────
+
+def snapshot_all_services(
+    env: str,
+    compute_fn,  # Callable[[str, str], Dict]  — returns RiskReport for (service, env)
+    history_store,  # RiskHistoryStore
+    policy: Optional[Dict] = None,
+    known_services: Optional[List[str]] = None,
+) -> Dict:
+    """
+    Compute and persist a RiskSnapshot for every known service.
+
+    `compute_fn(service, env)` must return a RiskReport dict.
+    Returns {written, skipped, errors, services}.
+    Non-fatal per service.
+    """
+    if policy is None:
+        policy = load_risk_policy()
+
+    from risk_history_store import RiskSnapshot
+
+    max_services = int(policy.get("history", {}).get("max_services_per_run", 50))
+    services = (known_services or [])[:max_services]
+
+    written = skipped = errors = 0
+    snapped: List[str] = []
+
+    for svc in services:
+        try:
+            report = compute_fn(svc, env)
+            snap = RiskSnapshot(
+                ts=datetime.datetime.utcnow().isoformat(),
+                service=svc,
+                env=env,
+                score=int(report.get("score", 0)),
+                band=report.get("band", "low"),
+                components=report.get("components", {}),
+                reasons=report.get("reasons", []),
+            )
+            history_store.write_snapshot([snap])
+            written += 1
+            snapped.append(svc)
+        except Exception as e:
+            logger.warning("snapshot_all_services: error for %s/%s: %s", svc, env, e)
+            errors += 1
+
+    return {
+        "written": written,
+        "skipped": skipped,
+        "errors": errors,
+        "services": snapped,
+        "env": env,
+        "ts": datetime.datetime.utcnow().isoformat(),
+    }
+
+
+def compute_risk_dashboard(
+    env: str = "prod",
+    top_n: int = 10,
+    *,
+    service_reports: Optional[List[Dict]] = None,
+    history_store=None,   # Optional[RiskHistoryStore] — if provided, enrich with trend
+    policy: Optional[Dict] = None,
+) -> Dict:
+    """
+    Build risk dashboard from a list of pre-computed service reports.
+    Sorts by score desc and returns summary.
+    If history_store is provided, each report is enriched with trend data.
+    """
+    if policy is None:
+        policy = load_risk_policy()
+
+    reports = sorted(
+        service_reports or [],
+        key=lambda r: -r.get("score", 0),
+    )[:top_n]
+
+    # Enrich with trend if history_store provided
+    if history_store is not None:
+        for r in reports:
+            enrich_risk_report_with_trend(r, history_store, policy)
+
+    band_counts: Dict[str, int] = {"critical": 0, "high": 0, "medium": 0, "low": 0}
+    for r in reports:
+        b = r.get("band", "low")
+        band_counts[b] = band_counts.get(b, 0) + 1
+
+    p0_services = set(policy.get("p0_services", []))
+    critical_p0 = [r for r in reports if r["service"] in p0_services
+                   and r["band"] in ("high", "critical")]
+
+    # Top regressions (highest delta_24h, trend present)
+    top_regressions = sorted(
+        [r for r in reports if (r.get("trend") or {}).get("delta_24h") is not None
+         and r["trend"]["delta_24h"] > 0],
+        key=lambda r: -r["trend"]["delta_24h"],
+    )[:5]
+
+    # Improving services (most negative delta_7d)
+    improving = sorted(
+        [r for r in reports if (r.get("trend") or {}).get("delta_7d") is not None
+         and r["trend"]["delta_7d"] < 0],
+        key=lambda r: r["trend"]["delta_7d"],
+    )[:5]
+
+    # Top regression summaries (with top-2 causes if attribution available)
+    top_regression_summaries = []
+    for r in top_regressions:
+        entry: Dict = {
+            "service": r["service"],
+            "delta_24h": r["trend"]["delta_24h"],
+        }
+        attr = r.get("attribution")
+        if attr and attr.get("causes"):
+            entry["causes"] = attr["causes"][:2]
+            entry["attribution_summary"] = attr.get("summary", "")
+        top_regression_summaries.append(entry)
+
+    now_iso = datetime.datetime.utcnow().isoformat()
+    return {
+        "env": env,
+        "generated_at": now_iso,
+        "history_updated_at": now_iso,
+        "total_services": len(reports),
+        "band_counts": band_counts,
+        "critical_p0_services": [r["service"] for r in critical_p0],
+        "top_regressions": top_regression_summaries,
+        "improving_services": [{"service": r["service"], "delta_7d": r["trend"]["delta_7d"]}
+                                for r in improving],
+        "services": reports,
+    }
--- a/services/router/risk_history_store.py
+++ b/services/router/risk_history_store.py
@@ -0,0 +1,409 @@
+"""
+risk_history_store.py — Storage layer for Risk Score snapshots.
+
+Provides:
+  RiskSnapshot        — dataclass for a single point-in-time risk record
+  RiskHistoryStore    — abstract base
+  MemoryRiskHistoryStore  — in-process (tests + fallback)
+  NullRiskHistoryStore    — no-op (disabled)
+  PostgresRiskHistoryStore — Postgres primary (psycopg2 sync)
+  AutoRiskHistoryStore    — Postgres → Memory fallback
+
+Factory:  get_risk_history_store() → AutoRiskHistoryStore by default
+"""
+from __future__ import annotations
+
+import datetime
+import json
+import logging
+import os
+import threading
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from dataclasses import dataclass, field, asdict
+from typing import Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+# ─── Data model ───────────────────────────────────────────────────────────────
+
+@dataclass
+class RiskSnapshot:
+    ts: str                          # ISO-8601 UTC
+    service: str
+    env: str
+    score: int
+    band: str
+    components: Dict = field(default_factory=dict)
+    reasons: List[str] = field(default_factory=list)
+
+    def to_dict(self) -> Dict:
+        return asdict(self)
+
+    @staticmethod
+    def from_dict(d: Dict) -> "RiskSnapshot":
+        return RiskSnapshot(
+            ts=d["ts"], service=d["service"], env=d.get("env", "prod"),
+            score=int(d["score"]), band=d.get("band", "low"),
+            components=d.get("components", {}),
+            reasons=d.get("reasons", []),
+        )
+
+
+# ─── Abstract base ────────────────────────────────────────────────────────────
+
+class RiskHistoryStore(ABC):
+    @abstractmethod
+    def write_snapshot(self, records: List[RiskSnapshot]) -> int:
+        """Persist records; returns number written."""
+
+    @abstractmethod
+    def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
+        """Most recent snapshot for service/env."""
+
+    @abstractmethod
+    def get_series(
+        self, service: str, env: str, hours: int = 168, limit: int = 200
+    ) -> List[RiskSnapshot]:
+        """Snapshots in descending time order within last `hours` hours."""
+
+    def get_delta(self, service: str, env: str, hours: int = 24) -> Optional[int]:
+        """
+        latest.score - closest-to-(now-hours) score.
+        Returns None if no baseline is available.
+        """
+        series = self.get_series(service, env, hours=hours * 2, limit=500)
+        if not series:
+            return None
+        latest = series[0]
+        cutoff_ts = (
+            datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
+        ).isoformat()
+        # Find snapshot closest to cutoff (first one before or at cutoff)
+        baseline = None
+        for snap in series:
+            if snap.ts <= cutoff_ts:
+                baseline = snap
+                break
+        if baseline is None:
+            return None
+        return latest.score - baseline.score
+
+    def dashboard_series(
+        self, env: str, hours: int = 24, top_n: int = 10
+    ) -> List[Dict]:
+        """Return latest snapshot for each service in env, sorted by score desc."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def cleanup(self, retention_days: int = 90) -> int:
+        """Delete records older than retention_days; returns count deleted."""
+
+
+# ─── Memory backend (tests + fallback) ────────────────────────────────────────
+
+class MemoryRiskHistoryStore(RiskHistoryStore):
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        # key: (service, env) → list of RiskSnapshot sorted desc by ts
+        self._data: Dict = defaultdict(list)
+
+    def write_snapshot(self, records: List[RiskSnapshot]) -> int:
+        with self._lock:
+            for rec in records:
+                key = (rec.service, rec.env)
+                self._data[key].append(rec)
+                self._data[key].sort(key=lambda r: r.ts, reverse=True)
+        return len(records)
+
+    def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
+        with self._lock:
+            series = self._data.get((service, env), [])
+            return series[0] if series else None
+
+    def get_series(
+        self, service: str, env: str, hours: int = 168, limit: int = 200
+    ) -> List[RiskSnapshot]:
+        cutoff = (
+            datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
+        ).isoformat()
+        with self._lock:
+            series = self._data.get((service, env), [])
+            result = [s for s in series if s.ts >= cutoff]
+            return result[:limit]
+
+    def dashboard_series(
+        self, env: str, hours: int = 24, top_n: int = 10
+    ) -> List[Dict]:
+        cutoff = (
+            datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
+        ).isoformat()
+        with self._lock:
+            latest_per_service: Dict[str, RiskSnapshot] = {}
+            for (svc, e), snaps in self._data.items():
+                if e != env:
+                    continue
+                recent = [s for s in snaps if s.ts >= cutoff]
+                if recent:
+                    latest_per_service[svc] = recent[0]
+        return sorted(
+            [s.to_dict() for s in latest_per_service.values()],
+            key=lambda r: -r["score"],
+        )[:top_n]
+
+    def cleanup(self, retention_days: int = 90) -> int:
+        cutoff = (
+            datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
+        ).isoformat()
+        deleted = 0
+        with self._lock:
+            for key in list(self._data.keys()):
+                before = len(self._data[key])
+                self._data[key] = [s for s in self._data[key] if s.ts >= cutoff]
+                deleted += before - len(self._data[key])
+        return deleted
+
+
+# ─── Null backend ──────────────────────────────────────────────────────────────
+
+class NullRiskHistoryStore(RiskHistoryStore):
+    """No-op: all writes discarded, all reads return empty."""
+
+    def write_snapshot(self, records: List[RiskSnapshot]) -> int:
+        return 0
+
+    def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
+        return None
+
+    def get_series(
+        self, service: str, env: str, hours: int = 168, limit: int = 200
+    ) -> List[RiskSnapshot]:
+        return []
+
+    def cleanup(self, retention_days: int = 90) -> int:
+        return 0
+
+
+# ─── Postgres backend ──────────────────────────────────────────────────────────
+
+class PostgresRiskHistoryStore(RiskHistoryStore):
+    """
+    Production Postgres backend (psycopg2 sync, per-thread connection).
+    Schema created by ops/scripts/migrate_risk_history_postgres.py.
+    """
+
+    def __init__(self, dsn: str) -> None:
+        self._dsn = dsn
+        self._local = threading.local()
+
+    def _conn(self):
+        conn = getattr(self._local, "conn", None)
+        if conn is None or conn.closed:
+            import psycopg2  # type: ignore
+            conn = psycopg2.connect(self._dsn)
+            conn.autocommit = True
+            self._local.conn = conn
+        return conn
+
+    def write_snapshot(self, records: List[RiskSnapshot]) -> int:
+        if not records:
+            return 0
+        cur = self._conn().cursor()
+        written = 0
+        for rec in records:
+            try:
+                cur.execute(
+                    """INSERT INTO risk_history (ts, service, env, score, band, components, reasons)
+                       VALUES (%s, %s, %s, %s, %s, %s, %s)
+                       ON CONFLICT (ts, service, env) DO UPDATE
+                         SET score=EXCLUDED.score, band=EXCLUDED.band,
+                             components=EXCLUDED.components, reasons=EXCLUDED.reasons""",
+                    (rec.ts, rec.service, rec.env, rec.score, rec.band,
+                     json.dumps(rec.components), json.dumps(rec.reasons)),
+                )
+                written += 1
+            except Exception as e:
+                logger.warning("risk_history write failed for %s/%s: %s", rec.service, rec.env, e)
+        cur.close()
+        return written
+
+    def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
+        cur = self._conn().cursor()
+        cur.execute(
+            "SELECT ts,service,env,score,band,components,reasons FROM risk_history "
+            "WHERE service=%s AND env=%s ORDER BY ts DESC LIMIT 1",
+            (service, env),
+        )
+        row = cur.fetchone()
+        cur.close()
+        if not row:
+            return None
+        return self._row_to_snap(row)
+
+    def get_series(
+        self, service: str, env: str, hours: int = 168, limit: int = 200
+    ) -> List[RiskSnapshot]:
+        cutoff = datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
+        cur = self._conn().cursor()
+        cur.execute(
+            "SELECT ts,service,env,score,band,components,reasons FROM risk_history "
+            "WHERE service=%s AND env=%s AND ts >= %s ORDER BY ts DESC LIMIT %s",
+            (service, env, cutoff, limit),
+        )
+        rows = cur.fetchall()
+        cur.close()
+        return [self._row_to_snap(r) for r in rows]
+
+    def dashboard_series(
+        self, env: str, hours: int = 24, top_n: int = 10
+    ) -> List[Dict]:
+        cutoff = datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
+        cur = self._conn().cursor()
+        # Latest snapshot per service in env within window
+        cur.execute(
+            """SELECT DISTINCT ON (service)
+                  ts, service, env, score, band, components, reasons
+               FROM risk_history
+               WHERE env=%s AND ts >= %s
+               ORDER BY service, ts DESC""",
+            (env, cutoff),
+        )
+        rows = cur.fetchall()
+        cur.close()
+        snaps = [self._row_to_snap(r).to_dict() for r in rows]
+        return sorted(snaps, key=lambda r: -r["score"])[:top_n]
+
+    def cleanup(self, retention_days: int = 90) -> int:
+        cutoff = datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
+        cur = self._conn().cursor()
+        cur.execute("DELETE FROM risk_history WHERE ts < %s", (cutoff,))
+        deleted = cur.rowcount
+        cur.close()
+        return deleted
+
+    @staticmethod
+    def _row_to_snap(row) -> RiskSnapshot:
+        ts, service, env, score, band, components, reasons = row
+        if isinstance(ts, datetime.datetime):
+            ts = ts.isoformat()
+        if isinstance(components, str):
+            components = json.loads(components)
+        if isinstance(reasons, str):
+            reasons = json.loads(reasons)
+        return RiskSnapshot(
+            ts=ts, service=service, env=env,
+            score=int(score), band=band,
+            components=components or {},
+            reasons=reasons or [],
+        )
+
+
+# ─── Auto backend ─────────────────────────────────────────────────────────────
+
+class AutoRiskHistoryStore(RiskHistoryStore):
+    """
+    Postgres primary; falls back to MemoryRiskHistoryStore on connection failures.
+    Reads are always tried against Postgres first. On failure, returns from memory buffer.
+    """
+
+    def __init__(self, pg_dsn: str) -> None:
+        self._pg = PostgresRiskHistoryStore(pg_dsn)
+        self._mem = MemoryRiskHistoryStore()
+        self._pg_ok = True
+
+    def _try_pg(self, method: str, *args, **kwargs):
+        try:
+            result = getattr(self._pg, method)(*args, **kwargs)
+            self._pg_ok = True
+            return True, result
+        except Exception as e:
+            if self._pg_ok:
+                logger.warning("AutoRiskHistoryStore: Postgres unavailable (%s), using memory", e)
+            self._pg_ok = False
+            return False, None
+
+    def write_snapshot(self, records: List[RiskSnapshot]) -> int:
+        ok, written = self._try_pg("write_snapshot", records)
+        self._mem.write_snapshot(records)  # always keep in-memory buffer
+        return written if ok else len(records)
+
+    def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
+        ok, result = self._try_pg("get_latest", service, env)
+        if ok:
+            return result
+        return self._mem.get_latest(service, env)
+
+    def get_series(
+        self, service: str, env: str, hours: int = 168, limit: int = 200
+    ) -> List[RiskSnapshot]:
+        ok, result = self._try_pg("get_series", service, env, hours, limit)
+        if ok:
+            return result
+        return self._mem.get_series(service, env, hours, limit)
+
+    def dashboard_series(
+        self, env: str, hours: int = 24, top_n: int = 10
+    ) -> List[Dict]:
+        ok, result = self._try_pg("dashboard_series", env, hours, top_n)
+        if ok:
+            return result
+        return self._mem.dashboard_series(env, hours, top_n)
+
+    def cleanup(self, retention_days: int = 90) -> int:
+        ok, count = self._try_pg("cleanup", retention_days)
+        self._mem.cleanup(retention_days)
+        return count if ok else 0
+
+
+# ─── Singleton factory ────────────────────────────────────────────────────────
+
+_store: Optional[RiskHistoryStore] = None
+_store_lock = threading.Lock()
+
+
+def get_risk_history_store() -> RiskHistoryStore:
+    global _store
+    if _store is None:
+        with _store_lock:
+            if _store is None:
+                _store = _create_store()
+    return _store
+
+
+def set_risk_history_store(store: Optional[RiskHistoryStore]) -> None:
+    global _store
+    with _store_lock:
+        _store = store
+
+
+def _create_store() -> RiskHistoryStore:
+    backend = os.getenv("RISK_HISTORY_BACKEND", "auto").lower()
+    dsn = (
+        os.getenv("RISK_DATABASE_URL")
+        or os.getenv("DATABASE_URL")
+        or ""
+    )
+
+    if backend == "memory":
+        logger.info("RiskHistoryStore: in-memory")
+        return MemoryRiskHistoryStore()
+
+    if backend == "null":
+        logger.info("RiskHistoryStore: null (disabled)")
+        return NullRiskHistoryStore()
+
+    if backend == "postgres":
+        if dsn:
+            logger.info("RiskHistoryStore: postgres dsn=%s…", dsn[:30])
+            return PostgresRiskHistoryStore(dsn)
+        logger.warning("RISK_HISTORY_BACKEND=postgres but no DATABASE_URL; falling back to memory")
+        return MemoryRiskHistoryStore()
+
+    # Default: auto
+    if dsn:
+        logger.info("RiskHistoryStore: auto (postgres→memory fallback) dsn=%s…", dsn[:30])
+        return AutoRiskHistoryStore(pg_dsn=dsn)
+
+    logger.info("RiskHistoryStore: auto — no DATABASE_URL, using memory")
+    return MemoryRiskHistoryStore()
--- a/services/router/signature_state_store.py
+++ b/services/router/signature_state_store.py
@@ -0,0 +1,376 @@
+"""
+signature_state_store.py — Cooldown tracking per incident signature.
+
+Prevents triage from running too frequently for the same failure type.
+A "signature" is the same one computed by alert_routing.compute_incident_signature.
+
+Backends:
+  - MemorySignatureStateStore  (tests / single-process)
+  - PostgresSignatureStateStore (production)
+  - AutoSignatureStateStore    (Postgres → Memory fallback)
+
+Table: incident_signature_state
+  signature text PK, last_triage_at timestamptz, last_alert_at timestamptz,
+  triage_count_24h int, updated_at timestamptz
+
+DDL: ops/scripts/migrate_alerts_postgres.py
+"""
+from __future__ import annotations
+
+import datetime
+import logging
+import os
+import threading
+import time
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_COOLDOWN_MINUTES = 15
+
+
+def _now_dt() -> datetime.datetime:
+    return datetime.datetime.utcnow()
+
+
+def _now_iso() -> str:
+    return datetime.datetime.utcnow().isoformat()
+
+
+# ─── Abstract ─────────────────────────────────────────────────────────────────
+
+class SignatureStateStore(ABC):
+
+    @abstractmethod
+    def should_run_triage(
+        self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES
+    ) -> bool:
+        """Return True if cooldown has passed (triage may proceed)."""
+
+    @abstractmethod
+    def mark_alert_seen(self, signature: str) -> None:
+        """Record that an alert with this signature was observed.
+        Also updates occurrences_60m rolling bucket."""
+
+    @abstractmethod
+    def mark_triage_run(self, signature: str) -> None:
+        """Record that triage was executed for this signature."""
+
+    @abstractmethod
+    def get_state(self, signature: str) -> Optional[Dict]:
+        """Return raw state dict or None."""
+
+    @abstractmethod
+    def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]:
+        """Return signatures seen in last window_minutes, ordered by occurrences_60m desc."""
+
+
+# ─── Memory backend ────────────────────────────────────────────────────────────
+
+class MemorySignatureStateStore(SignatureStateStore):
+    BUCKET_MINUTES = 60  # rolling window for occurrences_60m
+
+    def __init__(self):
+        self._lock = threading.Lock()
+        self._states: Dict[str, Dict] = {}
+
+    def _update_bucket(self, state: Dict, now: str) -> None:
+        """Update the 60-min rolling occurrence bucket in-place."""
+        bucket_start = state.get("occurrences_60m_bucket_start") or ""
+        cutoff = (_now_dt() - datetime.timedelta(minutes=self.BUCKET_MINUTES)).isoformat()
+        if bucket_start < cutoff:
+            state["occurrences_60m"] = 1
+            state["occurrences_60m_bucket_start"] = now
+        else:
+            state["occurrences_60m"] = state.get("occurrences_60m", 0) + 1
+
+    def should_run_triage(
+        self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES
+    ) -> bool:
+        with self._lock:
+            state = self._states.get(signature)
+        if state is None:
+            return True
+        last_triage = state.get("last_triage_at")
+        if not last_triage:
+            return True
+        cutoff = (_now_dt() - datetime.timedelta(minutes=cooldown_minutes)).isoformat()
+        return last_triage < cutoff
+
+    def mark_alert_seen(self, signature: str) -> None:
+        now = _now_iso()
+        with self._lock:
+            if signature not in self._states:
+                self._states[signature] = {
+                    "signature": signature,
+                    "last_triage_at": None,
+                    "last_alert_at": now,
+                    "triage_count_24h": 0,
+                    "occurrences_60m": 1,
+                    "occurrences_60m_bucket_start": now,
+                    "updated_at": now,
+                }
+            else:
+                s = self._states[signature]
+                s["last_alert_at"] = now
+                s["updated_at"] = now
+                self._update_bucket(s, now)
+
+    def mark_triage_run(self, signature: str) -> None:
+        now = _now_iso()
+        cutoff_24h = (_now_dt() - datetime.timedelta(hours=24)).isoformat()
+        with self._lock:
+            if signature not in self._states:
+                self._states[signature] = {
+                    "signature": signature,
+                    "last_triage_at": now,
+                    "last_alert_at": now,
+                    "triage_count_24h": 1,
+                    "occurrences_60m": 0,
+                    "occurrences_60m_bucket_start": now,
+                    "updated_at": now,
+                }
+            else:
+                s = self._states[signature]
+                prev = s.get("last_triage_at") or ""
+                if prev < cutoff_24h:
+                    s["triage_count_24h"] = 1
+                else:
+                    s["triage_count_24h"] = s.get("triage_count_24h", 0) + 1
+                s["last_triage_at"] = now
+                s["updated_at"] = now
+
+    def get_state(self, signature: str) -> Optional[Dict]:
+        with self._lock:
+            s = self._states.get(signature)
+            return dict(s) if s else None
+
+    def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]:
+        cutoff = (_now_dt() - datetime.timedelta(minutes=window_minutes)).isoformat()
+        with self._lock:
+            active = [
+                dict(s) for s in self._states.values()
+                if (s.get("last_alert_at") or "") >= cutoff
+            ]
+        return sorted(active, key=lambda x: x.get("occurrences_60m", 0), reverse=True)[:limit]
+
+
+# ─── Postgres backend ──────────────────────────────────────────────────────────
+
+class PostgresSignatureStateStore(SignatureStateStore):
+    def __init__(self, dsn: str):
+        self._dsn = dsn
+        self._local = threading.local()
+
+    def _conn(self):
+        conn = getattr(self._local, "conn", None)
+        if conn is None or conn.closed:
+            import psycopg2  # type: ignore
+            conn = psycopg2.connect(self._dsn)
+            conn.autocommit = True
+            self._local.conn = conn
+        return conn
+
+    def should_run_triage(
+        self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES
+    ) -> bool:
+        cur = self._conn().cursor()
+        cur.execute(
+            "SELECT last_triage_at FROM incident_signature_state WHERE signature=%s",
+            (signature,),
+        )
+        row = cur.fetchone()
+        cur.close()
+        if not row or row[0] is None:
+            return True
+        cutoff = _now_dt() - datetime.timedelta(minutes=cooldown_minutes)
+        last = row[0]
+        if hasattr(last, "tzinfo") and last.tzinfo:
+            last = last.replace(tzinfo=None)
+        return last < cutoff
+
+    def mark_alert_seen(self, signature: str) -> None:
+        now = _now_iso()
+        cutoff_60m = (_now_dt() - datetime.timedelta(minutes=60)).isoformat()
+        cur = self._conn().cursor()
+        cur.execute(
+            """INSERT INTO incident_signature_state
+               (signature, last_alert_at, triage_count_24h, updated_at,
+                occurrences_60m, occurrences_60m_bucket_start)
+               VALUES (%s, %s, 0, %s, 1, %s)
+               ON CONFLICT (signature) DO UPDATE
+               SET last_alert_at=EXCLUDED.last_alert_at,
+                   updated_at=EXCLUDED.updated_at,
+                   occurrences_60m = CASE
+                     WHEN incident_signature_state.occurrences_60m_bucket_start IS NULL
+                          OR incident_signature_state.occurrences_60m_bucket_start < %s
+                     THEN 1
+                     ELSE incident_signature_state.occurrences_60m + 1
+                   END,
+                   occurrences_60m_bucket_start = CASE
+                     WHEN incident_signature_state.occurrences_60m_bucket_start IS NULL
+                          OR incident_signature_state.occurrences_60m_bucket_start < %s
+                     THEN EXCLUDED.occurrences_60m_bucket_start
+                     ELSE incident_signature_state.occurrences_60m_bucket_start
+                   END""",
+            (signature, now, now, now, cutoff_60m, cutoff_60m),
+        )
+        cur.close()
+
+    def mark_triage_run(self, signature: str) -> None:
+        now = _now_iso()
+        cutoff_24h = (_now_dt() - datetime.timedelta(hours=24)).isoformat()
+        cur = self._conn().cursor()
+        cur.execute(
+            """INSERT INTO incident_signature_state
+               (signature, last_triage_at, last_alert_at, triage_count_24h, updated_at,
+                occurrences_60m, occurrences_60m_bucket_start)
+               VALUES (%s, %s, %s, 1, %s, 0, %s)
+               ON CONFLICT (signature) DO UPDATE
+               SET last_triage_at=EXCLUDED.last_triage_at,
+                   triage_count_24h = CASE
+                     WHEN incident_signature_state.last_triage_at IS NULL
+                          OR incident_signature_state.last_triage_at < %s
+                     THEN 1
+                     ELSE incident_signature_state.triage_count_24h + 1
+                   END,
+                   updated_at=EXCLUDED.updated_at""",
+            (signature, now, now, now, now, cutoff_24h),
+        )
+        cur.close()
+
+    def get_state(self, signature: str) -> Optional[Dict]:
+        cur = self._conn().cursor()
+        cur.execute(
+            "SELECT signature, last_triage_at, last_alert_at, triage_count_24h, updated_at, "
+            "occurrences_60m, occurrences_60m_bucket_start "
+            "FROM incident_signature_state WHERE signature=%s",
+            (signature,),
+        )
+        row = cur.fetchone()
+        cur.close()
+        if not row:
+            return None
+        sig, lta, laa, cnt, upd, occ60, occ_start = row
+        return {
+            "signature": sig,
+            "last_triage_at": lta.isoformat() if hasattr(lta, "isoformat") else lta,
+            "last_alert_at": laa.isoformat() if hasattr(laa, "isoformat") else laa,
+            "triage_count_24h": cnt,
+            "updated_at": upd.isoformat() if hasattr(upd, "isoformat") else upd,
+            "occurrences_60m": occ60 or 0,
+            "occurrences_60m_bucket_start": (
+                occ_start.isoformat() if hasattr(occ_start, "isoformat") else occ_start
+            ),
+        }
+
+    def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]:
+        cutoff = (_now_dt() - datetime.timedelta(minutes=window_minutes)).isoformat()
+        cur = self._conn().cursor()
+        cur.execute(
+            "SELECT signature, last_triage_at, last_alert_at, triage_count_24h, updated_at, "
+            "occurrences_60m, occurrences_60m_bucket_start "
+            "FROM incident_signature_state "
+            "WHERE last_alert_at >= %s "
+            "ORDER BY occurrences_60m DESC NULLS LAST LIMIT %s",
+            (cutoff, limit),
+        )
+        rows = []
+        for row in cur.fetchall():
+            sig, lta, laa, cnt, upd, occ60, occ_start = row
+            rows.append({
+                "signature": sig,
+                "last_triage_at": lta.isoformat() if hasattr(lta, "isoformat") else lta,
+                "last_alert_at": laa.isoformat() if hasattr(laa, "isoformat") else laa,
+                "triage_count_24h": cnt,
+                "updated_at": upd.isoformat() if hasattr(upd, "isoformat") else upd,
+                "occurrences_60m": occ60 or 0,
+                "occurrences_60m_bucket_start": (
+                    occ_start.isoformat() if hasattr(occ_start, "isoformat") else occ_start
+                ),
+            })
+        cur.close()
+        return rows
+
+
+# ─── Auto backend ──────────────────────────────────────────────────────────────
+
+class AutoSignatureStateStore(SignatureStateStore):
+    _RECOVERY_S = 300
+
+    def __init__(self, pg_dsn: str):
+        self._pg_dsn = pg_dsn
+        self._primary: Optional[PostgresSignatureStateStore] = None
+        self._fallback = MemorySignatureStateStore()
+        self._using_fallback = False
+        self._since: float = 0.0
+        self._lock = threading.Lock()
+
+    def _get_primary(self) -> PostgresSignatureStateStore:
+        if self._primary is None:
+            with self._lock:
+                if self._primary is None:
+                    self._primary = PostgresSignatureStateStore(self._pg_dsn)
+        return self._primary
+
+    def _maybe_recover(self):
+        if self._using_fallback and time.monotonic() - self._since >= self._RECOVERY_S:
+            self._using_fallback = False
+
+    def _delegate(self, method: str, *args, **kwargs):
+        self._maybe_recover()
+        if not self._using_fallback:
+            try:
+                return getattr(self._get_primary(), method)(*args, **kwargs)
+            except Exception as e:
+                logger.warning("AutoSignatureStateStore Postgres failed: %s", e)
+                self._using_fallback = True
+                self._since = time.monotonic()
+        return getattr(self._fallback, method)(*args, **kwargs)
+
+    def should_run_triage(self, signature, cooldown_minutes=DEFAULT_COOLDOWN_MINUTES):
+        return self._delegate("should_run_triage", signature, cooldown_minutes)
+
+    def mark_alert_seen(self, signature):
+        self._delegate("mark_alert_seen", signature)
+
+    def mark_triage_run(self, signature):
+        self._delegate("mark_triage_run", signature)
+
+    def get_state(self, signature):
+        return self._delegate("get_state", signature)
+
+    def list_active_signatures(self, window_minutes=60, limit=100):
+        return self._delegate("list_active_signatures", window_minutes, limit)
+
+
+# ─── Singleton ────────────────────────────────────────────────────────────────
+
+_sig_store: Optional[SignatureStateStore] = None
+_sig_lock = threading.Lock()
+
+
+def get_signature_state_store() -> SignatureStateStore:
+    global _sig_store
+    if _sig_store is None:
+        with _sig_lock:
+            if _sig_store is None:
+                _sig_store = _create_sig_store()
+    return _sig_store
+
+
+def set_signature_state_store(store: Optional[SignatureStateStore]) -> None:
+    global _sig_store
+    with _sig_lock:
+        _sig_store = store
+
+
+def _create_sig_store() -> SignatureStateStore:
+    backend = os.getenv("ALERT_BACKEND", "memory").lower()
+    dsn = os.getenv("DATABASE_URL") or os.getenv("ALERT_DATABASE_URL", "")
+    if backend == "postgres" and dsn:
+        return PostgresSignatureStateStore(dsn)
+    if backend == "auto" and dsn:
+        return AutoSignatureStateStore(dsn)
+    return MemorySignatureStateStore()
--- a/services/router/sofiia_auto_router.py
+++ b/services/router/sofiia_auto_router.py
@@ -0,0 +1,767 @@
+"""Sofiia Smart Auto-Router — Cursor-style model selection for Sofiia agent.
+
+Classifies incoming prompt by task type and selects the best available model,
+balancing capability, speed, cost, and provider availability.
+
+Full model catalog includes:
+  - Cloud: Anthropic Claude, xAI Grok, DeepSeek, Mistral AI, GLM-5 (Z.AI)
+  - Local Ollama (NODA2/MacBook): qwen3.5:35b-a3b, qwen3:14b, glm-4.7-flash:32k,
+    deepseek-r1:70b, deepseek-coder:33b, gemma3, mistral-nemo:12b,
+    starcoder2:3b, phi3, llava:13b
+
+Task taxonomy (inspired by Cursor Auto mode):
+  code_gen, code_review, code_debug, code_refactor,
+  architecture, devops, security, analysis, quick_answer, creative, reasoning,
+  math_code, vision, chatbot
+"""
+from __future__ import annotations
+
+import logging
+import os
+import re
+import time
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ── Task taxonomy ──────────────────────────────────────────────────────────────
+# Each pattern group uses multi-word or context-aware patterns to reduce false
+# positives. Single common words (system, design, check, list, graph, tree) are
+# avoided unless paired with a qualifier.
+
+TASK_PATTERNS: List[Tuple[str, List[str], float]] = [
+    # (task_type, patterns, base_weight)  — weight scales final score
+    ("code_gen", [
+        r"\bнапиши\s+(функці|код|клас|скрипт|модуль|endpoint|api)",
+        r"\bреалізуй\b", r"\bcreate\s+(function|class|module|endpoint|api|component)",
+        r"\bimplement\b", r"\bgenerate\s+code\b", r"\bзгенеруй\s+код\b",
+        r"\bфункці[юя]\s+для\b", r"\bклас\s+для\b", r"\bнапиши\s+код\b",
+        r"\bwrite\s+a?\s*(function|class|module|script|endpoint)\b",
+        r"\bcontroller\b", r"\bendpoint\s+(для|for)\b",
+    ], 1.0),
+    ("code_debug", [
+        r"\bвиправ\b", r"\bбаг\b", r"\bпомилк[аи]\b", r"\btraceback\b",
+        r"\bexception\b", r"\bfailed\b", r"\bcrash(es|ed)?\b", r"\bне\s+працю",
+        r"\bдебаг\b", r"\bdebug\b", r"\bfix\s+(the\s+)?(bug|error|issue|crash)\b",
+        r"\bsyntax\s*error\b", r"\btype\s*error\b", r"\battribute\s*error\b",
+        r"\bruntime\s*error\b", r"\bvalue\s*error\b",
+    ], 1.0),
+    ("code_review", [
+        r"\breview\s+(the\s+)?(code|pr|pull\s+request|diff)\b",
+        r"\bаудит\s+(код|сервіс|систем)\b", r"\baudit\s+(code|service)\b",
+        r"\bперевір\w*\s+(код|якість)\b", r"\bcode\s+quality\b",
+        r"\bcode\s+review\b", r"\brev'ю\b",
+    ], 1.0),
+    ("code_refactor", [
+        r"\bрефактор\b", r"\brefactor\b",
+        r"\bоптимізу[йї]\s+(код|функці|клас)\b", r"\boptimize\s+(the\s+)?(code|function|class)\b",
+        r"\bclean\s+up\s+(the\s+)?code\b", r"\bpolish\s+(the\s+)?code\b",
+        r"\bspeed\s+up\b", r"\bimprove\s+(the\s+)?code\b",
+    ], 1.0),
+    ("architecture", [
+        r"\bархітектур\w+\b", r"\barchitecture\b",
+        r"\bспроєктуй\b", r"\bsystem\s+design\b",
+        r"\bmicroservice\s+(architect|design|pattern)\b",
+        r"\bdatabase\s+design\b", r"\bapi\s+design\b",
+        r"\bscalab(le|ility)\b", r"\bscaling\s+strateg\b",
+        r"\bdesign\s+pattern\b", r"\bsystem\s+structure\b",
+    ], 1.0),
+    ("devops", [
+        r"\bdeploy\b", r"\bdocker\s*(file|compose|-compose|ize)?\b",
+        r"\bkubernetes\b", r"\bk8s\b", r"\bci[\s/]cd\b",
+        r"\bpipeline\b", r"\bnginx\b", r"\bcaddy\b",
+        r"\bнода\d?\b", r"\bnoda\d?\b", r"\bcontainer\s+(start|stop|restart|build|image)\b",
+        r"\bдеплой\b", r"\bssh\s+(to|into|root|connect)\b",
+        r"\bhelm\b", r"\bterraform\b", r"\binfrastructure\b",
+        r"\bdocker\s+compose\s+up\b",
+    ], 1.0),
+    ("security", [
+        r"\bvulnerability\b", r"\bCVE-\d+\b", r"\bsecurity\s+(audit|review|issue|scan)\b",
+        r"\bauth(entication|orization)\b", r"\bencrypt(ion)?\b",
+        r"\bRBAC\b", r"\bpermission\s+(model|system)\b",
+        r"\bбезпек\w+\b", r"\bpentest\b", r"\b(sql|xss|csrf)\s*injection\b",
+        r"\bthreat\s+model\b",
+    ], 1.0),
+    ("reasoning", [
+        r"\bчому\s+\w+\b", r"\bwhy\s+(does|is|do|did|should|would)\b",
+        r"\bpros\s+and\s+cons\b", r"\btrade[\s-]?off\b",
+        r"\bпорівняй\b", r"\bcompare\s+\w+\s+(vs|and|with|to)\b",
+        r"\bяк\s+краще\b", r"\bперевага\b", r"\bнедолік\b",
+        r"\bdecision\s+(between|about)\b",
+        r"\bversus\b", r"\b\w+\s+vs\s+\w+\b",
+    ], 1.0),
+    ("analysis", [
+        r"\bпроаналізуй\b", r"\bаналіз\s+\w+\b",
+        r"\banalyze\s+\w+\b", r"\binvestigate\b",
+        r"\bexplain\s+(how|why|what)\b", r"\bsummariz(e|ation)\b",
+        r"\bдослідж\b", r"\bпоясни\s+(як|чому|що)\b",
+        r"\bhow\s+does\s+\w+\s+work\b",
+    ], 1.0),
+    ("creative", [
+        r"\bнапиши\s+(текст|стат|пост|лист|опис)\b",
+        r"\bwrite\s+a\s+(blog|article|post|email|description|letter)\b",
+        r"\bdraft\s+(a\s+)?(doc|email|message|proposal)\b",
+        r"\breadme\b", r"\bchangelog\b", r"\bdocumentation\b",
+    ], 1.0),
+    ("quick_answer", [
+        r"\bщо\s+таке\b", r"\bwhat\s+is\s+(a|an|the)?\b",
+        r"\bhow\s+to\s+\w+\b", r"\bdefinition\s+of\b",
+        r"\bшвидко\b", r"\bсинтаксис\s+\w+\b",
+        r"\bgive\s+me\s+an?\s+example\b", r"\bexample\s+of\b",
+    ], 0.9),
+    ("vision", [
+        r"\bзображен\w+\b", r"\bфото\b", r"\bimage\s+(analysis|recognition|detect)\b",
+        r"\bскріншот\b", r"\bscreenshot\b",
+        r"\bвізуальн\w+\s+аналіз\b", r"\bвідео\s+(аналіз|розпізна)\b",
+    ], 1.0),
+    ("math_code", [
+        r"\bалгоритм\s+\w+\b", r"\balgorithm\s+(for|to)\b",
+        r"\bсортуван\w+\b", r"\bsort(ing)?\s+algorithm\b",
+        r"\bdynamic\s+programming\b", r"\bgraph\s+(algorithm|traversal|search)\b",
+        r"\bmatrix\s+(mult|inver|decomp)\b",
+        r"\bcalculate\s+\w+\b", r"\bcompute\s+\w+\b",
+        r"\bformula\s+(for|to)\b", r"\bДейкстр\b", r"\bDijkstra\b",
+    ], 1.0),
+    # Chatbot / conversational — greetings, small talk, acknowledgements
+    ("chatbot", [
+        r"^(привіт|вітаю|добрий|доброго|hi|hello|hey)\b",
+        r"^(дякую|спасибі|thank|thanks)\b",
+        r"^(ок|добре|зрозумів|зрозуміло|so?|ok|yes|no|ні|так)\s*[,!.]?\s*$",
+        r"\bяк\s+(справи|діла|ся маєш)\b", r"\bhow\s+are\s+you\b",
+    ], 0.8),
+]
+
+# Pre-compile patterns once for performance
+_COMPILED_PATTERNS: Optional[List[Tuple[str, List[re.Pattern], float]]] = None
+
+
+def _get_compiled_patterns() -> List[Tuple[str, List[re.Pattern], float]]:
+    global _COMPILED_PATTERNS
+    if _COMPILED_PATTERNS is None:
+        _COMPILED_PATTERNS = [
+            (task_type, [re.compile(p, re.IGNORECASE) for p in patterns], weight)
+            for task_type, patterns, weight in TASK_PATTERNS
+        ]
+    return _COMPILED_PATTERNS
+
+
+# ── Model catalog ──────────────────────────────────────────────────────────────
+
+@dataclass
+class ModelSpec:
+    profile_name: str
+    provider: str
+    model_id: str
+    api_key_env: str = ""
+    strengths: List[str] = field(default_factory=list)
+    cost_tier: int = 1              # 0=free(local), 1=cheap, 2=mid, 3=expensive
+    speed_tier: int = 1             # 1=fast, 2=medium, 3=slow
+    context_k: int = 8              # context window in thousands
+    local: bool = False
+    max_tokens: int = 4096
+    vram_gb: float = 0.0
+    description: str = ""
+
+    @property
+    def available(self) -> bool:
+        if self.local:
+            return _is_ollama_model_available(self.model_id)
+        return bool(os.getenv(self.api_key_env, "").strip())
+
+    @property
+    def has_credits(self) -> bool:
+        return ProviderBudget.is_available(self.provider)
+
+
+# ── Ollama model availability cache ───────────────────────────────────────────
+
+_ollama_available_models: Optional[List[str]] = None
+_ollama_cache_ts: float = 0.0
+_OLLAMA_CACHE_TTL = 60.0
+
+
+def _is_ollama_model_available(model_id: str) -> bool:
+    global _ollama_available_models, _ollama_cache_ts
+    now = time.time()
+    if _ollama_available_models is None or (now - _ollama_cache_ts) > _OLLAMA_CACHE_TTL:
+        _refresh_ollama_models_sync()
+    if _ollama_available_models is None:
+        return False
+    model_lower = model_id.lower()
+    model_base = model_lower.split(":")[0]
+    for m in _ollama_available_models:
+        ml = m.lower()
+        if ml == model_lower or ml.split(":")[0] == model_base:
+            return True
+    return False
+
+
+def _refresh_ollama_models_sync() -> None:
+    global _ollama_available_models, _ollama_cache_ts
+    import urllib.request
+    import json as _json
+    ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
+    try:
+        with urllib.request.urlopen(f"{ollama_url}/api/tags", timeout=2) as resp:
+            data = _json.loads(resp.read())
+            _ollama_available_models = [m["name"] for m in data.get("models", [])]
+            _ollama_cache_ts = time.time()
+    except Exception:
+        _ollama_available_models = []
+        _ollama_cache_ts = time.time()
+
+
+async def refresh_ollama_models_async() -> List[str]:
+    global _ollama_available_models, _ollama_cache_ts
+    try:
+        import httpx
+        ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
+        async with httpx.AsyncClient(timeout=2.0) as client:
+            resp = await client.get(f"{ollama_url}/api/tags")
+            data = resp.json()
+            _ollama_available_models = [m["name"] for m in data.get("models", [])]
+            _ollama_cache_ts = time.time()
+            return _ollama_available_models
+    except Exception:
+        _ollama_available_models = _ollama_available_models or []
+        return _ollama_available_models
+
+
+# ── Full model catalog ─────────────────────────────────────────────────────────
+
+SOFIIA_MODEL_CATALOG: List[ModelSpec] = [
+
+    # ── Anthropic Claude ─────────────────────────────────────────────────────
+    ModelSpec(
+        profile_name="cloud_claude_sonnet",
+        provider="anthropic", model_id="claude-sonnet-4-5",
+        api_key_env="ANTHROPIC_API_KEY",
+        strengths=["code_gen", "code_debug", "code_refactor", "architecture", "security", "reasoning"],
+        cost_tier=2, speed_tier=2, context_k=200, max_tokens=8192,
+        description="Claude Sonnet 4.5 — найкращий для коду та архітектури",
+    ),
+    ModelSpec(
+        profile_name="cloud_claude_haiku",
+        provider="anthropic", model_id="claude-haiku-3-5",
+        api_key_env="ANTHROPIC_API_KEY",
+        strengths=["quick_answer", "code_review", "creative", "analysis", "chatbot"],
+        cost_tier=1, speed_tier=1, context_k=200, max_tokens=4096,
+        description="Claude Haiku 3.5 — швидкий та дешевий",
+    ),
+
+    # ── xAI Grok ─────────────────────────────────────────────────────────────
+    ModelSpec(
+        profile_name="cloud_grok",
+        provider="grok", model_id="grok-4-1-fast-reasoning",
+        api_key_env="GROK_API_KEY",
+        strengths=["reasoning", "architecture", "analysis", "code_gen"],
+        cost_tier=2, speed_tier=1, context_k=2000, max_tokens=8192,
+        description="Grok 4.1 Fast — 2M контекст, кращий для reasoning",
+    ),
+
+    # ── DeepSeek API ─────────────────────────────────────────────────────────
+    ModelSpec(
+        profile_name="cloud_deepseek",
+        provider="deepseek", model_id="deepseek-chat",
+        api_key_env="DEEPSEEK_API_KEY",
+        strengths=["code_gen", "code_debug", "code_refactor", "devops", "quick_answer"],
+        cost_tier=1, speed_tier=2, context_k=64, max_tokens=4096,
+        description="DeepSeek Chat — дешевий і добре знає код/devops",
+    ),
+
+    # ── GLM-5 / Z.AI (API) ───────────────────────────────────────────────────
+    ModelSpec(
+        profile_name="cloud_glm5",
+        provider="glm", model_id="glm-4-plus",
+        api_key_env="GLM5_API_KEY",
+        strengths=["quick_answer", "creative", "analysis", "code_gen", "chatbot"],
+        cost_tier=1, speed_tier=1, context_k=128, max_tokens=4096,
+        description="GLM-4 Plus (Z.AI) — швидкий, дешевий, гарно знає українську/CJK",
+    ),
+    ModelSpec(
+        profile_name="cloud_glm5_flash",
+        provider="glm", model_id="glm-4-flash",
+        api_key_env="GLM5_API_KEY",
+        strengths=["quick_answer", "creative", "chatbot"],
+        cost_tier=0, speed_tier=1, context_k=128, max_tokens=2048,
+        description="GLM-4 Flash (Z.AI) — безкоштовний, найшвидший",
+    ),
+
+    # ── Mistral AI (API) ─────────────────────────────────────────────────────
+    ModelSpec(
+        profile_name="cloud_mistral",
+        provider="mistral", model_id="mistral-large-latest",
+        api_key_env="MISTRAL_API_KEY",
+        strengths=["analysis", "creative", "reasoning", "architecture"],
+        cost_tier=2, speed_tier=2, context_k=128, max_tokens=4096,
+        description="Mistral Large — добрий для аналізу та creative",
+    ),
+
+    # ── Local: qwen3.5:35b-a3b (FLAGSHIP) ────────────────────────────────────
+    ModelSpec(
+        profile_name="local_qwen35_35b",
+        provider="ollama", model_id="qwen3.5:35b-a3b",
+        strengths=["code_gen", "code_debug", "code_refactor", "reasoning", "architecture",
+                   "analysis", "devops", "security", "chatbot"],
+        cost_tier=0, speed_tier=2, context_k=32, max_tokens=4096,
+        local=True, vram_gb=24.0,
+        description="Qwen3.5 35B MoE (NODA2) — флагман локально, якість ≈ cloud",
+    ),
+
+    # ── Local: qwen3:14b ─────────────────────────────────────────────────────
+    ModelSpec(
+        profile_name="local_qwen3_14b",
+        provider="ollama", model_id="qwen3:14b",
+        strengths=["code_gen", "code_debug", "quick_answer", "devops", "analysis", "chatbot"],
+        cost_tier=0, speed_tier=2, context_k=32, max_tokens=2048,
+        local=True, vram_gb=10.0,
+        description="Qwen3 14B (NODA2) — швидкий локальний загальний",
+    ),
+
+    # ── Local: glm-4.7-flash:32k ─────────────────────────────────────────────
+    ModelSpec(
+        profile_name="local_glm47_32k",
+        provider="ollama", model_id="glm-4.7-flash:32k",
+        strengths=["quick_answer", "creative", "analysis", "code_review", "chatbot"],
+        cost_tier=0, speed_tier=2, context_k=32, max_tokens=2048,
+        local=True, vram_gb=20.0,
+        description="GLM-4.7 Flash 32K (NODA2) — локальний GLM, великий контекст",
+    ),
+
+    # ── Local: deepseek-r1:70b ────────────────────────────────────────────────
+    ModelSpec(
+        profile_name="local_deepseek_r1_70b",
+        provider="ollama", model_id="deepseek-r1:70b",
+        strengths=["reasoning", "math_code", "architecture", "analysis"],
+        cost_tier=0, speed_tier=3, context_k=64, max_tokens=4096,
+        local=True, vram_gb=48.0,
+        description="DeepSeek-R1 70B (NODA2) — локальний reasoning як o1",
+    ),
+
+    # ── Local: deepseek-coder:33b ─────────────────────────────────────────────
+    ModelSpec(
+        profile_name="local_deepseek_coder_33b",
+        provider="ollama", model_id="deepseek-coder:33b",
+        strengths=["code_gen", "code_debug", "code_refactor", "math_code"],
+        cost_tier=0, speed_tier=2, context_k=16, max_tokens=2048,
+        local=True, vram_gb=20.0,
+        description="DeepSeek Coder 33B (NODA2) — спеціаліст по коду",
+    ),
+
+    # ── Local: gemma3:latest ──────────────────────────────────────────────────
+    ModelSpec(
+        profile_name="local_gemma3",
+        provider="ollama", model_id="gemma3:latest",
+        strengths=["quick_answer", "analysis", "creative", "chatbot"],
+        cost_tier=0, speed_tier=2, context_k=8, max_tokens=2048,
+        local=True, vram_gb=8.0,
+        description="Gemma3 (NODA2) — Google's ефективна модель",
+    ),
+
+    # ── Local: mistral-nemo:12b ───────────────────────────────────────────────
+    ModelSpec(
+        profile_name="local_mistral_nemo",
+        provider="ollama", model_id="mistral-nemo:12b",
+        strengths=["creative", "quick_answer", "analysis", "chatbot"],
+        cost_tier=0, speed_tier=2, context_k=128, max_tokens=2048,
+        local=True, vram_gb=8.0,
+        description="Mistral Nemo 12B (NODA2) — 128K контекст локально",
+    ),
+
+    # ── Local: starcoder2:3b ──────────────────────────────────────────────────
+    ModelSpec(
+        profile_name="local_starcoder2",
+        provider="ollama", model_id="starcoder2:3b",
+        strengths=["code_gen", "code_review"],
+        cost_tier=0, speed_tier=1, context_k=16, max_tokens=2048,
+        local=True, vram_gb=2.0,
+        description="StarCoder2 3B (NODA2) — мікро-модель для code completion",
+    ),
+
+    # ── Local: phi3:latest ────────────────────────────────────────────────────
+    ModelSpec(
+        profile_name="local_phi3",
+        provider="ollama", model_id="phi3:latest",
+        strengths=["quick_answer", "analysis", "chatbot"],
+        cost_tier=0, speed_tier=1, context_k=128, max_tokens=2048,
+        local=True, vram_gb=4.0,
+        description="Phi-3 (NODA2) — Microsoft мала ефективна модель",
+    ),
+
+    # ── Local: llava:13b (vision) ─────────────────────────────────────────────
+    ModelSpec(
+        profile_name="local_llava_13b",
+        provider="ollama", model_id="llava:13b",
+        strengths=["vision"],
+        cost_tier=0, speed_tier=2, context_k=4, max_tokens=2048,
+        local=True, vram_gb=10.0,
+        description="LLaVA 13B (NODA2) — vision модель для зображень",
+    ),
+
+    # ── Local: gpt-oss:latest ─────────────────────────────────────────────────
+    ModelSpec(
+        profile_name="local_gpt_oss",
+        provider="ollama", model_id="gpt-oss:latest",
+        strengths=["code_gen", "quick_answer"],
+        cost_tier=0, speed_tier=2, context_k=8, max_tokens=2048,
+        local=True, vram_gb=8.0,
+        description="GPT-OSS (NODA2) — відкрита OSS GPT-like модель",
+    ),
+]
+
+# ── Task → preferred model matrix ─────────────────────────────────────────────
+
+TASK_MODEL_PRIORITY: Dict[str, List[str]] = {
+    # Principle: local-first for tasks where local quality is sufficient.
+    # Cloud only when the task genuinely needs it (complex code, deep reasoning,
+    # very long context, security audits).
+    #
+    # qwen3.5:35b-a3b is the flagship local — MoE with cloud-level quality.
+    # It should be preferred over cloud APIs for most routine tasks.
+
+    "code_gen": [
+        "local_qwen35_35b", "cloud_claude_sonnet", "local_deepseek_coder_33b",
+        "cloud_deepseek", "local_qwen3_14b", "cloud_grok",
+    ],
+    "code_debug": [
+        "local_qwen35_35b", "local_deepseek_coder_33b", "cloud_claude_sonnet",
+        "cloud_deepseek", "local_qwen3_14b",
+    ],
+    "code_review": [
+        "local_qwen35_35b", "cloud_claude_haiku", "local_deepseek_coder_33b",
+        "cloud_claude_sonnet", "cloud_deepseek",
+    ],
+    "code_refactor": [
+        "local_qwen35_35b", "local_deepseek_coder_33b", "cloud_claude_sonnet",
+        "cloud_deepseek", "local_qwen3_14b",
+    ],
+    "math_code": [
+        "local_deepseek_r1_70b", "local_qwen35_35b", "cloud_grok",
+        "cloud_claude_sonnet", "local_deepseek_coder_33b",
+    ],
+    "architecture": [
+        "local_qwen35_35b", "cloud_grok", "cloud_claude_sonnet",
+        "local_deepseek_r1_70b", "cloud_mistral",
+    ],
+    "devops": [
+        "local_qwen35_35b", "local_qwen3_14b", "cloud_deepseek",
+        "cloud_claude_sonnet", "local_glm47_32k",
+    ],
+    "security": [
+        "cloud_claude_sonnet", "local_qwen35_35b", "cloud_grok", "cloud_mistral",
+    ],
+    "reasoning": [
+        "local_deepseek_r1_70b", "local_qwen35_35b", "cloud_grok",
+        "cloud_claude_sonnet", "cloud_mistral",
+    ],
+    "analysis": [
+        "local_qwen35_35b", "local_glm47_32k", "cloud_grok",
+        "cloud_claude_haiku", "local_mistral_nemo", "cloud_mistral",
+    ],
+    "creative": [
+        "local_qwen35_35b", "local_mistral_nemo", "cloud_claude_haiku",
+        "local_glm47_32k", "cloud_mistral",
+    ],
+    "quick_answer": [
+        "local_qwen3_14b", "local_qwen35_35b", "local_phi3",
+        "local_gemma3", "cloud_deepseek", "cloud_glm5_flash",
+    ],
+    "chatbot": [
+        "local_qwen3_14b", "local_qwen35_35b", "local_gemma3",
+        "local_phi3", "local_mistral_nemo",
+    ],
+    "vision": [
+        "local_llava_13b",
+    ],
+    "unknown": [
+        "local_qwen35_35b", "local_qwen3_14b", "cloud_claude_sonnet",
+        "cloud_grok", "cloud_deepseek",
+    ],
+}
+
+# ── Budget integration ─────────────────────────────────────────────────────────
+
+class ProviderBudget:
+    """In-memory budget gate: marks providers exhausted until TTL expires."""
+    _exhausted: Dict[str, float] = {}
+    _exhausted_ttl: int = 3600
+
+    @classmethod
+    def mark_exhausted(cls, provider: str) -> None:
+        cls._exhausted[provider] = time.time()
+        logger.warning("💸 Provider %s marked as budget-exhausted", provider)
+
+    @classmethod
+    def is_available(cls, provider: str) -> bool:
+        ts = cls._exhausted.get(provider)
+        if ts is None:
+            return True
+        if time.time() - ts > cls._exhausted_ttl:
+            cls._exhausted.pop(provider, None)
+            return True
+        return False
+
+    @classmethod
+    def reset(cls, provider: str) -> None:
+        cls._exhausted.pop(provider, None)
+
+
+# ── Task classification ────────────────────────────────────────────────────────
+
+@dataclass
+class ClassificationResult:
+    task_type: str
+    confidence: float
+    all_scores: Dict[str, float]
+    ambiguous: bool = False
+    runner_up: Optional[str] = None
+
+
+def classify_task(prompt: str, context_len: int = 0) -> Tuple[str, float]:
+    """Classify prompt into a task type. Returns (task_type, confidence)."""
+    result = classify_task_detailed(prompt, context_len)
+    return result.task_type, result.confidence
+
+
+def classify_task_detailed(prompt: str, context_len: int = 0) -> ClassificationResult:
+    """Detailed classification with ambiguity detection and all scores."""
+    if not prompt or not prompt.strip():
+        return ClassificationResult("chatbot", 0.5, {}, ambiguous=False)
+
+    text = prompt.strip()
+    compiled = _get_compiled_patterns()
+    scores: Dict[str, float] = {}
+
+    for task_type, patterns, weight in compiled:
+        hits = sum(1 for p in patterns if p.search(text))
+        if hits > 0:
+            raw = hits / len(patterns)
+            scores[task_type] = raw * weight
+
+    if not scores:
+        return ClassificationResult("unknown", 0.3, {}, ambiguous=False)
+
+    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+    best_task, best_score = sorted_scores[0]
+    confidence = min(best_score * 10, 1.0)
+
+    # Penalize confidence for very short prompts (fewer signals)
+    word_count = len(text.split())
+    if word_count <= 3:
+        confidence *= 0.6
+    elif word_count <= 8:
+        confidence *= 0.85
+
+    # Detect ambiguity: second-place is within 30% of the best
+    ambiguous = False
+    runner_up = None
+    if len(sorted_scores) >= 2:
+        _, second_score = sorted_scores[1]
+        if second_score > 0 and second_score / best_score > 0.7:
+            ambiguous = True
+            runner_up = sorted_scores[1][0]
+
+    # For long conversations, slight preference for context-heavy models
+    # (influences scoring, not classification)
+    if context_len > 50:
+        confidence = max(confidence, 0.5)
+
+    return ClassificationResult(
+        task_type=best_task,
+        confidence=round(confidence, 3),
+        all_scores={k: round(v, 4) for k, v in sorted_scores[:5]},
+        ambiguous=ambiguous,
+        runner_up=runner_up,
+    )
+
+
+def _prompt_complexity(prompt: str) -> str:
+    """Estimate prompt complexity: simple | medium | complex"""
+    words = len(prompt.split())
+    lines = prompt.count("\n")
+    code_blocks = prompt.count("```")
+    if words < 20 and lines < 3 and code_blocks == 0:
+        return "simple"
+    if words > 200 or code_blocks >= 2 or lines > 20:
+        return "complex"
+    return "medium"
+
+
+# ── Main selection function ────────────────────────────────────────────────────
+
+@dataclass
+class AutoRouteResult:
+    profile_name: str
+    model_id: str
+    provider: str
+    task_type: str
+    confidence: float
+    complexity: str
+    reason: str
+    fallback_used: bool = False
+    all_candidates: List[str] = field(default_factory=list)
+    ambiguous: bool = False
+    runner_up: Optional[str] = None
+    all_scores: Dict[str, float] = field(default_factory=dict)
+
+
+def select_model_auto(
+    prompt: str,
+    force_fast: bool = False,
+    force_capable: bool = False,
+    prefer_local: bool = False,
+    prefer_cheap: bool = False,
+    budget_aware: bool = True,
+    context_messages_len: int = 0,
+) -> AutoRouteResult:
+    """
+    Cursor-style auto model selection for Sofiia.
+
+    Logic:
+      1. Classify task type from prompt (with ambiguity detection)
+      2. Estimate complexity (simple/medium/complex)
+      3. Apply modifiers (force_fast, force_capable, prefer_local, prefer_cheap)
+      4. Score candidates from priority list factoring availability, budget, speed, cost
+      5. For long conversations, prefer large-context models
+    """
+    classification = classify_task_detailed(prompt, context_messages_len)
+    task_type = classification.task_type
+    confidence = classification.confidence
+    complexity = _prompt_complexity(prompt)
+
+    effective_task = task_type
+
+    # Modifier overrides (parentheses fix for operator precedence)
+    if force_fast and task_type not in ("code_gen", "code_debug", "math_code"):
+        effective_task = "quick_answer"
+    if (prefer_cheap or complexity == "simple") and task_type in ("quick_answer", "creative", "chatbot"):
+        effective_task = "quick_answer"
+
+    priority_list = TASK_MODEL_PRIORITY.get(effective_task, TASK_MODEL_PRIORITY["unknown"])
+    catalog_map = {m.profile_name: m for m in SOFIIA_MODEL_CATALOG}
+
+    candidates = [p for p in priority_list if p in catalog_map]
+    if prefer_local:
+        local_cands = [p for p in candidates if catalog_map[p].local]
+        if local_cands:
+            candidates = local_cands
+
+    def _score(profile_name: str) -> float:
+        spec = catalog_map[profile_name]
+        score = 0.0
+
+        if not spec.available:
+            score += 1000
+        if budget_aware and not spec.has_credits:
+            score += 500
+
+        # Priority-list position is the strongest signal
+        try:
+            pos = priority_list.index(profile_name)
+            score += pos * 20
+        except ValueError:
+            score += 200
+
+        if prefer_local and not spec.local:
+            score += 200
+        if force_fast:
+            score += spec.speed_tier * 15
+        if prefer_cheap or prefer_local:
+            score -= spec.cost_tier * 20
+        else:
+            score += spec.cost_tier * 2
+
+        if force_capable:
+            score -= spec.context_k / 100
+
+        if complexity == "complex" and spec.context_k < 32:
+            score += 40
+
+        # Long conversation bonus for large-context models
+        if context_messages_len > 30 and spec.context_k >= 128:
+            score -= 15
+        elif context_messages_len > 50 and spec.context_k < 32:
+            score += 25
+
+        return score
+
+    scored = sorted([c for c in candidates if c in catalog_map], key=_score)
+
+    if not scored:
+        for fallback in ["local_qwen35_35b", "local_qwen3_14b", "local_phi3"]:
+            if fallback in catalog_map:
+                scored = [fallback]
+                break
+
+    best = scored[0] if scored else "local_qwen3_14b"
+    spec = catalog_map.get(best)
+    fallback_used = best not in priority_list[:2]
+
+    reasons: List[str] = [f"task={task_type} ({confidence:.0%})", f"complexity={complexity}"]
+    if classification.ambiguous:
+        reasons.append(f"ambiguous (runner_up={classification.runner_up})")
+    if force_fast:
+        reasons.append("force_fast")
+    if prefer_local:
+        reasons.append("prefer_local")
+    if prefer_cheap:
+        reasons.append("prefer_cheap")
+    if force_capable:
+        reasons.append("force_capable")
+    if context_messages_len > 30:
+        reasons.append(f"long_conversation({context_messages_len})")
+    if fallback_used:
+        reasons.append("fallback (top unavailable)")
+
+    return AutoRouteResult(
+        profile_name=best,
+        model_id=spec.model_id if spec else best,
+        provider=spec.provider if spec else "unknown",
+        task_type=task_type,
+        confidence=confidence,
+        complexity=complexity,
+        reason=" | ".join(reasons),
+        fallback_used=fallback_used,
+        all_candidates=scored[:5],
+        ambiguous=classification.ambiguous,
+        runner_up=classification.runner_up,
+        all_scores=classification.all_scores,
+    )
+
+
+def explain_selection(result: AutoRouteResult) -> str:
+    """Human-readable explanation of model selection (for debug/UI)."""
+    lines = [
+        f"Auto-selected **{result.model_id}** ({result.provider})",
+        f"Task: `{result.task_type}` | Complexity: `{result.complexity}` | "
+        f"Confidence: {result.confidence:.0%}",
+        f"Reason: {result.reason}",
+    ]
+    if result.ambiguous:
+        lines.append(f"Ambiguous: runner-up was `{result.runner_up}`")
+    if result.all_scores:
+        top3 = list(result.all_scores.items())[:3]
+        lines.append("Scores: " + ", ".join(f"{k}={v:.3f}" for k, v in top3))
+    return "\n".join(lines)
+
+
+def get_full_catalog() -> List[Dict[str, Any]]:
+    """Return full model catalog with availability status for dashboard."""
+    return [
+        {
+            "profile_name": m.profile_name,
+            "provider": m.provider,
+            "model_id": m.model_id,
+            "description": m.description,
+            "strengths": m.strengths,
+            "cost_tier": m.cost_tier,
+            "speed_tier": m.speed_tier,
+            "context_k": m.context_k,
+            "local": m.local,
+            "vram_gb": m.vram_gb,
+            "available": m.available,
+            "has_credits": m.has_credits,
+        }
+        for m in SOFIIA_MODEL_CATALOG
+    ]
--- a/services/router/tool_governance.py
+++ b/services/router/tool_governance.py
@@ -0,0 +1,473 @@
+"""
+Tool Governance: RBAC enforcement, Safety Middleware, Audit.
+
+Applies to ALL /v1/tools/* dispatch.
+
+Components:
+1. RBAC Matrix enforcement  – deny without entitlement
+2. Tool Safety Middleware   – limits, redaction, allowlist, audit
+3. Audit events             – structured per-call events (no payload, only metadata)
+
+Usage (in tool_manager.py execute_tool):
+    from tool_governance import ToolGovernance
+
+    governance = ToolGovernance()
+
+    # Pre-call
+    check = governance.pre_call(tool_name, action, agent_id, user_id, workspace_id, input_text)
+    if not check.allowed:
+        return ToolResult(success=False, error=check.reason)
+
+    # Execute actual tool handler ...
+    result = await _actual_handler(args)
+
+    # Post-call
+    governance.post_call(check.call_ctx, result, duration_ms)
+"""
+
+import hashlib
+import ipaddress
+import json
+import logging
+import os
+import re
+import time
+import uuid
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ─── Config Paths ─────────────────────────────────────────────────────────────
+_CONFIG_DIR = Path(__file__).parent.parent.parent / "config"
+_RBAC_PATH = _CONFIG_DIR / "rbac_tools_matrix.yml"
+_LIMITS_PATH = _CONFIG_DIR / "tool_limits.yml"
+_ALLOWLIST_PATH = _CONFIG_DIR / "network_allowlist.yml"
+
+
+# ─── Data Classes ─────────────────────────────────────────────────────────────
+
+@dataclass
+class CallContext:
+    req_id: str
+    tool: str
+    action: str
+    agent_id: str
+    user_id: str
+    workspace_id: str
+    ts_start: float
+    input_hash: str
+    input_chars: int
+    limits_applied: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class PreCallResult:
+    allowed: bool
+    reason: str = ""
+    call_ctx: Optional[CallContext] = None
+
+
+@dataclass
+class AuditEvent:
+    ts: str
+    req_id: str
+    tool: str
+    action: str
+    workspace_id: str
+    user_id: str
+    agent_id: str
+    status: str              # "pass" | "deny" | "error"
+    duration_ms: float
+    limits_applied: Dict[str, Any]
+    input_hash: str
+    input_chars: int
+    output_size_bytes: int
+
+
+# ─── YAML Loader (lazy, cached) ───────────────────────────────────────────────
+
+_yaml_cache: Dict[str, Any] = {}
+
+
+def _load_yaml(path: Path) -> dict:
+    key = str(path)
+    if key not in _yaml_cache:
+        try:
+            import yaml
+            with open(path, "r") as f:
+                _yaml_cache[key] = yaml.safe_load(f) or {}
+        except Exception as e:
+            logger.warning(f"Could not load {path}: {e}")
+            _yaml_cache[key] = {}
+    return _yaml_cache[key]
+
+
+def _reload_yaml_cache():
+    """Force reload all yaml caches (for tests / hot-reload)."""
+    _yaml_cache.clear()
+
+
+# ─── Secret Redaction ─────────────────────────────────────────────────────────
+
+_SECRET_PATTERNS = [
+    # API keys / tokens
+    re.compile(
+        r'(?i)(api[_-]?key|token|secret|password|passwd|pwd|auth|bearer|jwt|'
+        r'oauth|private[_-]?key|sk-|ghp_|xoxb-|AKIA|client_secret)'
+        r'[\s=:]+[\'"`]?([a-zA-Z0-9_\-\.]{8,})[\'"`]?',
+        re.MULTILINE,
+    ),
+    # Generic high-entropy strings after known labels
+    re.compile(
+        r'(?i)(credential|access[_-]?key|refresh[_-]?token|signing[_-]?key)'
+        r'[\s=:]+[\'"`]?([a-zA-Z0-9/+]{20,}={0,2})[\'"`]?',
+        re.MULTILINE,
+    ),
+]
+
+
+def redact(text: str) -> str:
+    """Mask secret values in text. Always enabled by default."""
+    if not text:
+        return text
+    for pat in _SECRET_PATTERNS:
+        def _replace(m):
+            label = m.group(1)
+            return f"{label}=***REDACTED***"
+        text = pat.sub(_replace, text)
+    return text
+
+
+# ─── Network Allowlist Check ──────────────────────────────────────────────────
+
+_PRIVATE_RANGES = [
+    ipaddress.ip_network("10.0.0.0/8"),
+    ipaddress.ip_network("172.16.0.0/12"),
+    ipaddress.ip_network("192.168.0.0/16"),
+    ipaddress.ip_network("127.0.0.0/8"),
+    ipaddress.ip_network("169.254.0.0/16"),
+    ipaddress.ip_network("::1/128"),
+    ipaddress.ip_network("fc00::/7"),
+]
+
+
+def _is_private_ip(host: str) -> bool:
+    try:
+        addr = ipaddress.ip_address(host)
+        return any(addr in net for net in _PRIVATE_RANGES)
+    except ValueError:
+        return False
+
+
+def check_url_allowed(tool: str, url: str) -> Tuple[bool, str]:
+    """
+    Check if a URL is allowed for a given tool per network_allowlist.yml.
+    Returns (allowed, reason).
+    """
+    import urllib.parse
+    parsed = urllib.parse.urlparse(url)
+    host = parsed.hostname or ""
+    scheme = parsed.scheme or "https"
+
+    allowlist_cfg = _load_yaml(_ALLOWLIST_PATH)
+    tool_cfg = allowlist_cfg.get(tool, {})
+
+    if not tool_cfg:
+        # No config: deny by default (safe default)
+        return False, f"No allowlist config for tool '{tool}'"
+
+    # Check scheme
+    allowed_schemes = tool_cfg.get("schemes", ["https"])
+    if scheme not in allowed_schemes:
+        return False, f"Scheme '{scheme}' not allowed for tool '{tool}'"
+
+    # Check allow_any_public flag
+    if tool_cfg.get("allow_any_public"):
+        if tool_cfg.get("block_private_ranges") and _is_private_ip(host):
+            return False, f"Private IP blocked: {host}"
+        return True, ""
+
+    # Check explicit hosts
+    allowed_hosts = tool_cfg.get("hosts", [])
+    if host in allowed_hosts:
+        return True, ""
+
+    return False, f"Host '{host}' not in allowlist for tool '{tool}'"
+
+
+# ─── RBAC Matrix ──────────────────────────────────────────────────────────────
+
+def _get_agent_role(agent_id: str) -> str:
+    """Resolve agent role (delegates to agent_tools_config)."""
+    try:
+        from agent_tools_config import get_agent_role
+        return get_agent_role(agent_id)
+    except Exception:
+        return "agent_default"
+
+
+def _get_role_entitlements(role: str) -> List[str]:
+    """Get entitlements for a role from RBAC matrix."""
+    matrix = _load_yaml(_RBAC_PATH)
+    role_entitlements = matrix.get("role_entitlements", {})
+    return role_entitlements.get(role, role_entitlements.get("agent_default", []))
+
+
+def _get_required_entitlements(tool: str, action: str) -> List[str]:
+    """Get required entitlements for tool+action from matrix."""
+    matrix = _load_yaml(_RBAC_PATH)
+    tools_section = matrix.get("tools", {})
+    tool_cfg = tools_section.get(tool, {})
+    actions = tool_cfg.get("actions", {})
+
+    # Try exact action, then _default
+    action_cfg = actions.get(action) or actions.get("_default", {})
+    return action_cfg.get("entitlements", []) if action_cfg else []
+
+
+def check_rbac(agent_id: str, tool: str, action: str) -> Tuple[bool, str]:
+    """
+    Check RBAC: agent role → entitlements → required entitlements for tool+action.
+    Returns (allowed, reason).
+    """
+    role = _get_agent_role(agent_id)
+    agent_ents = set(_get_role_entitlements(role))
+    required = _get_required_entitlements(tool, action)
+
+    if not required:
+        # No entitlements required → allowed
+        return True, ""
+
+    missing = [e for e in required if e not in agent_ents]
+    if missing:
+        return False, f"Missing entitlements: {missing} (agent={agent_id}, role={role})"
+
+    return True, ""
+
+
+# ─── Limits ───────────────────────────────────────────────────────────────────
+
+def _get_limits(tool: str) -> Dict[str, Any]:
+    """Get effective limits for a tool (per-tool overrides merged with defaults)."""
+    cfg = _load_yaml(_LIMITS_PATH)
+    defaults = cfg.get("defaults", {
+        "timeout_ms": 30000,
+        "max_chars_in": 200000,
+        "max_bytes_out": 524288,
+        "rate_limit_rpm": 60,
+        "concurrency": 5,
+    })
+    per_tool = cfg.get("tools", {}).get(tool, {})
+    return {**defaults, **per_tool}
+
+
+def check_input_limits(tool: str, input_text: str) -> Tuple[bool, str, Dict]:
+    """
+    Enforce max_chars_in limit.
+    Returns (ok, reason, limits_applied).
+    """
+    limits = _get_limits(tool)
+    max_chars = limits.get("max_chars_in", 200000)
+    actual = len(input_text) if input_text else 0
+
+    if actual > max_chars:
+        return False, f"Input too large: {actual} chars (max {max_chars} for {tool})", limits
+
+    return True, "", limits
+
+
+# ─── Audit ────────────────────────────────────────────────────────────────────
+
+def _emit_audit(event: AuditEvent):
+    """
+    Emit structured audit event.
+    1. Writes to logger (structured, no payload).
+    2. Persists to AuditStore (JSONL/Postgres/Memory) for FinOps analysis.
+
+    Persistence is non-fatal: errors are logged as warnings without interrupting tool execution.
+    """
+    import datetime
+    record = {
+        "ts": event.ts or datetime.datetime.now(datetime.timezone.utc).isoformat(),
+        "req_id": event.req_id,
+        "tool": event.tool,
+        "action": event.action,
+        "workspace_id": event.workspace_id,
+        "user_id": event.user_id,
+        "agent_id": event.agent_id,
+        "status": event.status,
+        "duration_ms": round(event.duration_ms, 2),
+        "limits_applied": event.limits_applied,
+        "input_hash": event.input_hash,
+        "input_chars": event.input_chars,
+        "output_size_bytes": event.output_size_bytes,
+    }
+    logger.info(f"TOOL_AUDIT {json.dumps(record)}")
+
+    # Persist to audit store (non-fatal)
+    try:
+        from audit_store import get_audit_store
+        store = get_audit_store()
+        store.write(event)
+    except Exception as _audit_err:
+        logger.warning("audit_store.write failed (non-fatal): %s", _audit_err)
+
+
+# ─── Main Governance Class ────────────────────────────────────────────────────
+
+class ToolGovernance:
+    """
+    Single entry point for tool governance.
+
+    Call pre_call() before executing any tool.
+    Call post_call() after execution to emit audit event.
+    """
+
+    def __init__(self, *, enable_rbac: bool = True, enable_redaction: bool = True,
+                 enable_limits: bool = True, enable_audit: bool = True,
+                 enable_allowlist: bool = True):
+        self.enable_rbac = enable_rbac
+        self.enable_redaction = enable_redaction
+        self.enable_limits = enable_limits
+        self.enable_audit = enable_audit
+        self.enable_allowlist = enable_allowlist
+
+    def pre_call(
+        self,
+        tool: str,
+        action: str,
+        agent_id: str,
+        user_id: str = "unknown",
+        workspace_id: str = "unknown",
+        input_text: str = "",
+    ) -> PreCallResult:
+        """
+        Run all pre-call checks. Returns PreCallResult.
+        If allowed=False, caller must return error immediately.
+        """
+        req_id = str(uuid.uuid4())[:12]
+        ts_start = time.monotonic()
+
+        # 1. RBAC check
+        if self.enable_rbac:
+            ok, reason = check_rbac(agent_id, tool, action)
+            if not ok:
+                if self.enable_audit:
+                    _emit_audit(AuditEvent(
+                        ts=_now_iso(), req_id=req_id, tool=tool, action=action,
+                        workspace_id=workspace_id, user_id=user_id, agent_id=agent_id,
+                        status="deny", duration_ms=0,
+                        limits_applied={}, input_hash="", input_chars=0, output_size_bytes=0,
+                    ))
+                return PreCallResult(allowed=False, reason=f"RBAC denied: {reason}")
+
+        # 2. Input limits
+        limits_applied = {}
+        if self.enable_limits and input_text:
+            ok, reason, limits_applied = check_input_limits(tool, input_text)
+            if not ok:
+                if self.enable_audit:
+                    _emit_audit(AuditEvent(
+                        ts=_now_iso(), req_id=req_id, tool=tool, action=action,
+                        workspace_id=workspace_id, user_id=user_id, agent_id=agent_id,
+                        status="deny", duration_ms=0,
+                        limits_applied=limits_applied,
+                        input_hash="", input_chars=len(input_text), output_size_bytes=0,
+                    ))
+                return PreCallResult(allowed=False, reason=f"Limits exceeded: {reason}")
+        elif not limits_applied:
+            limits_applied = _get_limits(tool)
+
+        # Build call context
+        input_hash = hashlib.sha256(input_text.encode()).hexdigest()[:16] if input_text else ""
+        ctx = CallContext(
+            req_id=req_id,
+            tool=tool,
+            action=action,
+            agent_id=agent_id,
+            user_id=user_id,
+            workspace_id=workspace_id,
+            ts_start=ts_start,
+            input_hash=input_hash,
+            input_chars=len(input_text) if input_text else 0,
+            limits_applied=limits_applied,
+        )
+        return PreCallResult(allowed=True, call_ctx=ctx)
+
+    def post_call(self, ctx: CallContext, result_value: Any, error: Optional[str] = None):
+        """
+        Emit audit event after tool execution.
+        result_value: raw result data (used only for size calculation, not logged).
+        """
+        if not self.enable_audit or ctx is None:
+            return
+
+        duration_ms = (time.monotonic() - ctx.ts_start) * 1000
+        status = "error" if error else "pass"
+
+        # Calculate output size (bytes) without logging content
+        try:
+            out_bytes = len(json.dumps(result_value).encode()) if result_value is not None else 0
+        except Exception:
+            out_bytes = 0
+
+        _emit_audit(AuditEvent(
+            ts=_now_iso(),
+            req_id=ctx.req_id,
+            tool=ctx.tool,
+            action=ctx.action,
+            workspace_id=ctx.workspace_id,
+            user_id=ctx.user_id,
+            agent_id=ctx.agent_id,
+            status=status,
+            duration_ms=duration_ms,
+            limits_applied=ctx.limits_applied,
+            input_hash=ctx.input_hash,
+            input_chars=ctx.input_chars,
+            output_size_bytes=out_bytes,
+        ))
+
+    def apply_redaction(self, text: str) -> str:
+        """Apply secret redaction if enabled."""
+        if not self.enable_redaction:
+            return text
+        return redact(text)
+
+    def check_url(self, tool: str, url: str) -> Tuple[bool, str]:
+        """Check URL against allowlist if enabled."""
+        if not self.enable_allowlist:
+            return True, ""
+        return check_url_allowed(tool, url)
+
+    def get_timeout_ms(self, tool: str) -> int:
+        """Get configured timeout for a tool."""
+        limits = _get_limits(tool)
+        return limits.get("timeout_ms", 30000)
+
+
+# ─── Helpers ──────────────────────────────────────────────────────────────────
+
+def _now_iso() -> str:
+    import datetime
+    return datetime.datetime.now(datetime.timezone.utc).isoformat()
+
+
+# ─── Module-level singleton ───────────────────────────────────────────────────
+
+_governance: Optional[ToolGovernance] = None
+
+
+def get_governance() -> ToolGovernance:
+    """Get the shared ToolGovernance singleton."""
+    global _governance
+    if _governance is None:
+        _governance = ToolGovernance()
+    return _governance
+
+
+def reset_governance(instance: Optional[ToolGovernance] = None):
+    """Reset singleton (for testing)."""
+    global _governance
+    _governance = instance