feat(platform): add new services, tools, tests and crews modules

New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
2026-03-03 07:14:14 -08:00
parent e9dedffa48
commit 129e4ea1fc
241 changed files with 69349 additions and 0 deletions
--- a/services/router/incident_escalation.py
+++ b/services/router/incident_escalation.py
@@ -0,0 +1,379 @@
+"""
+incident_escalation.py — Deterministic Incident Escalation Engine.
+
+Actions (exposed via incident_escalation_tool):
+  evaluate              — check active signatures against escalation thresholds
+  auto_resolve_candidates — find open incidents with no recent alerts
+
+No LLM usage; all logic is policy-driven.
+"""
+from __future__ import annotations
+
+import datetime
+import logging
+import os
+import yaml
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+# ─── Severity ordering ────────────────────────────────────────────────────────
+
+_SEV_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "INFO": 4}
+_SEV_NAMES = ["P0", "P1", "P2", "P3", "INFO"]
+
+
+def _sev_higher(a: str, b: str) -> bool:
+    """Return True if a is more severe (lower P number) than b."""
+    return _SEV_ORDER.get(a, 99) < _SEV_ORDER.get(b, 99)
+
+
+def _escalate_sev(current: str, cap: str = "P0") -> Optional[str]:
+    """Return next higher severity, or None if already at/above cap."""
+    idx = _SEV_ORDER.get(current)
+    if idx is None or idx == 0:
+        return None
+    target = _SEV_NAMES[idx - 1]
+    if _SEV_ORDER.get(target, 99) < _SEV_ORDER.get(cap, 0):
+        return None  # would exceed cap
+    return target
+
+
+def _now_iso() -> str:
+    return datetime.datetime.utcnow().isoformat()
+
+
+def _plus_hours(hours: int) -> str:
+    return (datetime.datetime.utcnow() + datetime.timedelta(hours=hours)).isoformat()
+
+
+# ─── Policy loading ───────────────────────────────────────────────────────────
+
+_POLICY_CACHE: Optional[Dict] = None
+_POLICY_PATHS = [
+    Path("config/incident_escalation_policy.yml"),
+    Path(__file__).resolve().parent.parent.parent / "config" / "incident_escalation_policy.yml",
+]
+
+
+def load_escalation_policy() -> Dict:
+    global _POLICY_CACHE
+    if _POLICY_CACHE is not None:
+        return _POLICY_CACHE
+    for path in _POLICY_PATHS:
+        if path.exists():
+            try:
+                with open(path) as f:
+                    data = yaml.safe_load(f) or {}
+                _POLICY_CACHE = data
+                return data
+            except Exception as e:
+                logger.warning("Failed to load escalation policy from %s: %s", path, e)
+    logger.warning("incident_escalation_policy.yml not found; using defaults")
+    _POLICY_CACHE = _builtin_defaults()
+    return _POLICY_CACHE
+
+
+def _builtin_defaults() -> Dict:
+    return {
+        "defaults": {"window_minutes": 60},
+        "escalation": {
+            "occurrences_thresholds": {"P2_to_P1": 10, "P1_to_P0": 25},
+            "triage_thresholds_24h": {"P2_to_P1": 3, "P1_to_P0": 6},
+            "severity_cap": "P0",
+            "create_followup_on_escalate": True,
+            "followup": {
+                "priority": "P1", "due_hours": 24, "owner": "oncall",
+                "message_template": "Escalated: occurrences={occurrences_60m}, triages_24h={triage_count_24h}",
+            },
+        },
+        "auto_resolve": {
+            "no_alerts_minutes_for_candidate": 60,
+            "close_allowed_severities": ["P2", "P3"],
+            "auto_close": False,
+            "candidate_event_type": "note",
+            "candidate_message": "Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
+        },
+        "alert_loop_slo": {
+            "claim_to_ack_p95_seconds": 60,
+            "failed_rate_pct": 5,
+            "processing_stuck_minutes": 15,
+        },
+    }
+
+
+# ─── Escalation thresholds helper ────────────────────────────────────────────
+
+def _determine_escalation(
+    current_severity: str,
+    occurrences_60m: int,
+    triage_count_24h: int,
+    policy: Dict,
+) -> Optional[str]:
+    """Return target severity if escalation is needed, else None."""
+    esc = policy.get("escalation", {})
+    occ_thresh = esc.get("occurrences_thresholds", {})
+    triage_thresh = esc.get("triage_thresholds_24h", {})
+    cap = esc.get("severity_cap", "P0")
+
+    # Build escalation rules in priority order (most → least severe)
+    rules = [
+        ("P1", "P0", occ_thresh.get("P1_to_P0", 25), triage_thresh.get("P1_to_P0", 6)),
+        ("P2", "P1", occ_thresh.get("P2_to_P1", 10), triage_thresh.get("P2_to_P1", 3)),
+    ]
+
+    for from_sev, to_sev, occ_limit, triage_limit in rules:
+        if current_severity != from_sev:
+            continue
+        if occurrences_60m >= occ_limit or triage_count_24h >= triage_limit:
+            # Check cap
+            if not _sev_higher(cap, to_sev) and to_sev != cap:
+                # to_sev is more severe than cap — not allowed
+                if _sev_higher(to_sev, cap):
+                    return cap
+            return to_sev
+    return None
+
+
+# ─── Core evaluate function ───────────────────────────────────────────────────
+
+def evaluate_escalations(
+    params: Dict,
+    alert_store,
+    sig_state_store,
+    incident_store,
+    policy: Optional[Dict] = None,
+    dry_run: bool = False,
+) -> Dict:
+    """
+    Main escalation evaluation. Returns structured summary.
+    """
+    if policy is None:
+        policy = load_escalation_policy()
+
+    env_filter = params.get("env")  # "prod" / "staging" / None = any
+    window_minutes = int(params.get("window_minutes",
+                                    policy.get("defaults", {}).get("window_minutes", 60)))
+    limit = int(params.get("limit", 100))
+
+    esc_cfg = policy.get("escalation", {})
+    cap = esc_cfg.get("severity_cap", "P0")
+    create_followup = esc_cfg.get("create_followup_on_escalate", True)
+    followup_cfg = esc_cfg.get("followup", {})
+
+    # Pull active signatures
+    active_sigs = sig_state_store.list_active_signatures(
+        window_minutes=window_minutes, limit=limit
+    )
+
+    evaluated = 0
+    escalated = 0
+    followups_created = 0
+    candidates: List[Dict] = []
+    recommendations: List[str] = []
+
+    for sig_state in active_sigs:
+        signature = sig_state.get("signature", "")
+        occurrences_60m = sig_state.get("occurrences_60m", 0)
+        triage_count_24h = sig_state.get("triage_count_24h", 0)
+
+        # Find open incident with this signature
+        all_incidents = incident_store.list_incidents(
+            {"status": "open"}, limit=200
+        )
+        matching = [
+            i for i in all_incidents
+            if i.get("meta", {}).get("incident_signature") == signature
+            and (not env_filter or i.get("env") == env_filter)
+        ]
+        if not matching:
+            # Also check mitigating
+            mitigating = incident_store.list_incidents(
+                {"status": "mitigating"}, limit=200
+            )
+            matching = [
+                i for i in mitigating
+                if i.get("meta", {}).get("incident_signature") == signature
+                and (not env_filter or i.get("env") == env_filter)
+            ]
+
+        if not matching:
+            evaluated += 1
+            continue
+
+        incident = matching[0]
+        inc_id = incident["id"]
+        current_sev = incident.get("severity", "P2")
+
+        evaluated += 1
+
+        target_sev = _determine_escalation(
+            current_sev, occurrences_60m, triage_count_24h, policy
+        )
+
+        if not target_sev:
+            continue  # no escalation needed
+
+        candidates.append({
+            "incident_id": inc_id,
+            "service": incident.get("service"),
+            "from_severity": current_sev,
+            "to_severity": target_sev,
+            "occurrences_60m": occurrences_60m,
+            "triage_count_24h": triage_count_24h,
+            "signature": signature,
+        })
+
+        if dry_run:
+            continue
+
+        # Append escalation decision event
+        esc_msg = (
+            f"Escalated {current_sev} → {target_sev}: "
+            f"occurrences_60m={occurrences_60m}, "
+            f"triage_count_24h={triage_count_24h}"
+        )
+        incident_store.append_event(inc_id, "decision", esc_msg, meta={
+            "from_severity": current_sev,
+            "to_severity": target_sev,
+            "occurrences_60m": occurrences_60m,
+            "triage_count_24h": triage_count_24h,
+            "policy_cap": cap,
+            "automated": True,
+        })
+        escalated += 1
+
+        # Create follow-up event if configured
+        if create_followup:
+            tmpl = followup_cfg.get(
+                "message_template",
+                "Escalation follow-up: investigate {occurrences_60m} occurrences"
+            )
+            followup_msg = tmpl.format(
+                occurrences_60m=occurrences_60m,
+                triage_count_24h=triage_count_24h,
+            )
+            due = _plus_hours(int(followup_cfg.get("due_hours", 24)))
+            incident_store.append_event(inc_id, "followup", followup_msg, meta={
+                "priority": followup_cfg.get("priority", "P1"),
+                "due_date": due,
+                "owner": followup_cfg.get("owner", "oncall"),
+                "auto_created": True,
+            })
+            followups_created += 1
+
+        recommendations.append(
+            f"Incident {inc_id} ({incident.get('service')}) escalated "
+            f"{current_sev}→{target_sev}: {esc_msg}"
+        )
+
+    return {
+        "evaluated": evaluated,
+        "escalated": escalated,
+        "followups_created": followups_created,
+        "candidates": candidates,
+        "recommendations": recommendations,
+        "dry_run": dry_run,
+    }
+
+
+# ─── Auto-resolve candidates ──────────────────────────────────────────────────
+
+def find_auto_resolve_candidates(
+    params: Dict,
+    sig_state_store,
+    incident_store,
+    policy: Optional[Dict] = None,
+    dry_run: bool = True,
+) -> Dict:
+    """
+    Find open incidents where no alerts have been seen in the last N minutes.
+    Returns list of candidate incidents.
+    By default dry_run=True — no state changes.
+    """
+    if policy is None:
+        policy = load_escalation_policy()
+
+    ar = policy.get("auto_resolve", {})
+    no_alerts_minutes = int(params.get(
+        "no_alerts_minutes",
+        ar.get("no_alerts_minutes_for_candidate", 60)
+    ))
+    env_filter = params.get("env")
+    limit = int(params.get("limit", 100))
+    close_allowed = ar.get("close_allowed_severities", ["P2", "P3"])
+    auto_close = ar.get("auto_close", False)
+    candidate_event_type = ar.get("candidate_event_type", "note")
+    candidate_msg_tmpl = ar.get(
+        "candidate_message",
+        "Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
+    )
+
+    now_dt = datetime.datetime.utcnow()
+    no_alert_cutoff = (now_dt - datetime.timedelta(minutes=no_alerts_minutes)).isoformat()
+
+    # Pull all open incidents
+    all_open = incident_store.list_incidents({"status": "open"}, limit=limit)
+    if env_filter:
+        all_open = [i for i in all_open if i.get("env") == env_filter]
+
+    candidates: List[Dict] = []
+    closed: List[str] = []
+
+    for incident in all_open:
+        inc_id = incident["id"]
+        signature = incident.get("meta", {}).get("incident_signature")
+        if not signature:
+            continue
+
+        sig_state = sig_state_store.get_state(signature)
+        if not sig_state:
+            continue
+
+        last_alert = sig_state.get("last_alert_at") or ""
+        if last_alert >= no_alert_cutoff:
+            continue  # alert seen recently → not a candidate
+
+        current_sev = incident.get("severity", "P2")
+        can_close = current_sev in close_allowed
+
+        candidates.append({
+            "incident_id": inc_id,
+            "service": incident.get("service"),
+            "severity": current_sev,
+            "last_alert_at": last_alert,
+            "minutes_without_alerts": round(
+                (now_dt - datetime.datetime.fromisoformat(last_alert)).total_seconds() / 60
+                if last_alert else no_alerts_minutes
+            ),
+            "auto_close_eligible": can_close and auto_close,
+        })
+
+        if dry_run:
+            continue
+
+        # Append candidate note to incident
+        msg = candidate_msg_tmpl.format(no_alerts_minutes=no_alerts_minutes)
+        incident_store.append_event(inc_id, candidate_event_type, msg, meta={
+            "last_alert_at": last_alert,
+            "no_alerts_minutes": no_alerts_minutes,
+            "auto_created": True,
+        })
+
+        if can_close and auto_close:
+            incident_store.close_incident(
+                inc_id,
+                _now_iso(),
+                f"Auto-closed: no alerts for {no_alerts_minutes} minutes",
+            )
+            closed.append(inc_id)
+
+    return {
+        "candidates": candidates,
+        "candidates_count": len(candidates),
+        "closed": closed,
+        "closed_count": len(closed),
+        "no_alerts_minutes": no_alerts_minutes,
+        "dry_run": dry_run,
+    }