feat(platform): add new services, tools, tests and crews modules

New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
2026-03-03 07:14:14 -08:00
parent e9dedffa48
commit 129e4ea1fc
241 changed files with 69349 additions and 0 deletions
--- a/services/sofiia-supervisor/app/alert_routing.py
+++ b/services/sofiia-supervisor/app/alert_routing.py
@@ -0,0 +1,203 @@
+"""
+alert_routing.py — Alert routing policy loader and matcher.
+
+Loads config/alert_routing_policy.yml and provides:
+  - match_alert(alert) → matched rule actions dict
+  - default_actions()  → fallback actions when no rule matches
+  - Policy dataclass for easy access to defaults/limits
+"""
+from __future__ import annotations
+
+import hashlib
+import logging
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import yaml
+
+logger = logging.getLogger(__name__)
+
+def _find_policy_path() -> Path:
+    """Walk up from this file to find config/alert_routing_policy.yml."""
+    here = Path(__file__).resolve()
+    for parent in here.parents:
+        candidate = parent / "config" / "alert_routing_policy.yml"
+        if candidate.exists():
+            return candidate
+    # Safe fallback path for container/local runs; file may be absent and
+    # load_policy() will fall back to built-in defaults.
+    return Path("/app/config/alert_routing_policy.yml")
+
+_POLICY_PATH = _find_policy_path()
+
+
+def load_policy(path: Optional[Path] = None) -> Dict:
+    """Load and return raw YAML policy dict. Caches nothing (caller may cache)."""
+    p = path or _POLICY_PATH
+    try:
+        with open(p) as f:
+            return yaml.safe_load(f) or {}
+    except FileNotFoundError:
+        logger.warning("alert_routing_policy.yml not found at %s — using built-in defaults", p)
+        return _builtin_defaults()
+    except Exception as e:
+        logger.error("Failed to load alert routing policy: %s", e)
+        return _builtin_defaults()
+
+
+def _builtin_defaults() -> Dict:
+    return {
+        "defaults": {
+            "poll_interval_seconds": 300,
+            "max_alerts_per_run": 20,
+            "only_unacked": True,
+            "max_incidents_per_run": 5,
+            "max_triages_per_run": 5,
+            "dedupe_window_minutes_default": 120,
+            "ack_note_prefix": "alert_triage_loop",
+            "llm_mode": "off",
+            "llm_on": {"triage": False, "postmortem": False},
+        },
+        "routing": [
+            {
+                "match": {"env_in": ["prod"], "severity_in": ["P0", "P1"]},
+                "actions": {
+                    "auto_incident": True,
+                    "auto_triage": True,
+                    "triage_mode": "deterministic",
+                    "incident_severity_cap": "P1",
+                    "dedupe_window_minutes": 120,
+                    "attach_alert_artifact": True,
+                    "ack": True,
+                },
+            },
+            {
+                "match": {"severity_in": ["P2", "P3", "INFO"]},
+                "actions": {"auto_incident": False, "digest_only": True, "ack": True},
+            },
+        ],
+    }
+
+
+def _normalize_kind(kind: str, kind_map: Dict[str, List[str]]) -> str:
+    """Resolve kind aliases to canonical name."""
+    if not kind_map:
+        return kind
+    for canonical, aliases in kind_map.items():
+        if kind in aliases or kind == canonical:
+            return canonical
+    return kind
+
+
+def match_alert(alert: Dict, policy: Optional[Dict] = None) -> Dict:
+    """
+    Find the first matching routing rule for an alert and return its actions.
+    Falls back to digest_only if no rule matches.
+    """
+    if policy is None:
+        policy = load_policy()
+
+    kind_map = policy.get("kind_map", {})
+    routing = policy.get("routing", [])
+    defaults_cfg = policy.get("defaults", {})
+
+    normalized_kind = _normalize_kind(alert.get("kind", "custom"), kind_map)
+    env = alert.get("env", "prod")
+    severity = alert.get("severity", "P2")
+
+    for rule in routing:
+        m = rule.get("match", {})
+        if not _rule_matches(m, env=env, severity=severity, kind=normalized_kind):
+            continue
+        actions = dict(rule.get("actions", {}))
+        # Inject defaults for missing action fields
+        actions.setdefault("auto_incident", False)
+        actions.setdefault("auto_triage", False)
+        actions.setdefault("digest_only", False)
+        actions.setdefault("ack", True)
+        actions.setdefault("triage_mode", "deterministic")
+        actions.setdefault(
+            "incident_severity_cap",
+            policy.get("severity_caps", {}).get(normalized_kind, "P1"),
+        )
+        actions.setdefault(
+            "dedupe_window_minutes",
+            defaults_cfg.get("dedupe_window_minutes_default", 120),
+        )
+        actions["_normalized_kind"] = normalized_kind
+        return actions
+
+    # No match → safe fallback
+    return {
+        "auto_incident": False,
+        "digest_only": True,
+        "ack": True,
+        "triage_mode": "deterministic",
+        "incident_severity_cap": "P2",
+        "dedupe_window_minutes": defaults_cfg.get("dedupe_window_minutes_default", 120),
+        "_normalized_kind": normalized_kind,
+    }
+
+
+def _rule_matches(match: Dict, env: str, severity: str, kind: str) -> bool:
+    """Return True if all match conditions are satisfied."""
+    if "env_in" in match and env not in match["env_in"]:
+        return False
+    if "severity_in" in match and severity not in match["severity_in"]:
+        return False
+    if "kind_in" in match and kind not in match["kind_in"]:
+        return False
+    return True
+
+
+# ─── Incident Signature ────────────────────────────────────────────────────────
+
+def compute_incident_signature(
+    alert: Dict,
+    policy: Optional[Dict] = None,
+) -> str:
+    """
+    Compute an incident signature for deduplication.
+    Components controlled by `policy.signature`.
+    """
+    if policy is None:
+        policy = load_policy()
+
+    sig_cfg = policy.get("signature", {})
+    kind_map = policy.get("kind_map", {})
+
+    service = alert.get("service", "unknown")
+    env = alert.get("env", "prod")
+    kind = _normalize_kind(alert.get("kind", "custom"), kind_map)
+
+    parts = [service, env]
+
+    if sig_cfg.get("use_kind", True):
+        parts.append(kind)
+
+    if sig_cfg.get("use_fingerprint", True):
+        fp = (alert.get("labels") or {}).get("fingerprint", "")
+        parts.append(fp)
+
+    if sig_cfg.get("use_node_label", False):
+        node = (alert.get("labels") or {}).get("node", "")
+        parts.append(node)
+
+    raw = "|".join(parts)
+    return hashlib.sha256(raw.encode()).hexdigest()[:32]
+
+
+def is_llm_allowed(action: str, policy: Optional[Dict] = None) -> bool:
+    """
+    Return True only if global llm_mode != off AND the specific action is enabled.
+    Used to guard any LLM call.
+    """
+    if policy is None:
+        policy = load_policy()
+    defaults = policy.get("defaults", {})
+    llm_mode = defaults.get("llm_mode", "off")
+    if llm_mode == "off":
+        return False
+    llm_on = defaults.get("llm_on", {})
+    return bool(llm_on.get(action, False))