Files
microdao-daarion/services/sofiia-supervisor/app/alert_routing.py
Apple 129e4ea1fc feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
2026-03-03 07:14:14 -08:00

204 lines
6.7 KiB
Python

"""
alert_routing.py — Alert routing policy loader and matcher.
Loads config/alert_routing_policy.yml and provides:
- match_alert(alert) → matched rule actions dict
- default_actions() → fallback actions when no rule matches
- Policy dataclass for easy access to defaults/limits
"""
from __future__ import annotations
import hashlib
import logging
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
import yaml
logger = logging.getLogger(__name__)
def _find_policy_path() -> Path:
"""Walk up from this file to find config/alert_routing_policy.yml."""
here = Path(__file__).resolve()
for parent in here.parents:
candidate = parent / "config" / "alert_routing_policy.yml"
if candidate.exists():
return candidate
# Safe fallback path for container/local runs; file may be absent and
# load_policy() will fall back to built-in defaults.
return Path("/app/config/alert_routing_policy.yml")
_POLICY_PATH = _find_policy_path()
def load_policy(path: Optional[Path] = None) -> Dict:
"""Load and return raw YAML policy dict. Caches nothing (caller may cache)."""
p = path or _POLICY_PATH
try:
with open(p) as f:
return yaml.safe_load(f) or {}
except FileNotFoundError:
logger.warning("alert_routing_policy.yml not found at %s — using built-in defaults", p)
return _builtin_defaults()
except Exception as e:
logger.error("Failed to load alert routing policy: %s", e)
return _builtin_defaults()
def _builtin_defaults() -> Dict:
return {
"defaults": {
"poll_interval_seconds": 300,
"max_alerts_per_run": 20,
"only_unacked": True,
"max_incidents_per_run": 5,
"max_triages_per_run": 5,
"dedupe_window_minutes_default": 120,
"ack_note_prefix": "alert_triage_loop",
"llm_mode": "off",
"llm_on": {"triage": False, "postmortem": False},
},
"routing": [
{
"match": {"env_in": ["prod"], "severity_in": ["P0", "P1"]},
"actions": {
"auto_incident": True,
"auto_triage": True,
"triage_mode": "deterministic",
"incident_severity_cap": "P1",
"dedupe_window_minutes": 120,
"attach_alert_artifact": True,
"ack": True,
},
},
{
"match": {"severity_in": ["P2", "P3", "INFO"]},
"actions": {"auto_incident": False, "digest_only": True, "ack": True},
},
],
}
def _normalize_kind(kind: str, kind_map: Dict[str, List[str]]) -> str:
"""Resolve kind aliases to canonical name."""
if not kind_map:
return kind
for canonical, aliases in kind_map.items():
if kind in aliases or kind == canonical:
return canonical
return kind
def match_alert(alert: Dict, policy: Optional[Dict] = None) -> Dict:
"""
Find the first matching routing rule for an alert and return its actions.
Falls back to digest_only if no rule matches.
"""
if policy is None:
policy = load_policy()
kind_map = policy.get("kind_map", {})
routing = policy.get("routing", [])
defaults_cfg = policy.get("defaults", {})
normalized_kind = _normalize_kind(alert.get("kind", "custom"), kind_map)
env = alert.get("env", "prod")
severity = alert.get("severity", "P2")
for rule in routing:
m = rule.get("match", {})
if not _rule_matches(m, env=env, severity=severity, kind=normalized_kind):
continue
actions = dict(rule.get("actions", {}))
# Inject defaults for missing action fields
actions.setdefault("auto_incident", False)
actions.setdefault("auto_triage", False)
actions.setdefault("digest_only", False)
actions.setdefault("ack", True)
actions.setdefault("triage_mode", "deterministic")
actions.setdefault(
"incident_severity_cap",
policy.get("severity_caps", {}).get(normalized_kind, "P1"),
)
actions.setdefault(
"dedupe_window_minutes",
defaults_cfg.get("dedupe_window_minutes_default", 120),
)
actions["_normalized_kind"] = normalized_kind
return actions
# No match → safe fallback
return {
"auto_incident": False,
"digest_only": True,
"ack": True,
"triage_mode": "deterministic",
"incident_severity_cap": "P2",
"dedupe_window_minutes": defaults_cfg.get("dedupe_window_minutes_default", 120),
"_normalized_kind": normalized_kind,
}
def _rule_matches(match: Dict, env: str, severity: str, kind: str) -> bool:
"""Return True if all match conditions are satisfied."""
if "env_in" in match and env not in match["env_in"]:
return False
if "severity_in" in match and severity not in match["severity_in"]:
return False
if "kind_in" in match and kind not in match["kind_in"]:
return False
return True
# ─── Incident Signature ────────────────────────────────────────────────────────
def compute_incident_signature(
alert: Dict,
policy: Optional[Dict] = None,
) -> str:
"""
Compute an incident signature for deduplication.
Components controlled by `policy.signature`.
"""
if policy is None:
policy = load_policy()
sig_cfg = policy.get("signature", {})
kind_map = policy.get("kind_map", {})
service = alert.get("service", "unknown")
env = alert.get("env", "prod")
kind = _normalize_kind(alert.get("kind", "custom"), kind_map)
parts = [service, env]
if sig_cfg.get("use_kind", True):
parts.append(kind)
if sig_cfg.get("use_fingerprint", True):
fp = (alert.get("labels") or {}).get("fingerprint", "")
parts.append(fp)
if sig_cfg.get("use_node_label", False):
node = (alert.get("labels") or {}).get("node", "")
parts.append(node)
raw = "|".join(parts)
return hashlib.sha256(raw.encode()).hexdigest()[:32]
def is_llm_allowed(action: str, policy: Optional[Dict] = None) -> bool:
"""
Return True only if global llm_mode != off AND the specific action is enabled.
Used to guard any LLM call.
"""
if policy is None:
policy = load_policy()
defaults = policy.get("defaults", {})
llm_mode = defaults.get("llm_mode", "off")
if llm_mode == "off":
return False
llm_on = defaults.get("llm_on", {})
return bool(llm_on.get(action, False))