feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
This commit is contained in:
203
services/sofiia-supervisor/app/alert_routing.py
Normal file
203
services/sofiia-supervisor/app/alert_routing.py
Normal file
@@ -0,0 +1,203 @@
|
||||
"""
|
||||
alert_routing.py — Alert routing policy loader and matcher.
|
||||
|
||||
Loads config/alert_routing_policy.yml and provides:
|
||||
- match_alert(alert) → matched rule actions dict
|
||||
- default_actions() → fallback actions when no rule matches
|
||||
- Policy dataclass for easy access to defaults/limits
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import yaml
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def _find_policy_path() -> Path:
|
||||
"""Walk up from this file to find config/alert_routing_policy.yml."""
|
||||
here = Path(__file__).resolve()
|
||||
for parent in here.parents:
|
||||
candidate = parent / "config" / "alert_routing_policy.yml"
|
||||
if candidate.exists():
|
||||
return candidate
|
||||
# Safe fallback path for container/local runs; file may be absent and
|
||||
# load_policy() will fall back to built-in defaults.
|
||||
return Path("/app/config/alert_routing_policy.yml")
|
||||
|
||||
_POLICY_PATH = _find_policy_path()
|
||||
|
||||
|
||||
def load_policy(path: Optional[Path] = None) -> Dict:
|
||||
"""Load and return raw YAML policy dict. Caches nothing (caller may cache)."""
|
||||
p = path or _POLICY_PATH
|
||||
try:
|
||||
with open(p) as f:
|
||||
return yaml.safe_load(f) or {}
|
||||
except FileNotFoundError:
|
||||
logger.warning("alert_routing_policy.yml not found at %s — using built-in defaults", p)
|
||||
return _builtin_defaults()
|
||||
except Exception as e:
|
||||
logger.error("Failed to load alert routing policy: %s", e)
|
||||
return _builtin_defaults()
|
||||
|
||||
|
||||
def _builtin_defaults() -> Dict:
|
||||
return {
|
||||
"defaults": {
|
||||
"poll_interval_seconds": 300,
|
||||
"max_alerts_per_run": 20,
|
||||
"only_unacked": True,
|
||||
"max_incidents_per_run": 5,
|
||||
"max_triages_per_run": 5,
|
||||
"dedupe_window_minutes_default": 120,
|
||||
"ack_note_prefix": "alert_triage_loop",
|
||||
"llm_mode": "off",
|
||||
"llm_on": {"triage": False, "postmortem": False},
|
||||
},
|
||||
"routing": [
|
||||
{
|
||||
"match": {"env_in": ["prod"], "severity_in": ["P0", "P1"]},
|
||||
"actions": {
|
||||
"auto_incident": True,
|
||||
"auto_triage": True,
|
||||
"triage_mode": "deterministic",
|
||||
"incident_severity_cap": "P1",
|
||||
"dedupe_window_minutes": 120,
|
||||
"attach_alert_artifact": True,
|
||||
"ack": True,
|
||||
},
|
||||
},
|
||||
{
|
||||
"match": {"severity_in": ["P2", "P3", "INFO"]},
|
||||
"actions": {"auto_incident": False, "digest_only": True, "ack": True},
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _normalize_kind(kind: str, kind_map: Dict[str, List[str]]) -> str:
|
||||
"""Resolve kind aliases to canonical name."""
|
||||
if not kind_map:
|
||||
return kind
|
||||
for canonical, aliases in kind_map.items():
|
||||
if kind in aliases or kind == canonical:
|
||||
return canonical
|
||||
return kind
|
||||
|
||||
|
||||
def match_alert(alert: Dict, policy: Optional[Dict] = None) -> Dict:
|
||||
"""
|
||||
Find the first matching routing rule for an alert and return its actions.
|
||||
Falls back to digest_only if no rule matches.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_policy()
|
||||
|
||||
kind_map = policy.get("kind_map", {})
|
||||
routing = policy.get("routing", [])
|
||||
defaults_cfg = policy.get("defaults", {})
|
||||
|
||||
normalized_kind = _normalize_kind(alert.get("kind", "custom"), kind_map)
|
||||
env = alert.get("env", "prod")
|
||||
severity = alert.get("severity", "P2")
|
||||
|
||||
for rule in routing:
|
||||
m = rule.get("match", {})
|
||||
if not _rule_matches(m, env=env, severity=severity, kind=normalized_kind):
|
||||
continue
|
||||
actions = dict(rule.get("actions", {}))
|
||||
# Inject defaults for missing action fields
|
||||
actions.setdefault("auto_incident", False)
|
||||
actions.setdefault("auto_triage", False)
|
||||
actions.setdefault("digest_only", False)
|
||||
actions.setdefault("ack", True)
|
||||
actions.setdefault("triage_mode", "deterministic")
|
||||
actions.setdefault(
|
||||
"incident_severity_cap",
|
||||
policy.get("severity_caps", {}).get(normalized_kind, "P1"),
|
||||
)
|
||||
actions.setdefault(
|
||||
"dedupe_window_minutes",
|
||||
defaults_cfg.get("dedupe_window_minutes_default", 120),
|
||||
)
|
||||
actions["_normalized_kind"] = normalized_kind
|
||||
return actions
|
||||
|
||||
# No match → safe fallback
|
||||
return {
|
||||
"auto_incident": False,
|
||||
"digest_only": True,
|
||||
"ack": True,
|
||||
"triage_mode": "deterministic",
|
||||
"incident_severity_cap": "P2",
|
||||
"dedupe_window_minutes": defaults_cfg.get("dedupe_window_minutes_default", 120),
|
||||
"_normalized_kind": normalized_kind,
|
||||
}
|
||||
|
||||
|
||||
def _rule_matches(match: Dict, env: str, severity: str, kind: str) -> bool:
|
||||
"""Return True if all match conditions are satisfied."""
|
||||
if "env_in" in match and env not in match["env_in"]:
|
||||
return False
|
||||
if "severity_in" in match and severity not in match["severity_in"]:
|
||||
return False
|
||||
if "kind_in" in match and kind not in match["kind_in"]:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
# ─── Incident Signature ────────────────────────────────────────────────────────
|
||||
|
||||
def compute_incident_signature(
|
||||
alert: Dict,
|
||||
policy: Optional[Dict] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Compute an incident signature for deduplication.
|
||||
Components controlled by `policy.signature`.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_policy()
|
||||
|
||||
sig_cfg = policy.get("signature", {})
|
||||
kind_map = policy.get("kind_map", {})
|
||||
|
||||
service = alert.get("service", "unknown")
|
||||
env = alert.get("env", "prod")
|
||||
kind = _normalize_kind(alert.get("kind", "custom"), kind_map)
|
||||
|
||||
parts = [service, env]
|
||||
|
||||
if sig_cfg.get("use_kind", True):
|
||||
parts.append(kind)
|
||||
|
||||
if sig_cfg.get("use_fingerprint", True):
|
||||
fp = (alert.get("labels") or {}).get("fingerprint", "")
|
||||
parts.append(fp)
|
||||
|
||||
if sig_cfg.get("use_node_label", False):
|
||||
node = (alert.get("labels") or {}).get("node", "")
|
||||
parts.append(node)
|
||||
|
||||
raw = "|".join(parts)
|
||||
return hashlib.sha256(raw.encode()).hexdigest()[:32]
|
||||
|
||||
|
||||
def is_llm_allowed(action: str, policy: Optional[Dict] = None) -> bool:
|
||||
"""
|
||||
Return True only if global llm_mode != off AND the specific action is enabled.
|
||||
Used to guard any LLM call.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_policy()
|
||||
defaults = policy.get("defaults", {})
|
||||
llm_mode = defaults.get("llm_mode", "off")
|
||||
if llm_mode == "off":
|
||||
return False
|
||||
llm_on = defaults.get("llm_on", {})
|
||||
return bool(llm_on.get(action, False))
|
||||
Reference in New Issue
Block a user