""" alert_routing.py — Alert routing policy loader and matcher. Loads config/alert_routing_policy.yml and provides: - match_alert(alert) → matched rule actions dict - default_actions() → fallback actions when no rule matches - Policy dataclass for easy access to defaults/limits """ from __future__ import annotations import hashlib import logging import re from pathlib import Path from typing import Any, Dict, List, Optional import yaml logger = logging.getLogger(__name__) def _find_policy_path() -> Path: """Walk up from this file to find config/alert_routing_policy.yml.""" here = Path(__file__).resolve() for parent in here.parents: candidate = parent / "config" / "alert_routing_policy.yml" if candidate.exists(): return candidate # Safe fallback path for container/local runs; file may be absent and # load_policy() will fall back to built-in defaults. return Path("/app/config/alert_routing_policy.yml") _POLICY_PATH = _find_policy_path() def load_policy(path: Optional[Path] = None) -> Dict: """Load and return raw YAML policy dict. Caches nothing (caller may cache).""" p = path or _POLICY_PATH try: with open(p) as f: return yaml.safe_load(f) or {} except FileNotFoundError: logger.warning("alert_routing_policy.yml not found at %s — using built-in defaults", p) return _builtin_defaults() except Exception as e: logger.error("Failed to load alert routing policy: %s", e) return _builtin_defaults() def _builtin_defaults() -> Dict: return { "defaults": { "poll_interval_seconds": 300, "max_alerts_per_run": 20, "only_unacked": True, "max_incidents_per_run": 5, "max_triages_per_run": 5, "dedupe_window_minutes_default": 120, "ack_note_prefix": "alert_triage_loop", "llm_mode": "off", "llm_on": {"triage": False, "postmortem": False}, }, "routing": [ { "match": {"env_in": ["prod"], "severity_in": ["P0", "P1"]}, "actions": { "auto_incident": True, "auto_triage": True, "triage_mode": "deterministic", "incident_severity_cap": "P1", "dedupe_window_minutes": 120, "attach_alert_artifact": True, "ack": True, }, }, { "match": {"severity_in": ["P2", "P3", "INFO"]}, "actions": {"auto_incident": False, "digest_only": True, "ack": True}, }, ], } def _normalize_kind(kind: str, kind_map: Dict[str, List[str]]) -> str: """Resolve kind aliases to canonical name.""" if not kind_map: return kind for canonical, aliases in kind_map.items(): if kind in aliases or kind == canonical: return canonical return kind def match_alert(alert: Dict, policy: Optional[Dict] = None) -> Dict: """ Find the first matching routing rule for an alert and return its actions. Falls back to digest_only if no rule matches. """ if policy is None: policy = load_policy() kind_map = policy.get("kind_map", {}) routing = policy.get("routing", []) defaults_cfg = policy.get("defaults", {}) normalized_kind = _normalize_kind(alert.get("kind", "custom"), kind_map) env = alert.get("env", "prod") severity = alert.get("severity", "P2") for rule in routing: m = rule.get("match", {}) if not _rule_matches(m, env=env, severity=severity, kind=normalized_kind): continue actions = dict(rule.get("actions", {})) # Inject defaults for missing action fields actions.setdefault("auto_incident", False) actions.setdefault("auto_triage", False) actions.setdefault("digest_only", False) actions.setdefault("ack", True) actions.setdefault("triage_mode", "deterministic") actions.setdefault( "incident_severity_cap", policy.get("severity_caps", {}).get(normalized_kind, "P1"), ) actions.setdefault( "dedupe_window_minutes", defaults_cfg.get("dedupe_window_minutes_default", 120), ) actions["_normalized_kind"] = normalized_kind return actions # No match → safe fallback return { "auto_incident": False, "digest_only": True, "ack": True, "triage_mode": "deterministic", "incident_severity_cap": "P2", "dedupe_window_minutes": defaults_cfg.get("dedupe_window_minutes_default", 120), "_normalized_kind": normalized_kind, } def _rule_matches(match: Dict, env: str, severity: str, kind: str) -> bool: """Return True if all match conditions are satisfied.""" if "env_in" in match and env not in match["env_in"]: return False if "severity_in" in match and severity not in match["severity_in"]: return False if "kind_in" in match and kind not in match["kind_in"]: return False return True # ─── Incident Signature ──────────────────────────────────────────────────────── def compute_incident_signature( alert: Dict, policy: Optional[Dict] = None, ) -> str: """ Compute an incident signature for deduplication. Components controlled by `policy.signature`. """ if policy is None: policy = load_policy() sig_cfg = policy.get("signature", {}) kind_map = policy.get("kind_map", {}) service = alert.get("service", "unknown") env = alert.get("env", "prod") kind = _normalize_kind(alert.get("kind", "custom"), kind_map) parts = [service, env] if sig_cfg.get("use_kind", True): parts.append(kind) if sig_cfg.get("use_fingerprint", True): fp = (alert.get("labels") or {}).get("fingerprint", "") parts.append(fp) if sig_cfg.get("use_node_label", False): node = (alert.get("labels") or {}).get("node", "") parts.append(node) raw = "|".join(parts) return hashlib.sha256(raw.encode()).hexdigest()[:32] def is_llm_allowed(action: str, policy: Optional[Dict] = None) -> bool: """ Return True only if global llm_mode != off AND the specific action is enabled. Used to guard any LLM call. """ if policy is None: policy = load_policy() defaults = policy.get("defaults", {}) llm_mode = defaults.get("llm_mode", "off") if llm_mode == "off": return False llm_on = defaults.get("llm_on", {}) return bool(llm_on.get(action, False))