""" incident_intelligence.py — Incident Intelligence Layer (deterministic, no LLM). Functions: correlate_incident(incident_id, policy, store) -> related[] detect_recurrence(window_days, policy, store) -> stats weekly_digest(policy, store) -> {json, markdown} Policy: config/incident_intelligence_policy.yml """ from __future__ import annotations import datetime import json import logging import os import re import textwrap import yaml from collections import defaultdict from pathlib import Path from typing import Any, Dict, List, Optional, Tuple from incident_intel_utils import ( extract_kind, incident_key_fields, incidents_within_minutes, mask_signature, safe_truncate, severity_rank, format_duration, parse_iso, ) logger = logging.getLogger(__name__) # ─── Policy ─────────────────────────────────────────────────────────────────── _POLICY_CACHE: Optional[Dict] = None _POLICY_SEARCH_PATHS = [ Path("config/incident_intelligence_policy.yml"), Path(__file__).resolve().parent.parent.parent / "config" / "incident_intelligence_policy.yml", ] def load_intel_policy() -> Dict: global _POLICY_CACHE if _POLICY_CACHE is not None: return _POLICY_CACHE for p in _POLICY_SEARCH_PATHS: if p.exists(): try: with open(p) as f: data = yaml.safe_load(f) or {} _POLICY_CACHE = data return data except Exception as e: logger.warning("Failed to load intel policy from %s: %s", p, e) logger.warning("incident_intelligence_policy.yml not found; using defaults") _POLICY_CACHE = _builtin_defaults() return _POLICY_CACHE def _builtin_defaults() -> Dict: return { "correlation": { "lookback_days": 30, "max_related": 10, "min_score": 20, "rules": [ {"name": "same_signature", "weight": 100, "match": {"signature": True}}, {"name": "same_service_and_kind", "weight": 60, "match": {"same_service": True, "same_kind": True}}, {"name": "same_service_time_cluster", "weight": 40, "match": {"same_service": True, "within_minutes": 180}}, {"name": "same_kind_cross_service", "weight": 30, "match": {"same_kind": True, "within_minutes": 120}}, ], }, "recurrence": { "windows_days": [7, 30], "thresholds": { "signature": {"warn": 3, "high": 6}, "kind": {"warn": 5, "high": 10}, }, "top_n": 15, "recommendations": { "signature_high": "Create permanent fix: add regression test + SLO guard", "signature_warn": "Review root cause history; consider monitoring threshold", "kind_high": "Systemic issue with kind={kind}: review architecture", "kind_warn": "Recurring kind={kind}: validate alert thresholds", }, }, "digest": { "weekly_day": "Mon", "include_closed": True, "include_open": True, "output_dir": "ops/reports/incidents", "markdown_max_chars": 8000, "top_incidents": 20, }, } # ─── Helpers ────────────────────────────────────────────────────────────────── def _now_iso() -> str: return datetime.datetime.utcnow().isoformat() def _lookback_cutoff(days: int) -> str: return (datetime.datetime.utcnow() - datetime.timedelta(days=days)).isoformat() def _incidents_in_window(store, days: int, limit: int = 1000) -> List[Dict]: """Load all incidents (open+closed) in last N days.""" cutoff = _lookback_cutoff(days) all_incs: List[Dict] = [] for status in ("open", "mitigating", "closed", "resolved"): batch = store.list_incidents({"status": status}, limit=limit) all_incs.extend(i for i in batch if i.get("started_at", "") >= cutoff) seen = set() unique = [] for i in all_incs: if i["id"] not in seen: seen.add(i["id"]) unique.append(i) return unique # ─── 1. Correlation ─────────────────────────────────────────────────────────── def correlate_incident( incident_id: str, policy: Optional[Dict] = None, store = None, append_note: bool = False, ) -> List[Dict]: """ Find related incidents for a given incident_id using scored matching. Returns list sorted by score desc (highest relevance first). """ if policy is None: policy = load_intel_policy() if store is None: from incident_store import get_incident_store store = get_incident_store() corr_cfg = policy.get("correlation", {}) lookback_days = int(corr_cfg.get("lookback_days", 30)) max_related = int(corr_cfg.get("max_related", 10)) min_score = int(corr_cfg.get("min_score", 20)) rules = corr_cfg.get("rules", []) target_raw = store.get_incident(incident_id) if not target_raw: return [] target = incident_key_fields(target_raw) candidates = _incidents_in_window(store, lookback_days, limit=500) scored: List[Dict] = [] for cand_raw in candidates: cid = cand_raw.get("id", "") if cid == incident_id: continue cand = incident_key_fields(cand_raw) score, reasons = _score_pair(target, cand, target_raw, cand_raw, rules) if score >= min_score: scored.append({ "incident_id": cid, "score": score, "reasons": reasons, "service": cand["service"], "kind": cand["kind"], "severity": cand["severity"], "status": cand["status"], "started_at": cand["started_at"], "signature": mask_signature(cand["signature"]), }) scored.sort(key=lambda x: (-x["score"], x["started_at"])) related = scored[:max_related] # Optionally append correlation note to incident timeline if append_note and related: note_parts = [f"`{r['incident_id']}` score={r['score']} ({', '.join(r['reasons'])})" for r in related[:5]] note = "Related incidents: " + "; ".join(note_parts) try: store.append_event(incident_id, "note", safe_truncate(note, 2000), meta={"auto_created": True, "related_count": len(related)}) except Exception as e: logger.warning("correlate_incident: append_note failed: %s", e) return related def _score_pair( target: Dict, cand: Dict, target_raw: Dict, cand_raw: Dict, rules: List[Dict], ) -> Tuple[int, List[str]]: """Compute correlation score and matching reasons for a candidate pair.""" score = 0 reasons: List[str] = [] t_sig = target.get("signature", "") c_sig = cand.get("signature", "") t_svc = target.get("service", "") c_svc = cand.get("service", "") t_kind = target.get("kind", "") c_kind = cand.get("kind", "") t_start = target.get("started_at", "") c_start = cand.get("started_at", "") for rule in rules: m = rule.get("match", {}) w = int(rule.get("weight", 0)) name = rule.get("name", "rule") # Signature-only rule: only fires when signatures are equal; skip otherwise. if "signature" in m: if t_sig and c_sig and t_sig == c_sig: score += w reasons.append(name) continue # never fall through to combined-conditions for this rule # Combined conditions (same_service / same_kind / within_minutes) within = m.get("within_minutes") matched = True if m.get("same_service"): if t_svc != c_svc: matched = False if m.get("same_kind"): if t_kind != c_kind or not t_kind or t_kind == "custom": matched = False if within is not None: if not incidents_within_minutes(target_raw, cand_raw, within): matched = False if matched: score += w reasons.append(name) return score, reasons # ─── 2. Recurrence Detection ────────────────────────────────────────────────── def detect_recurrence( window_days: int = 7, policy: Optional[Dict] = None, store = None, ) -> Dict: """ Analyze incident frequency for given window. Returns frequency tables and threshold classifications. """ if policy is None: policy = load_intel_policy() if store is None: from incident_store import get_incident_store store = get_incident_store() rec_cfg = policy.get("recurrence", {}) thresholds = rec_cfg.get("thresholds", {}) sig_thresh = thresholds.get("signature", {"warn": 3, "high": 6}) kind_thresh = thresholds.get("kind", {"warn": 5, "high": 10}) top_n = int(rec_cfg.get("top_n", 15)) incidents = _incidents_in_window(store, window_days) # Frequency tables sig_count: Dict[str, Dict] = {} # signature → {count, services, last_seen, severity_min} kind_count: Dict[str, Dict] = {} # kind → {count, services} svc_count: Dict[str, int] = defaultdict(int) sev_count: Dict[str, int] = defaultdict(int) open_count = 0 closed_count = 0 for inc in incidents: fields = incident_key_fields(inc) sig = fields["signature"] kind = fields["kind"] svc = fields["service"] sev = fields["severity"] status = fields["status"] started_at = fields["started_at"] svc_count[svc] += 1 sev_count[sev] += 1 if status in ("open", "mitigating"): open_count += 1 else: closed_count += 1 if sig: if sig not in sig_count: sig_count[sig] = {"count": 0, "services": set(), "last_seen": "", "severity_min": sev} sig_count[sig]["count"] += 1 sig_count[sig]["services"].add(svc) if started_at > sig_count[sig]["last_seen"]: sig_count[sig]["last_seen"] = started_at if severity_rank(sev) < severity_rank(sig_count[sig]["severity_min"]): sig_count[sig]["severity_min"] = sev if kind and kind != "custom": if kind not in kind_count: kind_count[kind] = {"count": 0, "services": set()} kind_count[kind]["count"] += 1 kind_count[kind]["services"].add(svc) # Serialize sets top_sigs = sorted( [{"signature": k, "count": v["count"], "services": sorted(v["services"]), "last_seen": v["last_seen"], "severity_min": v["severity_min"]} for k, v in sig_count.items()], key=lambda x: -x["count"], )[:top_n] top_kinds = sorted( [{"kind": k, "count": v["count"], "services": sorted(v["services"])} for k, v in kind_count.items()], key=lambda x: -x["count"], )[:top_n] top_services = sorted( [{"service": k, "count": v} for k, v in svc_count.items()], key=lambda x: -x["count"], )[:top_n] # Classify high recurrence high_sigs = [s for s in top_sigs if s["count"] >= sig_thresh.get("high", 6)] warn_sigs = [s for s in top_sigs if sig_thresh.get("warn", 3) <= s["count"] < sig_thresh.get("high", 6)] high_kinds = [k for k in top_kinds if k["count"] >= kind_thresh.get("high", 10)] warn_kinds = [k for k in top_kinds if kind_thresh.get("warn", 5) <= k["count"] < kind_thresh.get("high", 10)] return { "window_days": window_days, "total_incidents": len(incidents), "open_count": open_count, "closed_count": closed_count, "severity_distribution": dict(sev_count), "top_signatures": top_sigs, "top_kinds": top_kinds, "top_services": top_services, "high_recurrence": { "signatures": high_sigs, "kinds": high_kinds, }, "warn_recurrence": { "signatures": warn_sigs, "kinds": warn_kinds, }, } # ─── 3. Weekly Digest ───────────────────────────────────────────────────────── def weekly_digest( policy: Optional[Dict] = None, store = None, save_artifacts: bool = True, ) -> Dict: """ Generate weekly incident digest: markdown + JSON. Saves to output_dir if save_artifacts=True. Returns {markdown, json_data, artifact_paths}. """ if policy is None: policy = load_intel_policy() if store is None: from incident_store import get_incident_store store = get_incident_store() digest_cfg = policy.get("digest", {}) max_chars = int(digest_cfg.get("markdown_max_chars", 8000)) top_n_inc = int(digest_cfg.get("top_incidents", 20)) output_dir = digest_cfg.get("output_dir", "ops/reports/incidents") now = datetime.datetime.utcnow() week_str = now.strftime("%Y-W%W") ts_str = now.strftime("%Y-%m-%d %H:%M UTC") # ── Collect data ────────────────────────────────────────────────────────── rec_7d = detect_recurrence(window_days=7, policy=policy, store=store) rec_30d = detect_recurrence(window_days=30, policy=policy, store=store) # Open incidents open_incs = store.list_incidents({"status": "open"}, limit=100) mitigating = store.list_incidents({"status": "mitigating"}, limit=50) all_open = open_incs + mitigating all_open.sort(key=lambda i: severity_rank(i.get("severity", "P3"))) # Last 7d incidents (sorted by severity then started_at) recent = _incidents_in_window(store, 7, limit=top_n_inc * 2) recent.sort(key=lambda i: (severity_rank(i.get("severity", "P3")), i.get("started_at", ""))) recent = recent[:top_n_inc] # ── Root-cause buckets ──────────────────────────────────────────────────── all_30d = _incidents_in_window(store, 30, limit=1000) buckets = build_root_cause_buckets(all_30d, policy=policy, windows=[7, 30]) buck_cfg = policy.get("buckets", {}) sig_high_thresh = int(policy.get("recurrence", {}).get("thresholds", {}) .get("signature", {}).get("high", 6)) kind_high_thresh = int(policy.get("recurrence", {}).get("thresholds", {}) .get("kind", {}).get("high", 10)) high_buckets = [ b for b in buckets if b["counts"]["7d"] >= sig_high_thresh or any(k in {ki["kind"] for ki in rec_7d.get("high_recurrence", {}).get("kinds", [])} for k in b.get("kinds", [])) ] # ── Auto follow-ups ─────────────────────────────────────────────────────── autofollowup_result: Dict = {"created": [], "skipped": []} if policy.get("autofollowups", {}).get("enabled", True): try: autofollowup_result = create_autofollowups( buckets=buckets, rec_7d=rec_7d, policy=policy, store=store, week_str=week_str, ) except Exception as e: logger.warning("weekly_digest: autofollowups error (non-fatal): %s", e) # ── Build recommendations ───────────────────────────────────────────────── recs = _build_recommendations(rec_7d, rec_30d, policy) # ── Build JSON payload ──────────────────────────────────────────────────── json_data = { "generated_at": _now_iso(), "week": week_str, "open_incidents_count": len(all_open), "recent_7d_count": rec_7d["total_incidents"], "recent_30d_count": rec_30d["total_incidents"], "recurrence_7d": rec_7d, "recurrence_30d": rec_30d, "open_incidents": [_inc_summary(i) for i in all_open[:20]], "recent_incidents": [_inc_summary(i) for i in recent], "recommendations": recs, "buckets": { "top": buckets, "high": high_buckets, }, "autofollowups": autofollowup_result, } # ── Build Markdown ──────────────────────────────────────────────────────── md = _build_markdown( week_str=week_str, ts_str=ts_str, all_open=all_open, recent=recent, rec_7d=rec_7d, rec_30d=rec_30d, recs=recs, buckets=buckets, autofollowup_result=autofollowup_result, ) if len(md) > max_chars: md = md[:max_chars - 80] + f"\n\n… *(digest truncated at {max_chars} chars)*" # ── Save artifacts ──────────────────────────────────────────────────────── artifact_paths: List[str] = [] if save_artifacts: artifact_paths = _save_digest_artifacts(output_dir, week_str, json_data, md) return { "markdown": md, "json_data": json_data, "artifact_paths": artifact_paths, "week": week_str, } def _inc_summary(inc: Dict) -> Dict: return { "id": inc.get("id", ""), "service": inc.get("service", ""), "env": inc.get("env", "prod"), "severity": inc.get("severity", "P2"), "status": inc.get("status", ""), "kind": extract_kind(inc), "title": safe_truncate(inc.get("title", ""), 120), "started_at": inc.get("started_at", ""), "duration": format_duration( inc.get("started_at", ""), inc.get("ended_at") ), "signature": mask_signature( (inc.get("meta") or {}).get("incident_signature", "") ), } def _build_recommendations(rec_7d: Dict, rec_30d: Dict, policy: Dict) -> List[Dict]: recs_cfg = policy.get("recurrence", {}).get("recommendations", {}) recs: List[Dict] = [] for sig_item in rec_7d.get("high_recurrence", {}).get("signatures", []): sig_short = mask_signature(sig_item["signature"]) msg_tmpl = recs_cfg.get("signature_high", "Create permanent fix for signature {sig}") recs.append({ "level": "high", "category": "signature", "target": sig_short, "services": sig_item.get("services", []), "count_7d": sig_item.get("count", 0), "message": msg_tmpl.format(sig=sig_short, **sig_item), }) for sig_item in rec_7d.get("warn_recurrence", {}).get("signatures", []): sig_short = mask_signature(sig_item["signature"]) msg_tmpl = recs_cfg.get("signature_warn", "Review root cause for signature {sig}") recs.append({ "level": "warn", "category": "signature", "target": sig_short, "services": sig_item.get("services", []), "count_7d": sig_item.get("count", 0), "message": msg_tmpl.format(sig=sig_short, **sig_item), }) for kind_item in rec_7d.get("high_recurrence", {}).get("kinds", []): kind = kind_item.get("kind", "?") msg_tmpl = recs_cfg.get("kind_high", "Systemic issue with {kind}") recs.append({ "level": "high", "category": "kind", "target": kind, "services": kind_item.get("services", []), "count_7d": kind_item.get("count", 0), "message": msg_tmpl.format(kind=kind), }) for kind_item in rec_7d.get("warn_recurrence", {}).get("kinds", []): kind = kind_item.get("kind", "?") msg_tmpl = recs_cfg.get("kind_warn", "Recurring {kind}") recs.append({ "level": "warn", "category": "kind", "target": kind, "services": kind_item.get("services", []), "count_7d": kind_item.get("count", 0), "message": msg_tmpl.format(kind=kind), }) return recs def _build_markdown( week_str: str, ts_str: str, all_open: List[Dict], recent: List[Dict], rec_7d: Dict, rec_30d: Dict, recs: List[Dict], buckets: Optional[List[Dict]] = None, autofollowup_result: Optional[Dict] = None, ) -> str: lines = [ f"# Weekly Incident Digest — {week_str}", f"*Generated: {ts_str}*", "", "---", "", f"## Summary", f"| Metric | Value |", f"|--------|-------|", f"| Open incidents | {len(all_open)} |", f"| Incidents (7d) | {rec_7d['total_incidents']} |", f"| Incidents (30d) | {rec_30d['total_incidents']} |", "", ] # ── Open incidents ───────────────────────────────────────────────────────── if all_open: lines.append("## 🔴 Open Incidents") for inc in all_open[:10]: sev = inc.get("severity", "?") svc = inc.get("service", "?") title = safe_truncate(inc.get("title", ""), 80) dur = format_duration(inc.get("started_at", ""), inc.get("ended_at")) kind = extract_kind(inc) lines.append(f"- **[{sev}]** `{inc.get('id','?')}` {svc} — {title} *(kind: {kind}, {dur})*") if len(all_open) > 10: lines.append(f"- … and {len(all_open) - 10} more open incidents") lines.append("") # ── Recent 7d ───────────────────────────────────────────────────────────── if recent: lines.append("## 📋 Recent Incidents (7 days)") for inc in recent[:15]: sev = inc.get("severity", "?") status = inc.get("status", "?") svc = inc.get("service", "?") title = safe_truncate(inc.get("title", ""), 80) dur = format_duration(inc.get("started_at", ""), inc.get("ended_at")) kind = extract_kind(inc) lines.append(f"- **[{sev}/{status}]** {svc} — {title} *(kind: {kind}, {dur})*") lines.append("") # ── Recurrence: 7d ──────────────────────────────────────────────────────── lines.append("## 🔁 Recurrence (7 days)") if rec_7d["top_signatures"]: lines.append("### Top Signatures") for item in rec_7d["top_signatures"][:8]: sig_s = mask_signature(item["signature"]) svcs = ", ".join(item.get("services", [])[:3]) lines.append(f"- `{sig_s}` — {item['count']}x ({svcs})") if rec_7d["top_kinds"]: lines.append("### Top Kinds") for item in rec_7d["top_kinds"][:8]: svcs = ", ".join(item.get("services", [])[:3]) lines.append(f"- `{item['kind']}` — {item['count']}x ({svcs})") lines.append("") # ── Recurrence: 30d ─────────────────────────────────────────────────────── if rec_30d["total_incidents"] > rec_7d["total_incidents"]: lines.append("## 📈 Recurrence (30 days)") if rec_30d["top_signatures"][:5]: for item in rec_30d["top_signatures"][:5]: sig_s = mask_signature(item["signature"]) lines.append(f"- `{sig_s}` — {item['count']}x") lines.append("") # ── Root-Cause Buckets ──────────────────────────────────────────────────── if buckets: lines.append("## 🪣 Top Root-Cause Buckets (7d/30d)") for b in buckets[:8]: bkey = b["bucket_key"] c7 = b["counts"]["7d"] c30 = b["counts"]["30d"] c_open = b["counts"]["open"] svcs = ", ".join(b.get("services", [])[:3]) last = b.get("last_seen", "")[:10] lines.append(f"### `{bkey}`") lines.append(f"*7d: {c7} incidents | 30d: {c30} | open: {c_open} | last: {last} | svcs: {svcs}*") recs_b = b.get("recommendations", [])[:3] for rec in recs_b: lines.append(f" - {rec}") lines.append("") # ── Auto Follow-ups summary ──────────────────────────────────────────────── if autofollowup_result: created = autofollowup_result.get("created", []) skipped = autofollowup_result.get("skipped", []) if created: lines.append("## 🔗 Auto Follow-ups Created") for fu in created[:10]: lines.append( f"- `{fu['bucket_key']}` → incident `{fu['incident_id']}` " f"({fu['priority']}, due {fu['due_date']})" ) lines.append("") elif skipped: lines.append(f"*Auto follow-ups: {len(skipped)} skipped (no high recurrence or already exists)*\n") # ── Recommendations ─────────────────────────────────────────────────────── if recs: lines.append("## 💡 Recommendations") for r in recs[:10]: icon = "🔴" if r["level"] == "high" else "🟡" svcs = ", ".join(r.get("services", [])[:3]) lines.append(f"{icon} **[{r['category']}:{r['target']}]** {r['message']} *(count_7d={r['count_7d']}, svcs: {svcs})*") lines.append("") return "\n".join(lines) def _save_digest_artifacts( output_dir: str, week_str: str, json_data: Dict, md: str, ) -> List[str]: """Atomic write of digest artifacts. Returns list of written paths.""" paths: List[str] = [] try: out = Path(output_dir) / "weekly" out.mkdir(parents=True, exist_ok=True) json_path = out / f"{week_str}.json" md_path = out / f"{week_str}.md" # Atomic write via temp file import tempfile for content, dest in [(json.dumps(json_data, indent=2, default=str), json_path), (md, md_path)]: tmp_fd, tmp_path = tempfile.mkstemp(dir=out, suffix=".tmp") try: with os.fdopen(tmp_fd, "w") as f: f.write(content) os.replace(tmp_path, dest) paths.append(str(dest)) except Exception: try: os.unlink(tmp_path) except OSError: pass raise except Exception as e: logger.error("Failed to save digest artifacts: %s", e) return paths # ─── Root-Cause Buckets ─────────────────────────────────────────────────────── _KIND_RECS: Dict[str, List[str]] = { "error_rate": [ "Add regression test for API contract & error mapping", "Review recent deploy diffs and dependency changes", "Add/adjust SLO thresholds & alert routing", ], "slo_breach": [ "Add regression test for API contract & error mapping", "Review recent deploy diffs and dependency changes", "Add/adjust SLO thresholds & alert routing", ], "latency": [ "Check p95 vs saturation; add perf budget", "Investigate DB/queue contention", ], "oom": [ "Add memory profiling; set container limits; audit for leaks", ], "crashloop": [ "Add memory profiling; set container limits; audit for leaks", "Check liveness/readiness probe configuration", ], "disk": [ "Add retention/cleanup automation; verify volume provisioning", ], "security": [ "Run dependency scanner + rotate secrets; verify gateway allowlists", ], "queue": [ "Check consumer lag and partition count; add dead-letter queue", ], "network": [ "Audit DNS configuration; verify network policies and ACLs", ], } _DEFAULT_KIND_RECS = [ "Review incident timeline and add regression test", "Check deployment diffs for correlated changes", ] def bucket_recommendations(bucket: Dict) -> List[str]: """Deterministic per-bucket recommendations based on kind + open_count.""" kinds = bucket.get("kinds", set()) recs: List[str] = [] seen: set = set() for kind in kinds: for r in _KIND_RECS.get(kind, []): if r not in seen: recs.append(r) seen.add(r) if not recs: recs = list(_DEFAULT_KIND_RECS) # Actionable warning if there are open incidents if bucket.get("counts", {}).get("open", 0) > 0: recs.append("⚠ Do not deploy risky changes until open incidents are mitigated") return recs[:5] def build_root_cause_buckets( incidents: List[Dict], policy: Optional[Dict] = None, windows: Optional[List[int]] = None, ) -> List[Dict]: """ Cluster incidents into root-cause buckets by service|kind or signature_prefix. Returns top_n buckets sorted by (count_7d desc, count_30d desc, last_seen desc). Only buckets meeting min_count thresholds are returned. """ if policy is None: policy = load_intel_policy() if windows is None: windows = [7, 30] buck_cfg = policy.get("buckets", {}) mode = buck_cfg.get("mode", "service_kind") prefix_len = int(buck_cfg.get("signature_prefix_len", 12)) top_n = int(buck_cfg.get("top_n", 10)) min_count = buck_cfg.get("min_count", {"7": 3, "30": 6}) # normalize keys to int min_7 = int(min_count.get(7, min_count.get("7", 3))) min_30 = int(min_count.get(30, min_count.get("30", 6))) now = datetime.datetime.utcnow() cutoffs: Dict[int, str] = { w: (now - datetime.timedelta(days=w)).isoformat() for w in windows } # Build bucket map buckets: Dict[str, Dict] = {} for inc in incidents: fields = incident_key_fields(inc) sig = fields["signature"] kind = fields["kind"] svc = fields["service"] started = fields["started_at"] status = fields["status"] sev = fields["severity"] if mode == "signature_prefix" and sig: bkey = sig[:prefix_len] else: bkey = f"{svc}|{kind}" if bkey not in buckets: buckets[bkey] = { "bucket_key": bkey, "counts": {"open": 0}, "last_seen": "", "first_seen": started, "services": set(), "kinds": set(), "sig_counts": defaultdict(int), "sev_mix": defaultdict(int), "sample_incidents": [], } b = buckets[bkey] b["services"].add(svc) b["kinds"].add(kind) if sig: b["sig_counts"][sig] += 1 if started > b["last_seen"]: b["last_seen"] = started if started < b["first_seen"] or not b["first_seen"]: b["first_seen"] = started b["sev_mix"][sev] += 1 if status in ("open", "mitigating"): b["counts"]["open"] += 1 # Count by window for w, cutoff in cutoffs.items(): key_w = f"{w}d" if started >= cutoff: b["counts"][key_w] = b["counts"].get(key_w, 0) + 1 # Keep up to 5 sample incidents if len(b["sample_incidents"]) < 5: b["sample_incidents"].append({ "id": inc.get("id", ""), "started_at": started, "status": status, "title": safe_truncate(inc.get("title", ""), 80), }) # Serialize and filter result = [] for bkey, b in buckets.items(): count_7d = b["counts"].get("7d", 0) count_30d = b["counts"].get("30d", 0) if count_7d < min_7 and count_30d < min_30: continue top_sigs = sorted( [{"signature": mask_signature(s), "count": c} for s, c in b["sig_counts"].items()], key=lambda x: -x["count"], )[:5] recs_data = bucket_recommendations({ "kinds": b["kinds"], "counts": b["counts"], }) result.append({ "bucket_key": bkey, "counts": { "7d": count_7d, "30d": count_30d, "open": b["counts"]["open"], }, "last_seen": b["last_seen"], "first_seen": b["first_seen"], "services": sorted(b["services"]), "kinds": sorted(b["kinds"]), "top_signatures": top_sigs, "severity_mix": dict(b["sev_mix"]), "sample_incidents": sorted( b["sample_incidents"], key=lambda x: x["started_at"], reverse=True )[:5], "recommendations": recs_data, }) # Sort: count_7d desc, then count_30d desc, then last_seen desc result.sort(key=lambda x: (-x["counts"]["7d"], -x["counts"]["30d"], x["last_seen"]), reverse=False) result.sort(key=lambda x: (-x["counts"]["7d"], -x["counts"]["30d"])) return result[:top_n] # ─── Auto Follow-ups ───────────────────────────────────────────────────────── def _followup_dedupe_key(policy: Dict, week_str: str, bucket_key: str) -> str: prefix = policy.get("autofollowups", {}).get("dedupe_key_prefix", "intel_recur") return f"{prefix}:{week_str}:{bucket_key}" def create_autofollowups( buckets: List[Dict], rec_7d: Dict, policy: Optional[Dict] = None, store = None, week_str: Optional[str] = None, ) -> Dict: """ For each high-recurrence bucket, append a follow-up event to the most recent open incident in that bucket (deterministic dedupe by week+bucket_key). Returns {created: [...], skipped: [...]} """ if policy is None: policy = load_intel_policy() if store is None: from incident_store import get_incident_store store = get_incident_store() if week_str is None: week_str = datetime.datetime.utcnow().strftime("%Y-W%W") af_cfg = policy.get("autofollowups", {}) if not af_cfg.get("enabled", True): return {"created": [], "skipped": [{"reason": "disabled"}]} only_when_high = bool(af_cfg.get("only_when_high", True)) owner = af_cfg.get("owner", "oncall") priority = af_cfg.get("priority", "P1") due_days = int(af_cfg.get("due_days", 7)) # Determine threshold thresholds rec_cfg = policy.get("recurrence", {}) sig_high_thresh = int(rec_cfg.get("thresholds", {}).get("signature", {}).get("high", 6)) kind_high_thresh = int(rec_cfg.get("thresholds", {}).get("kind", {}).get("high", 10)) # Which signatures/kinds are "high" high_sigs = {s["signature"] for s in rec_7d.get("high_recurrence", {}).get("signatures", [])} high_kinds = {k["kind"] for k in rec_7d.get("high_recurrence", {}).get("kinds", [])} created: List[Dict] = [] skipped: List[Dict] = [] due_date = (datetime.datetime.utcnow() + datetime.timedelta(days=due_days)).isoformat()[:10] for bucket in buckets: bkey = bucket["bucket_key"] count_7d = bucket["counts"]["7d"] # Check if this bucket is "high" is_high = False # by signature match for ts in bucket.get("top_signatures", []): if ts.get("signature", "") in high_sigs: is_high = True break # by kind match if not is_high: for kind in bucket.get("kinds", []): if kind in high_kinds: is_high = True break # by count threshold if not is_high and count_7d >= sig_high_thresh: is_high = True if only_when_high and not is_high: skipped.append({"bucket_key": bkey, "reason": "not_high", "count_7d": count_7d}) continue dedupe_key = _followup_dedupe_key(policy, week_str, bkey) # Find anchor incident: most recent sample samples = sorted( bucket.get("sample_incidents", []), key=lambda x: x.get("started_at", ""), reverse=True, ) if not samples: skipped.append({"bucket_key": bkey, "reason": "no_incidents"}) continue anchor_id = samples[0]["id"] if not anchor_id: skipped.append({"bucket_key": bkey, "reason": "no_anchor_id"}) continue # Dedupe check: does anchor already have a followup with this key? try: existing_events = store.get_events(anchor_id, limit=100) for ev in existing_events: ev_meta = ev.get("meta") or {} if ev_meta.get("dedupe_key") == dedupe_key: skipped.append({ "bucket_key": bkey, "reason": "already_exists", "incident_id": anchor_id, "dedupe_key": dedupe_key, }) break else: # Create follow-up event msg = ( f"[intel] Recurrence high: {bkey} " f"(7d={count_7d}, 30d={bucket['counts']['30d']}, " f"kinds={','.join(sorted(bucket.get('kinds', []))[:3])})" ) store.append_event( anchor_id, "followup", safe_truncate(msg, 2000), meta={ "title": f"[intel] Recurrence high: {bkey}", "owner": owner, "priority": priority, "due_date": due_date, "dedupe_key": dedupe_key, "auto_created": True, "bucket_key": bkey, "count_7d": count_7d, }, ) created.append({ "bucket_key": bkey, "incident_id": anchor_id, "dedupe_key": dedupe_key, "priority": priority, "due_date": due_date, }) except Exception as e: logger.warning("autofollowup failed for bucket %s: %s", bkey, e) skipped.append({"bucket_key": bkey, "reason": f"error: {e}"}) return {"created": created, "skipped": skipped} # ─── Recurrence-Watch helper (for release gate) ─────────────────────────────── def recurrence_for_service( service: str, window_days: int = 7, policy: Optional[Dict] = None, store = None, ) -> Dict: """ Focused recurrence analysis for a single service. Returns the same shape as detect_recurrence but filtered to service. """ if policy is None: policy = load_intel_policy() if store is None: from incident_store import get_incident_store store = get_incident_store() rec_cfg = policy.get("recurrence", {}) thresholds = rec_cfg.get("thresholds", {}) sig_thresh = thresholds.get("signature", {"warn": 3, "high": 6}) kind_thresh = thresholds.get("kind", {"warn": 5, "high": 10}) top_n = int(rec_cfg.get("top_n", 15)) incidents = _incidents_in_window(store, window_days) if service: incidents = [i for i in incidents if i.get("service", "") == service] # Frequency tables for this service only sig_count: Dict[str, Dict] = {} kind_count: Dict[str, Dict] = {} from collections import defaultdict as _defdict sev_count: Dict[str, int] = _defdict(int) for inc in incidents: fields = incident_key_fields(inc) sig = fields["signature"] kind = fields["kind"] sev = fields["severity"] started_at = fields["started_at"] sev_count[sev] += 1 if sig: if sig not in sig_count: sig_count[sig] = {"count": 0, "services": {service}, "last_seen": "", "severity_min": sev} sig_count[sig]["count"] += 1 if started_at > sig_count[sig]["last_seen"]: sig_count[sig]["last_seen"] = started_at if kind and kind != "custom": if kind not in kind_count: kind_count[kind] = {"count": 0, "services": {service}} kind_count[kind]["count"] += 1 top_sigs = sorted( [{"signature": k, "count": v["count"], "services": sorted(v["services"]), "last_seen": v["last_seen"], "severity_min": v["severity_min"]} for k, v in sig_count.items()], key=lambda x: -x["count"], )[:top_n] top_kinds = sorted( [{"kind": k, "count": v["count"], "services": sorted(v["services"])} for k, v in kind_count.items()], key=lambda x: -x["count"], )[:top_n] high_sigs = [s for s in top_sigs if s["count"] >= sig_thresh.get("high", 6)] warn_sigs = [s for s in top_sigs if sig_thresh.get("warn", 3) <= s["count"] < sig_thresh.get("high", 6)] high_kinds = [k for k in top_kinds if k["count"] >= kind_thresh.get("high", 10)] warn_kinds = [k for k in top_kinds if kind_thresh.get("warn", 5) <= k["count"] < kind_thresh.get("high", 10)] # Determine max severity seen in high-recurrence bucket max_sev = "P3" for inc in incidents: sev = inc.get("severity", "P3") if severity_rank(sev) < severity_rank(max_sev): max_sev = sev return { "service": service, "window_days": window_days, "total_incidents": len(incidents), "severity_distribution": dict(sev_count), "max_severity_seen": max_sev, "top_signatures": top_sigs, "top_kinds": top_kinds, "high_recurrence": {"signatures": high_sigs, "kinds": high_kinds}, "warn_recurrence": {"signatures": warn_sigs, "kinds": warn_kinds}, }