""" incident_store.py — Incident Log storage abstraction. Backends: - MemoryIncidentStore (testing) - JsonlIncidentStore (MVP/fallback — ops/incidents/ directory) - PostgresIncidentStore(production — psycopg2 sync) - AutoIncidentStore (Postgres primary → JSONL fallback) All writes are non-fatal: exceptions are logged as warnings. """ from __future__ import annotations import datetime import hashlib import json import logging import os import re import threading import time import uuid from abc import ABC, abstractmethod from pathlib import Path from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) _SECRET_PAT = re.compile(r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+') def _redact_text(text: str, max_len: int = 4000) -> str: """Mask secrets, truncate.""" text = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", text) return text[:max_len] if len(text) > max_len else text def _now_iso() -> str: return datetime.datetime.now(datetime.timezone.utc).isoformat() def _generate_incident_id() -> str: now = datetime.datetime.now(datetime.timezone.utc) rand = uuid.uuid4().hex[:6] return f"inc_{now.strftime('%Y%m%d_%H%M')}_{rand}" # ─── Abstract interface ────────────────────────────────────────────────────── class IncidentStore(ABC): @abstractmethod def create_incident(self, data: Dict) -> Dict: ... @abstractmethod def get_incident(self, incident_id: str) -> Optional[Dict]: ... @abstractmethod def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]: ... @abstractmethod def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]: ... @abstractmethod def append_event(self, incident_id: str, event_type: str, message: str, meta: Optional[Dict] = None) -> Optional[Dict]: ... @abstractmethod def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]: ... @abstractmethod def add_artifact(self, incident_id: str, kind: str, fmt: str, path: str, sha256: str, size_bytes: int) -> Optional[Dict]: ... @abstractmethod def get_artifacts(self, incident_id: str) -> List[Dict]: ... # ─── In-memory (testing) ───────────────────────────────────────────────────── class MemoryIncidentStore(IncidentStore): def __init__(self): self._incidents: Dict[str, Dict] = {} self._events: Dict[str, List[Dict]] = {} self._artifacts: Dict[str, List[Dict]] = {} self._lock = threading.Lock() def create_incident(self, data: Dict) -> Dict: inc_id = data.get("id") or _generate_incident_id() now = _now_iso() inc = { "id": inc_id, "workspace_id": data.get("workspace_id", "default"), "service": data["service"], "env": data.get("env", "prod"), "severity": data.get("severity", "P2"), "status": "open", "title": _redact_text(data.get("title", ""), 500), "summary": _redact_text(data.get("summary", "") or "", 2000), "started_at": data.get("started_at", now), "ended_at": None, "created_by": data.get("created_by", "unknown"), "created_at": now, "updated_at": now, "meta": data.get("meta") or {}, } with self._lock: self._incidents[inc_id] = inc self._events[inc_id] = [] self._artifacts[inc_id] = [] return inc def get_incident(self, incident_id: str) -> Optional[Dict]: inc = self._incidents.get(incident_id) if not inc: return None events = self._events.get(incident_id, [])[-20:] artifacts = self._artifacts.get(incident_id, []) return {**inc, "events": events, "artifacts": artifacts} def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]: filters = filters or {} result = list(self._incidents.values()) if filters.get("status"): result = [i for i in result if i["status"] == filters["status"]] if filters.get("service"): result = [i for i in result if i["service"] == filters["service"]] if filters.get("env"): result = [i for i in result if i["env"] == filters["env"]] if filters.get("severity"): result = [i for i in result if i["severity"] == filters["severity"]] result.sort(key=lambda x: x.get("created_at", ""), reverse=True) return result[:limit] def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]: inc = self._incidents.get(incident_id) if not inc: return None with self._lock: inc["status"] = "closed" inc["ended_at"] = ended_at inc["summary"] = _redact_text(resolution, 2000) if resolution else inc.get("summary") inc["updated_at"] = _now_iso() self._events.setdefault(incident_id, []).append({ "ts": _now_iso(), "type": "status_change", "message": f"Incident closed: {_redact_text(resolution, 500)}", "meta": None, }) return inc def append_event(self, incident_id: str, event_type: str, message: str, meta: Optional[Dict] = None) -> Optional[Dict]: if incident_id not in self._incidents: return None ev = { "ts": _now_iso(), "type": event_type, "message": _redact_text(message, 4000), "meta": meta, } with self._lock: self._events.setdefault(incident_id, []).append(ev) self._incidents[incident_id]["updated_at"] = _now_iso() return ev def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]: return self._events.get(incident_id, [])[:limit] def add_artifact(self, incident_id: str, kind: str, fmt: str, path: str, sha256: str, size_bytes: int) -> Optional[Dict]: if incident_id not in self._incidents: return None art = { "ts": _now_iso(), "kind": kind, "format": fmt, "path": path, "sha256": sha256, "size_bytes": size_bytes, } with self._lock: self._artifacts.setdefault(incident_id, []).append(art) return art def get_artifacts(self, incident_id: str) -> List[Dict]: return self._artifacts.get(incident_id, []) # ─── JSONL (MVP file backend) ──────────────────────────────────────────────── class JsonlIncidentStore(IncidentStore): """ Stores incidents/events/artifacts as separate JSONL files in a directory. Layout: /incidents.jsonl /events.jsonl /artifacts.jsonl """ def __init__(self, base_dir: str): self._dir = Path(base_dir) self._dir.mkdir(parents=True, exist_ok=True) self._lock = threading.Lock() def _incidents_path(self) -> Path: return self._dir / "incidents.jsonl" def _events_path(self) -> Path: return self._dir / "events.jsonl" def _artifacts_path(self) -> Path: return self._dir / "artifacts.jsonl" def _read_jsonl(self, path: Path) -> List[Dict]: if not path.exists(): return [] items = [] try: with open(path, "r", encoding="utf-8") as fh: for line in fh: line = line.strip() if line: try: items.append(json.loads(line)) except json.JSONDecodeError: pass except Exception: pass return items def _append_jsonl(self, path: Path, record: Dict) -> None: with self._lock: with open(path, "a", encoding="utf-8") as fh: fh.write(json.dumps(record, ensure_ascii=False, default=str) + "\n") def _rewrite_jsonl(self, path: Path, items: List[Dict]) -> None: with self._lock: with open(path, "w", encoding="utf-8") as fh: for item in items: fh.write(json.dumps(item, ensure_ascii=False, default=str) + "\n") def create_incident(self, data: Dict) -> Dict: inc_id = data.get("id") or _generate_incident_id() now = _now_iso() inc = { "id": inc_id, "workspace_id": data.get("workspace_id", "default"), "service": data["service"], "env": data.get("env", "prod"), "severity": data.get("severity", "P2"), "status": "open", "title": _redact_text(data.get("title", ""), 500), "summary": _redact_text(data.get("summary", "") or "", 2000), "started_at": data.get("started_at", now), "ended_at": None, "created_by": data.get("created_by", "unknown"), "created_at": now, "updated_at": now, "meta": data.get("meta") or {}, } self._append_jsonl(self._incidents_path(), inc) return inc def get_incident(self, incident_id: str) -> Optional[Dict]: incidents = self._read_jsonl(self._incidents_path()) inc = next((i for i in incidents if i.get("id") == incident_id), None) if not inc: return None events = [e for e in self._read_jsonl(self._events_path()) if e.get("incident_id") == incident_id][-20:] artifacts = [a for a in self._read_jsonl(self._artifacts_path()) if a.get("incident_id") == incident_id] return {**inc, "events": events, "artifacts": artifacts} def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]: filters = filters or {} incidents = self._read_jsonl(self._incidents_path()) if filters.get("status"): incidents = [i for i in incidents if i.get("status") == filters["status"]] if filters.get("service"): incidents = [i for i in incidents if i.get("service") == filters["service"]] if filters.get("env"): incidents = [i for i in incidents if i.get("env") == filters["env"]] if filters.get("severity"): incidents = [i for i in incidents if i.get("severity") == filters["severity"]] incidents.sort(key=lambda x: x.get("created_at", ""), reverse=True) return incidents[:limit] def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]: incidents = self._read_jsonl(self._incidents_path()) found = None for inc in incidents: if inc.get("id") == incident_id: inc["status"] = "closed" inc["ended_at"] = ended_at if resolution: inc["summary"] = _redact_text(resolution, 2000) inc["updated_at"] = _now_iso() found = inc break if not found: return None self._rewrite_jsonl(self._incidents_path(), incidents) self.append_event(incident_id, "status_change", f"Incident closed: {_redact_text(resolution or '', 500)}") return found def append_event(self, incident_id: str, event_type: str, message: str, meta: Optional[Dict] = None) -> Optional[Dict]: incidents = self._read_jsonl(self._incidents_path()) if not any(i.get("id") == incident_id for i in incidents): return None ev = { "incident_id": incident_id, "ts": _now_iso(), "type": event_type, "message": _redact_text(message, 4000), "meta": meta, } self._append_jsonl(self._events_path(), ev) return ev def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]: events = self._read_jsonl(self._events_path()) return [e for e in events if e.get("incident_id") == incident_id][:limit] def add_artifact(self, incident_id: str, kind: str, fmt: str, path: str, sha256: str, size_bytes: int) -> Optional[Dict]: incidents = self._read_jsonl(self._incidents_path()) if not any(i.get("id") == incident_id for i in incidents): return None art = { "incident_id": incident_id, "ts": _now_iso(), "kind": kind, "format": fmt, "path": path, "sha256": sha256, "size_bytes": size_bytes, } self._append_jsonl(self._artifacts_path(), art) return art def get_artifacts(self, incident_id: str) -> List[Dict]: artifacts = self._read_jsonl(self._artifacts_path()) return [a for a in artifacts if a.get("incident_id") == incident_id] # ─── Postgres backend ───────────────────────────────────────────────────────── class PostgresIncidentStore(IncidentStore): """ Production backend using psycopg2 (sync). Tables created by ops/scripts/migrate_incidents_postgres.py. """ def __init__(self, dsn: str): self._dsn = dsn self._local = threading.local() def _conn(self): """Get or create a per-thread connection.""" conn = getattr(self._local, "conn", None) if conn is None or conn.closed: import psycopg2 # type: ignore conn = psycopg2.connect(self._dsn) conn.autocommit = True self._local.conn = conn return conn def create_incident(self, data: Dict) -> Dict: inc_id = data.get("id") or _generate_incident_id() now = _now_iso() cur = self._conn().cursor() cur.execute( """INSERT INTO incidents (id,workspace_id,service,env,severity,status, title,summary,started_at,created_by,created_at,updated_at) VALUES (%s,%s,%s,%s,%s,'open',%s,%s,%s,%s,%s,%s)""", (inc_id, data.get("workspace_id", "default"), data["service"], data.get("env", "prod"), data.get("severity", "P2"), _redact_text(data.get("title", ""), 500), _redact_text(data.get("summary", "") or "", 2000), data.get("started_at") or now, data.get("created_by", "unknown"), now, now), ) cur.close() return {"id": inc_id, "status": "open", "service": data["service"], "severity": data.get("severity", "P2"), "started_at": data.get("started_at") or now, "created_at": now} def get_incident(self, incident_id: str) -> Optional[Dict]: cur = self._conn().cursor() cur.execute("SELECT id,workspace_id,service,env,severity,status,title,summary," "started_at,ended_at,created_by,created_at,updated_at " "FROM incidents WHERE id=%s", (incident_id,)) row = cur.fetchone() if not row: cur.close() return None cols = [d[0] for d in cur.description] inc = {c: (v.isoformat() if isinstance(v, datetime.datetime) else v) for c, v in zip(cols, row)} # Events cur.execute("SELECT ts,type,message,meta FROM incident_events " "WHERE incident_id=%s ORDER BY ts DESC LIMIT 200", (incident_id,)) events = [] for r in cur.fetchall(): events.append({"ts": r[0].isoformat() if r[0] else "", "type": r[1], "message": r[2], "meta": r[3]}) events.reverse() # Artifacts cur.execute("SELECT ts,kind,format,path,sha256,size_bytes FROM incident_artifacts " "WHERE incident_id=%s ORDER BY ts", (incident_id,)) artifacts = [] for r in cur.fetchall(): artifacts.append({"ts": r[0].isoformat() if r[0] else "", "kind": r[1], "format": r[2], "path": r[3], "sha256": r[4], "size_bytes": r[5]}) cur.close() return {**inc, "events": events, "artifacts": artifacts} def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]: filters = filters or {} clauses = [] params: list = [] for k in ("status", "service", "env", "severity"): if filters.get(k): clauses.append(f"{k}=%s") params.append(filters[k]) if filters.get("window_days"): clauses.append("created_at >= NOW() - INTERVAL '%s days'") params.append(int(filters["window_days"])) where = ("WHERE " + " AND ".join(clauses)) if clauses else "" params.append(min(limit, 200)) cur = self._conn().cursor() cur.execute(f"SELECT id,workspace_id,service,env,severity,status,title,summary," f"started_at,ended_at,created_by,created_at,updated_at " f"FROM incidents {where} ORDER BY created_at DESC LIMIT %s", params) cols = [d[0] for d in cur.description] rows = [] for row in cur.fetchall(): rows.append({c: (v.isoformat() if isinstance(v, datetime.datetime) else v) for c, v in zip(cols, row)}) cur.close() return rows def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]: cur = self._conn().cursor() cur.execute("UPDATE incidents SET status='closed', ended_at=%s, summary=%s, updated_at=%s " "WHERE id=%s RETURNING id", (ended_at or _now_iso(), _redact_text(resolution, 2000) if resolution else None, _now_iso(), incident_id)) if not cur.fetchone(): cur.close() return None cur.close() self.append_event(incident_id, "status_change", f"Incident closed: {_redact_text(resolution or '', 500)}") return {"id": incident_id, "status": "closed"} def append_event(self, incident_id: str, event_type: str, message: str, meta: Optional[Dict] = None) -> Optional[Dict]: now = _now_iso() cur = self._conn().cursor() meta_json = json.dumps(meta, default=str) if meta else None cur.execute("INSERT INTO incident_events (incident_id,ts,type,message,meta) " "VALUES (%s,%s,%s,%s,%s)", (incident_id, now, event_type, _redact_text(message, 4000), meta_json)) cur.close() return {"ts": now, "type": event_type, "message": _redact_text(message, 4000), "meta": meta} def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]: cur = self._conn().cursor() cur.execute("SELECT ts,type,message,meta FROM incident_events " "WHERE incident_id=%s ORDER BY ts LIMIT %s", (incident_id, limit)) events = [{"ts": r[0].isoformat() if r[0] else "", "type": r[1], "message": r[2], "meta": r[3]} for r in cur.fetchall()] cur.close() return events def add_artifact(self, incident_id: str, kind: str, fmt: str, path: str, sha256: str, size_bytes: int) -> Optional[Dict]: now = _now_iso() cur = self._conn().cursor() cur.execute("INSERT INTO incident_artifacts (incident_id,ts,kind,format,path,sha256,size_bytes) " "VALUES (%s,%s,%s,%s,%s,%s,%s)", (incident_id, now, kind, fmt, path, sha256, size_bytes)) cur.close() return {"ts": now, "kind": kind, "format": fmt, "path": path, "sha256": sha256, "size_bytes": size_bytes} def get_artifacts(self, incident_id: str) -> List[Dict]: cur = self._conn().cursor() cur.execute("SELECT ts,kind,format,path,sha256,size_bytes FROM incident_artifacts " "WHERE incident_id=%s ORDER BY ts", (incident_id,)) artifacts = [{"ts": r[0].isoformat() if r[0] else "", "kind": r[1], "format": r[2], "path": r[3], "sha256": r[4], "size_bytes": r[5]} for r in cur.fetchall()] cur.close() return artifacts def close(self): conn = getattr(self._local, "conn", None) if conn and not conn.closed: conn.close() # ─── Auto backend (Postgres → JSONL fallback) ──────────────────────────────── class AutoIncidentStore(IncidentStore): """ Tries Postgres first; on any failure falls back to JSONL. Re-attempts Postgres after RECOVERY_INTERVAL_S (5 min). """ _RECOVERY_INTERVAL_S = 300 def __init__(self, pg_dsn: str, jsonl_dir: str): self._pg_dsn = pg_dsn self._jsonl_dir = jsonl_dir self._primary: Optional[PostgresIncidentStore] = None self._fallback: Optional[JsonlIncidentStore] = None self._using_fallback = False self._fallback_since: float = 0.0 self._init_lock = threading.Lock() def _get_primary(self) -> PostgresIncidentStore: if self._primary is None: with self._init_lock: if self._primary is None: self._primary = PostgresIncidentStore(self._pg_dsn) return self._primary def _get_fallback(self) -> JsonlIncidentStore: if self._fallback is None: with self._init_lock: if self._fallback is None: self._fallback = JsonlIncidentStore(self._jsonl_dir) return self._fallback def _maybe_recover(self) -> None: if self._using_fallback and self._fallback_since > 0: if time.monotonic() - self._fallback_since >= self._RECOVERY_INTERVAL_S: logger.info("AutoIncidentStore: attempting Postgres recovery") self._using_fallback = False self._fallback_since = 0.0 def _switch_to_fallback(self, err: Exception) -> None: logger.warning("AutoIncidentStore: Postgres failed (%s), using JSONL fallback", err) self._using_fallback = True self._fallback_since = time.monotonic() def active_backend(self) -> str: return "jsonl_fallback" if self._using_fallback else "postgres" # ── Delegate methods ────────────────────────────────────────────────────── def create_incident(self, data: Dict) -> Dict: self._maybe_recover() if not self._using_fallback: try: return self._get_primary().create_incident(data) except Exception as e: self._switch_to_fallback(e) return self._get_fallback().create_incident(data) def get_incident(self, incident_id: str) -> Optional[Dict]: self._maybe_recover() if not self._using_fallback: try: return self._get_primary().get_incident(incident_id) except Exception as e: self._switch_to_fallback(e) return self._get_fallback().get_incident(incident_id) def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]: self._maybe_recover() if not self._using_fallback: try: return self._get_primary().list_incidents(filters, limit) except Exception as e: self._switch_to_fallback(e) return self._get_fallback().list_incidents(filters, limit) def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]: self._maybe_recover() if not self._using_fallback: try: return self._get_primary().close_incident(incident_id, ended_at, resolution) except Exception as e: self._switch_to_fallback(e) return self._get_fallback().close_incident(incident_id, ended_at, resolution) def append_event(self, incident_id: str, event_type: str, message: str, meta: Optional[Dict] = None) -> Optional[Dict]: self._maybe_recover() if not self._using_fallback: try: return self._get_primary().append_event(incident_id, event_type, message, meta) except Exception as e: self._switch_to_fallback(e) return self._get_fallback().append_event(incident_id, event_type, message, meta) def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]: self._maybe_recover() if not self._using_fallback: try: return self._get_primary().get_events(incident_id, limit) except Exception as e: self._switch_to_fallback(e) return self._get_fallback().get_events(incident_id, limit) def add_artifact(self, incident_id: str, kind: str, fmt: str, path: str, sha256: str, size_bytes: int) -> Optional[Dict]: self._maybe_recover() if not self._using_fallback: try: return self._get_primary().add_artifact(incident_id, kind, fmt, path, sha256, size_bytes) except Exception as e: self._switch_to_fallback(e) return self._get_fallback().add_artifact(incident_id, kind, fmt, path, sha256, size_bytes) def get_artifacts(self, incident_id: str) -> List[Dict]: self._maybe_recover() if not self._using_fallback: try: return self._get_primary().get_artifacts(incident_id) except Exception as e: self._switch_to_fallback(e) return self._get_fallback().get_artifacts(incident_id) # ─── Singleton ──────────────────────────────────────────────────────────────── _store: Optional[IncidentStore] = None _store_lock = threading.Lock() def get_incident_store() -> IncidentStore: global _store if _store is None: with _store_lock: if _store is None: _store = _create_store() return _store def set_incident_store(store: Optional[IncidentStore]) -> None: global _store with _store_lock: _store = store def _create_store() -> IncidentStore: backend = os.getenv("INCIDENT_BACKEND", "jsonl").lower() dsn = os.getenv("DATABASE_URL") or os.getenv("INCIDENT_DATABASE_URL", "") jsonl_dir = os.getenv( "INCIDENT_JSONL_DIR", str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "incidents"), ) if backend == "memory": logger.info("IncidentStore: in-memory (testing only)") return MemoryIncidentStore() if backend == "postgres": if dsn: logger.info("IncidentStore: postgres dsn=%s…", dsn[:30]) return PostgresIncidentStore(dsn) logger.warning("INCIDENT_BACKEND=postgres but no DATABASE_URL; falling back to jsonl") if backend == "auto": if dsn: logger.info("IncidentStore: auto (postgres→jsonl fallback) dsn=%s…", dsn[:30]) return AutoIncidentStore(pg_dsn=dsn, jsonl_dir=jsonl_dir) logger.info("IncidentStore: auto — no DATABASE_URL, using jsonl") if backend == "null": return MemoryIncidentStore() # Default: JSONL logger.info("IncidentStore: jsonl dir=%s", jsonl_dir) return JsonlIncidentStore(jsonl_dir)