New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
691 lines
28 KiB
Python
691 lines
28 KiB
Python
"""
|
|
incident_store.py — Incident Log storage abstraction.
|
|
|
|
Backends:
|
|
- MemoryIncidentStore (testing)
|
|
- JsonlIncidentStore (MVP/fallback — ops/incidents/ directory)
|
|
- PostgresIncidentStore(production — psycopg2 sync)
|
|
- AutoIncidentStore (Postgres primary → JSONL fallback)
|
|
|
|
All writes are non-fatal: exceptions are logged as warnings.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import datetime
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import threading
|
|
import time
|
|
import uuid
|
|
from abc import ABC, abstractmethod
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_SECRET_PAT = re.compile(r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+')
|
|
|
|
|
|
def _redact_text(text: str, max_len: int = 4000) -> str:
|
|
"""Mask secrets, truncate."""
|
|
text = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", text)
|
|
return text[:max_len] if len(text) > max_len else text
|
|
|
|
|
|
def _now_iso() -> str:
|
|
return datetime.datetime.now(datetime.timezone.utc).isoformat()
|
|
|
|
|
|
def _generate_incident_id() -> str:
|
|
now = datetime.datetime.now(datetime.timezone.utc)
|
|
rand = uuid.uuid4().hex[:6]
|
|
return f"inc_{now.strftime('%Y%m%d_%H%M')}_{rand}"
|
|
|
|
|
|
# ─── Abstract interface ──────────────────────────────────────────────────────
|
|
|
|
class IncidentStore(ABC):
|
|
@abstractmethod
|
|
def create_incident(self, data: Dict) -> Dict:
|
|
...
|
|
|
|
@abstractmethod
|
|
def get_incident(self, incident_id: str) -> Optional[Dict]:
|
|
...
|
|
|
|
@abstractmethod
|
|
def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
|
|
...
|
|
|
|
@abstractmethod
|
|
def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
|
|
...
|
|
|
|
@abstractmethod
|
|
def append_event(self, incident_id: str, event_type: str, message: str,
|
|
meta: Optional[Dict] = None) -> Optional[Dict]:
|
|
...
|
|
|
|
@abstractmethod
|
|
def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
|
|
...
|
|
|
|
@abstractmethod
|
|
def add_artifact(self, incident_id: str, kind: str, fmt: str,
|
|
path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
|
|
...
|
|
|
|
@abstractmethod
|
|
def get_artifacts(self, incident_id: str) -> List[Dict]:
|
|
...
|
|
|
|
|
|
# ─── In-memory (testing) ─────────────────────────────────────────────────────
|
|
|
|
class MemoryIncidentStore(IncidentStore):
|
|
def __init__(self):
|
|
self._incidents: Dict[str, Dict] = {}
|
|
self._events: Dict[str, List[Dict]] = {}
|
|
self._artifacts: Dict[str, List[Dict]] = {}
|
|
self._lock = threading.Lock()
|
|
|
|
def create_incident(self, data: Dict) -> Dict:
|
|
inc_id = data.get("id") or _generate_incident_id()
|
|
now = _now_iso()
|
|
inc = {
|
|
"id": inc_id,
|
|
"workspace_id": data.get("workspace_id", "default"),
|
|
"service": data["service"],
|
|
"env": data.get("env", "prod"),
|
|
"severity": data.get("severity", "P2"),
|
|
"status": "open",
|
|
"title": _redact_text(data.get("title", ""), 500),
|
|
"summary": _redact_text(data.get("summary", "") or "", 2000),
|
|
"started_at": data.get("started_at", now),
|
|
"ended_at": None,
|
|
"created_by": data.get("created_by", "unknown"),
|
|
"created_at": now,
|
|
"updated_at": now,
|
|
"meta": data.get("meta") or {},
|
|
}
|
|
with self._lock:
|
|
self._incidents[inc_id] = inc
|
|
self._events[inc_id] = []
|
|
self._artifacts[inc_id] = []
|
|
return inc
|
|
|
|
def get_incident(self, incident_id: str) -> Optional[Dict]:
|
|
inc = self._incidents.get(incident_id)
|
|
if not inc:
|
|
return None
|
|
events = self._events.get(incident_id, [])[-20:]
|
|
artifacts = self._artifacts.get(incident_id, [])
|
|
return {**inc, "events": events, "artifacts": artifacts}
|
|
|
|
def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
|
|
filters = filters or {}
|
|
result = list(self._incidents.values())
|
|
if filters.get("status"):
|
|
result = [i for i in result if i["status"] == filters["status"]]
|
|
if filters.get("service"):
|
|
result = [i for i in result if i["service"] == filters["service"]]
|
|
if filters.get("env"):
|
|
result = [i for i in result if i["env"] == filters["env"]]
|
|
if filters.get("severity"):
|
|
result = [i for i in result if i["severity"] == filters["severity"]]
|
|
result.sort(key=lambda x: x.get("created_at", ""), reverse=True)
|
|
return result[:limit]
|
|
|
|
def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
|
|
inc = self._incidents.get(incident_id)
|
|
if not inc:
|
|
return None
|
|
with self._lock:
|
|
inc["status"] = "closed"
|
|
inc["ended_at"] = ended_at
|
|
inc["summary"] = _redact_text(resolution, 2000) if resolution else inc.get("summary")
|
|
inc["updated_at"] = _now_iso()
|
|
self._events.setdefault(incident_id, []).append({
|
|
"ts": _now_iso(),
|
|
"type": "status_change",
|
|
"message": f"Incident closed: {_redact_text(resolution, 500)}",
|
|
"meta": None,
|
|
})
|
|
return inc
|
|
|
|
def append_event(self, incident_id: str, event_type: str, message: str,
|
|
meta: Optional[Dict] = None) -> Optional[Dict]:
|
|
if incident_id not in self._incidents:
|
|
return None
|
|
ev = {
|
|
"ts": _now_iso(),
|
|
"type": event_type,
|
|
"message": _redact_text(message, 4000),
|
|
"meta": meta,
|
|
}
|
|
with self._lock:
|
|
self._events.setdefault(incident_id, []).append(ev)
|
|
self._incidents[incident_id]["updated_at"] = _now_iso()
|
|
return ev
|
|
|
|
def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
|
|
return self._events.get(incident_id, [])[:limit]
|
|
|
|
def add_artifact(self, incident_id: str, kind: str, fmt: str,
|
|
path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
|
|
if incident_id not in self._incidents:
|
|
return None
|
|
art = {
|
|
"ts": _now_iso(),
|
|
"kind": kind,
|
|
"format": fmt,
|
|
"path": path,
|
|
"sha256": sha256,
|
|
"size_bytes": size_bytes,
|
|
}
|
|
with self._lock:
|
|
self._artifacts.setdefault(incident_id, []).append(art)
|
|
return art
|
|
|
|
def get_artifacts(self, incident_id: str) -> List[Dict]:
|
|
return self._artifacts.get(incident_id, [])
|
|
|
|
|
|
# ─── JSONL (MVP file backend) ────────────────────────────────────────────────
|
|
|
|
class JsonlIncidentStore(IncidentStore):
|
|
"""
|
|
Stores incidents/events/artifacts as separate JSONL files in a directory.
|
|
Layout:
|
|
<base_dir>/incidents.jsonl
|
|
<base_dir>/events.jsonl
|
|
<base_dir>/artifacts.jsonl
|
|
"""
|
|
|
|
def __init__(self, base_dir: str):
|
|
self._dir = Path(base_dir)
|
|
self._dir.mkdir(parents=True, exist_ok=True)
|
|
self._lock = threading.Lock()
|
|
|
|
def _incidents_path(self) -> Path:
|
|
return self._dir / "incidents.jsonl"
|
|
|
|
def _events_path(self) -> Path:
|
|
return self._dir / "events.jsonl"
|
|
|
|
def _artifacts_path(self) -> Path:
|
|
return self._dir / "artifacts.jsonl"
|
|
|
|
def _read_jsonl(self, path: Path) -> List[Dict]:
|
|
if not path.exists():
|
|
return []
|
|
items = []
|
|
try:
|
|
with open(path, "r", encoding="utf-8") as fh:
|
|
for line in fh:
|
|
line = line.strip()
|
|
if line:
|
|
try:
|
|
items.append(json.loads(line))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
except Exception:
|
|
pass
|
|
return items
|
|
|
|
def _append_jsonl(self, path: Path, record: Dict) -> None:
|
|
with self._lock:
|
|
with open(path, "a", encoding="utf-8") as fh:
|
|
fh.write(json.dumps(record, ensure_ascii=False, default=str) + "\n")
|
|
|
|
def _rewrite_jsonl(self, path: Path, items: List[Dict]) -> None:
|
|
with self._lock:
|
|
with open(path, "w", encoding="utf-8") as fh:
|
|
for item in items:
|
|
fh.write(json.dumps(item, ensure_ascii=False, default=str) + "\n")
|
|
|
|
def create_incident(self, data: Dict) -> Dict:
|
|
inc_id = data.get("id") or _generate_incident_id()
|
|
now = _now_iso()
|
|
inc = {
|
|
"id": inc_id,
|
|
"workspace_id": data.get("workspace_id", "default"),
|
|
"service": data["service"],
|
|
"env": data.get("env", "prod"),
|
|
"severity": data.get("severity", "P2"),
|
|
"status": "open",
|
|
"title": _redact_text(data.get("title", ""), 500),
|
|
"summary": _redact_text(data.get("summary", "") or "", 2000),
|
|
"started_at": data.get("started_at", now),
|
|
"ended_at": None,
|
|
"created_by": data.get("created_by", "unknown"),
|
|
"created_at": now,
|
|
"updated_at": now,
|
|
"meta": data.get("meta") or {},
|
|
}
|
|
self._append_jsonl(self._incidents_path(), inc)
|
|
return inc
|
|
|
|
def get_incident(self, incident_id: str) -> Optional[Dict]:
|
|
incidents = self._read_jsonl(self._incidents_path())
|
|
inc = next((i for i in incidents if i.get("id") == incident_id), None)
|
|
if not inc:
|
|
return None
|
|
events = [e for e in self._read_jsonl(self._events_path())
|
|
if e.get("incident_id") == incident_id][-20:]
|
|
artifacts = [a for a in self._read_jsonl(self._artifacts_path())
|
|
if a.get("incident_id") == incident_id]
|
|
return {**inc, "events": events, "artifacts": artifacts}
|
|
|
|
def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
|
|
filters = filters or {}
|
|
incidents = self._read_jsonl(self._incidents_path())
|
|
if filters.get("status"):
|
|
incidents = [i for i in incidents if i.get("status") == filters["status"]]
|
|
if filters.get("service"):
|
|
incidents = [i for i in incidents if i.get("service") == filters["service"]]
|
|
if filters.get("env"):
|
|
incidents = [i for i in incidents if i.get("env") == filters["env"]]
|
|
if filters.get("severity"):
|
|
incidents = [i for i in incidents if i.get("severity") == filters["severity"]]
|
|
incidents.sort(key=lambda x: x.get("created_at", ""), reverse=True)
|
|
return incidents[:limit]
|
|
|
|
def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
|
|
incidents = self._read_jsonl(self._incidents_path())
|
|
found = None
|
|
for inc in incidents:
|
|
if inc.get("id") == incident_id:
|
|
inc["status"] = "closed"
|
|
inc["ended_at"] = ended_at
|
|
if resolution:
|
|
inc["summary"] = _redact_text(resolution, 2000)
|
|
inc["updated_at"] = _now_iso()
|
|
found = inc
|
|
break
|
|
if not found:
|
|
return None
|
|
self._rewrite_jsonl(self._incidents_path(), incidents)
|
|
self.append_event(incident_id, "status_change",
|
|
f"Incident closed: {_redact_text(resolution or '', 500)}")
|
|
return found
|
|
|
|
def append_event(self, incident_id: str, event_type: str, message: str,
|
|
meta: Optional[Dict] = None) -> Optional[Dict]:
|
|
incidents = self._read_jsonl(self._incidents_path())
|
|
if not any(i.get("id") == incident_id for i in incidents):
|
|
return None
|
|
ev = {
|
|
"incident_id": incident_id,
|
|
"ts": _now_iso(),
|
|
"type": event_type,
|
|
"message": _redact_text(message, 4000),
|
|
"meta": meta,
|
|
}
|
|
self._append_jsonl(self._events_path(), ev)
|
|
return ev
|
|
|
|
def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
|
|
events = self._read_jsonl(self._events_path())
|
|
return [e for e in events if e.get("incident_id") == incident_id][:limit]
|
|
|
|
def add_artifact(self, incident_id: str, kind: str, fmt: str,
|
|
path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
|
|
incidents = self._read_jsonl(self._incidents_path())
|
|
if not any(i.get("id") == incident_id for i in incidents):
|
|
return None
|
|
art = {
|
|
"incident_id": incident_id,
|
|
"ts": _now_iso(),
|
|
"kind": kind,
|
|
"format": fmt,
|
|
"path": path,
|
|
"sha256": sha256,
|
|
"size_bytes": size_bytes,
|
|
}
|
|
self._append_jsonl(self._artifacts_path(), art)
|
|
return art
|
|
|
|
def get_artifacts(self, incident_id: str) -> List[Dict]:
|
|
artifacts = self._read_jsonl(self._artifacts_path())
|
|
return [a for a in artifacts if a.get("incident_id") == incident_id]
|
|
|
|
|
|
# ─── Postgres backend ─────────────────────────────────────────────────────────
|
|
|
|
class PostgresIncidentStore(IncidentStore):
|
|
"""
|
|
Production backend using psycopg2 (sync).
|
|
Tables created by ops/scripts/migrate_incidents_postgres.py.
|
|
"""
|
|
|
|
def __init__(self, dsn: str):
|
|
self._dsn = dsn
|
|
self._local = threading.local()
|
|
|
|
def _conn(self):
|
|
"""Get or create a per-thread connection."""
|
|
conn = getattr(self._local, "conn", None)
|
|
if conn is None or conn.closed:
|
|
import psycopg2 # type: ignore
|
|
conn = psycopg2.connect(self._dsn)
|
|
conn.autocommit = True
|
|
self._local.conn = conn
|
|
return conn
|
|
|
|
def create_incident(self, data: Dict) -> Dict:
|
|
inc_id = data.get("id") or _generate_incident_id()
|
|
now = _now_iso()
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
"""INSERT INTO incidents (id,workspace_id,service,env,severity,status,
|
|
title,summary,started_at,created_by,created_at,updated_at)
|
|
VALUES (%s,%s,%s,%s,%s,'open',%s,%s,%s,%s,%s,%s)""",
|
|
(inc_id, data.get("workspace_id", "default"),
|
|
data["service"], data.get("env", "prod"),
|
|
data.get("severity", "P2"),
|
|
_redact_text(data.get("title", ""), 500),
|
|
_redact_text(data.get("summary", "") or "", 2000),
|
|
data.get("started_at") or now,
|
|
data.get("created_by", "unknown"), now, now),
|
|
)
|
|
cur.close()
|
|
return {"id": inc_id, "status": "open", "service": data["service"],
|
|
"severity": data.get("severity", "P2"),
|
|
"started_at": data.get("started_at") or now,
|
|
"created_at": now}
|
|
|
|
def get_incident(self, incident_id: str) -> Optional[Dict]:
|
|
cur = self._conn().cursor()
|
|
cur.execute("SELECT id,workspace_id,service,env,severity,status,title,summary,"
|
|
"started_at,ended_at,created_by,created_at,updated_at "
|
|
"FROM incidents WHERE id=%s", (incident_id,))
|
|
row = cur.fetchone()
|
|
if not row:
|
|
cur.close()
|
|
return None
|
|
cols = [d[0] for d in cur.description]
|
|
inc = {c: (v.isoformat() if isinstance(v, datetime.datetime) else v) for c, v in zip(cols, row)}
|
|
# Events
|
|
cur.execute("SELECT ts,type,message,meta FROM incident_events "
|
|
"WHERE incident_id=%s ORDER BY ts DESC LIMIT 200", (incident_id,))
|
|
events = []
|
|
for r in cur.fetchall():
|
|
events.append({"ts": r[0].isoformat() if r[0] else "", "type": r[1],
|
|
"message": r[2], "meta": r[3]})
|
|
events.reverse()
|
|
# Artifacts
|
|
cur.execute("SELECT ts,kind,format,path,sha256,size_bytes FROM incident_artifacts "
|
|
"WHERE incident_id=%s ORDER BY ts", (incident_id,))
|
|
artifacts = []
|
|
for r in cur.fetchall():
|
|
artifacts.append({"ts": r[0].isoformat() if r[0] else "", "kind": r[1],
|
|
"format": r[2], "path": r[3], "sha256": r[4], "size_bytes": r[5]})
|
|
cur.close()
|
|
return {**inc, "events": events, "artifacts": artifacts}
|
|
|
|
def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
|
|
filters = filters or {}
|
|
clauses = []
|
|
params: list = []
|
|
for k in ("status", "service", "env", "severity"):
|
|
if filters.get(k):
|
|
clauses.append(f"{k}=%s")
|
|
params.append(filters[k])
|
|
if filters.get("window_days"):
|
|
clauses.append("created_at >= NOW() - INTERVAL '%s days'")
|
|
params.append(int(filters["window_days"]))
|
|
where = ("WHERE " + " AND ".join(clauses)) if clauses else ""
|
|
params.append(min(limit, 200))
|
|
cur = self._conn().cursor()
|
|
cur.execute(f"SELECT id,workspace_id,service,env,severity,status,title,summary,"
|
|
f"started_at,ended_at,created_by,created_at,updated_at "
|
|
f"FROM incidents {where} ORDER BY created_at DESC LIMIT %s", params)
|
|
cols = [d[0] for d in cur.description]
|
|
rows = []
|
|
for row in cur.fetchall():
|
|
rows.append({c: (v.isoformat() if isinstance(v, datetime.datetime) else v)
|
|
for c, v in zip(cols, row)})
|
|
cur.close()
|
|
return rows
|
|
|
|
def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
|
|
cur = self._conn().cursor()
|
|
cur.execute("UPDATE incidents SET status='closed', ended_at=%s, summary=%s, updated_at=%s "
|
|
"WHERE id=%s RETURNING id",
|
|
(ended_at or _now_iso(), _redact_text(resolution, 2000) if resolution else None,
|
|
_now_iso(), incident_id))
|
|
if not cur.fetchone():
|
|
cur.close()
|
|
return None
|
|
cur.close()
|
|
self.append_event(incident_id, "status_change",
|
|
f"Incident closed: {_redact_text(resolution or '', 500)}")
|
|
return {"id": incident_id, "status": "closed"}
|
|
|
|
def append_event(self, incident_id: str, event_type: str, message: str,
|
|
meta: Optional[Dict] = None) -> Optional[Dict]:
|
|
now = _now_iso()
|
|
cur = self._conn().cursor()
|
|
meta_json = json.dumps(meta, default=str) if meta else None
|
|
cur.execute("INSERT INTO incident_events (incident_id,ts,type,message,meta) "
|
|
"VALUES (%s,%s,%s,%s,%s)",
|
|
(incident_id, now, event_type, _redact_text(message, 4000), meta_json))
|
|
cur.close()
|
|
return {"ts": now, "type": event_type, "message": _redact_text(message, 4000), "meta": meta}
|
|
|
|
def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
|
|
cur = self._conn().cursor()
|
|
cur.execute("SELECT ts,type,message,meta FROM incident_events "
|
|
"WHERE incident_id=%s ORDER BY ts LIMIT %s", (incident_id, limit))
|
|
events = [{"ts": r[0].isoformat() if r[0] else "", "type": r[1],
|
|
"message": r[2], "meta": r[3]} for r in cur.fetchall()]
|
|
cur.close()
|
|
return events
|
|
|
|
def add_artifact(self, incident_id: str, kind: str, fmt: str,
|
|
path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
|
|
now = _now_iso()
|
|
cur = self._conn().cursor()
|
|
cur.execute("INSERT INTO incident_artifacts (incident_id,ts,kind,format,path,sha256,size_bytes) "
|
|
"VALUES (%s,%s,%s,%s,%s,%s,%s)",
|
|
(incident_id, now, kind, fmt, path, sha256, size_bytes))
|
|
cur.close()
|
|
return {"ts": now, "kind": kind, "format": fmt, "path": path,
|
|
"sha256": sha256, "size_bytes": size_bytes}
|
|
|
|
def get_artifacts(self, incident_id: str) -> List[Dict]:
|
|
cur = self._conn().cursor()
|
|
cur.execute("SELECT ts,kind,format,path,sha256,size_bytes FROM incident_artifacts "
|
|
"WHERE incident_id=%s ORDER BY ts", (incident_id,))
|
|
artifacts = [{"ts": r[0].isoformat() if r[0] else "", "kind": r[1], "format": r[2],
|
|
"path": r[3], "sha256": r[4], "size_bytes": r[5]} for r in cur.fetchall()]
|
|
cur.close()
|
|
return artifacts
|
|
|
|
def close(self):
|
|
conn = getattr(self._local, "conn", None)
|
|
if conn and not conn.closed:
|
|
conn.close()
|
|
|
|
|
|
# ─── Auto backend (Postgres → JSONL fallback) ────────────────────────────────
|
|
|
|
class AutoIncidentStore(IncidentStore):
|
|
"""
|
|
Tries Postgres first; on any failure falls back to JSONL.
|
|
Re-attempts Postgres after RECOVERY_INTERVAL_S (5 min).
|
|
"""
|
|
|
|
_RECOVERY_INTERVAL_S = 300
|
|
|
|
def __init__(self, pg_dsn: str, jsonl_dir: str):
|
|
self._pg_dsn = pg_dsn
|
|
self._jsonl_dir = jsonl_dir
|
|
self._primary: Optional[PostgresIncidentStore] = None
|
|
self._fallback: Optional[JsonlIncidentStore] = None
|
|
self._using_fallback = False
|
|
self._fallback_since: float = 0.0
|
|
self._init_lock = threading.Lock()
|
|
|
|
def _get_primary(self) -> PostgresIncidentStore:
|
|
if self._primary is None:
|
|
with self._init_lock:
|
|
if self._primary is None:
|
|
self._primary = PostgresIncidentStore(self._pg_dsn)
|
|
return self._primary
|
|
|
|
def _get_fallback(self) -> JsonlIncidentStore:
|
|
if self._fallback is None:
|
|
with self._init_lock:
|
|
if self._fallback is None:
|
|
self._fallback = JsonlIncidentStore(self._jsonl_dir)
|
|
return self._fallback
|
|
|
|
def _maybe_recover(self) -> None:
|
|
if self._using_fallback and self._fallback_since > 0:
|
|
if time.monotonic() - self._fallback_since >= self._RECOVERY_INTERVAL_S:
|
|
logger.info("AutoIncidentStore: attempting Postgres recovery")
|
|
self._using_fallback = False
|
|
self._fallback_since = 0.0
|
|
|
|
def _switch_to_fallback(self, err: Exception) -> None:
|
|
logger.warning("AutoIncidentStore: Postgres failed (%s), using JSONL fallback", err)
|
|
self._using_fallback = True
|
|
self._fallback_since = time.monotonic()
|
|
|
|
def active_backend(self) -> str:
|
|
return "jsonl_fallback" if self._using_fallback else "postgres"
|
|
|
|
# ── Delegate methods ──────────────────────────────────────────────────────
|
|
|
|
def create_incident(self, data: Dict) -> Dict:
|
|
self._maybe_recover()
|
|
if not self._using_fallback:
|
|
try:
|
|
return self._get_primary().create_incident(data)
|
|
except Exception as e:
|
|
self._switch_to_fallback(e)
|
|
return self._get_fallback().create_incident(data)
|
|
|
|
def get_incident(self, incident_id: str) -> Optional[Dict]:
|
|
self._maybe_recover()
|
|
if not self._using_fallback:
|
|
try:
|
|
return self._get_primary().get_incident(incident_id)
|
|
except Exception as e:
|
|
self._switch_to_fallback(e)
|
|
return self._get_fallback().get_incident(incident_id)
|
|
|
|
def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
|
|
self._maybe_recover()
|
|
if not self._using_fallback:
|
|
try:
|
|
return self._get_primary().list_incidents(filters, limit)
|
|
except Exception as e:
|
|
self._switch_to_fallback(e)
|
|
return self._get_fallback().list_incidents(filters, limit)
|
|
|
|
def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
|
|
self._maybe_recover()
|
|
if not self._using_fallback:
|
|
try:
|
|
return self._get_primary().close_incident(incident_id, ended_at, resolution)
|
|
except Exception as e:
|
|
self._switch_to_fallback(e)
|
|
return self._get_fallback().close_incident(incident_id, ended_at, resolution)
|
|
|
|
def append_event(self, incident_id: str, event_type: str, message: str,
|
|
meta: Optional[Dict] = None) -> Optional[Dict]:
|
|
self._maybe_recover()
|
|
if not self._using_fallback:
|
|
try:
|
|
return self._get_primary().append_event(incident_id, event_type, message, meta)
|
|
except Exception as e:
|
|
self._switch_to_fallback(e)
|
|
return self._get_fallback().append_event(incident_id, event_type, message, meta)
|
|
|
|
def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
|
|
self._maybe_recover()
|
|
if not self._using_fallback:
|
|
try:
|
|
return self._get_primary().get_events(incident_id, limit)
|
|
except Exception as e:
|
|
self._switch_to_fallback(e)
|
|
return self._get_fallback().get_events(incident_id, limit)
|
|
|
|
def add_artifact(self, incident_id: str, kind: str, fmt: str,
|
|
path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
|
|
self._maybe_recover()
|
|
if not self._using_fallback:
|
|
try:
|
|
return self._get_primary().add_artifact(incident_id, kind, fmt, path, sha256, size_bytes)
|
|
except Exception as e:
|
|
self._switch_to_fallback(e)
|
|
return self._get_fallback().add_artifact(incident_id, kind, fmt, path, sha256, size_bytes)
|
|
|
|
def get_artifacts(self, incident_id: str) -> List[Dict]:
|
|
self._maybe_recover()
|
|
if not self._using_fallback:
|
|
try:
|
|
return self._get_primary().get_artifacts(incident_id)
|
|
except Exception as e:
|
|
self._switch_to_fallback(e)
|
|
return self._get_fallback().get_artifacts(incident_id)
|
|
|
|
|
|
# ─── Singleton ────────────────────────────────────────────────────────────────
|
|
|
|
_store: Optional[IncidentStore] = None
|
|
_store_lock = threading.Lock()
|
|
|
|
|
|
def get_incident_store() -> IncidentStore:
|
|
global _store
|
|
if _store is None:
|
|
with _store_lock:
|
|
if _store is None:
|
|
_store = _create_store()
|
|
return _store
|
|
|
|
|
|
def set_incident_store(store: Optional[IncidentStore]) -> None:
|
|
global _store
|
|
with _store_lock:
|
|
_store = store
|
|
|
|
|
|
def _create_store() -> IncidentStore:
|
|
backend = os.getenv("INCIDENT_BACKEND", "jsonl").lower()
|
|
dsn = os.getenv("DATABASE_URL") or os.getenv("INCIDENT_DATABASE_URL", "")
|
|
jsonl_dir = os.getenv(
|
|
"INCIDENT_JSONL_DIR",
|
|
str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "incidents"),
|
|
)
|
|
|
|
if backend == "memory":
|
|
logger.info("IncidentStore: in-memory (testing only)")
|
|
return MemoryIncidentStore()
|
|
|
|
if backend == "postgres":
|
|
if dsn:
|
|
logger.info("IncidentStore: postgres dsn=%s…", dsn[:30])
|
|
return PostgresIncidentStore(dsn)
|
|
logger.warning("INCIDENT_BACKEND=postgres but no DATABASE_URL; falling back to jsonl")
|
|
|
|
if backend == "auto":
|
|
if dsn:
|
|
logger.info("IncidentStore: auto (postgres→jsonl fallback) dsn=%s…", dsn[:30])
|
|
return AutoIncidentStore(pg_dsn=dsn, jsonl_dir=jsonl_dir)
|
|
logger.info("IncidentStore: auto — no DATABASE_URL, using jsonl")
|
|
|
|
if backend == "null":
|
|
return MemoryIncidentStore()
|
|
|
|
# Default: JSONL
|
|
logger.info("IncidentStore: jsonl dir=%s", jsonl_dir)
|
|
return JsonlIncidentStore(jsonl_dir)
|