New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
1032 lines
40 KiB
Python
1032 lines
40 KiB
Python
"""
|
|
alert_store.py — Alert ingestion storage with state machine.
|
|
|
|
State machine: new → processing → acked | failed
|
|
|
|
Backends:
|
|
- MemoryAlertStore (testing / single-process)
|
|
- PostgresAlertStore (production — uses psycopg2 sync)
|
|
- AutoAlertStore (Postgres primary → Memory fallback)
|
|
|
|
DDL: ops/scripts/migrate_alerts_postgres.py
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import datetime
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import os
|
|
import threading
|
|
import time
|
|
import uuid
|
|
from abc import ABC, abstractmethod
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ─── Constants ────────────────────────────────────────────────────────────────
|
|
|
|
MAX_LOG_SAMPLES = 40
|
|
MAX_SUMMARY_CHARS = 1000
|
|
MAX_ALERT_JSON_BYTES = 32 * 1024 # 32 KB per alert
|
|
|
|
# Alert status values
|
|
STATUS_NEW = "new"
|
|
STATUS_PROCESSING = "processing"
|
|
STATUS_ACKED = "acked"
|
|
STATUS_FAILED = "failed"
|
|
|
|
PROCESSING_LOCK_TTL_S = 600 # default 10 min lock
|
|
|
|
|
|
def _now_iso() -> str:
|
|
return datetime.datetime.utcnow().isoformat()
|
|
|
|
|
|
def _now_dt() -> datetime.datetime:
|
|
return datetime.datetime.utcnow()
|
|
|
|
|
|
def _generate_alert_ref() -> str:
|
|
ts = datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
|
short = uuid.uuid4().hex[:6]
|
|
return f"alrt_{ts}_{short}"
|
|
|
|
|
|
def _compute_dedupe_key(service: str, env: str, kind: str, fingerprint: str = "") -> str:
|
|
raw = f"{service}|{env}|{kind}|{fingerprint}"
|
|
return hashlib.sha256(raw.encode()).hexdigest()[:32]
|
|
|
|
|
|
def _redact_text(text: str, max_chars: int = 500) -> str:
|
|
import re
|
|
_SECRET_PAT = re.compile(
|
|
r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+',
|
|
)
|
|
redacted = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", text or "")
|
|
return redacted[:max_chars]
|
|
|
|
|
|
def _sanitize_alert(alert_data: Dict) -> Dict:
|
|
"""Truncate/redact alert payload for safe storage."""
|
|
safe = dict(alert_data)
|
|
safe["summary"] = _redact_text(safe.get("summary", ""), MAX_SUMMARY_CHARS)
|
|
safe["title"] = _redact_text(safe.get("title", ""), 300)
|
|
ev = safe.get("evidence", {})
|
|
if isinstance(ev, dict):
|
|
logs = ev.get("log_samples", [])
|
|
safe["evidence"] = {
|
|
**ev,
|
|
"log_samples": [_redact_text(l, 300) for l in logs[:MAX_LOG_SAMPLES]],
|
|
}
|
|
return safe
|
|
|
|
|
|
# ─── Abstract interface ────────────────────────────────────────────────────────
|
|
|
|
class AlertStore(ABC):
|
|
|
|
@abstractmethod
|
|
def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict:
|
|
"""
|
|
Store alert with dedupe.
|
|
Returns: {accepted, deduped, dedupe_key, alert_ref, occurrences}
|
|
"""
|
|
|
|
@abstractmethod
|
|
def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
|
|
"""List alerts metadata. Supports status_in filter."""
|
|
|
|
@abstractmethod
|
|
def get_alert(self, alert_ref: str) -> Optional[Dict]:
|
|
"""Return full alert record."""
|
|
|
|
@abstractmethod
|
|
def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
|
|
"""Mark alert as acked (status=acked). Legacy compat."""
|
|
|
|
@abstractmethod
|
|
def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]:
|
|
"""Lookup by dedupe key (for reuse-open-incident logic)."""
|
|
|
|
# ── State machine methods ──────────────────────────────────────────────────
|
|
|
|
@abstractmethod
|
|
def claim_next_alerts(
|
|
self,
|
|
window_minutes: int = 240,
|
|
limit: int = 25,
|
|
owner: str = "loop",
|
|
lock_ttl_seconds: int = PROCESSING_LOCK_TTL_S,
|
|
) -> List[Dict]:
|
|
"""
|
|
Atomically move status=new (or failed+expired) → processing.
|
|
Skips already-processing-and-locked alerts.
|
|
Returns the claimed alert records.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def mark_acked(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
|
|
"""status=acked, acked_at=now."""
|
|
|
|
@abstractmethod
|
|
def mark_failed(
|
|
self, alert_ref: str, error: str, retry_after_seconds: int = 300
|
|
) -> Optional[Dict]:
|
|
"""status=failed, lock_until=now+retry, last_error=truncated."""
|
|
|
|
@abstractmethod
|
|
def requeue_expired_processing(self) -> int:
|
|
"""processing + lock_until < now → status=new. Returns count reset."""
|
|
|
|
@abstractmethod
|
|
def dashboard_counts(self, window_minutes: int = 240) -> Dict:
|
|
"""Return {new, processing, failed, acked} counts for window."""
|
|
|
|
@abstractmethod
|
|
def top_signatures(self, window_minutes: int = 240, limit: int = 20) -> List[Dict]:
|
|
"""Return top dedupe_keys by occurrences."""
|
|
|
|
@abstractmethod
|
|
def compute_loop_slo(self, window_minutes: int = 240,
|
|
p95_threshold_s: float = 60.0,
|
|
failed_rate_threshold_pct: float = 5.0,
|
|
stuck_minutes: float = 15.0) -> Dict:
|
|
"""Compute alert-loop SLO metrics for the dashboard.
|
|
Returns: {claim_to_ack_p95_seconds, failed_rate_pct, processing_stuck_count, violations}
|
|
"""
|
|
|
|
|
|
# ─── Memory backend ────────────────────────────────────────────────────────────
|
|
|
|
class MemoryAlertStore(AlertStore):
|
|
def __init__(self):
|
|
self._lock = threading.Lock()
|
|
self._alerts: Dict[str, Dict] = {}
|
|
self._dedupe: Dict[str, str] = {} # dedupe_key → alert_ref
|
|
|
|
def _new_record(self, alert_data: Dict, dedupe_key: str, now: str) -> Dict:
|
|
safe = _sanitize_alert(alert_data)
|
|
service = alert_data.get("service", "unknown")
|
|
env = alert_data.get("env", "prod")
|
|
kind = alert_data.get("kind", "custom")
|
|
alert_ref = alert_data.get("alert_id") or _generate_alert_ref()
|
|
return {
|
|
"alert_ref": alert_ref,
|
|
"dedupe_key": dedupe_key,
|
|
"source": safe.get("source", "unknown"),
|
|
"service": service,
|
|
"env": env,
|
|
"severity": safe.get("severity", "P2"),
|
|
"kind": kind,
|
|
"title": safe.get("title", ""),
|
|
"summary": safe.get("summary", ""),
|
|
"started_at": safe.get("started_at") or now,
|
|
"labels": safe.get("labels", {}),
|
|
"metrics": safe.get("metrics", {}),
|
|
"evidence": safe.get("evidence", {}),
|
|
"links": safe.get("links", [])[:10],
|
|
"created_at": now,
|
|
"last_seen_at": now,
|
|
"occurrences": 1,
|
|
# State machine fields
|
|
"status": STATUS_NEW,
|
|
"claimed_at": None, # set when claimed
|
|
"processing_lock_until": None,
|
|
"processing_owner": None,
|
|
"last_error": None,
|
|
"acked_at": None,
|
|
# Legacy compat
|
|
"ack_status": "pending",
|
|
"ack_actor": None,
|
|
"ack_note": None,
|
|
"ack_at": None,
|
|
}
|
|
|
|
def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict:
|
|
service = alert_data.get("service", "unknown")
|
|
env = alert_data.get("env", "prod")
|
|
kind = alert_data.get("kind", "custom")
|
|
labels = alert_data.get("labels", {})
|
|
fingerprint = labels.get("fingerprint", "")
|
|
dedupe_key = _compute_dedupe_key(service, env, kind, fingerprint)
|
|
|
|
now = _now_iso()
|
|
with self._lock:
|
|
existing_ref = self._dedupe.get(dedupe_key)
|
|
if existing_ref and existing_ref in self._alerts:
|
|
existing = self._alerts[existing_ref]
|
|
created_at = existing.get("created_at", "")
|
|
ttl_cutoff = (
|
|
datetime.datetime.utcnow()
|
|
- datetime.timedelta(minutes=dedupe_ttl_minutes)
|
|
).isoformat()
|
|
if created_at >= ttl_cutoff:
|
|
existing["occurrences"] = existing.get("occurrences", 1) + 1
|
|
existing["last_seen_at"] = now
|
|
if alert_data.get("metrics"):
|
|
existing["metrics"] = alert_data["metrics"]
|
|
# If previously acked/failed, reset to new so it gets picked up again
|
|
if existing.get("status") in (STATUS_ACKED, STATUS_FAILED):
|
|
existing["status"] = STATUS_NEW
|
|
existing["processing_lock_until"] = None
|
|
existing["last_error"] = None
|
|
return {
|
|
"accepted": True,
|
|
"deduped": True,
|
|
"dedupe_key": dedupe_key,
|
|
"alert_ref": existing_ref,
|
|
"occurrences": existing["occurrences"],
|
|
}
|
|
|
|
record = self._new_record(alert_data, dedupe_key, now)
|
|
alert_ref = record["alert_ref"]
|
|
self._alerts[alert_ref] = record
|
|
self._dedupe[dedupe_key] = alert_ref
|
|
|
|
return {
|
|
"accepted": True,
|
|
"deduped": False,
|
|
"dedupe_key": dedupe_key,
|
|
"alert_ref": alert_ref,
|
|
"occurrences": 1,
|
|
}
|
|
|
|
def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
|
|
filters = filters or {}
|
|
service = filters.get("service")
|
|
env = filters.get("env")
|
|
window = int(filters.get("window_minutes", 240))
|
|
status_in = filters.get("status_in") # list of statuses or None (all)
|
|
cutoff = (
|
|
datetime.datetime.utcnow() - datetime.timedelta(minutes=window)
|
|
).isoformat()
|
|
with self._lock:
|
|
results = []
|
|
for a in sorted(self._alerts.values(),
|
|
key=lambda x: x.get("created_at", ""), reverse=True):
|
|
if a.get("created_at", "") < cutoff:
|
|
continue
|
|
if service and a.get("service") != service:
|
|
continue
|
|
if env and a.get("env") != env:
|
|
continue
|
|
if status_in and a.get("status", STATUS_NEW) not in status_in:
|
|
continue
|
|
results.append({k: v for k, v in a.items() if k not in ("evidence",)})
|
|
if len(results) >= limit:
|
|
break
|
|
return results
|
|
|
|
def get_alert(self, alert_ref: str) -> Optional[Dict]:
|
|
with self._lock:
|
|
return dict(self._alerts[alert_ref]) if alert_ref in self._alerts else None
|
|
|
|
def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
|
|
return self.mark_acked(alert_ref, actor, note)
|
|
|
|
def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]:
|
|
with self._lock:
|
|
ref = self._dedupe.get(dedupe_key)
|
|
if ref and ref in self._alerts:
|
|
return dict(self._alerts[ref])
|
|
return None
|
|
|
|
def claim_next_alerts(
|
|
self,
|
|
window_minutes: int = 240,
|
|
limit: int = 25,
|
|
owner: str = "loop",
|
|
lock_ttl_seconds: int = PROCESSING_LOCK_TTL_S,
|
|
) -> List[Dict]:
|
|
now_dt = _now_dt()
|
|
now_str = now_dt.isoformat()
|
|
lock_until = (now_dt + datetime.timedelta(seconds=lock_ttl_seconds)).isoformat()
|
|
cutoff = (now_dt - datetime.timedelta(minutes=window_minutes)).isoformat()
|
|
|
|
claimed = []
|
|
with self._lock:
|
|
for a in sorted(self._alerts.values(),
|
|
key=lambda x: x.get("created_at", "")):
|
|
if len(claimed) >= limit:
|
|
break
|
|
if a.get("created_at", "") < cutoff:
|
|
continue
|
|
st = a.get("status", STATUS_NEW)
|
|
lock_exp = a.get("processing_lock_until")
|
|
|
|
# Claimable: new, OR failed/processing with expired/no lock
|
|
if st == STATUS_ACKED:
|
|
continue
|
|
if st in (STATUS_PROCESSING, STATUS_FAILED):
|
|
if lock_exp and lock_exp > now_str:
|
|
continue # still locked (retry window not passed)
|
|
# Claim it
|
|
a["status"] = STATUS_PROCESSING
|
|
a["claimed_at"] = now_str
|
|
a["processing_lock_until"] = lock_until
|
|
a["processing_owner"] = owner
|
|
claimed.append(dict(a))
|
|
|
|
return claimed
|
|
|
|
def mark_acked(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
|
|
now = _now_iso()
|
|
with self._lock:
|
|
if alert_ref not in self._alerts:
|
|
return None
|
|
rec = self._alerts[alert_ref]
|
|
rec["status"] = STATUS_ACKED
|
|
rec["acked_at"] = now
|
|
rec["ack_status"] = "acked"
|
|
rec["ack_actor"] = _redact_text(actor, 100)
|
|
rec["ack_note"] = _redact_text(note, 500)
|
|
rec["ack_at"] = now
|
|
rec["processing_lock_until"] = None
|
|
rec["processing_owner"] = None
|
|
return {"alert_ref": alert_ref, "status": STATUS_ACKED, "ack_status": "acked"}
|
|
|
|
def mark_failed(
|
|
self, alert_ref: str, error: str, retry_after_seconds: int = 300
|
|
) -> Optional[Dict]:
|
|
now_dt = _now_dt()
|
|
retry_at = (now_dt + datetime.timedelta(seconds=retry_after_seconds)).isoformat()
|
|
with self._lock:
|
|
if alert_ref not in self._alerts:
|
|
return None
|
|
rec = self._alerts[alert_ref]
|
|
rec["status"] = STATUS_FAILED
|
|
rec["last_error"] = _redact_text(error, 500)
|
|
rec["processing_lock_until"] = retry_at
|
|
rec["processing_owner"] = None
|
|
return {"alert_ref": alert_ref, "status": STATUS_FAILED,
|
|
"ack_status": "failed", "retry_at": retry_at}
|
|
|
|
def requeue_expired_processing(self) -> int:
|
|
now_str = _now_iso()
|
|
count = 0
|
|
with self._lock:
|
|
for a in self._alerts.values():
|
|
if a.get("status") == STATUS_PROCESSING:
|
|
lock_exp = a.get("processing_lock_until")
|
|
if lock_exp and lock_exp <= now_str:
|
|
a["status"] = STATUS_NEW
|
|
a["processing_lock_until"] = None
|
|
a["processing_owner"] = None
|
|
count += 1
|
|
return count
|
|
|
|
def dashboard_counts(self, window_minutes: int = 240) -> Dict:
|
|
cutoff = (
|
|
_now_dt() - datetime.timedelta(minutes=window_minutes)
|
|
).isoformat()
|
|
counts = {STATUS_NEW: 0, STATUS_PROCESSING: 0, STATUS_ACKED: 0, STATUS_FAILED: 0}
|
|
now_str = _now_iso()
|
|
with self._lock:
|
|
for a in self._alerts.values():
|
|
if a.get("created_at", "") < cutoff:
|
|
continue
|
|
st = a.get("status", STATUS_NEW)
|
|
if st in counts:
|
|
counts[st] += 1
|
|
return counts
|
|
|
|
def top_signatures(self, window_minutes: int = 240, limit: int = 20) -> List[Dict]:
|
|
cutoff = (
|
|
_now_dt() - datetime.timedelta(minutes=window_minutes)
|
|
).isoformat()
|
|
sigs: Dict[str, Dict] = {}
|
|
with self._lock:
|
|
for a in self._alerts.values():
|
|
if a.get("created_at", "") < cutoff:
|
|
continue
|
|
key = a.get("dedupe_key", "")
|
|
if key not in sigs:
|
|
sigs[key] = {
|
|
"signature": key,
|
|
"service": a.get("service", ""),
|
|
"kind": a.get("kind", ""),
|
|
"occurrences": 0,
|
|
"last_seen": a.get("last_seen_at", ""),
|
|
}
|
|
sigs[key]["occurrences"] += a.get("occurrences", 1)
|
|
if a.get("last_seen_at", "") > sigs[key]["last_seen"]:
|
|
sigs[key]["last_seen"] = a.get("last_seen_at", "")
|
|
return sorted(sigs.values(), key=lambda x: x["occurrences"], reverse=True)[:limit]
|
|
|
|
def compute_loop_slo(self, window_minutes: int = 240,
|
|
p95_threshold_s: float = 60.0,
|
|
failed_rate_threshold_pct: float = 5.0,
|
|
stuck_minutes: float = 15.0) -> Dict:
|
|
now_dt = _now_dt()
|
|
cutoff = (now_dt - datetime.timedelta(minutes=window_minutes)).isoformat()
|
|
stuck_cutoff = (now_dt - datetime.timedelta(minutes=stuck_minutes)).isoformat()
|
|
|
|
durations_s: list = []
|
|
acked = 0
|
|
failed = 0
|
|
stuck = 0
|
|
|
|
with self._lock:
|
|
for a in self._alerts.values():
|
|
if a.get("created_at", "") < cutoff:
|
|
continue
|
|
st = a.get("status", STATUS_NEW)
|
|
if st == STATUS_ACKED:
|
|
acked += 1
|
|
claimed_at = a.get("claimed_at")
|
|
acked_at = a.get("acked_at")
|
|
if claimed_at and acked_at:
|
|
try:
|
|
c = datetime.datetime.fromisoformat(claimed_at)
|
|
k = datetime.datetime.fromisoformat(acked_at)
|
|
durations_s.append((k - c).total_seconds())
|
|
except Exception:
|
|
pass
|
|
elif st == STATUS_FAILED:
|
|
failed += 1
|
|
elif st == STATUS_PROCESSING:
|
|
claimed_at = a.get("claimed_at") or ""
|
|
if claimed_at and claimed_at < stuck_cutoff:
|
|
stuck += 1
|
|
|
|
# P95
|
|
p95 = None
|
|
if durations_s:
|
|
durations_s.sort()
|
|
idx = max(0, int(len(durations_s) * 0.95) - 1)
|
|
p95 = round(durations_s[idx], 1)
|
|
|
|
# Failed rate
|
|
total_terminal = acked + failed
|
|
failed_pct = round((failed / total_terminal * 100) if total_terminal > 0 else 0.0, 1)
|
|
|
|
violations = []
|
|
if p95 is not None and p95 > p95_threshold_s:
|
|
violations.append({
|
|
"metric": "claim_to_ack_p95_seconds",
|
|
"value": p95,
|
|
"threshold": p95_threshold_s,
|
|
"message": f"P95 claim→ack latency {p95}s exceeds {p95_threshold_s}s",
|
|
})
|
|
if failed_pct > failed_rate_threshold_pct:
|
|
violations.append({
|
|
"metric": "failed_rate_pct",
|
|
"value": failed_pct,
|
|
"threshold": failed_rate_threshold_pct,
|
|
"message": f"Failed alert rate {failed_pct}% exceeds {failed_rate_threshold_pct}%",
|
|
})
|
|
if stuck > 0:
|
|
violations.append({
|
|
"metric": "processing_stuck_count",
|
|
"value": stuck,
|
|
"threshold": 0,
|
|
"message": f"{stuck} alerts stuck in processing > {stuck_minutes}min",
|
|
})
|
|
|
|
return {
|
|
"claim_to_ack_p95_seconds": p95,
|
|
"failed_rate_pct": failed_pct,
|
|
"processing_stuck_count": stuck,
|
|
"sample_count": len(durations_s),
|
|
"violations": violations,
|
|
}
|
|
|
|
|
|
# ─── Postgres backend ──────────────────────────────────────────────────────────
|
|
|
|
class PostgresAlertStore(AlertStore):
|
|
"""Production backend via psycopg2 (sync, per-thread connections)."""
|
|
|
|
def __init__(self, dsn: str):
|
|
self._dsn = dsn
|
|
self._local = threading.local()
|
|
|
|
def _conn(self):
|
|
conn = getattr(self._local, "conn", None)
|
|
if conn is None or conn.closed:
|
|
import psycopg2 # type: ignore
|
|
conn = psycopg2.connect(self._dsn)
|
|
conn.autocommit = False
|
|
self._local.conn = conn
|
|
return conn
|
|
|
|
def _commit(self):
|
|
self._conn().commit()
|
|
|
|
def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict:
|
|
service = alert_data.get("service", "unknown")
|
|
env = alert_data.get("env", "prod")
|
|
kind = alert_data.get("kind", "custom")
|
|
labels = alert_data.get("labels", {})
|
|
fingerprint = labels.get("fingerprint", "")
|
|
dedupe_key = _compute_dedupe_key(service, env, kind, fingerprint)
|
|
now = _now_iso()
|
|
|
|
conn = self._conn()
|
|
cur = conn.cursor()
|
|
cutoff = (
|
|
datetime.datetime.utcnow() - datetime.timedelta(minutes=dedupe_ttl_minutes)
|
|
).isoformat()
|
|
cur.execute(
|
|
"SELECT alert_ref, occurrences, status FROM alerts "
|
|
"WHERE dedupe_key=%s AND created_at >= %s LIMIT 1",
|
|
(dedupe_key, cutoff),
|
|
)
|
|
row = cur.fetchone()
|
|
if row:
|
|
existing_ref, occ, existing_status = row
|
|
new_occ = occ + 1
|
|
# Reset to new if previously terminal
|
|
new_status = STATUS_NEW if existing_status in (STATUS_ACKED, STATUS_FAILED) else existing_status
|
|
cur.execute(
|
|
"UPDATE alerts SET occurrences=%s, last_seen_at=%s, metrics=%s, status=%s "
|
|
"WHERE alert_ref=%s",
|
|
(new_occ, now,
|
|
json.dumps(alert_data.get("metrics", {}), default=str),
|
|
new_status, existing_ref),
|
|
)
|
|
conn.commit()
|
|
cur.close()
|
|
return {
|
|
"accepted": True,
|
|
"deduped": True,
|
|
"dedupe_key": dedupe_key,
|
|
"alert_ref": existing_ref,
|
|
"occurrences": new_occ,
|
|
}
|
|
|
|
safe = _sanitize_alert(alert_data)
|
|
alert_ref = alert_data.get("alert_id") or _generate_alert_ref()
|
|
cur.execute(
|
|
"""INSERT INTO alerts (alert_ref,dedupe_key,source,service,env,severity,kind,
|
|
title,summary,started_at,labels,metrics,evidence,links,
|
|
created_at,last_seen_at,occurrences,status)
|
|
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,1,%s)""",
|
|
(alert_ref, dedupe_key,
|
|
safe.get("source", "unknown"), service, env,
|
|
safe.get("severity", "P2"), kind,
|
|
safe.get("title", ""), safe.get("summary", ""),
|
|
safe.get("started_at") or now,
|
|
json.dumps(safe.get("labels", {}), default=str),
|
|
json.dumps(safe.get("metrics", {}), default=str),
|
|
json.dumps(safe.get("evidence", {}), default=str),
|
|
json.dumps(safe.get("links", [])[:10], default=str),
|
|
now, now, STATUS_NEW),
|
|
)
|
|
conn.commit()
|
|
cur.close()
|
|
return {
|
|
"accepted": True,
|
|
"deduped": False,
|
|
"dedupe_key": dedupe_key,
|
|
"alert_ref": alert_ref,
|
|
"occurrences": 1,
|
|
}
|
|
|
|
def _row_to_dict(self, cur, row) -> Dict:
|
|
cols = [d[0] for d in cur.description]
|
|
d: Dict = {}
|
|
for c, v in zip(cols, row):
|
|
if isinstance(v, datetime.datetime):
|
|
d[c] = v.isoformat()
|
|
elif isinstance(v, str) and c in ("labels", "metrics", "evidence", "links"):
|
|
try:
|
|
d[c] = json.loads(v)
|
|
except Exception:
|
|
d[c] = v
|
|
else:
|
|
d[c] = v
|
|
return d
|
|
|
|
def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
|
|
filters = filters or {}
|
|
window = int(filters.get("window_minutes", 240))
|
|
cutoff = (datetime.datetime.utcnow() - datetime.timedelta(minutes=window)).isoformat()
|
|
status_in = filters.get("status_in")
|
|
clauses = ["created_at >= %s"]
|
|
params: list = [cutoff]
|
|
if filters.get("service"):
|
|
clauses.append("service=%s")
|
|
params.append(filters["service"])
|
|
if filters.get("env"):
|
|
clauses.append("env=%s")
|
|
params.append(filters["env"])
|
|
if status_in:
|
|
placeholders = ",".join(["%s"] * len(status_in))
|
|
clauses.append(f"status IN ({placeholders})")
|
|
params.extend(status_in)
|
|
params.append(min(limit, 200))
|
|
where = " AND ".join(clauses)
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
f"SELECT alert_ref,dedupe_key,source,service,env,severity,kind,"
|
|
f"title,summary,started_at,labels,metrics,links,"
|
|
f"created_at,last_seen_at,occurrences,status,processing_owner,acked_at,last_error "
|
|
f"FROM alerts WHERE {where} ORDER BY created_at DESC LIMIT %s",
|
|
params,
|
|
)
|
|
rows = [self._row_to_dict(cur, r) for r in cur.fetchall()]
|
|
cur.close()
|
|
return rows
|
|
|
|
def get_alert(self, alert_ref: str) -> Optional[Dict]:
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
"SELECT alert_ref,dedupe_key,source,service,env,severity,kind,"
|
|
"title,summary,started_at,labels,metrics,evidence,links,"
|
|
"created_at,last_seen_at,occurrences,status,processing_lock_until,"
|
|
"processing_owner,last_error,acked_at,ack_actor,ack_note "
|
|
"FROM alerts WHERE alert_ref=%s",
|
|
(alert_ref,),
|
|
)
|
|
row = cur.fetchone()
|
|
if not row:
|
|
cur.close()
|
|
return None
|
|
result = self._row_to_dict(cur, row)
|
|
cur.close()
|
|
return result
|
|
|
|
def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
|
|
return self.mark_acked(alert_ref, actor, note)
|
|
|
|
def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]:
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
"SELECT alert_ref,dedupe_key,service,env,severity,kind,title,summary,"
|
|
"started_at,labels,metrics,created_at,last_seen_at,occurrences,status "
|
|
"FROM alerts WHERE dedupe_key=%s ORDER BY created_at DESC LIMIT 1",
|
|
(dedupe_key,),
|
|
)
|
|
row = cur.fetchone()
|
|
if not row:
|
|
cur.close()
|
|
return None
|
|
result = self._row_to_dict(cur, row)
|
|
cur.close()
|
|
return result
|
|
|
|
def claim_next_alerts(
|
|
self,
|
|
window_minutes: int = 240,
|
|
limit: int = 25,
|
|
owner: str = "loop",
|
|
lock_ttl_seconds: int = PROCESSING_LOCK_TTL_S,
|
|
) -> List[Dict]:
|
|
"""Atomic claim via SELECT FOR UPDATE SKIP LOCKED."""
|
|
conn = self._conn()
|
|
now_str = _now_iso()
|
|
lock_until = (
|
|
datetime.datetime.utcnow() + datetime.timedelta(seconds=lock_ttl_seconds)
|
|
).isoformat()
|
|
cutoff = (
|
|
datetime.datetime.utcnow() - datetime.timedelta(minutes=window_minutes)
|
|
).isoformat()
|
|
|
|
cur = conn.cursor()
|
|
try:
|
|
# Select claimable: new, or failed/processing with expired lock
|
|
cur.execute(
|
|
"""
|
|
SELECT alert_ref FROM alerts
|
|
WHERE created_at >= %s
|
|
AND status IN ('new', 'failed', 'processing')
|
|
AND (processing_lock_until IS NULL OR processing_lock_until <= %s)
|
|
ORDER BY
|
|
CASE severity WHEN 'P0' THEN 0 WHEN 'P1' THEN 1
|
|
WHEN 'P2' THEN 2 WHEN 'P3' THEN 3 ELSE 4 END,
|
|
created_at
|
|
LIMIT %s
|
|
FOR UPDATE SKIP LOCKED
|
|
""",
|
|
(cutoff, now_str, limit),
|
|
)
|
|
refs = [row[0] for row in cur.fetchall()]
|
|
if not refs:
|
|
conn.commit()
|
|
cur.close()
|
|
return []
|
|
|
|
placeholders = ",".join(["%s"] * len(refs))
|
|
cur.execute(
|
|
f"""UPDATE alerts SET status='processing',
|
|
claimed_at=%s, processing_lock_until=%s, processing_owner=%s
|
|
WHERE alert_ref IN ({placeholders})""",
|
|
[now_str, lock_until, owner] + refs,
|
|
)
|
|
# Fetch updated rows
|
|
cur.execute(
|
|
f"SELECT alert_ref,dedupe_key,service,env,severity,kind,title,summary,"
|
|
f"started_at,labels,metrics,created_at,last_seen_at,occurrences,"
|
|
f"status,processing_owner,last_error "
|
|
f"FROM alerts WHERE alert_ref IN ({placeholders})",
|
|
refs,
|
|
)
|
|
rows = [self._row_to_dict(cur, r) for r in cur.fetchall()]
|
|
conn.commit()
|
|
cur.close()
|
|
return rows
|
|
except Exception:
|
|
conn.rollback()
|
|
cur.close()
|
|
raise
|
|
|
|
def mark_acked(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
|
|
now = _now_iso()
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
"UPDATE alerts SET status='acked', acked_at=%s, ack_actor=%s, ack_note=%s, "
|
|
"processing_lock_until=NULL, processing_owner=NULL "
|
|
"WHERE alert_ref=%s RETURNING alert_ref",
|
|
(now, _redact_text(actor, 100), _redact_text(note, 500), alert_ref),
|
|
)
|
|
row = cur.fetchone()
|
|
self._commit()
|
|
cur.close()
|
|
if not row:
|
|
return None
|
|
return {"alert_ref": alert_ref, "status": STATUS_ACKED, "ack_status": "acked"}
|
|
|
|
def mark_failed(
|
|
self, alert_ref: str, error: str, retry_after_seconds: int = 300
|
|
) -> Optional[Dict]:
|
|
retry_at = (
|
|
datetime.datetime.utcnow() + datetime.timedelta(seconds=retry_after_seconds)
|
|
).isoformat()
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
"UPDATE alerts SET status='failed', last_error=%s, "
|
|
"processing_lock_until=%s, processing_owner=NULL "
|
|
"WHERE alert_ref=%s RETURNING alert_ref",
|
|
(_redact_text(error, 500), retry_at, alert_ref),
|
|
)
|
|
row = cur.fetchone()
|
|
self._commit()
|
|
cur.close()
|
|
if not row:
|
|
return None
|
|
return {"alert_ref": alert_ref, "status": STATUS_FAILED,
|
|
"ack_status": "failed", "retry_at": retry_at}
|
|
|
|
def requeue_expired_processing(self) -> int:
|
|
now = _now_iso()
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
"UPDATE alerts SET status='new', processing_lock_until=NULL, "
|
|
"processing_owner=NULL "
|
|
"WHERE status='processing' AND processing_lock_until <= %s",
|
|
(now,),
|
|
)
|
|
count = cur.rowcount
|
|
self._commit()
|
|
cur.close()
|
|
return count
|
|
|
|
def dashboard_counts(self, window_minutes: int = 240) -> Dict:
|
|
cutoff = (
|
|
datetime.datetime.utcnow() - datetime.timedelta(minutes=window_minutes)
|
|
).isoformat()
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
"SELECT status, COUNT(*) FROM alerts WHERE created_at >= %s GROUP BY status",
|
|
(cutoff,),
|
|
)
|
|
counts = {STATUS_NEW: 0, STATUS_PROCESSING: 0, STATUS_ACKED: 0, STATUS_FAILED: 0}
|
|
for row in cur.fetchall():
|
|
st, cnt = row
|
|
if st in counts:
|
|
counts[st] = int(cnt)
|
|
cur.close()
|
|
return counts
|
|
|
|
def top_signatures(self, window_minutes: int = 240, limit: int = 20) -> List[Dict]:
|
|
cutoff = (
|
|
datetime.datetime.utcnow() - datetime.timedelta(minutes=window_minutes)
|
|
).isoformat()
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
"SELECT dedupe_key, service, kind, SUM(occurrences) AS occ, MAX(last_seen_at) AS ls "
|
|
"FROM alerts WHERE created_at >= %s "
|
|
"GROUP BY dedupe_key, service, kind "
|
|
"ORDER BY occ DESC LIMIT %s",
|
|
(cutoff, limit),
|
|
)
|
|
rows = []
|
|
for row in cur.fetchall():
|
|
key, svc, kind, occ, ls = row
|
|
rows.append({
|
|
"signature": key,
|
|
"service": svc,
|
|
"kind": kind,
|
|
"occurrences": int(occ),
|
|
"last_seen": ls.isoformat() if hasattr(ls, "isoformat") else str(ls),
|
|
})
|
|
cur.close()
|
|
return rows
|
|
|
|
def compute_loop_slo(self, window_minutes: int = 240,
|
|
p95_threshold_s: float = 60.0,
|
|
failed_rate_threshold_pct: float = 5.0,
|
|
stuck_minutes: float = 15.0) -> Dict:
|
|
now = datetime.datetime.utcnow()
|
|
cutoff = (now - datetime.timedelta(minutes=window_minutes)).isoformat()
|
|
stuck_cutoff = (now - datetime.timedelta(minutes=stuck_minutes)).isoformat()
|
|
cur = self._conn().cursor()
|
|
|
|
# P95 duration: only for acked with both claimed_at and acked_at
|
|
cur.execute(
|
|
"SELECT EXTRACT(EPOCH FROM (acked_at - claimed_at)) "
|
|
"FROM alerts "
|
|
"WHERE created_at >= %s AND status='acked' "
|
|
"AND claimed_at IS NOT NULL AND acked_at IS NOT NULL "
|
|
"ORDER BY 1",
|
|
(cutoff,),
|
|
)
|
|
durations = [float(r[0]) for r in cur.fetchall() if r[0] is not None]
|
|
|
|
cur.execute(
|
|
"SELECT COUNT(*) FROM alerts WHERE created_at >= %s AND status='acked'",
|
|
(cutoff,),
|
|
)
|
|
acked = int(cur.fetchone()[0])
|
|
cur.execute(
|
|
"SELECT COUNT(*) FROM alerts WHERE created_at >= %s AND status='failed'",
|
|
(cutoff,),
|
|
)
|
|
failed = int(cur.fetchone()[0])
|
|
cur.execute(
|
|
"SELECT COUNT(*) FROM alerts "
|
|
"WHERE created_at >= %s AND status='processing' AND claimed_at < %s",
|
|
(cutoff, stuck_cutoff),
|
|
)
|
|
stuck = int(cur.fetchone()[0])
|
|
cur.close()
|
|
|
|
p95 = None
|
|
if durations:
|
|
idx = max(0, int(len(durations) * 0.95) - 1)
|
|
p95 = round(durations[idx], 1)
|
|
|
|
total_terminal = acked + failed
|
|
failed_pct = round((failed / total_terminal * 100) if total_terminal > 0 else 0.0, 1)
|
|
|
|
violations = []
|
|
if p95 is not None and p95 > p95_threshold_s:
|
|
violations.append({
|
|
"metric": "claim_to_ack_p95_seconds", "value": p95,
|
|
"threshold": p95_threshold_s,
|
|
"message": f"P95 claim→ack {p95}s > {p95_threshold_s}s",
|
|
})
|
|
if failed_pct > failed_rate_threshold_pct:
|
|
violations.append({
|
|
"metric": "failed_rate_pct", "value": failed_pct,
|
|
"threshold": failed_rate_threshold_pct,
|
|
"message": f"Failed rate {failed_pct}% > {failed_rate_threshold_pct}%",
|
|
})
|
|
if stuck > 0:
|
|
violations.append({
|
|
"metric": "processing_stuck_count", "value": stuck,
|
|
"threshold": 0,
|
|
"message": f"{stuck} alerts stuck in processing > {stuck_minutes}min",
|
|
})
|
|
return {
|
|
"claim_to_ack_p95_seconds": p95,
|
|
"failed_rate_pct": failed_pct,
|
|
"processing_stuck_count": stuck,
|
|
"sample_count": len(durations),
|
|
"violations": violations,
|
|
}
|
|
|
|
|
|
# ─── Auto backend ──────────────────────────────────────────────────────────────
|
|
|
|
class AutoAlertStore(AlertStore):
|
|
"""Postgres primary → MemoryAlertStore fallback, with 5 min recovery."""
|
|
|
|
_RECOVERY_INTERVAL_S = 300
|
|
|
|
def __init__(self, pg_dsn: str):
|
|
self._pg_dsn = pg_dsn
|
|
self._primary: Optional[PostgresAlertStore] = None
|
|
self._fallback = MemoryAlertStore()
|
|
self._using_fallback = False
|
|
self._fallback_since: float = 0.0
|
|
self._init_lock = threading.Lock()
|
|
|
|
def _get_primary(self) -> PostgresAlertStore:
|
|
if self._primary is None:
|
|
with self._init_lock:
|
|
if self._primary is None:
|
|
self._primary = PostgresAlertStore(self._pg_dsn)
|
|
return self._primary
|
|
|
|
def _maybe_recover(self) -> None:
|
|
if self._using_fallback and self._fallback_since > 0:
|
|
if time.monotonic() - self._fallback_since >= self._RECOVERY_INTERVAL_S:
|
|
logger.info("AutoAlertStore: attempting Postgres recovery")
|
|
self._using_fallback = False
|
|
self._fallback_since = 0.0
|
|
|
|
def _switch_to_fallback(self, err: Exception) -> None:
|
|
logger.warning("AutoAlertStore: Postgres failed (%s), using Memory fallback", err)
|
|
self._using_fallback = True
|
|
self._fallback_since = time.monotonic()
|
|
|
|
def active_backend(self) -> str:
|
|
return "memory_fallback" if self._using_fallback else "postgres"
|
|
|
|
def _delegate(self, method: str, *args, **kwargs):
|
|
self._maybe_recover()
|
|
if not self._using_fallback:
|
|
try:
|
|
return getattr(self._get_primary(), method)(*args, **kwargs)
|
|
except Exception as e:
|
|
self._switch_to_fallback(e)
|
|
return getattr(self._fallback, method)(*args, **kwargs)
|
|
|
|
def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict:
|
|
return self._delegate("ingest", alert_data, dedupe_ttl_minutes)
|
|
|
|
def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
|
|
return self._delegate("list_alerts", filters, limit)
|
|
|
|
def get_alert(self, alert_ref: str) -> Optional[Dict]:
|
|
return self._delegate("get_alert", alert_ref)
|
|
|
|
def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
|
|
return self._delegate("mark_acked", alert_ref, actor, note)
|
|
|
|
def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]:
|
|
return self._delegate("get_by_dedupe_key", dedupe_key)
|
|
|
|
def claim_next_alerts(self, window_minutes=240, limit=25, owner="loop",
|
|
lock_ttl_seconds=PROCESSING_LOCK_TTL_S) -> List[Dict]:
|
|
return self._delegate("claim_next_alerts", window_minutes, limit, owner, lock_ttl_seconds)
|
|
|
|
def mark_acked(self, alert_ref, actor, note="") -> Optional[Dict]:
|
|
return self._delegate("mark_acked", alert_ref, actor, note)
|
|
|
|
def mark_failed(self, alert_ref, error, retry_after_seconds=300) -> Optional[Dict]:
|
|
return self._delegate("mark_failed", alert_ref, error, retry_after_seconds)
|
|
|
|
def requeue_expired_processing(self) -> int:
|
|
return self._delegate("requeue_expired_processing")
|
|
|
|
def dashboard_counts(self, window_minutes=240) -> Dict:
|
|
return self._delegate("dashboard_counts", window_minutes)
|
|
|
|
def top_signatures(self, window_minutes=240, limit=20) -> List[Dict]:
|
|
return self._delegate("top_signatures", window_minutes, limit)
|
|
|
|
def compute_loop_slo(self, window_minutes=240, p95_threshold_s=60.0,
|
|
failed_rate_threshold_pct=5.0, stuck_minutes=15.0) -> Dict:
|
|
return self._delegate("compute_loop_slo", window_minutes, p95_threshold_s,
|
|
failed_rate_threshold_pct, stuck_minutes)
|
|
|
|
|
|
# ─── Singleton ────────────────────────────────────────────────────────────────
|
|
|
|
_store: Optional[AlertStore] = None
|
|
_store_lock = threading.Lock()
|
|
|
|
|
|
def get_alert_store() -> AlertStore:
|
|
global _store
|
|
if _store is None:
|
|
with _store_lock:
|
|
if _store is None:
|
|
_store = _create_alert_store()
|
|
return _store
|
|
|
|
|
|
def set_alert_store(store: Optional[AlertStore]) -> None:
|
|
global _store
|
|
with _store_lock:
|
|
_store = store
|
|
|
|
|
|
def _create_alert_store() -> AlertStore:
|
|
backend = os.getenv("ALERT_BACKEND", "memory").lower()
|
|
# ALERT_DATABASE_URL takes precedence (service-specific), then DATABASE_URL (shared)
|
|
dsn = os.getenv("ALERT_DATABASE_URL") or os.getenv("DATABASE_URL", "")
|
|
|
|
if backend == "postgres":
|
|
if dsn:
|
|
logger.info("AlertStore: postgres dsn=%s…", dsn[:30])
|
|
return PostgresAlertStore(dsn)
|
|
logger.warning(
|
|
"ALERT_BACKEND=postgres but no ALERT_DATABASE_URL/DATABASE_URL; falling back to memory"
|
|
)
|
|
|
|
if backend == "auto":
|
|
if dsn:
|
|
logger.info("AlertStore: auto (postgres→memory fallback) dsn=%s…", dsn[:30])
|
|
return AutoAlertStore(dsn)
|
|
logger.info("AlertStore: auto — no ALERT_DATABASE_URL/DATABASE_URL, using memory")
|
|
|
|
logger.info("AlertStore: memory (in-process)")
|
|
return MemoryAlertStore()
|