Files
microdao-daarion/services/router/alert_store.py
Apple 129e4ea1fc feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
2026-03-03 07:14:14 -08:00

1032 lines
40 KiB
Python

"""
alert_store.py — Alert ingestion storage with state machine.
State machine: new → processing → acked | failed
Backends:
- MemoryAlertStore (testing / single-process)
- PostgresAlertStore (production — uses psycopg2 sync)
- AutoAlertStore (Postgres primary → Memory fallback)
DDL: ops/scripts/migrate_alerts_postgres.py
"""
from __future__ import annotations
import datetime
import hashlib
import json
import logging
import os
import threading
import time
import uuid
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
# ─── Constants ────────────────────────────────────────────────────────────────
MAX_LOG_SAMPLES = 40
MAX_SUMMARY_CHARS = 1000
MAX_ALERT_JSON_BYTES = 32 * 1024 # 32 KB per alert
# Alert status values
STATUS_NEW = "new"
STATUS_PROCESSING = "processing"
STATUS_ACKED = "acked"
STATUS_FAILED = "failed"
PROCESSING_LOCK_TTL_S = 600 # default 10 min lock
def _now_iso() -> str:
return datetime.datetime.utcnow().isoformat()
def _now_dt() -> datetime.datetime:
return datetime.datetime.utcnow()
def _generate_alert_ref() -> str:
ts = datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S")
short = uuid.uuid4().hex[:6]
return f"alrt_{ts}_{short}"
def _compute_dedupe_key(service: str, env: str, kind: str, fingerprint: str = "") -> str:
raw = f"{service}|{env}|{kind}|{fingerprint}"
return hashlib.sha256(raw.encode()).hexdigest()[:32]
def _redact_text(text: str, max_chars: int = 500) -> str:
import re
_SECRET_PAT = re.compile(
r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+',
)
redacted = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", text or "")
return redacted[:max_chars]
def _sanitize_alert(alert_data: Dict) -> Dict:
"""Truncate/redact alert payload for safe storage."""
safe = dict(alert_data)
safe["summary"] = _redact_text(safe.get("summary", ""), MAX_SUMMARY_CHARS)
safe["title"] = _redact_text(safe.get("title", ""), 300)
ev = safe.get("evidence", {})
if isinstance(ev, dict):
logs = ev.get("log_samples", [])
safe["evidence"] = {
**ev,
"log_samples": [_redact_text(l, 300) for l in logs[:MAX_LOG_SAMPLES]],
}
return safe
# ─── Abstract interface ────────────────────────────────────────────────────────
class AlertStore(ABC):
@abstractmethod
def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict:
"""
Store alert with dedupe.
Returns: {accepted, deduped, dedupe_key, alert_ref, occurrences}
"""
@abstractmethod
def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
"""List alerts metadata. Supports status_in filter."""
@abstractmethod
def get_alert(self, alert_ref: str) -> Optional[Dict]:
"""Return full alert record."""
@abstractmethod
def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
"""Mark alert as acked (status=acked). Legacy compat."""
@abstractmethod
def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]:
"""Lookup by dedupe key (for reuse-open-incident logic)."""
# ── State machine methods ──────────────────────────────────────────────────
@abstractmethod
def claim_next_alerts(
self,
window_minutes: int = 240,
limit: int = 25,
owner: str = "loop",
lock_ttl_seconds: int = PROCESSING_LOCK_TTL_S,
) -> List[Dict]:
"""
Atomically move status=new (or failed+expired) → processing.
Skips already-processing-and-locked alerts.
Returns the claimed alert records.
"""
@abstractmethod
def mark_acked(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
"""status=acked, acked_at=now."""
@abstractmethod
def mark_failed(
self, alert_ref: str, error: str, retry_after_seconds: int = 300
) -> Optional[Dict]:
"""status=failed, lock_until=now+retry, last_error=truncated."""
@abstractmethod
def requeue_expired_processing(self) -> int:
"""processing + lock_until < now → status=new. Returns count reset."""
@abstractmethod
def dashboard_counts(self, window_minutes: int = 240) -> Dict:
"""Return {new, processing, failed, acked} counts for window."""
@abstractmethod
def top_signatures(self, window_minutes: int = 240, limit: int = 20) -> List[Dict]:
"""Return top dedupe_keys by occurrences."""
@abstractmethod
def compute_loop_slo(self, window_minutes: int = 240,
p95_threshold_s: float = 60.0,
failed_rate_threshold_pct: float = 5.0,
stuck_minutes: float = 15.0) -> Dict:
"""Compute alert-loop SLO metrics for the dashboard.
Returns: {claim_to_ack_p95_seconds, failed_rate_pct, processing_stuck_count, violations}
"""
# ─── Memory backend ────────────────────────────────────────────────────────────
class MemoryAlertStore(AlertStore):
def __init__(self):
self._lock = threading.Lock()
self._alerts: Dict[str, Dict] = {}
self._dedupe: Dict[str, str] = {} # dedupe_key → alert_ref
def _new_record(self, alert_data: Dict, dedupe_key: str, now: str) -> Dict:
safe = _sanitize_alert(alert_data)
service = alert_data.get("service", "unknown")
env = alert_data.get("env", "prod")
kind = alert_data.get("kind", "custom")
alert_ref = alert_data.get("alert_id") or _generate_alert_ref()
return {
"alert_ref": alert_ref,
"dedupe_key": dedupe_key,
"source": safe.get("source", "unknown"),
"service": service,
"env": env,
"severity": safe.get("severity", "P2"),
"kind": kind,
"title": safe.get("title", ""),
"summary": safe.get("summary", ""),
"started_at": safe.get("started_at") or now,
"labels": safe.get("labels", {}),
"metrics": safe.get("metrics", {}),
"evidence": safe.get("evidence", {}),
"links": safe.get("links", [])[:10],
"created_at": now,
"last_seen_at": now,
"occurrences": 1,
# State machine fields
"status": STATUS_NEW,
"claimed_at": None, # set when claimed
"processing_lock_until": None,
"processing_owner": None,
"last_error": None,
"acked_at": None,
# Legacy compat
"ack_status": "pending",
"ack_actor": None,
"ack_note": None,
"ack_at": None,
}
def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict:
service = alert_data.get("service", "unknown")
env = alert_data.get("env", "prod")
kind = alert_data.get("kind", "custom")
labels = alert_data.get("labels", {})
fingerprint = labels.get("fingerprint", "")
dedupe_key = _compute_dedupe_key(service, env, kind, fingerprint)
now = _now_iso()
with self._lock:
existing_ref = self._dedupe.get(dedupe_key)
if existing_ref and existing_ref in self._alerts:
existing = self._alerts[existing_ref]
created_at = existing.get("created_at", "")
ttl_cutoff = (
datetime.datetime.utcnow()
- datetime.timedelta(minutes=dedupe_ttl_minutes)
).isoformat()
if created_at >= ttl_cutoff:
existing["occurrences"] = existing.get("occurrences", 1) + 1
existing["last_seen_at"] = now
if alert_data.get("metrics"):
existing["metrics"] = alert_data["metrics"]
# If previously acked/failed, reset to new so it gets picked up again
if existing.get("status") in (STATUS_ACKED, STATUS_FAILED):
existing["status"] = STATUS_NEW
existing["processing_lock_until"] = None
existing["last_error"] = None
return {
"accepted": True,
"deduped": True,
"dedupe_key": dedupe_key,
"alert_ref": existing_ref,
"occurrences": existing["occurrences"],
}
record = self._new_record(alert_data, dedupe_key, now)
alert_ref = record["alert_ref"]
self._alerts[alert_ref] = record
self._dedupe[dedupe_key] = alert_ref
return {
"accepted": True,
"deduped": False,
"dedupe_key": dedupe_key,
"alert_ref": alert_ref,
"occurrences": 1,
}
def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
filters = filters or {}
service = filters.get("service")
env = filters.get("env")
window = int(filters.get("window_minutes", 240))
status_in = filters.get("status_in") # list of statuses or None (all)
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(minutes=window)
).isoformat()
with self._lock:
results = []
for a in sorted(self._alerts.values(),
key=lambda x: x.get("created_at", ""), reverse=True):
if a.get("created_at", "") < cutoff:
continue
if service and a.get("service") != service:
continue
if env and a.get("env") != env:
continue
if status_in and a.get("status", STATUS_NEW) not in status_in:
continue
results.append({k: v for k, v in a.items() if k not in ("evidence",)})
if len(results) >= limit:
break
return results
def get_alert(self, alert_ref: str) -> Optional[Dict]:
with self._lock:
return dict(self._alerts[alert_ref]) if alert_ref in self._alerts else None
def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
return self.mark_acked(alert_ref, actor, note)
def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]:
with self._lock:
ref = self._dedupe.get(dedupe_key)
if ref and ref in self._alerts:
return dict(self._alerts[ref])
return None
def claim_next_alerts(
self,
window_minutes: int = 240,
limit: int = 25,
owner: str = "loop",
lock_ttl_seconds: int = PROCESSING_LOCK_TTL_S,
) -> List[Dict]:
now_dt = _now_dt()
now_str = now_dt.isoformat()
lock_until = (now_dt + datetime.timedelta(seconds=lock_ttl_seconds)).isoformat()
cutoff = (now_dt - datetime.timedelta(minutes=window_minutes)).isoformat()
claimed = []
with self._lock:
for a in sorted(self._alerts.values(),
key=lambda x: x.get("created_at", "")):
if len(claimed) >= limit:
break
if a.get("created_at", "") < cutoff:
continue
st = a.get("status", STATUS_NEW)
lock_exp = a.get("processing_lock_until")
# Claimable: new, OR failed/processing with expired/no lock
if st == STATUS_ACKED:
continue
if st in (STATUS_PROCESSING, STATUS_FAILED):
if lock_exp and lock_exp > now_str:
continue # still locked (retry window not passed)
# Claim it
a["status"] = STATUS_PROCESSING
a["claimed_at"] = now_str
a["processing_lock_until"] = lock_until
a["processing_owner"] = owner
claimed.append(dict(a))
return claimed
def mark_acked(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
now = _now_iso()
with self._lock:
if alert_ref not in self._alerts:
return None
rec = self._alerts[alert_ref]
rec["status"] = STATUS_ACKED
rec["acked_at"] = now
rec["ack_status"] = "acked"
rec["ack_actor"] = _redact_text(actor, 100)
rec["ack_note"] = _redact_text(note, 500)
rec["ack_at"] = now
rec["processing_lock_until"] = None
rec["processing_owner"] = None
return {"alert_ref": alert_ref, "status": STATUS_ACKED, "ack_status": "acked"}
def mark_failed(
self, alert_ref: str, error: str, retry_after_seconds: int = 300
) -> Optional[Dict]:
now_dt = _now_dt()
retry_at = (now_dt + datetime.timedelta(seconds=retry_after_seconds)).isoformat()
with self._lock:
if alert_ref not in self._alerts:
return None
rec = self._alerts[alert_ref]
rec["status"] = STATUS_FAILED
rec["last_error"] = _redact_text(error, 500)
rec["processing_lock_until"] = retry_at
rec["processing_owner"] = None
return {"alert_ref": alert_ref, "status": STATUS_FAILED,
"ack_status": "failed", "retry_at": retry_at}
def requeue_expired_processing(self) -> int:
now_str = _now_iso()
count = 0
with self._lock:
for a in self._alerts.values():
if a.get("status") == STATUS_PROCESSING:
lock_exp = a.get("processing_lock_until")
if lock_exp and lock_exp <= now_str:
a["status"] = STATUS_NEW
a["processing_lock_until"] = None
a["processing_owner"] = None
count += 1
return count
def dashboard_counts(self, window_minutes: int = 240) -> Dict:
cutoff = (
_now_dt() - datetime.timedelta(minutes=window_minutes)
).isoformat()
counts = {STATUS_NEW: 0, STATUS_PROCESSING: 0, STATUS_ACKED: 0, STATUS_FAILED: 0}
now_str = _now_iso()
with self._lock:
for a in self._alerts.values():
if a.get("created_at", "") < cutoff:
continue
st = a.get("status", STATUS_NEW)
if st in counts:
counts[st] += 1
return counts
def top_signatures(self, window_minutes: int = 240, limit: int = 20) -> List[Dict]:
cutoff = (
_now_dt() - datetime.timedelta(minutes=window_minutes)
).isoformat()
sigs: Dict[str, Dict] = {}
with self._lock:
for a in self._alerts.values():
if a.get("created_at", "") < cutoff:
continue
key = a.get("dedupe_key", "")
if key not in sigs:
sigs[key] = {
"signature": key,
"service": a.get("service", ""),
"kind": a.get("kind", ""),
"occurrences": 0,
"last_seen": a.get("last_seen_at", ""),
}
sigs[key]["occurrences"] += a.get("occurrences", 1)
if a.get("last_seen_at", "") > sigs[key]["last_seen"]:
sigs[key]["last_seen"] = a.get("last_seen_at", "")
return sorted(sigs.values(), key=lambda x: x["occurrences"], reverse=True)[:limit]
def compute_loop_slo(self, window_minutes: int = 240,
p95_threshold_s: float = 60.0,
failed_rate_threshold_pct: float = 5.0,
stuck_minutes: float = 15.0) -> Dict:
now_dt = _now_dt()
cutoff = (now_dt - datetime.timedelta(minutes=window_minutes)).isoformat()
stuck_cutoff = (now_dt - datetime.timedelta(minutes=stuck_minutes)).isoformat()
durations_s: list = []
acked = 0
failed = 0
stuck = 0
with self._lock:
for a in self._alerts.values():
if a.get("created_at", "") < cutoff:
continue
st = a.get("status", STATUS_NEW)
if st == STATUS_ACKED:
acked += 1
claimed_at = a.get("claimed_at")
acked_at = a.get("acked_at")
if claimed_at and acked_at:
try:
c = datetime.datetime.fromisoformat(claimed_at)
k = datetime.datetime.fromisoformat(acked_at)
durations_s.append((k - c).total_seconds())
except Exception:
pass
elif st == STATUS_FAILED:
failed += 1
elif st == STATUS_PROCESSING:
claimed_at = a.get("claimed_at") or ""
if claimed_at and claimed_at < stuck_cutoff:
stuck += 1
# P95
p95 = None
if durations_s:
durations_s.sort()
idx = max(0, int(len(durations_s) * 0.95) - 1)
p95 = round(durations_s[idx], 1)
# Failed rate
total_terminal = acked + failed
failed_pct = round((failed / total_terminal * 100) if total_terminal > 0 else 0.0, 1)
violations = []
if p95 is not None and p95 > p95_threshold_s:
violations.append({
"metric": "claim_to_ack_p95_seconds",
"value": p95,
"threshold": p95_threshold_s,
"message": f"P95 claim→ack latency {p95}s exceeds {p95_threshold_s}s",
})
if failed_pct > failed_rate_threshold_pct:
violations.append({
"metric": "failed_rate_pct",
"value": failed_pct,
"threshold": failed_rate_threshold_pct,
"message": f"Failed alert rate {failed_pct}% exceeds {failed_rate_threshold_pct}%",
})
if stuck > 0:
violations.append({
"metric": "processing_stuck_count",
"value": stuck,
"threshold": 0,
"message": f"{stuck} alerts stuck in processing > {stuck_minutes}min",
})
return {
"claim_to_ack_p95_seconds": p95,
"failed_rate_pct": failed_pct,
"processing_stuck_count": stuck,
"sample_count": len(durations_s),
"violations": violations,
}
# ─── Postgres backend ──────────────────────────────────────────────────────────
class PostgresAlertStore(AlertStore):
"""Production backend via psycopg2 (sync, per-thread connections)."""
def __init__(self, dsn: str):
self._dsn = dsn
self._local = threading.local()
def _conn(self):
conn = getattr(self._local, "conn", None)
if conn is None or conn.closed:
import psycopg2 # type: ignore
conn = psycopg2.connect(self._dsn)
conn.autocommit = False
self._local.conn = conn
return conn
def _commit(self):
self._conn().commit()
def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict:
service = alert_data.get("service", "unknown")
env = alert_data.get("env", "prod")
kind = alert_data.get("kind", "custom")
labels = alert_data.get("labels", {})
fingerprint = labels.get("fingerprint", "")
dedupe_key = _compute_dedupe_key(service, env, kind, fingerprint)
now = _now_iso()
conn = self._conn()
cur = conn.cursor()
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(minutes=dedupe_ttl_minutes)
).isoformat()
cur.execute(
"SELECT alert_ref, occurrences, status FROM alerts "
"WHERE dedupe_key=%s AND created_at >= %s LIMIT 1",
(dedupe_key, cutoff),
)
row = cur.fetchone()
if row:
existing_ref, occ, existing_status = row
new_occ = occ + 1
# Reset to new if previously terminal
new_status = STATUS_NEW if existing_status in (STATUS_ACKED, STATUS_FAILED) else existing_status
cur.execute(
"UPDATE alerts SET occurrences=%s, last_seen_at=%s, metrics=%s, status=%s "
"WHERE alert_ref=%s",
(new_occ, now,
json.dumps(alert_data.get("metrics", {}), default=str),
new_status, existing_ref),
)
conn.commit()
cur.close()
return {
"accepted": True,
"deduped": True,
"dedupe_key": dedupe_key,
"alert_ref": existing_ref,
"occurrences": new_occ,
}
safe = _sanitize_alert(alert_data)
alert_ref = alert_data.get("alert_id") or _generate_alert_ref()
cur.execute(
"""INSERT INTO alerts (alert_ref,dedupe_key,source,service,env,severity,kind,
title,summary,started_at,labels,metrics,evidence,links,
created_at,last_seen_at,occurrences,status)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,1,%s)""",
(alert_ref, dedupe_key,
safe.get("source", "unknown"), service, env,
safe.get("severity", "P2"), kind,
safe.get("title", ""), safe.get("summary", ""),
safe.get("started_at") or now,
json.dumps(safe.get("labels", {}), default=str),
json.dumps(safe.get("metrics", {}), default=str),
json.dumps(safe.get("evidence", {}), default=str),
json.dumps(safe.get("links", [])[:10], default=str),
now, now, STATUS_NEW),
)
conn.commit()
cur.close()
return {
"accepted": True,
"deduped": False,
"dedupe_key": dedupe_key,
"alert_ref": alert_ref,
"occurrences": 1,
}
def _row_to_dict(self, cur, row) -> Dict:
cols = [d[0] for d in cur.description]
d: Dict = {}
for c, v in zip(cols, row):
if isinstance(v, datetime.datetime):
d[c] = v.isoformat()
elif isinstance(v, str) and c in ("labels", "metrics", "evidence", "links"):
try:
d[c] = json.loads(v)
except Exception:
d[c] = v
else:
d[c] = v
return d
def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
filters = filters or {}
window = int(filters.get("window_minutes", 240))
cutoff = (datetime.datetime.utcnow() - datetime.timedelta(minutes=window)).isoformat()
status_in = filters.get("status_in")
clauses = ["created_at >= %s"]
params: list = [cutoff]
if filters.get("service"):
clauses.append("service=%s")
params.append(filters["service"])
if filters.get("env"):
clauses.append("env=%s")
params.append(filters["env"])
if status_in:
placeholders = ",".join(["%s"] * len(status_in))
clauses.append(f"status IN ({placeholders})")
params.extend(status_in)
params.append(min(limit, 200))
where = " AND ".join(clauses)
cur = self._conn().cursor()
cur.execute(
f"SELECT alert_ref,dedupe_key,source,service,env,severity,kind,"
f"title,summary,started_at,labels,metrics,links,"
f"created_at,last_seen_at,occurrences,status,processing_owner,acked_at,last_error "
f"FROM alerts WHERE {where} ORDER BY created_at DESC LIMIT %s",
params,
)
rows = [self._row_to_dict(cur, r) for r in cur.fetchall()]
cur.close()
return rows
def get_alert(self, alert_ref: str) -> Optional[Dict]:
cur = self._conn().cursor()
cur.execute(
"SELECT alert_ref,dedupe_key,source,service,env,severity,kind,"
"title,summary,started_at,labels,metrics,evidence,links,"
"created_at,last_seen_at,occurrences,status,processing_lock_until,"
"processing_owner,last_error,acked_at,ack_actor,ack_note "
"FROM alerts WHERE alert_ref=%s",
(alert_ref,),
)
row = cur.fetchone()
if not row:
cur.close()
return None
result = self._row_to_dict(cur, row)
cur.close()
return result
def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
return self.mark_acked(alert_ref, actor, note)
def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]:
cur = self._conn().cursor()
cur.execute(
"SELECT alert_ref,dedupe_key,service,env,severity,kind,title,summary,"
"started_at,labels,metrics,created_at,last_seen_at,occurrences,status "
"FROM alerts WHERE dedupe_key=%s ORDER BY created_at DESC LIMIT 1",
(dedupe_key,),
)
row = cur.fetchone()
if not row:
cur.close()
return None
result = self._row_to_dict(cur, row)
cur.close()
return result
def claim_next_alerts(
self,
window_minutes: int = 240,
limit: int = 25,
owner: str = "loop",
lock_ttl_seconds: int = PROCESSING_LOCK_TTL_S,
) -> List[Dict]:
"""Atomic claim via SELECT FOR UPDATE SKIP LOCKED."""
conn = self._conn()
now_str = _now_iso()
lock_until = (
datetime.datetime.utcnow() + datetime.timedelta(seconds=lock_ttl_seconds)
).isoformat()
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(minutes=window_minutes)
).isoformat()
cur = conn.cursor()
try:
# Select claimable: new, or failed/processing with expired lock
cur.execute(
"""
SELECT alert_ref FROM alerts
WHERE created_at >= %s
AND status IN ('new', 'failed', 'processing')
AND (processing_lock_until IS NULL OR processing_lock_until <= %s)
ORDER BY
CASE severity WHEN 'P0' THEN 0 WHEN 'P1' THEN 1
WHEN 'P2' THEN 2 WHEN 'P3' THEN 3 ELSE 4 END,
created_at
LIMIT %s
FOR UPDATE SKIP LOCKED
""",
(cutoff, now_str, limit),
)
refs = [row[0] for row in cur.fetchall()]
if not refs:
conn.commit()
cur.close()
return []
placeholders = ",".join(["%s"] * len(refs))
cur.execute(
f"""UPDATE alerts SET status='processing',
claimed_at=%s, processing_lock_until=%s, processing_owner=%s
WHERE alert_ref IN ({placeholders})""",
[now_str, lock_until, owner] + refs,
)
# Fetch updated rows
cur.execute(
f"SELECT alert_ref,dedupe_key,service,env,severity,kind,title,summary,"
f"started_at,labels,metrics,created_at,last_seen_at,occurrences,"
f"status,processing_owner,last_error "
f"FROM alerts WHERE alert_ref IN ({placeholders})",
refs,
)
rows = [self._row_to_dict(cur, r) for r in cur.fetchall()]
conn.commit()
cur.close()
return rows
except Exception:
conn.rollback()
cur.close()
raise
def mark_acked(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
now = _now_iso()
cur = self._conn().cursor()
cur.execute(
"UPDATE alerts SET status='acked', acked_at=%s, ack_actor=%s, ack_note=%s, "
"processing_lock_until=NULL, processing_owner=NULL "
"WHERE alert_ref=%s RETURNING alert_ref",
(now, _redact_text(actor, 100), _redact_text(note, 500), alert_ref),
)
row = cur.fetchone()
self._commit()
cur.close()
if not row:
return None
return {"alert_ref": alert_ref, "status": STATUS_ACKED, "ack_status": "acked"}
def mark_failed(
self, alert_ref: str, error: str, retry_after_seconds: int = 300
) -> Optional[Dict]:
retry_at = (
datetime.datetime.utcnow() + datetime.timedelta(seconds=retry_after_seconds)
).isoformat()
cur = self._conn().cursor()
cur.execute(
"UPDATE alerts SET status='failed', last_error=%s, "
"processing_lock_until=%s, processing_owner=NULL "
"WHERE alert_ref=%s RETURNING alert_ref",
(_redact_text(error, 500), retry_at, alert_ref),
)
row = cur.fetchone()
self._commit()
cur.close()
if not row:
return None
return {"alert_ref": alert_ref, "status": STATUS_FAILED,
"ack_status": "failed", "retry_at": retry_at}
def requeue_expired_processing(self) -> int:
now = _now_iso()
cur = self._conn().cursor()
cur.execute(
"UPDATE alerts SET status='new', processing_lock_until=NULL, "
"processing_owner=NULL "
"WHERE status='processing' AND processing_lock_until <= %s",
(now,),
)
count = cur.rowcount
self._commit()
cur.close()
return count
def dashboard_counts(self, window_minutes: int = 240) -> Dict:
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(minutes=window_minutes)
).isoformat()
cur = self._conn().cursor()
cur.execute(
"SELECT status, COUNT(*) FROM alerts WHERE created_at >= %s GROUP BY status",
(cutoff,),
)
counts = {STATUS_NEW: 0, STATUS_PROCESSING: 0, STATUS_ACKED: 0, STATUS_FAILED: 0}
for row in cur.fetchall():
st, cnt = row
if st in counts:
counts[st] = int(cnt)
cur.close()
return counts
def top_signatures(self, window_minutes: int = 240, limit: int = 20) -> List[Dict]:
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(minutes=window_minutes)
).isoformat()
cur = self._conn().cursor()
cur.execute(
"SELECT dedupe_key, service, kind, SUM(occurrences) AS occ, MAX(last_seen_at) AS ls "
"FROM alerts WHERE created_at >= %s "
"GROUP BY dedupe_key, service, kind "
"ORDER BY occ DESC LIMIT %s",
(cutoff, limit),
)
rows = []
for row in cur.fetchall():
key, svc, kind, occ, ls = row
rows.append({
"signature": key,
"service": svc,
"kind": kind,
"occurrences": int(occ),
"last_seen": ls.isoformat() if hasattr(ls, "isoformat") else str(ls),
})
cur.close()
return rows
def compute_loop_slo(self, window_minutes: int = 240,
p95_threshold_s: float = 60.0,
failed_rate_threshold_pct: float = 5.0,
stuck_minutes: float = 15.0) -> Dict:
now = datetime.datetime.utcnow()
cutoff = (now - datetime.timedelta(minutes=window_minutes)).isoformat()
stuck_cutoff = (now - datetime.timedelta(minutes=stuck_minutes)).isoformat()
cur = self._conn().cursor()
# P95 duration: only for acked with both claimed_at and acked_at
cur.execute(
"SELECT EXTRACT(EPOCH FROM (acked_at - claimed_at)) "
"FROM alerts "
"WHERE created_at >= %s AND status='acked' "
"AND claimed_at IS NOT NULL AND acked_at IS NOT NULL "
"ORDER BY 1",
(cutoff,),
)
durations = [float(r[0]) for r in cur.fetchall() if r[0] is not None]
cur.execute(
"SELECT COUNT(*) FROM alerts WHERE created_at >= %s AND status='acked'",
(cutoff,),
)
acked = int(cur.fetchone()[0])
cur.execute(
"SELECT COUNT(*) FROM alerts WHERE created_at >= %s AND status='failed'",
(cutoff,),
)
failed = int(cur.fetchone()[0])
cur.execute(
"SELECT COUNT(*) FROM alerts "
"WHERE created_at >= %s AND status='processing' AND claimed_at < %s",
(cutoff, stuck_cutoff),
)
stuck = int(cur.fetchone()[0])
cur.close()
p95 = None
if durations:
idx = max(0, int(len(durations) * 0.95) - 1)
p95 = round(durations[idx], 1)
total_terminal = acked + failed
failed_pct = round((failed / total_terminal * 100) if total_terminal > 0 else 0.0, 1)
violations = []
if p95 is not None and p95 > p95_threshold_s:
violations.append({
"metric": "claim_to_ack_p95_seconds", "value": p95,
"threshold": p95_threshold_s,
"message": f"P95 claim→ack {p95}s > {p95_threshold_s}s",
})
if failed_pct > failed_rate_threshold_pct:
violations.append({
"metric": "failed_rate_pct", "value": failed_pct,
"threshold": failed_rate_threshold_pct,
"message": f"Failed rate {failed_pct}% > {failed_rate_threshold_pct}%",
})
if stuck > 0:
violations.append({
"metric": "processing_stuck_count", "value": stuck,
"threshold": 0,
"message": f"{stuck} alerts stuck in processing > {stuck_minutes}min",
})
return {
"claim_to_ack_p95_seconds": p95,
"failed_rate_pct": failed_pct,
"processing_stuck_count": stuck,
"sample_count": len(durations),
"violations": violations,
}
# ─── Auto backend ──────────────────────────────────────────────────────────────
class AutoAlertStore(AlertStore):
"""Postgres primary → MemoryAlertStore fallback, with 5 min recovery."""
_RECOVERY_INTERVAL_S = 300
def __init__(self, pg_dsn: str):
self._pg_dsn = pg_dsn
self._primary: Optional[PostgresAlertStore] = None
self._fallback = MemoryAlertStore()
self._using_fallback = False
self._fallback_since: float = 0.0
self._init_lock = threading.Lock()
def _get_primary(self) -> PostgresAlertStore:
if self._primary is None:
with self._init_lock:
if self._primary is None:
self._primary = PostgresAlertStore(self._pg_dsn)
return self._primary
def _maybe_recover(self) -> None:
if self._using_fallback and self._fallback_since > 0:
if time.monotonic() - self._fallback_since >= self._RECOVERY_INTERVAL_S:
logger.info("AutoAlertStore: attempting Postgres recovery")
self._using_fallback = False
self._fallback_since = 0.0
def _switch_to_fallback(self, err: Exception) -> None:
logger.warning("AutoAlertStore: Postgres failed (%s), using Memory fallback", err)
self._using_fallback = True
self._fallback_since = time.monotonic()
def active_backend(self) -> str:
return "memory_fallback" if self._using_fallback else "postgres"
def _delegate(self, method: str, *args, **kwargs):
self._maybe_recover()
if not self._using_fallback:
try:
return getattr(self._get_primary(), method)(*args, **kwargs)
except Exception as e:
self._switch_to_fallback(e)
return getattr(self._fallback, method)(*args, **kwargs)
def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict:
return self._delegate("ingest", alert_data, dedupe_ttl_minutes)
def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
return self._delegate("list_alerts", filters, limit)
def get_alert(self, alert_ref: str) -> Optional[Dict]:
return self._delegate("get_alert", alert_ref)
def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
return self._delegate("mark_acked", alert_ref, actor, note)
def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]:
return self._delegate("get_by_dedupe_key", dedupe_key)
def claim_next_alerts(self, window_minutes=240, limit=25, owner="loop",
lock_ttl_seconds=PROCESSING_LOCK_TTL_S) -> List[Dict]:
return self._delegate("claim_next_alerts", window_minutes, limit, owner, lock_ttl_seconds)
def mark_acked(self, alert_ref, actor, note="") -> Optional[Dict]:
return self._delegate("mark_acked", alert_ref, actor, note)
def mark_failed(self, alert_ref, error, retry_after_seconds=300) -> Optional[Dict]:
return self._delegate("mark_failed", alert_ref, error, retry_after_seconds)
def requeue_expired_processing(self) -> int:
return self._delegate("requeue_expired_processing")
def dashboard_counts(self, window_minutes=240) -> Dict:
return self._delegate("dashboard_counts", window_minutes)
def top_signatures(self, window_minutes=240, limit=20) -> List[Dict]:
return self._delegate("top_signatures", window_minutes, limit)
def compute_loop_slo(self, window_minutes=240, p95_threshold_s=60.0,
failed_rate_threshold_pct=5.0, stuck_minutes=15.0) -> Dict:
return self._delegate("compute_loop_slo", window_minutes, p95_threshold_s,
failed_rate_threshold_pct, stuck_minutes)
# ─── Singleton ────────────────────────────────────────────────────────────────
_store: Optional[AlertStore] = None
_store_lock = threading.Lock()
def get_alert_store() -> AlertStore:
global _store
if _store is None:
with _store_lock:
if _store is None:
_store = _create_alert_store()
return _store
def set_alert_store(store: Optional[AlertStore]) -> None:
global _store
with _store_lock:
_store = store
def _create_alert_store() -> AlertStore:
backend = os.getenv("ALERT_BACKEND", "memory").lower()
# ALERT_DATABASE_URL takes precedence (service-specific), then DATABASE_URL (shared)
dsn = os.getenv("ALERT_DATABASE_URL") or os.getenv("DATABASE_URL", "")
if backend == "postgres":
if dsn:
logger.info("AlertStore: postgres dsn=%s", dsn[:30])
return PostgresAlertStore(dsn)
logger.warning(
"ALERT_BACKEND=postgres but no ALERT_DATABASE_URL/DATABASE_URL; falling back to memory"
)
if backend == "auto":
if dsn:
logger.info("AlertStore: auto (postgres→memory fallback) dsn=%s", dsn[:30])
return AutoAlertStore(dsn)
logger.info("AlertStore: auto — no ALERT_DATABASE_URL/DATABASE_URL, using memory")
logger.info("AlertStore: memory (in-process)")
return MemoryAlertStore()