New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
377 lines
15 KiB
Python
377 lines
15 KiB
Python
"""
|
|
signature_state_store.py — Cooldown tracking per incident signature.
|
|
|
|
Prevents triage from running too frequently for the same failure type.
|
|
A "signature" is the same one computed by alert_routing.compute_incident_signature.
|
|
|
|
Backends:
|
|
- MemorySignatureStateStore (tests / single-process)
|
|
- PostgresSignatureStateStore (production)
|
|
- AutoSignatureStateStore (Postgres → Memory fallback)
|
|
|
|
Table: incident_signature_state
|
|
signature text PK, last_triage_at timestamptz, last_alert_at timestamptz,
|
|
triage_count_24h int, updated_at timestamptz
|
|
|
|
DDL: ops/scripts/migrate_alerts_postgres.py
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import datetime
|
|
import logging
|
|
import os
|
|
import threading
|
|
import time
|
|
from abc import ABC, abstractmethod
|
|
from typing import Dict, List, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DEFAULT_COOLDOWN_MINUTES = 15
|
|
|
|
|
|
def _now_dt() -> datetime.datetime:
|
|
return datetime.datetime.utcnow()
|
|
|
|
|
|
def _now_iso() -> str:
|
|
return datetime.datetime.utcnow().isoformat()
|
|
|
|
|
|
# ─── Abstract ─────────────────────────────────────────────────────────────────
|
|
|
|
class SignatureStateStore(ABC):
|
|
|
|
@abstractmethod
|
|
def should_run_triage(
|
|
self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES
|
|
) -> bool:
|
|
"""Return True if cooldown has passed (triage may proceed)."""
|
|
|
|
@abstractmethod
|
|
def mark_alert_seen(self, signature: str) -> None:
|
|
"""Record that an alert with this signature was observed.
|
|
Also updates occurrences_60m rolling bucket."""
|
|
|
|
@abstractmethod
|
|
def mark_triage_run(self, signature: str) -> None:
|
|
"""Record that triage was executed for this signature."""
|
|
|
|
@abstractmethod
|
|
def get_state(self, signature: str) -> Optional[Dict]:
|
|
"""Return raw state dict or None."""
|
|
|
|
@abstractmethod
|
|
def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]:
|
|
"""Return signatures seen in last window_minutes, ordered by occurrences_60m desc."""
|
|
|
|
|
|
# ─── Memory backend ────────────────────────────────────────────────────────────
|
|
|
|
class MemorySignatureStateStore(SignatureStateStore):
|
|
BUCKET_MINUTES = 60 # rolling window for occurrences_60m
|
|
|
|
def __init__(self):
|
|
self._lock = threading.Lock()
|
|
self._states: Dict[str, Dict] = {}
|
|
|
|
def _update_bucket(self, state: Dict, now: str) -> None:
|
|
"""Update the 60-min rolling occurrence bucket in-place."""
|
|
bucket_start = state.get("occurrences_60m_bucket_start") or ""
|
|
cutoff = (_now_dt() - datetime.timedelta(minutes=self.BUCKET_MINUTES)).isoformat()
|
|
if bucket_start < cutoff:
|
|
state["occurrences_60m"] = 1
|
|
state["occurrences_60m_bucket_start"] = now
|
|
else:
|
|
state["occurrences_60m"] = state.get("occurrences_60m", 0) + 1
|
|
|
|
def should_run_triage(
|
|
self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES
|
|
) -> bool:
|
|
with self._lock:
|
|
state = self._states.get(signature)
|
|
if state is None:
|
|
return True
|
|
last_triage = state.get("last_triage_at")
|
|
if not last_triage:
|
|
return True
|
|
cutoff = (_now_dt() - datetime.timedelta(minutes=cooldown_minutes)).isoformat()
|
|
return last_triage < cutoff
|
|
|
|
def mark_alert_seen(self, signature: str) -> None:
|
|
now = _now_iso()
|
|
with self._lock:
|
|
if signature not in self._states:
|
|
self._states[signature] = {
|
|
"signature": signature,
|
|
"last_triage_at": None,
|
|
"last_alert_at": now,
|
|
"triage_count_24h": 0,
|
|
"occurrences_60m": 1,
|
|
"occurrences_60m_bucket_start": now,
|
|
"updated_at": now,
|
|
}
|
|
else:
|
|
s = self._states[signature]
|
|
s["last_alert_at"] = now
|
|
s["updated_at"] = now
|
|
self._update_bucket(s, now)
|
|
|
|
def mark_triage_run(self, signature: str) -> None:
|
|
now = _now_iso()
|
|
cutoff_24h = (_now_dt() - datetime.timedelta(hours=24)).isoformat()
|
|
with self._lock:
|
|
if signature not in self._states:
|
|
self._states[signature] = {
|
|
"signature": signature,
|
|
"last_triage_at": now,
|
|
"last_alert_at": now,
|
|
"triage_count_24h": 1,
|
|
"occurrences_60m": 0,
|
|
"occurrences_60m_bucket_start": now,
|
|
"updated_at": now,
|
|
}
|
|
else:
|
|
s = self._states[signature]
|
|
prev = s.get("last_triage_at") or ""
|
|
if prev < cutoff_24h:
|
|
s["triage_count_24h"] = 1
|
|
else:
|
|
s["triage_count_24h"] = s.get("triage_count_24h", 0) + 1
|
|
s["last_triage_at"] = now
|
|
s["updated_at"] = now
|
|
|
|
def get_state(self, signature: str) -> Optional[Dict]:
|
|
with self._lock:
|
|
s = self._states.get(signature)
|
|
return dict(s) if s else None
|
|
|
|
def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]:
|
|
cutoff = (_now_dt() - datetime.timedelta(minutes=window_minutes)).isoformat()
|
|
with self._lock:
|
|
active = [
|
|
dict(s) for s in self._states.values()
|
|
if (s.get("last_alert_at") or "") >= cutoff
|
|
]
|
|
return sorted(active, key=lambda x: x.get("occurrences_60m", 0), reverse=True)[:limit]
|
|
|
|
|
|
# ─── Postgres backend ──────────────────────────────────────────────────────────
|
|
|
|
class PostgresSignatureStateStore(SignatureStateStore):
|
|
def __init__(self, dsn: str):
|
|
self._dsn = dsn
|
|
self._local = threading.local()
|
|
|
|
def _conn(self):
|
|
conn = getattr(self._local, "conn", None)
|
|
if conn is None or conn.closed:
|
|
import psycopg2 # type: ignore
|
|
conn = psycopg2.connect(self._dsn)
|
|
conn.autocommit = True
|
|
self._local.conn = conn
|
|
return conn
|
|
|
|
def should_run_triage(
|
|
self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES
|
|
) -> bool:
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
"SELECT last_triage_at FROM incident_signature_state WHERE signature=%s",
|
|
(signature,),
|
|
)
|
|
row = cur.fetchone()
|
|
cur.close()
|
|
if not row or row[0] is None:
|
|
return True
|
|
cutoff = _now_dt() - datetime.timedelta(minutes=cooldown_minutes)
|
|
last = row[0]
|
|
if hasattr(last, "tzinfo") and last.tzinfo:
|
|
last = last.replace(tzinfo=None)
|
|
return last < cutoff
|
|
|
|
def mark_alert_seen(self, signature: str) -> None:
|
|
now = _now_iso()
|
|
cutoff_60m = (_now_dt() - datetime.timedelta(minutes=60)).isoformat()
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
"""INSERT INTO incident_signature_state
|
|
(signature, last_alert_at, triage_count_24h, updated_at,
|
|
occurrences_60m, occurrences_60m_bucket_start)
|
|
VALUES (%s, %s, 0, %s, 1, %s)
|
|
ON CONFLICT (signature) DO UPDATE
|
|
SET last_alert_at=EXCLUDED.last_alert_at,
|
|
updated_at=EXCLUDED.updated_at,
|
|
occurrences_60m = CASE
|
|
WHEN incident_signature_state.occurrences_60m_bucket_start IS NULL
|
|
OR incident_signature_state.occurrences_60m_bucket_start < %s
|
|
THEN 1
|
|
ELSE incident_signature_state.occurrences_60m + 1
|
|
END,
|
|
occurrences_60m_bucket_start = CASE
|
|
WHEN incident_signature_state.occurrences_60m_bucket_start IS NULL
|
|
OR incident_signature_state.occurrences_60m_bucket_start < %s
|
|
THEN EXCLUDED.occurrences_60m_bucket_start
|
|
ELSE incident_signature_state.occurrences_60m_bucket_start
|
|
END""",
|
|
(signature, now, now, now, cutoff_60m, cutoff_60m),
|
|
)
|
|
cur.close()
|
|
|
|
def mark_triage_run(self, signature: str) -> None:
|
|
now = _now_iso()
|
|
cutoff_24h = (_now_dt() - datetime.timedelta(hours=24)).isoformat()
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
"""INSERT INTO incident_signature_state
|
|
(signature, last_triage_at, last_alert_at, triage_count_24h, updated_at,
|
|
occurrences_60m, occurrences_60m_bucket_start)
|
|
VALUES (%s, %s, %s, 1, %s, 0, %s)
|
|
ON CONFLICT (signature) DO UPDATE
|
|
SET last_triage_at=EXCLUDED.last_triage_at,
|
|
triage_count_24h = CASE
|
|
WHEN incident_signature_state.last_triage_at IS NULL
|
|
OR incident_signature_state.last_triage_at < %s
|
|
THEN 1
|
|
ELSE incident_signature_state.triage_count_24h + 1
|
|
END,
|
|
updated_at=EXCLUDED.updated_at""",
|
|
(signature, now, now, now, now, cutoff_24h),
|
|
)
|
|
cur.close()
|
|
|
|
def get_state(self, signature: str) -> Optional[Dict]:
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
"SELECT signature, last_triage_at, last_alert_at, triage_count_24h, updated_at, "
|
|
"occurrences_60m, occurrences_60m_bucket_start "
|
|
"FROM incident_signature_state WHERE signature=%s",
|
|
(signature,),
|
|
)
|
|
row = cur.fetchone()
|
|
cur.close()
|
|
if not row:
|
|
return None
|
|
sig, lta, laa, cnt, upd, occ60, occ_start = row
|
|
return {
|
|
"signature": sig,
|
|
"last_triage_at": lta.isoformat() if hasattr(lta, "isoformat") else lta,
|
|
"last_alert_at": laa.isoformat() if hasattr(laa, "isoformat") else laa,
|
|
"triage_count_24h": cnt,
|
|
"updated_at": upd.isoformat() if hasattr(upd, "isoformat") else upd,
|
|
"occurrences_60m": occ60 or 0,
|
|
"occurrences_60m_bucket_start": (
|
|
occ_start.isoformat() if hasattr(occ_start, "isoformat") else occ_start
|
|
),
|
|
}
|
|
|
|
def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]:
|
|
cutoff = (_now_dt() - datetime.timedelta(minutes=window_minutes)).isoformat()
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
"SELECT signature, last_triage_at, last_alert_at, triage_count_24h, updated_at, "
|
|
"occurrences_60m, occurrences_60m_bucket_start "
|
|
"FROM incident_signature_state "
|
|
"WHERE last_alert_at >= %s "
|
|
"ORDER BY occurrences_60m DESC NULLS LAST LIMIT %s",
|
|
(cutoff, limit),
|
|
)
|
|
rows = []
|
|
for row in cur.fetchall():
|
|
sig, lta, laa, cnt, upd, occ60, occ_start = row
|
|
rows.append({
|
|
"signature": sig,
|
|
"last_triage_at": lta.isoformat() if hasattr(lta, "isoformat") else lta,
|
|
"last_alert_at": laa.isoformat() if hasattr(laa, "isoformat") else laa,
|
|
"triage_count_24h": cnt,
|
|
"updated_at": upd.isoformat() if hasattr(upd, "isoformat") else upd,
|
|
"occurrences_60m": occ60 or 0,
|
|
"occurrences_60m_bucket_start": (
|
|
occ_start.isoformat() if hasattr(occ_start, "isoformat") else occ_start
|
|
),
|
|
})
|
|
cur.close()
|
|
return rows
|
|
|
|
|
|
# ─── Auto backend ──────────────────────────────────────────────────────────────
|
|
|
|
class AutoSignatureStateStore(SignatureStateStore):
|
|
_RECOVERY_S = 300
|
|
|
|
def __init__(self, pg_dsn: str):
|
|
self._pg_dsn = pg_dsn
|
|
self._primary: Optional[PostgresSignatureStateStore] = None
|
|
self._fallback = MemorySignatureStateStore()
|
|
self._using_fallback = False
|
|
self._since: float = 0.0
|
|
self._lock = threading.Lock()
|
|
|
|
def _get_primary(self) -> PostgresSignatureStateStore:
|
|
if self._primary is None:
|
|
with self._lock:
|
|
if self._primary is None:
|
|
self._primary = PostgresSignatureStateStore(self._pg_dsn)
|
|
return self._primary
|
|
|
|
def _maybe_recover(self):
|
|
if self._using_fallback and time.monotonic() - self._since >= self._RECOVERY_S:
|
|
self._using_fallback = False
|
|
|
|
def _delegate(self, method: str, *args, **kwargs):
|
|
self._maybe_recover()
|
|
if not self._using_fallback:
|
|
try:
|
|
return getattr(self._get_primary(), method)(*args, **kwargs)
|
|
except Exception as e:
|
|
logger.warning("AutoSignatureStateStore Postgres failed: %s", e)
|
|
self._using_fallback = True
|
|
self._since = time.monotonic()
|
|
return getattr(self._fallback, method)(*args, **kwargs)
|
|
|
|
def should_run_triage(self, signature, cooldown_minutes=DEFAULT_COOLDOWN_MINUTES):
|
|
return self._delegate("should_run_triage", signature, cooldown_minutes)
|
|
|
|
def mark_alert_seen(self, signature):
|
|
self._delegate("mark_alert_seen", signature)
|
|
|
|
def mark_triage_run(self, signature):
|
|
self._delegate("mark_triage_run", signature)
|
|
|
|
def get_state(self, signature):
|
|
return self._delegate("get_state", signature)
|
|
|
|
def list_active_signatures(self, window_minutes=60, limit=100):
|
|
return self._delegate("list_active_signatures", window_minutes, limit)
|
|
|
|
|
|
# ─── Singleton ────────────────────────────────────────────────────────────────
|
|
|
|
_sig_store: Optional[SignatureStateStore] = None
|
|
_sig_lock = threading.Lock()
|
|
|
|
|
|
def get_signature_state_store() -> SignatureStateStore:
|
|
global _sig_store
|
|
if _sig_store is None:
|
|
with _sig_lock:
|
|
if _sig_store is None:
|
|
_sig_store = _create_sig_store()
|
|
return _sig_store
|
|
|
|
|
|
def set_signature_state_store(store: Optional[SignatureStateStore]) -> None:
|
|
global _sig_store
|
|
with _sig_lock:
|
|
_sig_store = store
|
|
|
|
|
|
def _create_sig_store() -> SignatureStateStore:
|
|
backend = os.getenv("ALERT_BACKEND", "memory").lower()
|
|
dsn = os.getenv("DATABASE_URL") or os.getenv("ALERT_DATABASE_URL", "")
|
|
if backend == "postgres" and dsn:
|
|
return PostgresSignatureStateStore(dsn)
|
|
if backend == "auto" and dsn:
|
|
return AutoSignatureStateStore(dsn)
|
|
return MemorySignatureStateStore()
|