Files
microdao-daarion/services/router/signature_state_store.py
Apple 129e4ea1fc feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
2026-03-03 07:14:14 -08:00

377 lines
15 KiB
Python

"""
signature_state_store.py — Cooldown tracking per incident signature.
Prevents triage from running too frequently for the same failure type.
A "signature" is the same one computed by alert_routing.compute_incident_signature.
Backends:
- MemorySignatureStateStore (tests / single-process)
- PostgresSignatureStateStore (production)
- AutoSignatureStateStore (Postgres → Memory fallback)
Table: incident_signature_state
signature text PK, last_triage_at timestamptz, last_alert_at timestamptz,
triage_count_24h int, updated_at timestamptz
DDL: ops/scripts/migrate_alerts_postgres.py
"""
from __future__ import annotations
import datetime
import logging
import os
import threading
import time
from abc import ABC, abstractmethod
from typing import Dict, List, Optional
logger = logging.getLogger(__name__)
DEFAULT_COOLDOWN_MINUTES = 15
def _now_dt() -> datetime.datetime:
return datetime.datetime.utcnow()
def _now_iso() -> str:
return datetime.datetime.utcnow().isoformat()
# ─── Abstract ─────────────────────────────────────────────────────────────────
class SignatureStateStore(ABC):
@abstractmethod
def should_run_triage(
self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES
) -> bool:
"""Return True if cooldown has passed (triage may proceed)."""
@abstractmethod
def mark_alert_seen(self, signature: str) -> None:
"""Record that an alert with this signature was observed.
Also updates occurrences_60m rolling bucket."""
@abstractmethod
def mark_triage_run(self, signature: str) -> None:
"""Record that triage was executed for this signature."""
@abstractmethod
def get_state(self, signature: str) -> Optional[Dict]:
"""Return raw state dict or None."""
@abstractmethod
def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]:
"""Return signatures seen in last window_minutes, ordered by occurrences_60m desc."""
# ─── Memory backend ────────────────────────────────────────────────────────────
class MemorySignatureStateStore(SignatureStateStore):
BUCKET_MINUTES = 60 # rolling window for occurrences_60m
def __init__(self):
self._lock = threading.Lock()
self._states: Dict[str, Dict] = {}
def _update_bucket(self, state: Dict, now: str) -> None:
"""Update the 60-min rolling occurrence bucket in-place."""
bucket_start = state.get("occurrences_60m_bucket_start") or ""
cutoff = (_now_dt() - datetime.timedelta(minutes=self.BUCKET_MINUTES)).isoformat()
if bucket_start < cutoff:
state["occurrences_60m"] = 1
state["occurrences_60m_bucket_start"] = now
else:
state["occurrences_60m"] = state.get("occurrences_60m", 0) + 1
def should_run_triage(
self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES
) -> bool:
with self._lock:
state = self._states.get(signature)
if state is None:
return True
last_triage = state.get("last_triage_at")
if not last_triage:
return True
cutoff = (_now_dt() - datetime.timedelta(minutes=cooldown_minutes)).isoformat()
return last_triage < cutoff
def mark_alert_seen(self, signature: str) -> None:
now = _now_iso()
with self._lock:
if signature not in self._states:
self._states[signature] = {
"signature": signature,
"last_triage_at": None,
"last_alert_at": now,
"triage_count_24h": 0,
"occurrences_60m": 1,
"occurrences_60m_bucket_start": now,
"updated_at": now,
}
else:
s = self._states[signature]
s["last_alert_at"] = now
s["updated_at"] = now
self._update_bucket(s, now)
def mark_triage_run(self, signature: str) -> None:
now = _now_iso()
cutoff_24h = (_now_dt() - datetime.timedelta(hours=24)).isoformat()
with self._lock:
if signature not in self._states:
self._states[signature] = {
"signature": signature,
"last_triage_at": now,
"last_alert_at": now,
"triage_count_24h": 1,
"occurrences_60m": 0,
"occurrences_60m_bucket_start": now,
"updated_at": now,
}
else:
s = self._states[signature]
prev = s.get("last_triage_at") or ""
if prev < cutoff_24h:
s["triage_count_24h"] = 1
else:
s["triage_count_24h"] = s.get("triage_count_24h", 0) + 1
s["last_triage_at"] = now
s["updated_at"] = now
def get_state(self, signature: str) -> Optional[Dict]:
with self._lock:
s = self._states.get(signature)
return dict(s) if s else None
def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]:
cutoff = (_now_dt() - datetime.timedelta(minutes=window_minutes)).isoformat()
with self._lock:
active = [
dict(s) for s in self._states.values()
if (s.get("last_alert_at") or "") >= cutoff
]
return sorted(active, key=lambda x: x.get("occurrences_60m", 0), reverse=True)[:limit]
# ─── Postgres backend ──────────────────────────────────────────────────────────
class PostgresSignatureStateStore(SignatureStateStore):
def __init__(self, dsn: str):
self._dsn = dsn
self._local = threading.local()
def _conn(self):
conn = getattr(self._local, "conn", None)
if conn is None or conn.closed:
import psycopg2 # type: ignore
conn = psycopg2.connect(self._dsn)
conn.autocommit = True
self._local.conn = conn
return conn
def should_run_triage(
self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES
) -> bool:
cur = self._conn().cursor()
cur.execute(
"SELECT last_triage_at FROM incident_signature_state WHERE signature=%s",
(signature,),
)
row = cur.fetchone()
cur.close()
if not row or row[0] is None:
return True
cutoff = _now_dt() - datetime.timedelta(minutes=cooldown_minutes)
last = row[0]
if hasattr(last, "tzinfo") and last.tzinfo:
last = last.replace(tzinfo=None)
return last < cutoff
def mark_alert_seen(self, signature: str) -> None:
now = _now_iso()
cutoff_60m = (_now_dt() - datetime.timedelta(minutes=60)).isoformat()
cur = self._conn().cursor()
cur.execute(
"""INSERT INTO incident_signature_state
(signature, last_alert_at, triage_count_24h, updated_at,
occurrences_60m, occurrences_60m_bucket_start)
VALUES (%s, %s, 0, %s, 1, %s)
ON CONFLICT (signature) DO UPDATE
SET last_alert_at=EXCLUDED.last_alert_at,
updated_at=EXCLUDED.updated_at,
occurrences_60m = CASE
WHEN incident_signature_state.occurrences_60m_bucket_start IS NULL
OR incident_signature_state.occurrences_60m_bucket_start < %s
THEN 1
ELSE incident_signature_state.occurrences_60m + 1
END,
occurrences_60m_bucket_start = CASE
WHEN incident_signature_state.occurrences_60m_bucket_start IS NULL
OR incident_signature_state.occurrences_60m_bucket_start < %s
THEN EXCLUDED.occurrences_60m_bucket_start
ELSE incident_signature_state.occurrences_60m_bucket_start
END""",
(signature, now, now, now, cutoff_60m, cutoff_60m),
)
cur.close()
def mark_triage_run(self, signature: str) -> None:
now = _now_iso()
cutoff_24h = (_now_dt() - datetime.timedelta(hours=24)).isoformat()
cur = self._conn().cursor()
cur.execute(
"""INSERT INTO incident_signature_state
(signature, last_triage_at, last_alert_at, triage_count_24h, updated_at,
occurrences_60m, occurrences_60m_bucket_start)
VALUES (%s, %s, %s, 1, %s, 0, %s)
ON CONFLICT (signature) DO UPDATE
SET last_triage_at=EXCLUDED.last_triage_at,
triage_count_24h = CASE
WHEN incident_signature_state.last_triage_at IS NULL
OR incident_signature_state.last_triage_at < %s
THEN 1
ELSE incident_signature_state.triage_count_24h + 1
END,
updated_at=EXCLUDED.updated_at""",
(signature, now, now, now, now, cutoff_24h),
)
cur.close()
def get_state(self, signature: str) -> Optional[Dict]:
cur = self._conn().cursor()
cur.execute(
"SELECT signature, last_triage_at, last_alert_at, triage_count_24h, updated_at, "
"occurrences_60m, occurrences_60m_bucket_start "
"FROM incident_signature_state WHERE signature=%s",
(signature,),
)
row = cur.fetchone()
cur.close()
if not row:
return None
sig, lta, laa, cnt, upd, occ60, occ_start = row
return {
"signature": sig,
"last_triage_at": lta.isoformat() if hasattr(lta, "isoformat") else lta,
"last_alert_at": laa.isoformat() if hasattr(laa, "isoformat") else laa,
"triage_count_24h": cnt,
"updated_at": upd.isoformat() if hasattr(upd, "isoformat") else upd,
"occurrences_60m": occ60 or 0,
"occurrences_60m_bucket_start": (
occ_start.isoformat() if hasattr(occ_start, "isoformat") else occ_start
),
}
def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]:
cutoff = (_now_dt() - datetime.timedelta(minutes=window_minutes)).isoformat()
cur = self._conn().cursor()
cur.execute(
"SELECT signature, last_triage_at, last_alert_at, triage_count_24h, updated_at, "
"occurrences_60m, occurrences_60m_bucket_start "
"FROM incident_signature_state "
"WHERE last_alert_at >= %s "
"ORDER BY occurrences_60m DESC NULLS LAST LIMIT %s",
(cutoff, limit),
)
rows = []
for row in cur.fetchall():
sig, lta, laa, cnt, upd, occ60, occ_start = row
rows.append({
"signature": sig,
"last_triage_at": lta.isoformat() if hasattr(lta, "isoformat") else lta,
"last_alert_at": laa.isoformat() if hasattr(laa, "isoformat") else laa,
"triage_count_24h": cnt,
"updated_at": upd.isoformat() if hasattr(upd, "isoformat") else upd,
"occurrences_60m": occ60 or 0,
"occurrences_60m_bucket_start": (
occ_start.isoformat() if hasattr(occ_start, "isoformat") else occ_start
),
})
cur.close()
return rows
# ─── Auto backend ──────────────────────────────────────────────────────────────
class AutoSignatureStateStore(SignatureStateStore):
_RECOVERY_S = 300
def __init__(self, pg_dsn: str):
self._pg_dsn = pg_dsn
self._primary: Optional[PostgresSignatureStateStore] = None
self._fallback = MemorySignatureStateStore()
self._using_fallback = False
self._since: float = 0.0
self._lock = threading.Lock()
def _get_primary(self) -> PostgresSignatureStateStore:
if self._primary is None:
with self._lock:
if self._primary is None:
self._primary = PostgresSignatureStateStore(self._pg_dsn)
return self._primary
def _maybe_recover(self):
if self._using_fallback and time.monotonic() - self._since >= self._RECOVERY_S:
self._using_fallback = False
def _delegate(self, method: str, *args, **kwargs):
self._maybe_recover()
if not self._using_fallback:
try:
return getattr(self._get_primary(), method)(*args, **kwargs)
except Exception as e:
logger.warning("AutoSignatureStateStore Postgres failed: %s", e)
self._using_fallback = True
self._since = time.monotonic()
return getattr(self._fallback, method)(*args, **kwargs)
def should_run_triage(self, signature, cooldown_minutes=DEFAULT_COOLDOWN_MINUTES):
return self._delegate("should_run_triage", signature, cooldown_minutes)
def mark_alert_seen(self, signature):
self._delegate("mark_alert_seen", signature)
def mark_triage_run(self, signature):
self._delegate("mark_triage_run", signature)
def get_state(self, signature):
return self._delegate("get_state", signature)
def list_active_signatures(self, window_minutes=60, limit=100):
return self._delegate("list_active_signatures", window_minutes, limit)
# ─── Singleton ────────────────────────────────────────────────────────────────
_sig_store: Optional[SignatureStateStore] = None
_sig_lock = threading.Lock()
def get_signature_state_store() -> SignatureStateStore:
global _sig_store
if _sig_store is None:
with _sig_lock:
if _sig_store is None:
_sig_store = _create_sig_store()
return _sig_store
def set_signature_state_store(store: Optional[SignatureStateStore]) -> None:
global _sig_store
with _sig_lock:
_sig_store = store
def _create_sig_store() -> SignatureStateStore:
backend = os.getenv("ALERT_BACKEND", "memory").lower()
dsn = os.getenv("DATABASE_URL") or os.getenv("ALERT_DATABASE_URL", "")
if backend == "postgres" and dsn:
return PostgresSignatureStateStore(dsn)
if backend == "auto" and dsn:
return AutoSignatureStateStore(dsn)
return MemorySignatureStateStore()