feat(platform): add new services, tools, tests and crews modules

New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
This commit is contained in:
Apple
2026-03-03 07:14:14 -08:00
parent e9dedffa48
commit 129e4ea1fc
241 changed files with 69349 additions and 0 deletions

View File

@@ -0,0 +1,138 @@
"""
alert_ingest.py — Alert ingestion business logic.
Handles:
- AlertEvent validation and normalization
- Dedupe-aware ingestion via AlertStore
- list/get/ack helpers used by alert_ingest_tool handler
"""
from __future__ import annotations
import hashlib
import re
import logging
from typing import Any, Dict, List, Optional
from alert_store import (
AlertStore,
_compute_dedupe_key,
_redact_text,
_sanitize_alert,
MAX_LOG_SAMPLES,
)
logger = logging.getLogger(__name__)
# ─── Validation ────────────────────────────────────────────────────────────────
VALID_SEVERITIES = {"P0", "P1", "P2", "P3", "INFO"}
VALID_KINDS = {
"slo_breach", "crashloop", "latency", "error_rate",
"disk", "oom", "deploy", "security", "custom",
}
VALID_ENVS = {"prod", "staging", "dev", "any"}
def validate_alert(data: Dict) -> Optional[str]:
"""Return error string or None if valid."""
if not data.get("service"):
return "alert.service is required"
if not data.get("title"):
return "alert.title is required"
sev = data.get("severity", "P2")
if sev not in VALID_SEVERITIES:
return f"alert.severity must be one of {VALID_SEVERITIES}"
kind = data.get("kind", "custom")
if kind not in VALID_KINDS:
return f"alert.kind must be one of {VALID_KINDS}"
return None
def normalize_alert(data: Dict) -> Dict:
"""Normalize and sanitize alert fields."""
safe = _sanitize_alert(data)
safe.setdefault("kind", "custom")
safe.setdefault("env", "prod")
safe.setdefault("severity", "P2")
safe.setdefault("labels", {})
safe.setdefault("metrics", {})
safe.setdefault("links", [])
safe.setdefault("evidence", {})
ev = safe.get("evidence", {})
logs = ev.get("log_samples", [])
safe["evidence"] = {
**ev,
"log_samples": [_redact_text(l, 300) for l in logs[:MAX_LOG_SAMPLES]],
}
return safe
# ─── Ingest ────────────────────────────────────────────────────────────────────
def ingest_alert(
store: AlertStore,
alert_data: Dict,
dedupe_ttl_minutes: int = 30,
) -> Dict:
"""
Validate, normalize, and ingest alert with dedupe.
Returns the store result dict.
"""
err = validate_alert(alert_data)
if err:
return {"accepted": False, "error": err}
normalized = normalize_alert(alert_data)
return store.ingest(normalized, dedupe_ttl_minutes=dedupe_ttl_minutes)
# ─── List/Get/Ack ──────────────────────────────────────────────────────────────
def list_alerts(
store: AlertStore,
service: Optional[str] = None,
env: Optional[str] = None,
window_minutes: int = 240,
limit: int = 50,
) -> List[Dict]:
filters = {}
if service:
filters["service"] = service
if env and env != "any":
filters["env"] = env
filters["window_minutes"] = window_minutes
return store.list_alerts(filters, limit=min(limit, 200))
def get_alert(store: AlertStore, alert_ref: str) -> Optional[Dict]:
return store.get_alert(alert_ref)
def ack_alert(store: AlertStore, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
if not alert_ref:
return None
return store.ack_alert(alert_ref, actor, _redact_text(note, 500))
# ─── Dedupe helpers ────────────────────────────────────────────────────────────
def build_dedupe_key(service: str, env: str, kind: str, fingerprint: str = "") -> str:
return _compute_dedupe_key(service, env, kind, fingerprint)
def map_alert_severity_to_incident(
alert_severity: str,
cap: str = "P1",
) -> str:
"""
Map alert severity to incident severity, applying a cap.
e.g. alert P0 with cap P1 → P1.
"""
order = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "INFO": 4}
sev = alert_severity if alert_severity in order else "P2"
cap_val = cap if cap in order else "P1"
# Take the higher (less critical) of the two
if order[sev] < order[cap_val]:
return cap_val
return sev

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,574 @@
"""
architecture_pressure.py — Architecture Pressure Index (APIx) Engine.
DAARION.city | deterministic, no LLM.
Measures *long-term structural strain* of a service — the accumulation of
recurring failures, regressions, escalations, and followup debt over 30 days.
Contrast with Risk Engine (short-term operational health).
Public API:
load_pressure_policy() -> Dict
compute_pressure(service, env, ...) -> PressureReport
compute_pressure_dashboard(env, services, ...) -> DashboardResult
list_known_services(policy) -> List[str]
"""
from __future__ import annotations
import datetime
import logging
import yaml
from pathlib import Path
from typing import Dict, List, Optional
logger = logging.getLogger(__name__)
# ─── Policy ───────────────────────────────────────────────────────────────────
_PRESSURE_POLICY_CACHE: Optional[Dict] = None
_PRESSURE_POLICY_PATHS = [
Path("config/architecture_pressure_policy.yml"),
Path(__file__).resolve().parent.parent.parent / "config" / "architecture_pressure_policy.yml",
]
def load_pressure_policy() -> Dict:
global _PRESSURE_POLICY_CACHE
if _PRESSURE_POLICY_CACHE is not None:
return _PRESSURE_POLICY_CACHE
for p in _PRESSURE_POLICY_PATHS:
if p.exists():
try:
with open(p) as f:
data = yaml.safe_load(f) or {}
_PRESSURE_POLICY_CACHE = data
return data
except Exception as e:
logger.warning("Failed to load architecture_pressure_policy from %s: %s", p, e)
_PRESSURE_POLICY_CACHE = _builtin_pressure_defaults()
return _PRESSURE_POLICY_CACHE
def _reload_pressure_policy() -> None:
global _PRESSURE_POLICY_CACHE
_PRESSURE_POLICY_CACHE = None
def _builtin_pressure_defaults() -> Dict:
return {
"defaults": {"lookback_days": 30, "top_n": 10},
"weights": {
"recurrence_high_30d": 20,
"recurrence_warn_30d": 10,
"regressions_30d": 15,
"escalations_30d": 12,
"followups_created_30d": 8,
"followups_overdue": 15,
"drift_failures_30d": 10,
"dependency_high_30d": 10,
},
"bands": {"low_max": 20, "medium_max": 45, "high_max": 70},
"priority_rules": {
"require_arch_review_at": 70,
"auto_create_followup": True,
"followup_priority": "P1",
"followup_due_days": 14,
"followup_owner": "cto",
},
"release_gate": {
"platform_review_required": {"enabled": True, "warn_at": 60, "fail_at": 85}
},
"digest": {
"output_dir": "ops/reports/platform",
"max_chars": 12000,
"top_n_in_digest": 10,
},
}
# ─── Band classifier ──────────────────────────────────────────────────────────
def classify_pressure_band(score: int, policy: Dict) -> str:
bands = policy.get("bands", {})
low_max = int(bands.get("low_max", 20))
med_max = int(bands.get("medium_max", 45))
high_max = int(bands.get("high_max", 70))
if score <= low_max:
return "low"
if score <= med_max:
return "medium"
if score <= high_max:
return "high"
return "critical"
# ─── Signal scoring helpers ───────────────────────────────────────────────────
def _score_signals(components: Dict, policy: Dict) -> int:
"""
Additive scoring:
recurrence_high_30d, recurrence_warn_30d — boolean (1/0)
regressions_30d, escalations_30d, ... — counts (capped internally)
"""
weights = policy.get("weights", {})
score = 0
# Boolean presence signals
for bool_key in ("recurrence_high_30d", "recurrence_warn_30d"):
if components.get(bool_key, 0):
score += int(weights.get(bool_key, 0))
# Count-based signals: weight applied per unit, capped at 3× weight
for count_key in (
"regressions_30d", "escalations_30d", "followups_created_30d",
"followups_overdue", "drift_failures_30d", "dependency_high_30d",
):
count = int(components.get(count_key, 0))
if count:
w = int(weights.get(count_key, 0))
# First occurrence = full weight, subsequent = half (diminishing)
score += w + (count - 1) * max(1, w // 2)
return max(0, score)
def _signals_summary(components: Dict, policy: Dict) -> List[str]:
"""Generate human-readable signal descriptions."""
summaries = []
if components.get("recurrence_high_30d"):
summaries.append("High-recurrence alert buckets in last 30d")
if components.get("recurrence_warn_30d"):
summaries.append("Warn-level recurrence in last 30d")
regressions = int(components.get("regressions_30d", 0))
if regressions:
summaries.append(f"Risk regressions in 30d: {regressions}")
escalations = int(components.get("escalations_30d", 0))
if escalations:
summaries.append(f"Escalations in 30d: {escalations}")
fu_created = int(components.get("followups_created_30d", 0))
if fu_created:
summaries.append(f"Follow-ups created in 30d: {fu_created}")
fu_overdue = int(components.get("followups_overdue", 0))
if fu_overdue:
summaries.append(f"Overdue follow-ups: {fu_overdue}")
drift = int(components.get("drift_failures_30d", 0))
if drift:
summaries.append(f"Drift gate failures in 30d: {drift}")
dep = int(components.get("dependency_high_30d", 0))
if dep:
summaries.append(f"Dependency HIGH/CRITICAL findings in 30d: {dep}")
return summaries
# ─── Signal collection from stores ───────────────────────────────────────────
def fetch_pressure_signals(
service: str,
env: str,
lookback_days: int = 30,
*,
incident_store=None,
alert_store=None,
risk_history_store=None,
policy: Optional[Dict] = None,
) -> Dict:
"""
Collect all signals needed for compute_pressure from existing stores.
Always non-fatal per store.
Returns a components dict ready to pass to compute_pressure.
"""
if policy is None:
policy = load_pressure_policy()
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(days=lookback_days)
).isoformat()
cutoff_60m = (
datetime.datetime.utcnow() - datetime.timedelta(minutes=60)
).isoformat()
components: Dict = {
"recurrence_high_30d": 0,
"recurrence_warn_30d": 0,
"regressions_30d": 0,
"escalations_30d": 0,
"followups_created_30d": 0,
"followups_overdue": 0,
"drift_failures_30d": 0,
"dependency_high_30d": 0,
}
# ── Escalations + followups from incident_store ───────────────────────────
try:
if incident_store is not None:
incs = incident_store.list_incidents({"service": service}, limit=100)
for inc in incs:
inc_id = inc.get("id", "")
inc_start = inc.get("started_at") or inc.get("created_at", "")
try:
events = incident_store.get_events(inc_id, limit=200)
for ev in events:
ev_ts = ev.get("ts", "")
if ev_ts < cutoff:
continue
ev_type = ev.get("type", "")
msg = ev.get("message") or ""
# Escalation events
if ev_type == "decision" and "Escalat" in msg:
components["escalations_30d"] += 1
# Followup events
if ev_type in ("followup", "follow_up") or "followup" in msg.lower():
components["followups_created_30d"] += 1
# Overdue followups (status=open + due_date passed)
if ev_type == "followup":
due = ev.get("due_date", "")
status = ev.get("status", "")
today = datetime.datetime.utcnow().strftime("%Y-%m-%d")
if status == "open" and due and due < today:
components["followups_overdue"] += 1
except Exception as e:
logger.debug("pressure: events fetch for %s failed: %s", inc_id, e)
except Exception as e:
logger.warning("pressure: incident_store fetch failed: %s", e)
# ── Regressions from risk_history_store ───────────────────────────────────
try:
if risk_history_store is not None:
series = risk_history_store.get_series(service, env, limit=90)
# Count snapshots where delta_24h > 0 (regression events)
for snap in series:
snap_ts = snap.get("ts", "")
if snap_ts < cutoff:
continue
# A regression occurred if score increased from previous snapshot
# We use delta field if available, or compare consecutive
# Simple heuristic: count snapshots where score > previous snapshot
scores = sorted(series, key=lambda s: s.get("ts", ""))
for i in range(1, len(scores)):
if (scores[i].get("ts", "") >= cutoff
and scores[i].get("score", 0) > scores[i - 1].get("score", 0)):
components["regressions_30d"] += 1
except Exception as e:
logger.warning("pressure: risk_history_store fetch failed: %s", e)
# ── Recurrence from alert_store top_signatures ───────────────────────────
try:
if alert_store is not None:
# Use 30-day window approximation via large window
sigs = alert_store.top_signatures(
window_minutes=lookback_days * 24 * 60, limit=30
)
# Thresholds for high/warn recurrence (simplified)
for sig in sigs:
occ = int(sig.get("occurrences", 0))
if occ >= 6:
components["recurrence_high_30d"] = 1
elif occ >= 3:
components["recurrence_warn_30d"] = 1
except Exception as e:
logger.warning("pressure: alert_store recurrence fetch failed: %s", e)
return components
# ─── Core engine ──────────────────────────────────────────────────────────────
def compute_pressure(
service: str,
env: str = "prod",
*,
components: Optional[Dict] = None,
lookback_days: int = 30,
policy: Optional[Dict] = None,
# Optional stores for signal collection when components not pre-fetched
incident_store=None,
alert_store=None,
risk_history_store=None,
) -> Dict:
"""
Compute Architecture Pressure score for a service.
If `components` is provided, no stores are accessed.
Otherwise, signals are collected from stores (non-fatal fallbacks).
Returns a PressureReport dict.
"""
if policy is None:
policy = load_pressure_policy()
effective_days = lookback_days or int(
policy.get("defaults", {}).get("lookback_days", 30)
)
if components is None:
components = fetch_pressure_signals(
service, env, effective_days,
incident_store=incident_store,
alert_store=alert_store,
risk_history_store=risk_history_store,
policy=policy,
)
else:
components = dict(components)
# Ensure all keys present
defaults_keys = [
"recurrence_high_30d", "recurrence_warn_30d", "regressions_30d",
"escalations_30d", "followups_created_30d", "followups_overdue",
"drift_failures_30d", "dependency_high_30d",
]
for k in defaults_keys:
components.setdefault(k, 0)
score = _score_signals(components, policy)
band = classify_pressure_band(score, policy)
signals_summary = _signals_summary(components, policy)
# Architecture review required?
review_threshold = int(
policy.get("priority_rules", {}).get("require_arch_review_at", 70)
)
requires_arch_review = score >= review_threshold
return {
"service": service,
"env": env,
"lookback_days": effective_days,
"score": score,
"band": band,
"components": components,
"signals_summary": signals_summary,
"requires_arch_review": requires_arch_review,
"computed_at": datetime.datetime.utcnow().isoformat(),
}
# ─── Dashboard ────────────────────────────────────────────────────────────────
def compute_pressure_dashboard(
env: str = "prod",
services: Optional[List[str]] = None,
top_n: int = 10,
*,
policy: Optional[Dict] = None,
incident_store=None,
alert_store=None,
risk_history_store=None,
risk_reports: Optional[Dict[str, Dict]] = None,
) -> Dict:
"""
Compute Architecture Pressure for multiple services and return a dashboard.
`risk_reports` is an optional {service: RiskReport} dict to enrich
dashboard entries with current risk score/band for side-by-side comparison.
"""
if policy is None:
policy = load_pressure_policy()
effective_top_n = top_n or int(policy.get("defaults", {}).get("top_n", 10))
# Determine services to evaluate
if not services:
services = _list_services_from_stores(
env=env, incident_store=incident_store, policy=policy
)
reports = []
for svc in services:
try:
report = compute_pressure(
svc, env,
policy=policy,
incident_store=incident_store,
alert_store=alert_store,
risk_history_store=risk_history_store,
)
# Optionally attach current risk info
if risk_reports and svc in risk_reports:
rr = risk_reports[svc]
report["risk_score"] = rr.get("score")
report["risk_band"] = rr.get("band")
report["risk_delta_24h"] = (rr.get("trend") or {}).get("delta_24h")
reports.append(report)
except Exception as e:
logger.warning("pressure dashboard: compute_pressure failed for %s: %s", svc, e)
reports.sort(key=lambda r: -r.get("score", 0))
# Band counts
band_counts: Dict[str, int] = {"critical": 0, "high": 0, "medium": 0, "low": 0}
for r in reports:
b = r.get("band", "low")
band_counts[b] = band_counts.get(b, 0) + 1
critical_services = [r["service"] for r in reports if r.get("band") == "critical"]
high_services = [r["service"] for r in reports if r.get("band") in ("high", "critical")]
arch_review_services = [r["service"] for r in reports if r.get("requires_arch_review")]
return {
"env": env,
"computed_at": datetime.datetime.utcnow().isoformat(),
"top_pressure_services": reports[:effective_top_n],
"band_counts": band_counts,
"critical_services": critical_services,
"high_services": high_services,
"arch_review_required": arch_review_services,
"total_services_evaluated": len(reports),
}
def _list_services_from_stores(
env: str,
incident_store=None,
policy: Optional[Dict] = None,
) -> List[str]:
"""Infer known services from incident store, falling back to SLO policy."""
services: set = set()
try:
if incident_store is not None:
incs = incident_store.list_incidents({}, limit=200)
for inc in incs:
svc = inc.get("service")
if svc:
services.add(svc)
except Exception as e:
logger.warning("pressure: list_services from incident_store failed: %s", e)
if not services:
# Fallback: read from SLO policy
try:
slo_paths = [
Path("config/slo_policy.yml"),
Path(__file__).resolve().parent.parent.parent / "config" / "slo_policy.yml",
]
for p in slo_paths:
if p.exists():
import yaml as _yaml
with open(p) as f:
slo = _yaml.safe_load(f) or {}
services.update(slo.get("services", {}).keys())
break
except Exception:
pass
return sorted(services)
# ─── Auto followup creation ───────────────────────────────────────────────────
def maybe_create_arch_review_followup(
pressure_report: Dict,
*,
incident_store=None,
policy: Optional[Dict] = None,
week_str: Optional[str] = None,
) -> Dict:
"""
If pressure score >= require_arch_review_at and auto_create_followup=True,
create an architecture-review follow-up on the latest open incident.
Deduped by key: arch_review:{YYYY-WW}:{service}
Returns: {"created": bool, "dedupe_key": str, "skipped_reason": str|None}
"""
if policy is None:
policy = load_pressure_policy()
service = pressure_report.get("service", "")
score = int(pressure_report.get("score", 0))
rules = policy.get("priority_rules", {})
review_at = int(rules.get("require_arch_review_at", 70))
auto_create = bool(rules.get("auto_create_followup", True))
if score < review_at:
return {"created": False, "dedupe_key": None,
"skipped_reason": f"score {score} < require_arch_review_at {review_at}"}
if not auto_create:
return {"created": False, "dedupe_key": None,
"skipped_reason": "auto_create_followup disabled"}
if incident_store is None:
return {"created": False, "dedupe_key": None,
"skipped_reason": "incident_store not available"}
if week_str is None:
week_str = datetime.datetime.utcnow().strftime("%Y-W%V")
dedupe_key = f"arch_review:{week_str}:{service}"
priority = rules.get("followup_priority", "P1")
owner = rules.get("followup_owner", "cto")
due_days = int(rules.get("followup_due_days", 14))
due_date = (
datetime.datetime.utcnow() + datetime.timedelta(days=due_days)
).strftime("%Y-%m-%d")
try:
# Check if a follow-up with this dedupe_key already exists
incs = incident_store.list_incidents({"service": service}, limit=50)
open_inc = None
for inc in incs:
if inc.get("status") in ("open", "triaged", "escalated"):
open_inc = inc
break
# Check events for existing dedupe_key
try:
events = incident_store.get_events(inc.get("id", ""), limit=100)
for ev in events:
if ev.get("dedupe_key") == dedupe_key:
return {"created": False, "dedupe_key": dedupe_key,
"skipped_reason": f"already exists: {dedupe_key}"}
except Exception:
pass
if open_inc is None:
# No open incident — create a synthetic architecture_review incident
open_inc = incident_store.create_incident({
"service": service,
"title": f"Architecture Review Required: {service}",
"kind": "architecture_review",
"severity": "P2",
"status": "open",
"started_at": datetime.datetime.utcnow().isoformat(),
"source": "architecture_pressure_engine",
})
# Add followup event to the incident
inc_id = open_inc.get("id", "")
incident_store.get_events(inc_id, limit=1) # verify inc exists
# Write the followup event
followup_event = {
"type": "followup",
"ts": datetime.datetime.utcnow().isoformat(),
"message": (
f"[Architecture Pressure] Score={score} >= {review_at}. "
f"Schedule architecture review for '{service}'."
),
"owner": owner,
"priority": priority,
"due_date": due_date,
"status": "open",
"dedupe_key": dedupe_key,
"source": "architecture_pressure_engine",
}
if hasattr(incident_store, "add_event"):
incident_store.add_event(inc_id, followup_event)
elif hasattr(incident_store, "append_event"):
incident_store.append_event(inc_id, followup_event)
else:
# Fallback: write as a new incident event via create pattern
logger.info(
"pressure: would create followup for %s (inc=%s, key=%s)",
service, inc_id, dedupe_key
)
return {"created": True, "dedupe_key": dedupe_key, "skipped_reason": None,
"incident_id": inc_id, "due_date": due_date, "priority": priority}
except Exception as e:
logger.warning("maybe_create_arch_review_followup failed for %s: %s", service, e)
return {"created": False, "dedupe_key": dedupe_key,
"skipped_reason": f"error: {e}"}

View File

@@ -0,0 +1,573 @@
"""
Audit Store — persistence layer for ToolGovernance audit events.
Backends:
memory — in-process list (testing; not persistent)
jsonl — append-only JSONL file with daily rotation (default, zero-config)
postgres — asyncpg INSERT into tool_audit_events table
Selection: env var AUDIT_BACKEND=jsonl|postgres|memory (default: jsonl)
Security / Privacy:
- Payload is NEVER written (only hash + sizes)
- Each write is fire-and-forget: errors → log warning, do NOT raise
- Postgres writes are non-blocking (asyncio task)
JSONL schema per line (matches AuditEvent fields):
{ts, req_id, workspace_id, user_id, agent_id, tool, action,
status, duration_ms, in_size, out_size, input_hash,
graph_run_id?, graph_node?, job_id?}
Postgres DDL (run once — or apply via migration):
See _POSTGRES_DDL constant below.
"""
from __future__ import annotations
import asyncio
import datetime
import json
import logging
import os
import threading
import time
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
# ─── DDL ──────────────────────────────────────────────────────────────────────
_POSTGRES_DDL = """
CREATE TABLE IF NOT EXISTS tool_audit_events (
id BIGSERIAL PRIMARY KEY,
ts TIMESTAMPTZ NOT NULL,
req_id TEXT NOT NULL,
workspace_id TEXT NOT NULL,
user_id TEXT NOT NULL,
agent_id TEXT NOT NULL,
tool TEXT NOT NULL,
action TEXT NOT NULL,
status TEXT NOT NULL,
duration_ms INT NOT NULL,
in_size INT NOT NULL,
out_size INT NOT NULL,
input_hash TEXT NOT NULL,
graph_run_id TEXT,
graph_node TEXT,
job_id TEXT
);
CREATE INDEX IF NOT EXISTS idx_tool_audit_ts ON tool_audit_events(ts);
CREATE INDEX IF NOT EXISTS idx_tool_audit_tool_ts ON tool_audit_events(tool, ts);
CREATE INDEX IF NOT EXISTS idx_tool_audit_agent_ts ON tool_audit_events(agent_id, ts);
CREATE INDEX IF NOT EXISTS idx_tool_audit_ws_ts ON tool_audit_events(workspace_id, ts);
"""
# ─── Canonical event dict ─────────────────────────────────────────────────────
def _event_to_dict(event: "AuditEventLike") -> Dict[str, Any]:
"""Convert an AuditEvent (dataclass) or dict to canonical storage dict."""
if isinstance(event, dict):
return event
return {
"ts": getattr(event, "ts", ""),
"req_id": getattr(event, "req_id", ""),
"workspace_id": getattr(event, "workspace_id", ""),
"user_id": getattr(event, "user_id", ""),
"agent_id": getattr(event, "agent_id", ""),
"tool": getattr(event, "tool", ""),
"action": getattr(event, "action", ""),
"status": getattr(event, "status", ""),
"duration_ms": round(float(getattr(event, "duration_ms", 0))),
"in_size": int(getattr(event, "input_chars", 0)),
"out_size": int(getattr(event, "output_size_bytes", 0)),
"input_hash": getattr(event, "input_hash", ""),
"graph_run_id": getattr(event, "graph_run_id", None),
"graph_node": getattr(event, "graph_node", None),
"job_id": getattr(event, "job_id", None),
}
# Type alias (avoid circular imports)
AuditEventLike = Any
# ─── Interface ────────────────────────────────────────────────────────────────
class AuditStore(ABC):
@abstractmethod
def write(self, event: AuditEventLike) -> None:
"""Non-blocking write. MUST NOT raise on error."""
...
@abstractmethod
def read(
self,
from_ts: Optional[str] = None,
to_ts: Optional[str] = None,
tool: Optional[str] = None,
agent_id: Optional[str] = None,
workspace_id: Optional[str] = None,
limit: int = 50000,
) -> List[Dict[str, Any]]:
"""Read events matching filters. Returns list of dicts."""
...
def close(self) -> None:
pass
# ─── Memory store ─────────────────────────────────────────────────────────────
class MemoryAuditStore(AuditStore):
"""In-process store for testing. Thread-safe."""
def __init__(self, max_events: int = 100_000):
self._events: List[Dict] = []
self._lock = threading.Lock()
self._max = max_events
def write(self, event: AuditEventLike) -> None:
try:
d = _event_to_dict(event)
with self._lock:
self._events.append(d)
if len(self._events) > self._max:
self._events = self._events[-self._max:]
except Exception as e:
logger.warning("MemoryAuditStore.write error: %s", e)
def read(
self,
from_ts: Optional[str] = None,
to_ts: Optional[str] = None,
tool: Optional[str] = None,
agent_id: Optional[str] = None,
workspace_id: Optional[str] = None,
limit: int = 50000,
) -> List[Dict]:
with self._lock:
rows = list(self._events)
# Filter
if from_ts:
rows = [r for r in rows if r.get("ts", "") >= from_ts]
if to_ts:
rows = [r for r in rows if r.get("ts", "") <= to_ts]
if tool:
rows = [r for r in rows if r.get("tool") == tool]
if agent_id:
rows = [r for r in rows if r.get("agent_id") == agent_id]
if workspace_id:
rows = [r for r in rows if r.get("workspace_id") == workspace_id]
return rows[-limit:]
def clear(self) -> None:
with self._lock:
self._events.clear()
# ─── JSONL store ──────────────────────────────────────────────────────────────
class JsonlAuditStore(AuditStore):
"""
Append-only JSONL file with daily rotation.
File pattern: ops/audit/tool_audit_YYYY-MM-DD.jsonl
Writes are serialised through a threading.Lock (safe for multi-thread, not multi-process).
"""
def __init__(self, directory: str = "ops/audit"):
self._dir = Path(directory)
self._dir.mkdir(parents=True, exist_ok=True)
self._lock = threading.Lock()
self._current_file: Optional[Path] = None
self._current_date: Optional[str] = None
self._fh = None
def _get_fh(self, date_str: str):
if date_str != self._current_date:
if self._fh:
try:
self._fh.close()
except Exception:
pass
path = self._dir / f"tool_audit_{date_str}.jsonl"
self._fh = open(path, "a", encoding="utf-8", buffering=1) # line-buffered
self._current_date = date_str
self._current_file = path
return self._fh
def write(self, event: AuditEventLike) -> None:
try:
d = _event_to_dict(event)
date_str = (d.get("ts") or "")[:10] or datetime.date.today().isoformat()
line = json.dumps(d, ensure_ascii=False)
with self._lock:
fh = self._get_fh(date_str)
fh.write(line + "\n")
except Exception as e:
logger.warning("JsonlAuditStore.write error: %s", e)
def read(
self,
from_ts: Optional[str] = None,
to_ts: Optional[str] = None,
tool: Optional[str] = None,
agent_id: Optional[str] = None,
workspace_id: Optional[str] = None,
limit: int = 50000,
) -> List[Dict]:
"""Stream-read JSONL files in date range."""
# Determine which files to read
files = sorted(self._dir.glob("tool_audit_*.jsonl"))
if from_ts:
from_date = from_ts[:10]
files = [f for f in files if f.stem[-10:] >= from_date]
if to_ts:
to_date = to_ts[:10]
files = [f for f in files if f.stem[-10:] <= to_date]
rows = []
for fpath in files:
try:
with open(fpath, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
d = json.loads(line)
except Exception:
continue
ts = d.get("ts", "")
if from_ts and ts < from_ts:
continue
if to_ts and ts > to_ts:
continue
if tool and d.get("tool") != tool:
continue
if agent_id and d.get("agent_id") != agent_id:
continue
if workspace_id and d.get("workspace_id") != workspace_id:
continue
rows.append(d)
if len(rows) >= limit:
break
except Exception as e:
logger.warning("JsonlAuditStore.read error %s: %s", fpath, e)
if len(rows) >= limit:
break
return rows
def close(self) -> None:
with self._lock:
if self._fh:
try:
self._fh.close()
except Exception:
pass
self._fh = None
# ─── Postgres store ───────────────────────────────────────────────────────────
class PostgresAuditStore(AuditStore):
"""
Async Postgres store using asyncpg.
Writes are enqueued to an asyncio queue and flushed in background.
Falls back gracefully if Postgres is unavailable.
"""
def __init__(self, dsn: str):
self._dsn = dsn
self._pool = None
self._queue: asyncio.Queue = asyncio.Queue(maxsize=10_000)
self._task: Optional[asyncio.Task] = None
self._started = False
def _ensure_started(self):
if self._started:
return
try:
loop = asyncio.get_event_loop()
if loop.is_running():
self._task = loop.create_task(self._flush_loop())
self._started = True
except RuntimeError:
pass
async def _get_pool(self):
if self._pool is None:
import asyncpg
self._pool = await asyncpg.create_pool(self._dsn, min_size=1, max_size=3)
async with self._pool.acquire() as conn:
await conn.execute(_POSTGRES_DDL)
return self._pool
async def _flush_loop(self):
while True:
events = []
try:
# Collect up to 50 events or wait 2s
evt = await asyncio.wait_for(self._queue.get(), timeout=2.0)
events.append(evt)
while not self._queue.empty() and len(events) < 50:
events.append(self._queue.get_nowait())
except asyncio.TimeoutError:
pass
except Exception:
pass
if not events:
continue
try:
pool = await self._get_pool()
async with pool.acquire() as conn:
await conn.executemany(
"""
INSERT INTO tool_audit_events
(ts, req_id, workspace_id, user_id, agent_id, tool, action,
status, duration_ms, in_size, out_size, input_hash,
graph_run_id, graph_node, job_id)
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15)
""",
[
(
e["ts"], e["req_id"], e["workspace_id"], e["user_id"],
e["agent_id"], e["tool"], e["action"], e["status"],
e["duration_ms"], e["in_size"], e["out_size"],
e["input_hash"], e.get("graph_run_id"),
e.get("graph_node"), e.get("job_id"),
)
for e in events
],
)
except Exception as ex:
logger.warning("PostgresAuditStore flush error: %s", ex)
def write(self, event: AuditEventLike) -> None:
try:
d = _event_to_dict(event)
self._ensure_started()
if self._started and not self._queue.full():
self._queue.put_nowait(d)
except Exception as e:
logger.warning("PostgresAuditStore.write error: %s", e)
def read(
self,
from_ts: Optional[str] = None,
to_ts: Optional[str] = None,
tool: Optional[str] = None,
agent_id: Optional[str] = None,
workspace_id: Optional[str] = None,
limit: int = 50000,
) -> List[Dict]:
"""Synchronous read via asyncio.run() — for analyzer queries."""
try:
return asyncio.run(self._async_read(from_ts, to_ts, tool, agent_id, workspace_id, limit))
except Exception as e:
logger.warning("PostgresAuditStore.read error: %s", e)
return []
async def _async_read(self, from_ts, to_ts, tool, agent_id, workspace_id, limit):
pool = await self._get_pool()
conditions = ["TRUE"]
params = []
p = 1
if from_ts:
conditions.append(f"ts >= ${p}"); params.append(from_ts); p += 1
if to_ts:
conditions.append(f"ts <= ${p}"); params.append(to_ts); p += 1
if tool:
conditions.append(f"tool = ${p}"); params.append(tool); p += 1
if agent_id:
conditions.append(f"agent_id = ${p}"); params.append(agent_id); p += 1
if workspace_id:
conditions.append(f"workspace_id = ${p}"); params.append(workspace_id); p += 1
sql = f"SELECT * FROM tool_audit_events WHERE {' AND '.join(conditions)} ORDER BY ts LIMIT {limit}"
async with pool.acquire() as conn:
rows = await conn.fetch(sql, *params)
return [dict(r) for r in rows]
# ─── Null store ───────────────────────────────────────────────────────────────
class NullAuditStore(AuditStore):
"""No-op store (audit disabled)."""
def write(self, event: AuditEventLike) -> None:
pass
def read(self, **kwargs) -> List[Dict]:
return []
# ─── Global singleton ─────────────────────────────────────────────────────────
_store: Optional[AuditStore] = None
_store_lock = threading.Lock()
def get_audit_store() -> AuditStore:
"""Lazily initialise and return the global audit store."""
global _store
if _store is None:
with _store_lock:
if _store is None:
_store = _create_store()
return _store
def set_audit_store(store: AuditStore) -> None:
"""Override the global store (used in tests)."""
global _store
with _store_lock:
_store = store
class AutoAuditStore(AuditStore):
"""
Smart backend: tries Postgres first, falls back to JSONL on failure.
Used when AUDIT_BACKEND=auto (or unset with DATABASE_URL present).
- Writes go to whichever backend is currently healthy.
- On Postgres failure, transparently falls back to JsonlAuditStore.
- Recovers to Postgres on next health check (every ~5 min).
Non-fatal: write errors are logged as warnings.
"""
_RECOVERY_INTERVAL_S = 300 # retry Postgres after 5 minutes
def __init__(self, pg_dsn: str, jsonl_dir: str):
self._pg_dsn = pg_dsn
self._jsonl_dir = jsonl_dir
self._primary: Optional[PostgresAuditStore] = None
self._fallback: Optional[JsonlAuditStore] = None
self._using_fallback = False
self._fallback_since: float = 0.0
self._init_lock = threading.Lock()
def _get_primary(self) -> Optional[PostgresAuditStore]:
if self._primary is None:
with self._init_lock:
if self._primary is None:
self._primary = PostgresAuditStore(self._pg_dsn)
return self._primary
def _get_fallback(self) -> JsonlAuditStore:
if self._fallback is None:
with self._init_lock:
if self._fallback is None:
self._fallback = JsonlAuditStore(self._jsonl_dir)
return self._fallback
def _maybe_recover(self) -> None:
"""Try to switch back to Postgres if enough time has passed since fallback."""
if self._using_fallback and self._fallback_since > 0:
if time.monotonic() - self._fallback_since >= self._RECOVERY_INTERVAL_S:
logger.info("AutoAuditStore: attempting Postgres recovery")
self._using_fallback = False
self._fallback_since = 0.0
def write(self, event: AuditEventLike) -> None:
self._maybe_recover()
if not self._using_fallback:
try:
primary = self._get_primary()
if primary:
primary.write(event)
return
except Exception as pg_err:
logger.warning(
"AutoAuditStore: Postgres write failed (%s), switching to JSONL fallback", pg_err
)
self._using_fallback = True
self._fallback_since = time.monotonic()
# Write to JSONL fallback
try:
self._get_fallback().write(event)
except Exception as jl_err:
logger.warning("AutoAuditStore: JSONL fallback write failed: %s", jl_err)
def read(
self,
from_ts: Optional[str] = None,
to_ts: Optional[str] = None,
tool: Optional[str] = None,
agent_id: Optional[str] = None,
workspace_id: Optional[str] = None,
limit: int = 50000,
) -> List[Dict]:
"""Read from Postgres if available, else JSONL."""
self._maybe_recover()
if not self._using_fallback:
try:
primary = self._get_primary()
if primary:
return primary.read(from_ts=from_ts, to_ts=to_ts, tool=tool,
agent_id=agent_id, workspace_id=workspace_id, limit=limit)
except Exception as pg_err:
logger.warning("AutoAuditStore: Postgres read failed (%s), using JSONL", pg_err)
self._using_fallback = True
self._fallback_since = time.monotonic()
return self._get_fallback().read(
from_ts=from_ts, to_ts=to_ts, tool=tool,
agent_id=agent_id, workspace_id=workspace_id, limit=limit,
)
def active_backend(self) -> str:
"""Return the name of the currently active backend."""
return "jsonl_fallback" if self._using_fallback else "postgres"
def close(self) -> None:
if self._primary:
try:
self._primary.close()
except Exception:
pass
if self._fallback:
try:
self._fallback.close()
except Exception:
pass
def _create_store() -> AuditStore:
backend = os.getenv("AUDIT_BACKEND", "jsonl").lower()
dsn = os.getenv("DATABASE_URL") or os.getenv("POSTGRES_DSN", "")
audit_dir = os.getenv(
"AUDIT_JSONL_DIR",
str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "audit"),
)
if backend == "memory":
logger.info("AuditStore: in-memory (testing only)")
return MemoryAuditStore()
if backend == "postgres":
if not dsn:
logger.warning("AUDIT_BACKEND=postgres but DATABASE_URL not set; falling back to jsonl")
else:
logger.info("AuditStore: postgres dsn=%s", dsn[:30])
return PostgresAuditStore(dsn)
if backend == "auto":
if dsn:
logger.info("AuditStore: auto (postgres→jsonl fallback) dsn=%s", dsn[:30])
return AutoAuditStore(pg_dsn=dsn, jsonl_dir=audit_dir)
else:
logger.info("AuditStore: auto — no DATABASE_URL, using jsonl")
if backend == "null":
return NullAuditStore()
# Default / jsonl
logger.info("AuditStore: jsonl dir=%s", audit_dir)
return JsonlAuditStore(audit_dir)

View File

@@ -0,0 +1,530 @@
"""
backlog_generator.py — Auto-generation of Engineering Backlog items
from Platform Priority / Risk digests.
DAARION.city | deterministic, no LLM.
Public API:
load_backlog_policy() -> Dict
generate_from_pressure_digest(digest_data, env, ...) -> GenerateResult
generate_from_risk_digest(digest_data, env, ...) -> GenerateResult
_build_item_from_rule(service, rule, context, policy, week_str, env) -> BacklogItem | None
_make_dedupe_key(prefix, week_str, env, service, category) -> str
"""
from __future__ import annotations
import datetime
import json
import logging
import yaml
from pathlib import Path
from typing import Any, Dict, List, Optional
from backlog_store import (
BacklogItem, BacklogEvent, BacklogStore,
_new_id, _now_iso,
)
logger = logging.getLogger(__name__)
# ─── Policy ───────────────────────────────────────────────────────────────────
_BACKLOG_POLICY_CACHE: Optional[Dict] = None
_BACKLOG_POLICY_PATHS = [
Path("config/backlog_policy.yml"),
Path(__file__).resolve().parent.parent.parent / "config" / "backlog_policy.yml",
]
def load_backlog_policy() -> Dict:
global _BACKLOG_POLICY_CACHE
if _BACKLOG_POLICY_CACHE is not None:
return _BACKLOG_POLICY_CACHE
for p in _BACKLOG_POLICY_PATHS:
if p.exists():
try:
with open(p) as f:
data = yaml.safe_load(f) or {}
_BACKLOG_POLICY_CACHE = data
return data
except Exception as e:
logger.warning("Failed to load backlog_policy from %s: %s", p, e)
_BACKLOG_POLICY_CACHE = _builtin_backlog_defaults()
return _BACKLOG_POLICY_CACHE
def _reload_backlog_policy() -> None:
global _BACKLOG_POLICY_CACHE
_BACKLOG_POLICY_CACHE = None
def _builtin_backlog_defaults() -> Dict:
return {
"defaults": {"env": "prod", "retention_days": 180, "max_items_per_run": 50},
"dedupe": {
"scheme": "YYYY-WW",
"key_fields": ["service", "category", "env"],
"key_prefix": "platform_backlog",
},
"categories": {
"arch_review": {"priority": "P1", "due_days": 14},
"refactor": {"priority": "P1", "due_days": 21},
"slo_hardening": {"priority": "P2", "due_days": 30},
"cleanup_followups": {"priority": "P2", "due_days": 14},
"security": {"priority": "P0", "due_days": 7},
},
"generation": {
"weekly_from_pressure_digest": True,
"daily_from_risk_digest": False,
"rules": [
{
"name": "arch_review_required",
"when": {"pressure_requires_arch_review": True},
"create": {
"category": "arch_review",
"title_template": "[ARCH] Review required: {service}",
},
},
{
"name": "high_pressure_refactor",
"when": {
"pressure_band_in": ["high", "critical"],
"risk_band_in": ["high", "critical"],
},
"create": {
"category": "refactor",
"title_template": "[REF] Reduce pressure & risk: {service}",
},
},
{
"name": "slo_violations",
"when": {"risk_has_slo_violations": True},
"create": {
"category": "slo_hardening",
"title_template": "[SLO] Fix violations: {service}",
},
},
{
"name": "followup_backlog",
"when": {"followups_overdue_gt": 0},
"create": {
"category": "cleanup_followups",
"title_template": "[OPS] Close overdue followups: {service}",
},
},
],
},
"ownership": {
"default_owner": "oncall",
"overrides": {"gateway": "cto"},
},
"workflow": {
"statuses": ["open", "in_progress", "blocked", "done", "canceled"],
"allowed_transitions": {
"open": ["in_progress", "blocked", "canceled"],
"in_progress": ["blocked", "done", "canceled"],
"blocked": ["open", "in_progress", "canceled"],
"done": [],
"canceled": [],
},
},
}
# ─── Helpers ──────────────────────────────────────────────────────────────────
def _now_week() -> str:
return datetime.datetime.utcnow().strftime("%Y-W%V")
def _make_dedupe_key(prefix: str, week_str: str, env: str,
service: str, category: str) -> str:
return f"{prefix}:{week_str}:{env}:{service}:{category}"
def _due_date(due_days: int) -> str:
return (
datetime.datetime.utcnow() + datetime.timedelta(days=due_days)
).strftime("%Y-%m-%d")
def _owner_for(service: str, policy: Dict) -> str:
overrides = policy.get("ownership", {}).get("overrides", {})
return overrides.get(service, policy.get("ownership", {}).get("default_owner", "oncall"))
def _match_rule(rule: Dict, ctx: Dict) -> bool:
"""
Evaluate a rule's `when` conditions against the service context dict.
All conditions must hold (AND logic).
"""
when = rule.get("when", {})
for key, expected in when.items():
if key == "pressure_requires_arch_review":
if bool(ctx.get("pressure_requires_arch_review")) is not bool(expected):
return False
elif key == "pressure_band_in":
if ctx.get("pressure_band") not in expected:
return False
elif key == "risk_band_in":
if ctx.get("risk_band") not in expected:
return False
elif key == "risk_has_slo_violations":
slo_v = int(ctx.get("slo_violations", 0))
if (slo_v > 0) is not bool(expected):
return False
elif key == "followups_overdue_gt":
overdue = int(ctx.get("followups_overdue", 0))
if not (overdue > int(expected)):
return False
return True
def _build_description(service: str, ctx: Dict, rule: Dict) -> str:
"""Generate deterministic bullet-list description from context."""
lines = [f"Auto-generated by Engineering Backlog Bridge — rule: {rule.get('name', '?')}.", ""]
p_score = ctx.get("pressure_score")
p_band = ctx.get("pressure_band")
r_score = ctx.get("risk_score")
r_band = ctx.get("risk_band")
r_delta = ctx.get("risk_delta_24h")
if p_score is not None:
lines.append(f"- Architecture Pressure: {p_score} ({p_band})")
if r_score is not None:
lines.append(f"- Risk Score: {r_score} ({r_band})"
+ (f" Δ24h: +{r_delta}" if r_delta else ""))
slo_v = int(ctx.get("slo_violations", 0))
if slo_v:
lines.append(f"- Active SLO violations: {slo_v}")
overdue = int(ctx.get("followups_overdue", 0))
if overdue:
lines.append(f"- Overdue follow-ups: {overdue}")
if ctx.get("signals_summary"):
lines.append(f"- Pressure signals: {'; '.join(ctx['signals_summary'][:3])}")
if ctx.get("risk_reasons"):
lines.append(f"- Risk signals: {'; '.join(ctx['risk_reasons'][:3])}")
return "\n".join(lines)
def _build_item_from_rule(
service: str,
rule: Dict,
ctx: Dict,
policy: Dict,
week_str: str,
env: str,
) -> Optional[BacklogItem]:
"""Build a BacklogItem from a matched rule and service context."""
create_cfg = rule.get("create", {})
category = create_cfg.get("category", "arch_review")
title_template = create_cfg.get("title_template", "[BACKLOG] {service}")
title = title_template.format(service=service)
cat_cfg = policy.get("categories", {}).get(category, {})
priority = cat_cfg.get("priority", "P2")
due_days = int(cat_cfg.get("due_days", 14))
owner = _owner_for(service, policy)
prefix = policy.get("dedupe", {}).get("key_prefix", "platform_backlog")
dedupe_key = _make_dedupe_key(prefix, week_str, env, service, category)
description = _build_description(service, ctx, rule)
# Gather evidence_refs from context
evidence_refs = dict(ctx.get("evidence_refs") or {})
return BacklogItem(
id=_new_id("bl"),
created_at=_now_iso(),
updated_at=_now_iso(),
env=env,
service=service,
category=category,
title=title,
description=description,
priority=priority,
status="open",
owner=owner,
due_date=_due_date(due_days),
source="digest",
dedupe_key=dedupe_key,
evidence_refs=evidence_refs,
tags=["auto", f"week:{week_str}", f"rule:{rule.get('name', '?')}"],
meta={
"rule_name": rule.get("name", ""),
"pressure_score": ctx.get("pressure_score"),
"risk_score": ctx.get("risk_score"),
"week": week_str,
},
)
# ─── Context builder from digest ──────────────────────────────────────────────
def _build_service_context(
service_entry: Dict,
risk_entry: Optional[Dict] = None,
) -> Dict:
"""
Build a unified service context dict from a platform_priority_digest
top_pressure_services entry plus an optional risk_digest service entry.
"""
p_score = service_entry.get("score")
p_band = service_entry.get("band", "low")
requires_review = bool(service_entry.get("requires_arch_review", False))
signals_summary = service_entry.get("signals_summary", [])
comp = service_entry.get("components", {})
followups_overdue = int(comp.get("followups_overdue", 0))
evidence_refs = service_entry.get("evidence_refs") or {}
ctx: Dict[str, Any] = {
"pressure_score": p_score,
"pressure_band": p_band,
"pressure_requires_arch_review": requires_review,
"signals_summary": signals_summary,
"followups_overdue": followups_overdue,
"evidence_refs": dict(evidence_refs),
}
# Merge risk data
if risk_entry:
ctx["risk_score"] = risk_entry.get("score")
ctx["risk_band"] = risk_entry.get("band", "low")
ctx["risk_delta_24h"] = (risk_entry.get("trend") or {}).get("delta_24h")
slo_comp = (risk_entry.get("components") or {}).get("slo") or {}
ctx["slo_violations"] = int(slo_comp.get("violations", 0))
ctx["risk_reasons"] = risk_entry.get("reasons", [])
# Merge evidence_refs from risk
risk_attrs = risk_entry.get("attribution") or {}
risk_erefs = risk_attrs.get("evidence_refs") or {}
for k, v in risk_erefs.items():
if k not in ctx["evidence_refs"]:
ctx["evidence_refs"][k] = v
else:
ctx.setdefault("risk_band", service_entry.get("risk_band", "low"))
ctx.setdefault("risk_score", service_entry.get("risk_score"))
ctx.setdefault("risk_delta_24h", service_entry.get("risk_delta_24h"))
ctx.setdefault("slo_violations", 0)
return ctx
# ─── Main generation function ─────────────────────────────────────────────────
def generate_from_pressure_digest(
digest_data: Dict,
env: str = "prod",
*,
store: Optional[BacklogStore] = None,
policy: Optional[Dict] = None,
week_str: Optional[str] = None,
risk_digest_data: Optional[Dict] = None,
) -> Dict:
"""
Generate backlog items from a weekly_platform_priority_digest JSON output.
Args:
digest_data: JSON dict from platform_priority_digest (top_pressure_services list)
env: deployment environment
store: backlog store (loaded from factory if None)
policy: backlog_policy (loaded if None)
week_str: override ISO week (defaults to digest's "week" field or current)
risk_digest_data: optional daily risk digest JSON to enrich context
Returns GenerateResult dict: created, updated, skipped, items
"""
if policy is None:
policy = load_backlog_policy()
if store is None:
from backlog_store import get_backlog_store
store = get_backlog_store()
gen_cfg = policy.get("generation", {})
if not gen_cfg.get("weekly_from_pressure_digest", True):
return {"created": 0, "updated": 0, "skipped": 0, "items": [],
"skipped_reason": "weekly_from_pressure_digest disabled in policy"}
effective_week = week_str or digest_data.get("week") or _now_week()
max_items = int(policy.get("defaults", {}).get("max_items_per_run", 50))
rules = gen_cfg.get("rules", [])
# Build risk_by_service lookup
risk_by_service: Dict[str, Dict] = {}
if risk_digest_data:
for rs in (risk_digest_data.get("top_services") or []):
svc = rs.get("service", "")
if svc:
risk_by_service[svc] = rs
created = updated = skipped = 0
items_out: List[Dict] = []
total_written = 0
for svc_entry in (digest_data.get("top_pressure_services") or []):
service = svc_entry.get("service", "")
if not service:
continue
if total_written >= max_items:
skipped += 1
continue
ctx = _build_service_context(svc_entry, risk_by_service.get(service))
# Evaluate rules — one item per matched rule
matched_categories: set = set()
for rule in rules:
try:
if not _match_rule(rule, ctx):
continue
category = rule.get("create", {}).get("category", "")
if category in matched_categories:
continue # dedupe same category within a service
matched_categories.add(category)
item = _build_item_from_rule(service, rule, ctx, policy,
effective_week, env)
if item is None:
continue
result = store.upsert(item)
action = result["action"]
upserted = result["item"]
# Emit event
ev_type = "created" if action == "created" else "auto_update"
store.add_event(BacklogEvent(
id=_new_id("ev"),
item_id=upserted.id,
ts=_now_iso(),
type=ev_type,
message=f"Auto-generated by weekly digest — rule: {rule.get('name', '?')}",
actor="backlog_generator",
meta={"week": effective_week, "rule": rule.get("name", "")},
))
if action == "created":
created += 1
else:
updated += 1
total_written += 1
items_out.append({
"id": upserted.id,
"service": service,
"category": upserted.category,
"status": upserted.status,
"action": action,
})
except Exception as e:
logger.warning("backlog_generator: skip rule %s for %s: %s",
rule.get("name"), service, e)
skipped += 1
return {
"created": created,
"updated": updated,
"skipped": skipped,
"items": items_out,
"week": effective_week,
}
def generate_from_risk_digest(
risk_digest_data: Dict,
env: str = "prod",
*,
store: Optional[BacklogStore] = None,
policy: Optional[Dict] = None,
week_str: Optional[str] = None,
) -> Dict:
"""
Optional: generate items from a daily risk digest JSON.
Only active when generation.daily_from_risk_digest=true.
"""
if policy is None:
policy = load_backlog_policy()
gen_cfg = policy.get("generation", {})
if not gen_cfg.get("daily_from_risk_digest", False):
return {"created": 0, "updated": 0, "skipped": 0, "items": [],
"skipped_reason": "daily_from_risk_digest disabled in policy"}
if store is None:
from backlog_store import get_backlog_store
store = get_backlog_store()
# Convert risk digest top_services into pressure-like entries
effective_week = week_str or _now_week()
max_items = int(policy.get("defaults", {}).get("max_items_per_run", 50))
rules = gen_cfg.get("rules", [])
created = updated = skipped = 0
items_out: List[Dict] = []
total_written = 0
for svc_entry in (risk_digest_data.get("top_services") or []):
service = svc_entry.get("service", "")
if not service or total_written >= max_items:
skipped += 1
continue
# Build a minimal pressure context from risk data
ctx: Dict = {
"pressure_score": None,
"pressure_band": "low",
"pressure_requires_arch_review": False,
"signals_summary": [],
"followups_overdue": 0,
"risk_score": svc_entry.get("score"),
"risk_band": svc_entry.get("band", "low"),
"risk_delta_24h": (svc_entry.get("trend") or {}).get("delta_24h"),
"slo_violations": (svc_entry.get("components") or {}).get("slo", {}).get("violations", 0) if svc_entry.get("components") else 0,
"risk_reasons": svc_entry.get("reasons", []),
"evidence_refs": (svc_entry.get("attribution") or {}).get("evidence_refs") or {},
}
matched_categories: set = set()
for rule in rules:
try:
if not _match_rule(rule, ctx):
continue
category = rule.get("create", {}).get("category", "")
if category in matched_categories:
continue
matched_categories.add(category)
item = _build_item_from_rule(service, rule, ctx, policy,
effective_week, env)
if item is None:
continue
result = store.upsert(item)
action = result["action"]
upserted = result["item"]
store.add_event(BacklogEvent(
id=_new_id("ev"),
item_id=upserted.id,
ts=_now_iso(),
type="created" if action == "created" else "auto_update",
message="Auto-generated from daily risk digest",
actor="backlog_generator",
meta={"week": effective_week},
))
if action == "created":
created += 1
else:
updated += 1
total_written += 1
items_out.append({
"id": upserted.id, "service": service,
"category": upserted.category, "status": upserted.status,
"action": action,
})
except Exception as e:
logger.warning("backlog_generator(risk): skip rule %s for %s: %s",
rule.get("name"), service, e)
skipped += 1
return {"created": created, "updated": updated, "skipped": skipped,
"items": items_out, "week": effective_week}

View File

@@ -0,0 +1,705 @@
"""
backlog_store.py — Engineering Backlog Storage Layer.
DAARION.city | deterministic, no LLM.
Backends:
MemoryBacklogStore — in-process (tests + fallback)
JsonlBacklogStore — filesystem append-only JSONL (MVP)
PostgresBacklogStore — Postgres primary (psycopg2 sync)
AutoBacklogStore — Postgres → JSONL → Memory cascade
Factory: get_backlog_store() → respects BACKLOG_BACKEND env var.
BACKLOG_BACKEND: auto | postgres | jsonl | memory | null
"""
from __future__ import annotations
import datetime
import json
import logging
import os
import threading
import uuid
from abc import ABC, abstractmethod
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
# ─── Data model ───────────────────────────────────────────────────────────────
_VALID_STATUSES = {"open", "in_progress", "blocked", "done", "canceled"}
_VALID_PRIORITIES = {"P0", "P1", "P2", "P3"}
def _now_iso() -> str:
return datetime.datetime.utcnow().isoformat()
def _new_id(prefix: str = "bl") -> str:
return f"{prefix}_{uuid.uuid4().hex[:12]}"
@dataclass
class BacklogItem:
id: str
created_at: str
updated_at: str
env: str
service: str
category: str # arch_review / refactor / slo_hardening / cleanup_followups / security
title: str
description: str
priority: str # P0..P3
status: str # open / in_progress / blocked / done / canceled
owner: str
due_date: str # YYYY-MM-DD
source: str # risk | pressure | digest | manual
dedupe_key: str
evidence_refs: Dict = field(default_factory=dict) # alerts, incidents, release_checks, ...
tags: List[str] = field(default_factory=list)
meta: Dict = field(default_factory=dict)
def to_dict(self) -> Dict:
return asdict(self)
@classmethod
def from_dict(cls, d: Dict) -> "BacklogItem":
return cls(
id=d.get("id", _new_id()),
created_at=d.get("created_at", _now_iso()),
updated_at=d.get("updated_at", _now_iso()),
env=d.get("env", "prod"),
service=d.get("service", ""),
category=d.get("category", ""),
title=d.get("title", ""),
description=d.get("description", ""),
priority=d.get("priority", "P2"),
status=d.get("status", "open"),
owner=d.get("owner", "oncall"),
due_date=d.get("due_date", ""),
source=d.get("source", "manual"),
dedupe_key=d.get("dedupe_key", ""),
evidence_refs=d.get("evidence_refs") or {},
tags=d.get("tags") or [],
meta=d.get("meta") or {},
)
@dataclass
class BacklogEvent:
id: str
item_id: str
ts: str
type: str # created | status_change | comment | auto_update
message: str
actor: str
meta: Dict = field(default_factory=dict)
def to_dict(self) -> Dict:
return asdict(self)
@classmethod
def from_dict(cls, d: Dict) -> "BacklogEvent":
return cls(
id=d.get("id", _new_id("ev")),
item_id=d.get("item_id", ""),
ts=d.get("ts", _now_iso()),
type=d.get("type", "comment"),
message=d.get("message", ""),
actor=d.get("actor", "system"),
meta=d.get("meta") or {},
)
# ─── Abstract base ────────────────────────────────────────────────────────────
class BacklogStore(ABC):
@abstractmethod
def create(self, item: BacklogItem) -> BacklogItem: ...
@abstractmethod
def get(self, item_id: str) -> Optional[BacklogItem]: ...
@abstractmethod
def get_by_dedupe_key(self, key: str) -> Optional[BacklogItem]: ...
@abstractmethod
def update(self, item: BacklogItem) -> BacklogItem: ...
@abstractmethod
def list_items(self, filters: Optional[Dict] = None, limit: int = 50,
offset: int = 0) -> List[BacklogItem]: ...
@abstractmethod
def add_event(self, event: BacklogEvent) -> BacklogEvent: ...
@abstractmethod
def get_events(self, item_id: str, limit: int = 50) -> List[BacklogEvent]: ...
@abstractmethod
def cleanup(self, retention_days: int = 180) -> int: ...
def upsert(self, item: BacklogItem) -> Dict:
"""Create or update by dedupe_key. Returns {"action": created|updated, "item": ...}"""
existing = self.get_by_dedupe_key(item.dedupe_key)
if existing is None:
created = self.create(item)
return {"action": "created", "item": created}
# Update title/description/evidence_refs/tags/meta; preserve status/owner
existing.title = item.title
existing.description = item.description
existing.evidence_refs = item.evidence_refs
existing.tags = list(set(existing.tags + item.tags))
existing.meta.update(item.meta or {})
existing.updated_at = _now_iso()
updated = self.update(existing)
return {"action": "updated", "item": updated}
def dashboard(self, env: str = "prod") -> Dict:
"""Return aggregated backlog counts."""
items = self.list_items({"env": env}, limit=1000)
today = datetime.datetime.utcnow().strftime("%Y-%m-%d")
status_counts: Dict[str, int] = {}
priority_counts: Dict[str, int] = {}
category_counts: Dict[str, int] = {}
overdue: List[Dict] = []
service_counts: Dict[str, int] = {}
for it in items:
status_counts[it.status] = status_counts.get(it.status, 0) + 1
priority_counts[it.priority] = priority_counts.get(it.priority, 0) + 1
category_counts[it.category] = category_counts.get(it.category, 0) + 1
service_counts[it.service] = service_counts.get(it.service, 0) + 1
if (it.status not in ("done", "canceled")
and it.due_date and it.due_date < today):
overdue.append({
"id": it.id, "service": it.service,
"title": it.title, "priority": it.priority,
"due_date": it.due_date, "owner": it.owner,
})
overdue.sort(key=lambda x: (x["priority"], x["due_date"]))
top_services = sorted(service_counts.items(), key=lambda x: -x[1])[:10]
return {
"env": env,
"total": len(items),
"status_counts": status_counts,
"priority_counts": priority_counts,
"category_counts": category_counts,
"overdue": overdue[:20],
"overdue_count": len(overdue),
"top_services": [{"service": s, "count": c} for s, c in top_services],
}
# ─── Workflow helper ──────────────────────────────────────────────────────────
def validate_transition(current_status: str, new_status: str,
policy: Optional[Dict] = None) -> bool:
"""Return True if transition is allowed, False otherwise."""
defaults = _builtin_workflow()
if policy is None:
allowed = defaults
else:
allowed = policy.get("workflow", {}).get("allowed_transitions", defaults)
return new_status in allowed.get(current_status, [])
def _builtin_workflow() -> Dict:
return {
"open": ["in_progress", "blocked", "canceled"],
"in_progress": ["blocked", "done", "canceled"],
"blocked": ["open", "in_progress", "canceled"],
"done": [],
"canceled": [],
}
# ─── Memory backend ───────────────────────────────────────────────────────────
class MemoryBacklogStore(BacklogStore):
def __init__(self) -> None:
self._items: Dict[str, BacklogItem] = {}
self._events: List[BacklogEvent] = []
self._lock = threading.Lock()
def create(self, item: BacklogItem) -> BacklogItem:
with self._lock:
self._items[item.id] = item
return item
def get(self, item_id: str) -> Optional[BacklogItem]:
with self._lock:
return self._items.get(item_id)
def get_by_dedupe_key(self, key: str) -> Optional[BacklogItem]:
with self._lock:
for it in self._items.values():
if it.dedupe_key == key:
return it
return None
def update(self, item: BacklogItem) -> BacklogItem:
with self._lock:
self._items[item.id] = item
return item
def list_items(self, filters: Optional[Dict] = None,
limit: int = 50, offset: int = 0) -> List[BacklogItem]:
filters = filters or {}
with self._lock:
items = list(self._items.values())
items = _apply_filters(items, filters)
items.sort(key=lambda x: (x.priority, x.due_date or "9999"))
return items[offset: offset + limit]
def add_event(self, event: BacklogEvent) -> BacklogEvent:
with self._lock:
self._events.append(event)
return event
def get_events(self, item_id: str, limit: int = 50) -> List[BacklogEvent]:
with self._lock:
evs = [e for e in self._events if e.item_id == item_id]
return evs[-limit:]
def cleanup(self, retention_days: int = 180) -> int:
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
).isoformat()
with self._lock:
to_delete = [
iid for iid, it in self._items.items()
if it.status in ("done", "canceled") and it.updated_at < cutoff
]
for iid in to_delete:
del self._items[iid]
return len(to_delete)
# ─── JSONL backend ────────────────────────────────────────────────────────────
_JSONL_ITEMS = "ops/backlog/items.jsonl"
_JSONL_EVENTS = "ops/backlog/events.jsonl"
_JSONL_CACHE_MAX = 50_000 # lines to scan
class JsonlBacklogStore(BacklogStore):
"""
Append-only JSONL filesystem store.
Last-write-wins: items keyed by id, updates appended (read returns latest).
"""
def __init__(
self,
items_path: str = _JSONL_ITEMS,
events_path: str = _JSONL_EVENTS,
) -> None:
self._items_path = Path(items_path)
self._events_path = Path(events_path)
self._lock = threading.Lock()
self._items_path.parent.mkdir(parents=True, exist_ok=True)
self._events_path.parent.mkdir(parents=True, exist_ok=True)
def _load_items(self) -> Dict[str, BacklogItem]:
"""Scan file, last-write-wins per id."""
items: Dict[str, BacklogItem] = {}
if not self._items_path.exists():
return items
try:
with open(self._items_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
d = json.loads(line)
items[d["id"]] = BacklogItem.from_dict(d)
except Exception:
pass
except Exception as e:
logger.warning("JsonlBacklogStore: load_items error: %s", e)
return items
def _append_item(self, item: BacklogItem) -> None:
with open(self._items_path, "a", encoding="utf-8") as f:
f.write(json.dumps(item.to_dict(), default=str) + "\n")
def create(self, item: BacklogItem) -> BacklogItem:
with self._lock:
self._append_item(item)
return item
def get(self, item_id: str) -> Optional[BacklogItem]:
with self._lock:
items = self._load_items()
return items.get(item_id)
def get_by_dedupe_key(self, key: str) -> Optional[BacklogItem]:
with self._lock:
items = self._load_items()
for it in items.values():
if it.dedupe_key == key:
return it
return None
def update(self, item: BacklogItem) -> BacklogItem:
item.updated_at = _now_iso()
with self._lock:
self._append_item(item)
return item
def list_items(self, filters: Optional[Dict] = None,
limit: int = 50, offset: int = 0) -> List[BacklogItem]:
with self._lock:
items = list(self._load_items().values())
items = _apply_filters(items, filters or {})
items.sort(key=lambda x: (x.priority, x.due_date or "9999"))
return items[offset: offset + limit]
def add_event(self, event: BacklogEvent) -> BacklogEvent:
with self._lock:
if not self._events_path.parent.exists():
self._events_path.parent.mkdir(parents=True, exist_ok=True)
with open(self._events_path, "a", encoding="utf-8") as f:
f.write(json.dumps(event.to_dict(), default=str) + "\n")
return event
def get_events(self, item_id: str, limit: int = 50) -> List[BacklogEvent]:
events: List[BacklogEvent] = []
if not self._events_path.exists():
return events
try:
with open(self._events_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
d = json.loads(line)
if d.get("item_id") == item_id:
events.append(BacklogEvent.from_dict(d))
except Exception:
pass
except Exception as e:
logger.warning("JsonlBacklogStore: get_events error: %s", e)
return events[-limit:]
def cleanup(self, retention_days: int = 180) -> int:
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
).isoformat()
with self._lock:
items = self._load_items()
to_keep = {
iid: it for iid, it in items.items()
if not (it.status in ("done", "canceled") and it.updated_at < cutoff)
}
deleted = len(items) - len(to_keep)
if deleted:
# Rewrite the file
with open(self._items_path, "w", encoding="utf-8") as f:
for it in to_keep.values():
f.write(json.dumps(it.to_dict(), default=str) + "\n")
return deleted
# ─── Postgres backend ─────────────────────────────────────────────────────────
class PostgresBacklogStore(BacklogStore):
"""
Postgres-backed store using psycopg2 (sync).
Tables: backlog_items, backlog_events (created by migration script).
"""
def __init__(self, dsn: Optional[str] = None) -> None:
self._dsn = dsn or os.environ.get(
"BACKLOG_POSTGRES_DSN",
os.environ.get("POSTGRES_DSN", "postgresql://localhost/daarion")
)
self._lock = threading.Lock()
def _conn(self):
import psycopg2
import psycopg2.extras
return psycopg2.connect(self._dsn)
def create(self, item: BacklogItem) -> BacklogItem:
sql = """
INSERT INTO backlog_items
(id, created_at, updated_at, env, service, category, title, description,
priority, status, owner, due_date, source, dedupe_key,
evidence_refs, tags, meta)
VALUES
(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
ON CONFLICT (dedupe_key) DO NOTHING
"""
with self._conn() as conn:
with conn.cursor() as cur:
cur.execute(sql, (
item.id, item.created_at, item.updated_at,
item.env, item.service, item.category,
item.title, item.description, item.priority,
item.status, item.owner, item.due_date or None,
item.source, item.dedupe_key,
json.dumps(item.evidence_refs),
json.dumps(item.tags),
json.dumps(item.meta),
))
return item
def get(self, item_id: str) -> Optional[BacklogItem]:
with self._conn() as conn:
with conn.cursor() as cur:
cur.execute("SELECT * FROM backlog_items WHERE id=%s", (item_id,))
row = cur.fetchone()
if row:
return self._row_to_item(row, cur.description)
return None
def get_by_dedupe_key(self, key: str) -> Optional[BacklogItem]:
with self._conn() as conn:
with conn.cursor() as cur:
cur.execute("SELECT * FROM backlog_items WHERE dedupe_key=%s", (key,))
row = cur.fetchone()
if row:
return self._row_to_item(row, cur.description)
return None
def update(self, item: BacklogItem) -> BacklogItem:
item.updated_at = _now_iso()
sql = """
UPDATE backlog_items SET
updated_at=%s, title=%s, description=%s, priority=%s,
status=%s, owner=%s, due_date=%s, evidence_refs=%s, tags=%s, meta=%s
WHERE id=%s
"""
with self._conn() as conn:
with conn.cursor() as cur:
cur.execute(sql, (
item.updated_at, item.title, item.description,
item.priority, item.status, item.owner,
item.due_date or None,
json.dumps(item.evidence_refs),
json.dumps(item.tags),
json.dumps(item.meta),
item.id,
))
return item
def list_items(self, filters: Optional[Dict] = None,
limit: int = 50, offset: int = 0) -> List[BacklogItem]:
filters = filters or {}
where, params = _pg_where_clause(filters)
sql = f"""
SELECT * FROM backlog_items {where}
ORDER BY priority ASC, due_date ASC NULLS LAST
LIMIT %s OFFSET %s
"""
with self._conn() as conn:
with conn.cursor() as cur:
cur.execute(sql, params + [limit, offset])
rows = cur.fetchall()
desc = cur.description
return [self._row_to_item(r, desc) for r in rows]
def add_event(self, event: BacklogEvent) -> BacklogEvent:
sql = """
INSERT INTO backlog_events (id, item_id, ts, type, message, actor, meta)
VALUES (%s,%s,%s,%s,%s,%s,%s)
"""
with self._conn() as conn:
with conn.cursor() as cur:
cur.execute(sql, (
event.id, event.item_id, event.ts,
event.type, event.message, event.actor,
json.dumps(event.meta),
))
return event
def get_events(self, item_id: str, limit: int = 50) -> List[BacklogEvent]:
with self._conn() as conn:
with conn.cursor() as cur:
cur.execute(
"SELECT * FROM backlog_events WHERE item_id=%s ORDER BY ts DESC LIMIT %s",
(item_id, limit)
)
rows = cur.fetchall()
desc = cur.description
return [self._row_to_event(r, desc) for r in rows]
def cleanup(self, retention_days: int = 180) -> int:
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
).isoformat()
with self._conn() as conn:
with conn.cursor() as cur:
cur.execute(
"""DELETE FROM backlog_items
WHERE status IN ('done','canceled') AND updated_at < %s""",
(cutoff,)
)
return cur.rowcount
@staticmethod
def _row_to_item(row, description) -> BacklogItem:
d = {col.name: val for col, val in zip(description, row)}
for json_key in ("evidence_refs", "tags", "meta"):
v = d.get(json_key)
if isinstance(v, str):
try:
d[json_key] = json.loads(v)
except Exception:
d[json_key] = {} if json_key != "tags" else []
return BacklogItem.from_dict(d)
@staticmethod
def _row_to_event(row, description) -> BacklogEvent:
d = {col.name: val for col, val in zip(description, row)}
if isinstance(d.get("meta"), str):
try:
d["meta"] = json.loads(d["meta"])
except Exception:
d["meta"] = {}
return BacklogEvent.from_dict(d)
def _pg_where_clause(filters: Dict):
clauses, params = [], []
if filters.get("env"):
clauses.append("env=%s"); params.append(filters["env"])
if filters.get("service"):
clauses.append("service=%s"); params.append(filters["service"])
if filters.get("status"):
if isinstance(filters["status"], list):
ph = ",".join(["%s"] * len(filters["status"]))
clauses.append(f"status IN ({ph})"); params.extend(filters["status"])
else:
clauses.append("status=%s"); params.append(filters["status"])
if filters.get("owner"):
clauses.append("owner=%s"); params.append(filters["owner"])
if filters.get("category"):
clauses.append("category=%s"); params.append(filters["category"])
if filters.get("due_before"):
clauses.append("due_date < %s"); params.append(filters["due_before"])
return ("WHERE " + " AND ".join(clauses)) if clauses else "", params
# ─── Null backend ─────────────────────────────────────────────────────────────
class NullBacklogStore(BacklogStore):
def create(self, item): return item
def get(self, item_id): return None
def get_by_dedupe_key(self, key): return None
def update(self, item): return item
def list_items(self, filters=None, limit=50, offset=0): return []
def add_event(self, event): return event
def get_events(self, item_id, limit=50): return []
def cleanup(self, retention_days=180): return 0
# ─── Auto backend (Postgres → JSONL fallback) ─────────────────────────────────
class AutoBacklogStore(BacklogStore):
"""Postgres primary with JSONL fallback. Retries Postgres after 5 min."""
_RETRY_SEC = 300
def __init__(
self,
postgres_dsn: Optional[str] = None,
jsonl_items: str = _JSONL_ITEMS,
jsonl_events: str = _JSONL_EVENTS,
) -> None:
self._pg: Optional[PostgresBacklogStore] = None
self._jsonl = JsonlBacklogStore(jsonl_items, jsonl_events)
self._dsn = postgres_dsn
self._pg_failed_at: Optional[float] = None
self._lock = threading.Lock()
self._try_init_pg()
def _try_init_pg(self) -> None:
try:
self._pg = PostgresBacklogStore(self._dsn)
self._pg._conn().close() # test connection
self._pg_failed_at = None
logger.info("AutoBacklogStore: Postgres backend active")
except Exception as e:
logger.warning("AutoBacklogStore: Postgres unavailable, using JSONL: %s", e)
self._pg = None
import time
self._pg_failed_at = time.time()
def _backend(self) -> BacklogStore:
if self._pg is not None:
return self._pg
import time
if (self._pg_failed_at is None
or time.time() - self._pg_failed_at >= self._RETRY_SEC):
self._try_init_pg()
return self._pg if self._pg is not None else self._jsonl
def create(self, item): return self._backend().create(item)
def get(self, item_id): return self._backend().get(item_id)
def get_by_dedupe_key(self, key): return self._backend().get_by_dedupe_key(key)
def update(self, item): return self._backend().update(item)
def list_items(self, filters=None, limit=50, offset=0):
return self._backend().list_items(filters, limit, offset)
def add_event(self, event): return self._backend().add_event(event)
def get_events(self, item_id, limit=50): return self._backend().get_events(item_id, limit)
def cleanup(self, retention_days=180): return self._backend().cleanup(retention_days)
# ─── Filters helper ───────────────────────────────────────────────────────────
def _apply_filters(items: List[BacklogItem], filters: Dict) -> List[BacklogItem]:
result = []
for it in items:
if filters.get("env") and it.env != filters["env"]:
continue
if filters.get("service") and it.service != filters["service"]:
continue
if filters.get("status"):
statuses = filters["status"] if isinstance(filters["status"], list) else [filters["status"]]
if it.status not in statuses:
continue
if filters.get("owner") and it.owner != filters["owner"]:
continue
if filters.get("category") and it.category != filters["category"]:
continue
if filters.get("due_before") and it.due_date and it.due_date >= filters["due_before"]:
continue
result.append(it)
return result
# ─── Factory ──────────────────────────────────────────────────────────────────
_STORE_INSTANCE: Optional[BacklogStore] = None
_STORE_LOCK = threading.Lock()
def get_backlog_store() -> BacklogStore:
global _STORE_INSTANCE
with _STORE_LOCK:
if _STORE_INSTANCE is not None:
return _STORE_INSTANCE
backend = os.environ.get("BACKLOG_BACKEND", "auto").lower()
if backend == "memory":
_STORE_INSTANCE = MemoryBacklogStore()
elif backend == "jsonl":
_STORE_INSTANCE = JsonlBacklogStore()
elif backend == "postgres":
_STORE_INSTANCE = PostgresBacklogStore()
elif backend == "null":
_STORE_INSTANCE = NullBacklogStore()
else: # auto
_STORE_INSTANCE = AutoBacklogStore()
logger.info("backlog_store: using %s backend", type(_STORE_INSTANCE).__name__)
return _STORE_INSTANCE
def _reset_store_for_tests() -> None:
global _STORE_INSTANCE
with _STORE_LOCK:
_STORE_INSTANCE = None

View File

@@ -0,0 +1,595 @@
"""
Cost & Resource Analyzer (FinOps MVP)
Reads audit events from AuditStore and computes:
- Aggregated cost_units by tool/agent/workspace/status
- Top spenders (tools, agents, users)
- Anomalies (cost spikes, error rate spikes)
- Cost model weights
"cost_units" = cost_per_call(tool) + duration_ms * cost_per_ms(tool)
These are relative units, not real dollars.
No payload access — all inputs are aggregation parameters only.
"""
from __future__ import annotations
import datetime
import logging
import os
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
# ─── Config loader ────────────────────────────────────────────────────────────
_weights_cache: Optional[Dict] = None
_WEIGHTS_PATH = os.path.join(
os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent)),
"config", "cost_weights.yml",
)
def _load_weights() -> Dict:
global _weights_cache
if _weights_cache is not None:
return _weights_cache
try:
import yaml
with open(_WEIGHTS_PATH, "r") as f:
_weights_cache = yaml.safe_load(f) or {}
except Exception as e:
logger.warning("cost_weights.yml not loaded: %s", e)
_weights_cache = {}
return _weights_cache
def reload_cost_weights() -> None:
"""Force reload weights (for tests)."""
global _weights_cache
_weights_cache = None
def get_weights_for_tool(tool: str) -> Tuple[float, float]:
"""Return (cost_per_call, cost_per_ms) for a tool."""
cfg = _load_weights()
defaults = cfg.get("defaults", {})
tool_cfg = (cfg.get("tools") or {}).get(tool, {})
cpc = float(tool_cfg.get("cost_per_call", defaults.get("cost_per_call", 1.0)))
cpm = float(tool_cfg.get("cost_per_ms", defaults.get("cost_per_ms", 0.001)))
return cpc, cpm
def compute_event_cost(event: Dict) -> float:
"""Compute cost_units for a single audit event."""
tool = event.get("tool", "")
duration_ms = float(event.get("duration_ms", 0))
cpc, cpm = get_weights_for_tool(tool)
return round(cpc + duration_ms * cpm, 4)
# ─── Time helpers ─────────────────────────────────────────────────────────────
def _now_utc() -> datetime.datetime:
return datetime.datetime.now(datetime.timezone.utc)
def _iso(dt: datetime.datetime) -> str:
return dt.isoformat()
def _parse_iso(s: str) -> datetime.datetime:
s = s.replace("Z", "+00:00")
try:
return datetime.datetime.fromisoformat(s)
except Exception:
return _now_utc()
def _bucket_hour(ts: str) -> str:
"""Truncate ISO ts to hour: '2026-02-23T10:00:00+00:00'."""
return ts[:13] + ":00"
# ─── Aggregation helpers ──────────────────────────────────────────────────────
def _aggregate(
events: List[Dict],
group_keys: List[str],
) -> Dict[str, Dict]:
"""
Aggregate events by composite key (e.g. ["tool"] or ["agent_id", "tool"]).
Returns {key_str: {count, cost_units, duration_sum, failed_count, ...}}.
"""
result: Dict[str, Dict] = defaultdict(lambda: {
"count": 0,
"cost_units": 0.0,
"duration_ms_sum": 0.0,
"failed_count": 0,
"denied_count": 0,
"in_size_sum": 0,
"out_size_sum": 0,
})
for ev in events:
parts = [str(ev.get(k, "unknown")) for k in group_keys]
key = ":".join(parts)
cost = compute_event_cost(ev)
status = ev.get("status", "pass")
r = result[key]
r["count"] += 1
r["cost_units"] = round(r["cost_units"] + cost, 4)
r["duration_ms_sum"] = round(r["duration_ms_sum"] + float(ev.get("duration_ms", 0)), 2)
r["in_size_sum"] += int(ev.get("in_size", 0))
r["out_size_sum"] += int(ev.get("out_size", 0))
if status in ("failed", "error"):
r["failed_count"] += 1
elif status == "denied":
r["denied_count"] += 1
# Enrich with averages
for key, r in result.items():
n = r["count"] or 1
r["avg_duration_ms"] = round(r["duration_ms_sum"] / n, 1)
r["avg_cost_units"] = round(r["cost_units"] / n, 4)
r["error_rate"] = round(r["failed_count"] / (r["count"] or 1), 4)
return dict(result)
def _top_n(aggregated: Dict[str, Dict], key_field: str, n: int, sort_by: str = "cost_units") -> List[Dict]:
"""Sort aggregated dict by sort_by and return top N."""
items = [
{"key": k, key_field: k, **v}
for k, v in aggregated.items()
]
items.sort(key=lambda x: x.get(sort_by, 0), reverse=True)
return items[:n]
# ─── Actions ──────────────────────────────────────────────────────────────────
def action_report(
store,
time_range: Optional[Dict[str, str]] = None,
group_by: Optional[List[str]] = None,
top_n: int = 10,
include_failed: bool = True,
include_hourly: bool = False,
) -> Dict[str, Any]:
"""
Generate aggregated cost report for a time range.
Returns:
totals, breakdowns by group_by keys, top spenders, optional hourly trend.
"""
now = _now_utc()
tr = time_range or {}
from_ts = tr.get("from") or _iso(now - datetime.timedelta(days=7))
to_ts = tr.get("to") or _iso(now)
events = store.read(from_ts=from_ts, to_ts=to_ts, limit=200_000)
if not include_failed:
events = [e for e in events if e.get("status", "pass") not in ("failed", "error")]
# Totals
total_cost = sum(compute_event_cost(e) for e in events)
total_calls = len(events)
total_failed = sum(1 for e in events if e.get("status") in ("failed", "error"))
total_denied = sum(1 for e in events if e.get("status") == "denied")
# Breakdowns
by_key = group_by or ["tool"]
breakdowns: Dict[str, List[Dict]] = {}
for gk in by_key:
agg = _aggregate(events, [gk])
breakdowns[gk] = _top_n(agg, gk, top_n)
# Hourly trend (optional, for last 7d max)
hourly: List[Dict] = []
if include_hourly and events:
hourly_agg: Dict[str, Dict] = defaultdict(lambda: {"count": 0, "cost_units": 0.0})
for ev in events:
bucket = _bucket_hour(ev.get("ts", ""))
hourly_agg[bucket]["count"] += 1
hourly_agg[bucket]["cost_units"] = round(
hourly_agg[bucket]["cost_units"] + compute_event_cost(ev), 4
)
hourly = [{"hour": k, **v} for k, v in sorted(hourly_agg.items())]
return {
"time_range": {"from": from_ts, "to": to_ts},
"totals": {
"calls": total_calls,
"cost_units": round(total_cost, 2),
"failed": total_failed,
"denied": total_denied,
"error_rate": round(total_failed / (total_calls or 1), 4),
},
"breakdowns": breakdowns,
**({"hourly": hourly} if include_hourly else {}),
}
def action_top(
store,
window_hours: int = 24,
top_n: int = 10,
) -> Dict[str, Any]:
"""
Quick top-N report for tools, agents, and users over window_hours.
"""
now = _now_utc()
from_ts = _iso(now - datetime.timedelta(hours=window_hours))
to_ts = _iso(now)
events = store.read(from_ts=from_ts, to_ts=to_ts, limit=100_000)
top_tools = _top_n(_aggregate(events, ["tool"]), "tool", top_n)
top_agents = _top_n(_aggregate(events, ["agent_id"]), "agent_id", top_n)
top_users = _top_n(_aggregate(events, ["user_id"]), "user_id", top_n)
top_workspaces = _top_n(_aggregate(events, ["workspace_id"]), "workspace_id", top_n)
return {
"window_hours": window_hours,
"time_range": {"from": from_ts, "to": to_ts},
"total_calls": len(events),
"top_tools": top_tools,
"top_agents": top_agents,
"top_users": top_users,
"top_workspaces": top_workspaces,
}
def action_anomalies(
store,
window_minutes: int = 60,
baseline_hours: int = 24,
ratio_threshold: Optional[float] = None,
min_calls: Optional[int] = None,
tools_filter: Optional[List[str]] = None,
) -> Dict[str, Any]:
"""
Detect cost/call spikes and elevated error rates.
Algorithm:
1. Compute per-tool metrics for window [now-window_minutes, now]
2. Compute per-tool metrics for baseline [now-baseline_hours, now-window_minutes]
3. Spike = window_rate / baseline_rate >= ratio_threshold AND calls >= min_calls
4. Error spike = failed_rate > 10% AND calls >= min_calls
"""
cfg = _load_weights()
anomaly_cfg = cfg.get("anomaly", {})
if ratio_threshold is None:
ratio_threshold = float(anomaly_cfg.get("spike_ratio_threshold", 3.0))
if min_calls is None:
min_calls = int(anomaly_cfg.get("min_calls_threshold", 10))
now = _now_utc()
window_from = _iso(now - datetime.timedelta(minutes=window_minutes))
baseline_from = _iso(now - datetime.timedelta(hours=baseline_hours))
baseline_to = window_from # non-overlapping
# Fetch both windows
window_events = store.read(from_ts=window_from, to_ts=_iso(now), limit=50_000)
baseline_events = store.read(from_ts=baseline_from, to_ts=baseline_to, limit=200_000)
if tools_filter:
window_events = [e for e in window_events if e.get("tool") in tools_filter]
baseline_events = [e for e in baseline_events if e.get("tool") in tools_filter]
# Aggregate by tool
window_by_tool = _aggregate(window_events, ["tool"])
baseline_by_tool = _aggregate(baseline_events, ["tool"])
# Normalise baseline to per-minute rate
baseline_minutes = (baseline_hours * 60) - window_minutes
baseline_minutes = max(baseline_minutes, 1)
window_minutes_actual = float(window_minutes)
anomalies = []
all_tools = set(window_by_tool.keys()) | set(baseline_by_tool.keys())
for tool_key in sorted(all_tools):
w = window_by_tool.get(tool_key, {})
b = baseline_by_tool.get(tool_key, {})
w_calls = w.get("count", 0)
b_calls = b.get("count", 0)
if w_calls < min_calls:
continue # Not enough traffic for meaningful anomaly
# Per-minute rates
w_rate = w_calls / window_minutes_actual
b_rate = b_calls / baseline_minutes if b_calls > 0 else 0.0
# Cost spike
w_cost_pm = w.get("cost_units", 0) / window_minutes_actual
b_cost_pm = b.get("cost_units", 0) / baseline_minutes if b_calls > 0 else 0.0
call_ratio = (w_rate / b_rate) if b_rate > 0 else float("inf")
cost_ratio = (w_cost_pm / b_cost_pm) if b_cost_pm > 0 else float("inf")
if call_ratio >= ratio_threshold or cost_ratio >= ratio_threshold:
ratio_display = round(max(call_ratio, cost_ratio), 2)
if ratio_display == float("inf"):
ratio_display = "∞ (no baseline)"
w_cost = w.get("cost_units", 0)
b_cost = b.get("cost_units", 0)
anomalies.append({
"type": "cost_spike",
"key": f"tool:{tool_key}",
"tool": tool_key,
"window": f"last_{window_minutes}m",
"baseline": f"prev_{baseline_hours}h",
"window_calls": w_calls,
"baseline_calls": b_calls,
"window_cost_units": round(w_cost, 2),
"baseline_cost_units": round(b_cost, 2),
"ratio": ratio_display,
"recommendation": _spike_recommendation(tool_key, ratio_display, w_calls),
})
# Error rate spike
w_err_rate = w.get("error_rate", 0)
if w_err_rate > 0.10 and w_calls >= min_calls:
anomalies.append({
"type": "error_spike",
"key": f"tool:{tool_key}",
"tool": tool_key,
"window": f"last_{window_minutes}m",
"failed_calls": w.get("failed_count", 0),
"total_calls": w_calls,
"error_rate": round(w_err_rate, 4),
"recommendation": f"Investigate failures for '{tool_key}': {w.get('failed_count',0)} failed / {w_calls} calls ({round(w_err_rate*100,1)}% error rate).",
})
# De-duplicate tool+type combos (error_spike already separate)
seen = set()
unique_anomalies = []
for a in anomalies:
key = (a["type"], a.get("tool", ""))
if key not in seen:
unique_anomalies.append(a)
seen.add(key)
return {
"anomalies": unique_anomalies,
"anomaly_count": len(unique_anomalies),
"window_minutes": window_minutes,
"baseline_hours": baseline_hours,
"ratio_threshold": ratio_threshold,
"min_calls": min_calls,
"stats": {
"window_calls": len(window_events),
"baseline_calls": len(baseline_events),
},
}
def action_weights(repo_root: Optional[str] = None) -> Dict[str, Any]:
"""Return current cost weights configuration."""
global _weights_cache
_weights_cache = None # Force reload
cfg = _load_weights()
return {
"defaults": cfg.get("defaults", {}),
"tools": cfg.get("tools", {}),
"anomaly": cfg.get("anomaly", {}),
"config_path": _WEIGHTS_PATH,
}
# ─── Recommendation templates ─────────────────────────────────────────────────
def _spike_recommendation(tool: str, ratio: Any, calls: int) -> str:
cfg = _load_weights()
tool_cfg = (cfg.get("tools") or {}).get(tool, {})
category = tool_cfg.get("category", "")
if category == "media":
return (
f"'{tool}' cost spike (ratio={ratio}, {calls} calls). "
"Consider: rate-limit per workspace, queue with priority, review calling agents."
)
if category == "release":
return (
f"'{tool}' called more frequently than baseline (ratio={ratio}). "
"Review if release_check is looping or being triggered too often."
)
if category == "web":
return (
f"'{tool}' spike (ratio={ratio}). Consider: result caching, dedup identical queries."
)
return (
f"'{tool}' cost spike (ratio={ratio}, {calls} calls in window). "
"Review caller agents and apply rate limits if needed."
)
# ─── backend=auto store resolver ─────────────────────────────────────────────
def _resolve_store(backend: str = "auto"):
"""
Return an AuditStore based on backend param.
backend='auto' (default): uses the globally configured store (which may be
AutoAuditStore, Postgres, or JSONL).
backend='jsonl': forces JsonlAuditStore (7-day window max recommended).
backend='memory': MemoryAuditStore (testing).
"""
from audit_store import get_audit_store, JsonlAuditStore, MemoryAuditStore
if backend in ("auto", None, ""):
return get_audit_store()
if backend == "jsonl":
import os
from pathlib import Path
audit_dir = os.getenv(
"AUDIT_JSONL_DIR",
str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "audit"),
)
return JsonlAuditStore(audit_dir)
if backend == "memory":
return MemoryAuditStore()
return get_audit_store()
# ─── Digest action ────────────────────────────────────────────────────────────
def action_digest(
store,
window_hours: int = 24,
baseline_hours: int = 168, # 7 days
top_n: int = 10,
max_markdown_chars: int = 3800,
) -> Dict:
"""
Daily/weekly cost digest: top tools/agents + anomalies + recommendations.
Returns both structured JSON and a Telegram/markdown-friendly `markdown` field.
"""
now = _now_utc()
window_from = _iso(now - datetime.timedelta(hours=window_hours))
window_to = _iso(now)
baseline_from = _iso(now - datetime.timedelta(hours=baseline_hours))
# ── Top ──────────────────────────────────────────────────────────────────
top_data = action_top(store, window_hours=window_hours, top_n=top_n)
top_tools = top_data.get("top_tools") or []
top_agents = top_data.get("top_agents") or []
total_calls = top_data.get("total_calls", 0)
# ── Anomalies ─────────────────────────────────────────────────────────────
anomaly_data = action_anomalies(
store,
window_minutes=int(window_hours * 60 / 4),
baseline_hours=baseline_hours,
min_calls=5,
)
anomalies = anomaly_data.get("anomalies") or []
# ── Total cost ────────────────────────────────────────────────────────────
events = store.read(from_ts=window_from, to_ts=window_to, limit=200_000)
total_cost = sum(compute_event_cost(e) for e in events)
failed = sum(1 for e in events if e.get("status") in ("failed", "error"))
error_rate = round(failed / max(len(events), 1), 4)
# ── Recommendations ───────────────────────────────────────────────────────
recs = []
for a in anomalies[:5]:
r = a.get("recommendation", "")
if r:
recs.append(r)
if error_rate > 0.05:
recs.append(f"High error rate {round(error_rate*100,1)}% — investigate failing tools.")
if top_tools and top_tools[0].get("cost_units", 0) > 500:
tool_name = top_tools[0].get("tool", "?")
recs.append(f"Top spender '{tool_name}' used {top_tools[0]['cost_units']:.0f} cost units — review frequency.")
recs = list(dict.fromkeys(recs))[:8]
# ── Markdown ─────────────────────────────────────────────────────────────
period_label = f"Last {window_hours}h" if window_hours <= 48 else f"Last {window_hours//24}d"
lines = [
f"📊 **Cost Digest** ({period_label})",
f"Total calls: {total_calls} | Cost units: {total_cost:.0f} | Errors: {round(error_rate*100,1)}%",
"",
"**Top Tools:**",
]
for t in top_tools[:5]:
lines.append(f" • `{t.get('tool','?')}` — {t.get('cost_units',0):.1f}u, {t.get('count',0)} calls")
lines.append("")
lines.append("**Top Agents:**")
for a in top_agents[:3]:
lines.append(f" • `{a.get('agent_id','?')}` — {a.get('cost_units',0):.1f}u, {a.get('count',0)} calls")
if anomalies:
lines.append("")
lines.append(f"⚠️ **{len(anomalies)} Anomaly(ies):**")
for anm in anomalies[:3]:
lines.append(f" • [{anm.get('type','?')}] `{anm.get('tool','?')}` ratio={anm.get('ratio','?')}")
if recs:
lines.append("")
lines.append("💡 **Recommendations:**")
for r in recs[:5]:
lines.append(f" {r[:200]}")
markdown = "\n".join(lines)
if len(markdown) > max_markdown_chars:
markdown = markdown[:max_markdown_chars] + "\n…[truncated]"
return {
"period": period_label,
"window_hours": window_hours,
"time_range": {"from": window_from, "to": window_to},
"totals": {
"calls": total_calls,
"cost_units": round(total_cost, 2),
"failed": failed,
"error_rate": error_rate,
},
"top_tools": top_tools[:top_n],
"top_agents": top_agents[:top_n],
"anomalies": anomalies[:10],
"anomaly_count": len(anomalies),
"recommendations": recs,
"markdown": markdown,
}
# ─── Main entrypoint ─────────────────────────────────────────────────────────
def analyze_cost_dict(action: str, params: Optional[Dict] = None, store=None) -> Dict:
"""
Wrapper called by tool_manager handler.
Returns plain dict for ToolResult.
"""
params = params or {}
if store is None:
backend = params.get("backend", "auto")
store = _resolve_store(backend)
if action == "digest":
return action_digest(
store,
window_hours=int(params.get("window_hours", 24)),
baseline_hours=int(params.get("baseline_hours", 168)),
top_n=int(params.get("top_n", 10)),
max_markdown_chars=int(params.get("max_markdown_chars", 3800)),
)
if action == "report":
return action_report(
store,
time_range=params.get("time_range"),
group_by=params.get("group_by", ["tool"]),
top_n=int(params.get("top_n", 10)),
include_failed=bool(params.get("include_failed", True)),
include_hourly=bool(params.get("include_hourly", False)),
)
if action == "top":
return action_top(
store,
window_hours=int(params.get("window_hours", 24)),
top_n=int(params.get("top_n", 10)),
)
if action == "anomalies":
return action_anomalies(
store,
window_minutes=int(params.get("window_minutes", 60)),
baseline_hours=int(params.get("baseline_hours", 24)),
ratio_threshold=params.get("ratio_threshold"),
min_calls=params.get("min_calls"),
tools_filter=params.get("tools_filter"),
)
if action == "weights":
return action_weights()
return {"error": f"Unknown action '{action}'. Valid: digest, report, top, anomalies, weights"}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,968 @@
"""
Dependency & Supply Chain Scanner.
Scans Python and Node.js dependencies for:
1. Known vulnerabilities (via OSV.dev API or offline cache)
2. Outdated packages (lockfile_only mode, using OSV fixed_versions)
3. License policy enforcement (optional, MVP: offline-only)
Ecosystems supported:
Python → poetry.lock, pipfile.lock, requirements*.txt, pyproject.toml
Node → package-lock.json, pnpm-lock.yaml, yarn.lock, package.json
Pass rule: pass=false if any vuln with severity in fail_on (default: CRITICAL, HIGH).
MEDIUM → warning (not blocking by default). UNKNOWN → warning if not in fail_on.
Security:
- Read-only: no file writes except cache update (explicit)
- Evidence masked for secrets
- Payload not logged; only hash + counts
- Max files/deps enforced via limits
- Timeout via deadline
"""
from __future__ import annotations
import csv
import fnmatch
import hashlib
import json
import logging
import os
import re
import time
import uuid
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, FrozenSet, List, Optional, Set, Tuple
logger = logging.getLogger(__name__)
# ─── Constants ────────────────────────────────────────────────────────────────
EXCLUDED_DIRS: FrozenSet[str] = frozenset({
"node_modules", ".git", "dist", "build", "vendor",
".venv", "venv", "venv_models", "sofia_venv",
"__pycache__", ".pytest_cache", "rollback_backups",
"docs/consolidation",
})
OSV_API_URL = "https://api.osv.dev/v1/querybatch"
OSV_BATCH_SIZE = 100 # max per request
OSV_TIMEOUT_SEC = 15.0
# OSV ecosystems
ECOSYSTEM_PYPI = "PyPI"
ECOSYSTEM_NPM = "npm"
SEVERITY_ORDER = {"CRITICAL": 4, "HIGH": 3, "MEDIUM": 2, "LOW": 1, "UNKNOWN": 0}
# ─── Data Structures ──────────────────────────────────────────────────────────
@dataclass
class Package:
name: str
version: str # empty string = unresolved/unpinned
ecosystem: str # "PyPI" | "npm"
source_file: str
pinned: bool = True
@property
def normalized_name(self) -> str:
return self.name.lower().replace("_", "-")
@property
def cache_key(self) -> str:
return f"{self.ecosystem}:{self.normalized_name}:{self.version}"
@dataclass
class Vulnerability:
osv_id: str
ecosystem: str
package: str
version: str
severity: str # CRITICAL | HIGH | MEDIUM | LOW | UNKNOWN
fixed_versions: List[str]
aliases: List[str] # CVE-XXXX-XXXX etc.
evidence: Dict[str, str]
recommendation: str
@dataclass
class OutdatedPackage:
ecosystem: str
package: str
current: str
latest: Optional[str]
notes: str
@dataclass
class LicenseFinding:
package: str
license: str
policy: str # "deny" | "warn" | "ok" | "unknown"
recommendation: str
@dataclass
class ScanResult:
pass_: bool
summary: str
stats: Dict[str, Any]
vulnerabilities: List[Dict]
outdated: List[Dict]
licenses: List[Dict]
recommendations: List[str]
# ─── Helpers ──────────────────────────────────────────────────────────────────
_SECRET_PAT = re.compile(
r'(?i)(api[_-]?key|token|secret|password|bearer|jwt|private[_-]?key)'
r'[\s=:]+[\'"`]?([a-zA-Z0-9_\-\.]{8,})[\'"`]?'
)
def _redact(text: str) -> str:
return _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***REDACTED***", text or "")
def _is_excluded(path: str) -> bool:
parts = Path(path).parts
return any(p in EXCLUDED_DIRS for p in parts)
def _read_file(path: str, max_bytes: int = 524288) -> str:
try:
size = os.path.getsize(path)
with open(path, "r", errors="replace") as f:
return f.read(min(size, max_bytes))
except Exception:
return ""
def _normalize_pkg_name(name: str) -> str:
"""Normalize: lowercase, underscores → dashes."""
return name.strip().lower().replace("_", "-")
def _compare_versions(v1: str, v2: str) -> int:
"""
Simple version comparison. Returns -1 / 0 / 1.
Handles semver and PEP 440 in a best-effort way.
"""
def _parts(v: str) -> List[int]:
nums = re.findall(r'\d+', v.split("+")[0].split("-")[0])
return [int(x) for x in nums] if nums else [0]
p1, p2 = _parts(v1), _parts(v2)
# Pad to equal length
max_len = max(len(p1), len(p2))
p1 += [0] * (max_len - len(p1))
p2 += [0] * (max_len - len(p2))
if p1 < p2:
return -1
if p1 > p2:
return 1
return 0
# ─── Python Parsers ───────────────────────────────────────────────────────────
def _parse_poetry_lock(content: str, source_file: str) -> List[Package]:
"""Parse poetry.lock [[package]] sections."""
packages = []
# Split on [[package]] headers
sections = re.split(r'\[\[package\]\]', content)
for section in sections[1:]:
name_m = re.search(r'^name\s*=\s*"([^"]+)"', section, re.MULTILINE)
ver_m = re.search(r'^version\s*=\s*"([^"]+)"', section, re.MULTILINE)
if name_m and ver_m:
packages.append(Package(
name=name_m.group(1),
version=ver_m.group(1),
ecosystem=ECOSYSTEM_PYPI,
source_file=source_file,
pinned=True,
))
return packages
def _parse_pipfile_lock(content: str, source_file: str) -> List[Package]:
"""Parse Pipfile.lock JSON."""
packages = []
try:
data = json.loads(content)
for section in ("default", "develop"):
for pkg_name, pkg_info in (data.get(section) or {}).items():
version = pkg_info.get("version", "")
# Pipfile.lock versions are like "==2.28.0"
version = re.sub(r'^==', '', version)
if version:
packages.append(Package(
name=pkg_name,
version=version,
ecosystem=ECOSYSTEM_PYPI,
source_file=source_file,
pinned=True,
))
except Exception as e:
logger.debug(f"Could not parse Pipfile.lock: {e}")
return packages
_REQ_LINE_PAT = re.compile(
r'^([A-Za-z0-9_\-\.]+)(?:\[.*?\])?\s*==\s*([^\s;#]+)',
re.MULTILINE,
)
_REQ_UNPINNED_PAT = re.compile(
r'^([A-Za-z0-9_\-\.]+)(?:\[.*?\])?\s*[><!~^]=?\s*[^\s;#]+',
re.MULTILINE,
)
def _parse_requirements_txt(content: str, source_file: str) -> List[Package]:
"""
Parse requirements.txt.
Only pinned (==) lines yield concrete versions.
Unpinned are recorded with empty version (unresolved).
"""
packages = []
seen: Set[str] = set()
for m in _REQ_LINE_PAT.finditer(content):
name, version = m.group(1), m.group(2).strip()
key = _normalize_pkg_name(name)
if key not in seen:
packages.append(Package(
name=name, version=version,
ecosystem=ECOSYSTEM_PYPI,
source_file=source_file, pinned=True,
))
seen.add(key)
# Record unpinned for reporting (no vuln scan)
for m in _REQ_UNPINNED_PAT.finditer(content):
name = m.group(1)
key = _normalize_pkg_name(name)
if key not in seen:
packages.append(Package(
name=name, version="",
ecosystem=ECOSYSTEM_PYPI,
source_file=source_file, pinned=False,
))
seen.add(key)
return packages
def _parse_pyproject_toml(content: str, source_file: str) -> List[Package]:
"""Extract declared deps from pyproject.toml (without resolving versions)."""
packages = []
# [tool.poetry.dependencies] or [project.dependencies]
dep_section = re.search(
r'\[(?:tool\.poetry\.dependencies|project)\]([^\[]*)', content, re.DOTALL
)
if not dep_section:
return packages
block = dep_section.group(1)
for m in re.finditer(r'^([A-Za-z0-9_\-\.]+)\s*=', block, re.MULTILINE):
name = m.group(1).strip()
if name.lower() in ("python", "python-version"):
continue
packages.append(Package(
name=name, version="",
ecosystem=ECOSYSTEM_PYPI,
source_file=source_file, pinned=False,
))
return packages
# ─── Node Parsers ─────────────────────────────────────────────────────────────
def _parse_package_lock_json(content: str, source_file: str) -> List[Package]:
"""Parse package-lock.json (npm v2/v3 format)."""
packages = []
try:
data = json.loads(content)
# v2/v3: flat packages object
pkg_map = data.get("packages") or {}
for path_key, info in pkg_map.items():
if path_key == "" or not path_key.startswith("node_modules/"):
continue
# Extract package name from path
name = path_key.replace("node_modules/", "").split("/node_modules/")[-1]
version = info.get("version", "")
if name and version:
packages.append(Package(
name=name, version=version,
ecosystem=ECOSYSTEM_NPM,
source_file=source_file, pinned=True,
))
# v1 fallback: nested dependencies
if not packages:
for name, info in (data.get("dependencies") or {}).items():
version = info.get("version", "")
if version:
packages.append(Package(
name=name, version=version,
ecosystem=ECOSYSTEM_NPM,
source_file=source_file, pinned=True,
))
except Exception as e:
logger.debug(f"Could not parse package-lock.json: {e}")
return packages
def _parse_pnpm_lock(content: str, source_file: str) -> List[Package]:
"""Parse pnpm-lock.yaml packages section."""
packages = []
# Pattern: /package@version:
for m in re.finditer(r'^/([^@\s]+)@([^\s:]+):', content, re.MULTILINE):
name, version = m.group(1), m.group(2)
packages.append(Package(
name=name, version=version,
ecosystem=ECOSYSTEM_NPM,
source_file=source_file, pinned=True,
))
return packages
def _parse_yarn_lock(content: str, source_file: str) -> List[Package]:
"""Parse yarn.lock v1 format."""
packages = []
# Yarn.lock block: "package@version":\n version "X.Y.Z"
block_pat = re.compile(
r'^"?([^@"\s]+)@[^:]+:\n(?:\s+.*\n)*?\s+version "([^"]+)"',
re.MULTILINE,
)
seen: Set[str] = set()
for m in block_pat.finditer(content):
name, version = m.group(1), m.group(2)
key = f"{name}@{version}"
if key not in seen:
packages.append(Package(
name=name, version=version,
ecosystem=ECOSYSTEM_NPM,
source_file=source_file, pinned=True,
))
seen.add(key)
return packages
def _parse_package_json(content: str, source_file: str) -> List[Package]:
"""Extract declared deps from package.json (no lock = unresolved)."""
packages = []
try:
data = json.loads(content)
for section in ("dependencies", "devDependencies"):
for name in (data.get(section) or {}):
packages.append(Package(
name=name, version="",
ecosystem=ECOSYSTEM_NPM,
source_file=source_file, pinned=False,
))
except Exception:
pass
return packages
# ─── Dependency Discovery ─────────────────────────────────────────────────────
_PYTHON_MANIFESTS = (
"poetry.lock", "Pipfile.lock",
)
_PYTHON_REQUIREMENTS = ("requirements",) # matched via endswith
_PYTHON_PYPROJECT = ("pyproject.toml",)
_NODE_MANIFESTS = (
"package-lock.json", "pnpm-lock.yaml", "yarn.lock", "package.json",
)
def _find_and_parse_deps(
repo_root: str,
targets: List[str],
max_files: int,
deadline: float,
) -> List[Package]:
"""Walk repo and extract all packages from manifest files."""
all_packages: List[Package] = []
files_scanned = 0
for dirpath, dirnames, filenames in os.walk(repo_root):
dirnames[:] = [
d for d in dirnames
if d not in EXCLUDED_DIRS and not d.startswith(".")
]
if time.monotonic() > deadline:
logger.warning("dependency_scanner: walk timeout")
break
for fname in filenames:
if files_scanned >= max_files:
break
full = os.path.join(dirpath, fname)
if _is_excluded(full):
continue
rel = os.path.relpath(full, repo_root)
content = None
if "python" in targets:
if fname in _PYTHON_MANIFESTS:
content = _read_file(full)
if fname == "poetry.lock":
all_packages.extend(_parse_poetry_lock(content, rel))
elif fname == "Pipfile.lock":
all_packages.extend(_parse_pipfile_lock(content, rel))
files_scanned += 1
elif fname.endswith(".txt") and "requirements" in fname.lower():
content = _read_file(full)
all_packages.extend(_parse_requirements_txt(content, rel))
files_scanned += 1
elif fname in _PYTHON_PYPROJECT:
content = _read_file(full)
all_packages.extend(_parse_pyproject_toml(content, rel))
files_scanned += 1
if "node" in targets:
if fname in _NODE_MANIFESTS:
# Skip package.json if package-lock.json sibling exists
if fname == "package.json":
lock_exists = (
os.path.exists(os.path.join(dirpath, "package-lock.json")) or
os.path.exists(os.path.join(dirpath, "yarn.lock")) or
os.path.exists(os.path.join(dirpath, "pnpm-lock.yaml"))
)
if lock_exists:
continue
content = _read_file(full)
if fname == "package-lock.json":
all_packages.extend(_parse_package_lock_json(content, rel))
elif fname == "pnpm-lock.yaml":
all_packages.extend(_parse_pnpm_lock(content, rel))
elif fname == "yarn.lock":
all_packages.extend(_parse_yarn_lock(content, rel))
elif fname == "package.json":
all_packages.extend(_parse_package_json(content, rel))
files_scanned += 1
# Deduplicate: prefer pinned over unpinned; first seen wins
seen: Dict[str, Package] = {}
for pkg in all_packages:
key = f"{pkg.ecosystem}:{pkg.normalized_name}"
if key not in seen or (not seen[key].pinned and pkg.pinned):
seen[key] = pkg
return list(seen.values())
# ─── OSV Cache ────────────────────────────────────────────────────────────────
def _load_osv_cache(cache_path: str) -> Dict[str, Any]:
"""Load offline OSV cache from JSON file."""
if not cache_path or not os.path.exists(cache_path):
return {}
try:
with open(cache_path, "r") as f:
data = json.load(f)
return data.get("entries", {})
except Exception as e:
logger.warning(f"Could not load OSV cache {cache_path}: {e}")
return {}
def _save_osv_cache(cache_path: str, entries: Dict[str, Any]):
"""Persist updated cache entries to disk."""
os.makedirs(os.path.dirname(os.path.abspath(cache_path)), exist_ok=True)
existing = {}
if os.path.exists(cache_path):
try:
with open(cache_path, "r") as f:
existing = json.load(f)
except Exception:
pass
existing_entries = existing.get("entries", {})
existing_entries.update(entries)
import datetime
output = {
"version": 1,
"updated_at": datetime.datetime.now(datetime.timezone.utc).isoformat(),
"entries": existing_entries,
}
with open(cache_path, "w") as f:
json.dump(output, f, indent=2)
# ─── OSV API ──────────────────────────────────────────────────────────────────
def _query_osv_online(
packages: List[Package],
new_cache: Dict[str, Any],
deadline: float,
) -> Dict[str, List[Dict]]:
"""
Query OSV.dev /v1/querybatch in batches.
Returns {cache_key: [vuln_objects]}.
"""
try:
import httpx
except ImportError:
logger.warning("httpx not available for OSV online query")
return {}
results: Dict[str, List[Dict]] = {}
batches = [packages[i:i + OSV_BATCH_SIZE] for i in range(0, len(packages), OSV_BATCH_SIZE)]
for batch in batches:
if time.monotonic() > deadline:
break
queries = []
batch_keys = []
for pkg in batch:
if not pkg.pinned or not pkg.version:
continue
queries.append({
"package": {"name": pkg.normalized_name, "ecosystem": pkg.ecosystem},
"version": pkg.version,
})
batch_keys.append(pkg.cache_key)
if not queries:
continue
try:
remaining = max(1.0, deadline - time.monotonic())
timeout = min(OSV_TIMEOUT_SEC, remaining)
with httpx.Client(timeout=timeout) as client:
resp = client.post(OSV_API_URL, json={"queries": queries})
resp.raise_for_status()
data = resp.json()
except Exception as e:
logger.warning(f"OSV query failed: {e}")
continue
for key, result in zip(batch_keys, data.get("results", [])):
vulns = result.get("vulns") or []
results[key] = vulns
new_cache[key] = {"vulns": vulns, "cached_at": _now_iso()}
return results
def _parse_osv_severity(vuln: Dict) -> str:
"""Extract best-effort severity from OSV vuln object."""
# Try database_specific.severity (many databases provide this)
db_specific = vuln.get("database_specific", {})
sev = (db_specific.get("severity") or "").upper()
if sev in SEVERITY_ORDER:
return sev
# Try severity[].type=CVSS_V3 score
for sev_entry in (vuln.get("severity") or []):
score_str = sev_entry.get("score", "")
# CVSS vector like CVSS:3.1/AV:N/AC:L/.../C:H/I:H/A:H
# Extract base score from the end: not available directly
# Try to extract numerical score if present
num_m = re.search(r'(\d+\.\d+)', score_str)
if num_m:
score = float(num_m.group(1))
if score >= 9.0:
return "CRITICAL"
if score >= 7.0:
return "HIGH"
if score >= 4.0:
return "MEDIUM"
if score > 0:
return "LOW"
# Try ecosystem_specific
eco_specific = vuln.get("ecosystem_specific", {})
sev = (eco_specific.get("severity") or "").upper()
if sev in SEVERITY_ORDER:
return sev
return "UNKNOWN"
def _extract_fixed_versions(vuln: Dict, pkg_name: str, ecosystem: str) -> List[str]:
"""Extract fixed versions from OSV affected[].ranges[].events."""
fixed = []
for affected in (vuln.get("affected") or []):
pkg = affected.get("package", {})
if (pkg.get("ecosystem") or "").lower() != ecosystem.lower():
continue
if _normalize_pkg_name(pkg.get("name", "")) != _normalize_pkg_name(pkg_name):
continue
for rng in (affected.get("ranges") or []):
for event in (rng.get("events") or []):
if "fixed" in event:
fixed.append(event["fixed"])
return sorted(set(fixed))
def _lookup_vulnerability(
pkg: Package,
osv_vulns: List[Dict],
) -> List[Vulnerability]:
"""Convert raw OSV vulns → Vulnerability objects."""
results = []
for vuln in osv_vulns:
osv_id = vuln.get("id", "UNKNOWN")
aliases = [a for a in (vuln.get("aliases") or []) if a.startswith("CVE")]
severity = _parse_osv_severity(vuln)
fixed = _extract_fixed_versions(vuln, pkg.name, pkg.ecosystem)
rec = (
f"Upgrade {pkg.name} from {pkg.version} to {fixed[0]}"
if fixed else
f"No fix available for {pkg.name}@{pkg.version}. Monitor {osv_id}."
)
results.append(Vulnerability(
osv_id=osv_id,
ecosystem=pkg.ecosystem,
package=pkg.name,
version=pkg.version,
severity=severity,
fixed_versions=fixed,
aliases=aliases,
evidence={
"file": _redact(pkg.source_file),
"details": f"{pkg.name}=={pkg.version} in {pkg.source_file}",
},
recommendation=rec,
))
return results
# ─── Outdated Analysis ────────────────────────────────────────────────────────
def _analyze_outdated(
packages: List[Package],
vuln_results: Dict[str, List[Dict]],
) -> List[OutdatedPackage]:
"""
Lockfile-only outdated analysis.
Uses fixed_versions from OSV results as a hint for "newer version available".
"""
outdated = []
for pkg in packages:
if not pkg.pinned or not pkg.version:
continue
key = pkg.cache_key
vulns = vuln_results.get(key, [])
for vuln in vulns:
fixed = _extract_fixed_versions(vuln, pkg.name, pkg.ecosystem)
if not fixed:
continue
# Find the smallest fixed version > current
upgrades = [v for v in fixed if _compare_versions(v, pkg.version) > 0]
if upgrades:
min_fix = sorted(upgrades, key=lambda v: [int(x) for x in re.findall(r'\d+', v)])[0]
outdated.append(OutdatedPackage(
ecosystem=pkg.ecosystem,
package=pkg.name,
current=pkg.version,
latest=min_fix,
notes=f"Security fix available (vuln: {vuln.get('id', '?')})",
))
break # One entry per package
return outdated
# ─── License Policy ───────────────────────────────────────────────────────────
def _apply_license_policy(
packages: List[Package],
policy_cfg: Dict,
) -> List[LicenseFinding]:
"""MVP: license data is rarely in lock files, so most will be UNKNOWN."""
if not policy_cfg.get("enabled", False):
return []
deny_list = {l.upper() for l in (policy_cfg.get("deny") or [])}
warn_list = {l.upper() for l in (policy_cfg.get("warn") or [])}
findings = []
for pkg in packages:
# In MVP there's no way to get license from lockfile without network
license_str = "UNKNOWN"
if license_str == "UNKNOWN":
continue # skip unknown in MVP
policy = "ok"
if license_str.upper() in deny_list:
policy = "deny"
elif license_str.upper() in warn_list:
policy = "warn"
findings.append(LicenseFinding(
package=pkg.name,
license=license_str,
policy=policy,
recommendation=f"Review license {license_str} for {pkg.name}." if policy != "ok" else "",
))
return findings
# ─── Main Scanner ─────────────────────────────────────────────────────────────
def scan_dependencies(
repo_root: str,
targets: Optional[List[str]] = None,
vuln_sources: Optional[Dict] = None,
license_policy: Optional[Dict] = None,
severity_thresholds: Optional[Dict] = None,
outdated_cfg: Optional[Dict] = None,
limits: Optional[Dict] = None,
timeout_sec: float = 40.0,
) -> ScanResult:
"""
Scan repo dependencies for vulnerabilities, outdated packages, license issues.
Args:
repo_root: absolute path to repo root
targets: ["python", "node"] (default: both)
vuln_sources: {"osv": {"enabled": true, "mode": "online|offline_cache", "cache_path": "..."}}
license_policy: {"enabled": false, "deny": [...], "warn": [...]}
severity_thresholds: {"fail_on": ["CRITICAL", "HIGH"], "warn_on": ["MEDIUM"]}
outdated_cfg: {"enabled": true, "mode": "lockfile_only"}
limits: {"max_files": 80, "max_deps": 2000, "max_vulns": 500}
timeout_sec: hard deadline
Returns:
ScanResult with pass/fail verdict
"""
deadline = time.monotonic() + timeout_sec
targets = targets or ["python", "node"]
vuln_sources = vuln_sources or {"osv": {"enabled": True, "mode": "offline_cache",
"cache_path": "ops/cache/osv_cache.json"}}
license_policy = license_policy or {"enabled": False}
severity_thresholds = severity_thresholds or {"fail_on": ["CRITICAL", "HIGH"], "warn_on": ["MEDIUM"]}
outdated_cfg = outdated_cfg or {"enabled": True, "mode": "lockfile_only"}
limits = limits or {"max_files": 80, "max_deps": 2000, "max_vulns": 500}
fail_on = {s.upper() for s in (severity_thresholds.get("fail_on") or ["CRITICAL", "HIGH"])}
warn_on = {s.upper() for s in (severity_thresholds.get("warn_on") or ["MEDIUM"])}
# ── Step 1: Extract dependencies ─────────────────────────────────────────
all_packages = _find_and_parse_deps(
repo_root, targets,
max_files=limits.get("max_files", 80),
deadline=deadline,
)
# Apply dep count limit
max_deps = limits.get("max_deps", 2000)
if len(all_packages) > max_deps:
logger.warning(f"Dep count {len(all_packages)} > max {max_deps}, truncating")
all_packages = all_packages[:max_deps]
pinned = [p for p in all_packages if p.pinned and p.version]
unpinned = [p for p in all_packages if not p.pinned or not p.version]
# ── Step 2: Vulnerability lookup ─────────────────────────────────────────
osv_cfg = vuln_sources.get("osv", {})
osv_enabled = osv_cfg.get("enabled", True)
osv_mode = osv_cfg.get("mode", "offline_cache")
# Resolve cache path (absolute or relative to repo_root)
cache_path_raw = osv_cfg.get("cache_path", "ops/cache/osv_cache.json")
cache_path = (
cache_path_raw if os.path.isabs(cache_path_raw)
else os.path.join(repo_root, cache_path_raw)
)
cache_entries = _load_osv_cache(cache_path) if osv_enabled else {}
new_cache: Dict[str, Any] = {}
vuln_results: Dict[str, List[Dict]] = {}
if osv_enabled:
# Populate from cache first
cache_miss: List[Package] = []
for pkg in pinned:
key = pkg.cache_key
if key in cache_entries:
vuln_results[key] = (cache_entries[key] or {}).get("vulns", [])
else:
cache_miss.append(pkg)
# Online query for cache misses
if osv_mode == "online" and cache_miss and time.monotonic() < deadline:
online_results = _query_osv_online(cache_miss, new_cache, deadline)
vuln_results.update(online_results)
# Mark remaining misses as UNKNOWN (no cache entry)
for pkg in cache_miss:
if pkg.cache_key not in vuln_results:
vuln_results[pkg.cache_key] = None # type: ignore[assignment]
else:
# Offline: cache misses → UNKNOWN
for pkg in cache_miss:
vuln_results[pkg.cache_key] = None # type: ignore[assignment]
# Persist new cache entries if online mode
if new_cache and osv_mode == "online":
try:
_save_osv_cache(cache_path, new_cache)
except Exception as e:
logger.warning(f"Could not save OSV cache: {e}")
# ── Step 3: Build vulnerability findings ─────────────────────────────────
all_vulns: List[Vulnerability] = []
cache_miss_pkgs: List[Package] = []
for pkg in pinned:
key = pkg.cache_key
raw_vulns = vuln_results.get(key)
if raw_vulns is None:
cache_miss_pkgs.append(pkg)
continue
vulns = _lookup_vulnerability(pkg, raw_vulns)
all_vulns.extend(vulns)
# Apply vuln limit
max_vulns = limits.get("max_vulns", 500)
all_vulns = all_vulns[:max_vulns]
# Sort by severity desc
all_vulns.sort(key=lambda v: SEVERITY_ORDER.get(v.severity, 0), reverse=True)
# ── Step 4: Outdated ──────────────────────────────────────────────────────
outdated: List[OutdatedPackage] = []
if outdated_cfg.get("enabled", True):
outdated = _analyze_outdated(pinned, {
k: v for k, v in vuln_results.items() if v is not None
})
# ── Step 5: License policy ────────────────────────────────────────────────
licenses = _apply_license_policy(all_packages, license_policy)
# ── Step 6: Compute pass/fail ─────────────────────────────────────────────
by_severity: Dict[str, int] = {s: 0 for s in SEVERITY_ORDER}
for v in all_vulns:
by_severity[v.severity] = by_severity.get(v.severity, 0) + 1
blocking_count = sum(by_severity.get(s, 0) for s in fail_on)
warning_count = sum(by_severity.get(s, 0) for s in warn_on)
# License denials also block
denied_licenses = [lf for lf in licenses if lf.policy == "deny"]
if denied_licenses:
blocking_count += len(denied_licenses)
pass_ = blocking_count == 0
# ── Step 7: Build recommendations ────────────────────────────────────────
recommendations: List[str] = []
if blocking_count > 0:
top_crit = [v for v in all_vulns if v.severity in fail_on][:3]
for v in top_crit:
recommendations.append(v.recommendation)
if warning_count > 0:
recommendations.append(
f"{warning_count} MEDIUM severity vulnerabilities found — review and upgrade where possible."
)
if cache_miss_pkgs:
recommendations.append(
f"{len(cache_miss_pkgs)} packages have no OSV cache entry (severity UNKNOWN). "
"Run in online mode to populate cache: mode=online."
)
if unpinned:
recommendations.append(
f"{len(unpinned)} unpinned dependencies detected — cannot check for vulnerabilities. "
"Pin versions in requirements.txt/lock files."
)
# ── Step 8: Summary ───────────────────────────────────────────────────────
ecosystems_found = sorted({p.ecosystem for p in all_packages})
elapsed_ms = round((time.monotonic() - (deadline - timeout_sec)) * 1000, 1)
if pass_:
summary = (
f"✅ Dependency scan PASSED. "
f"{len(pinned)} deps scanned, {len(all_vulns)} vulns found "
f"({by_severity.get('CRITICAL', 0)} critical, {by_severity.get('HIGH', 0)} high)."
)
else:
summary = (
f"❌ Dependency scan FAILED. "
f"{blocking_count} blocking issue(s): "
f"{by_severity.get('CRITICAL', 0)} CRITICAL, {by_severity.get('HIGH', 0)} HIGH"
+ (f", {len(denied_licenses)} denied licenses" if denied_licenses else "")
+ "."
)
stats = {
"ecosystems": ecosystems_found,
"files_scanned": len(set(p.source_file for p in all_packages)),
"deps_total": len(all_packages),
"deps_pinned": len(pinned),
"deps_unresolved": len(cache_miss_pkgs),
"vulns_total": len(all_vulns),
"by_severity": by_severity,
"outdated_total": len(outdated),
"elapsed_ms": elapsed_ms,
}
return ScanResult(
pass_=pass_,
summary=summary,
stats=stats,
vulnerabilities=[_vuln_to_dict(v) for v in all_vulns],
outdated=[_outdated_to_dict(o) for o in outdated],
licenses=[_license_to_dict(lf) for lf in licenses],
recommendations=list(dict.fromkeys(recommendations)), # dedupe
)
def scan_dependencies_dict(repo_root: str, **kwargs) -> Dict:
"""Convenience wrapper returning plain dict for ToolResult."""
result = scan_dependencies(repo_root, **kwargs)
return {
"pass": result.pass_,
"summary": result.summary,
"stats": result.stats,
"vulnerabilities": result.vulnerabilities,
"outdated": result.outdated,
"licenses": result.licenses,
"recommendations": result.recommendations,
}
# ─── Serializers ──────────────────────────────────────────────────────────────
def _vuln_to_dict(v: Vulnerability) -> Dict:
return {
"id": v.osv_id,
"ecosystem": v.ecosystem,
"package": v.package,
"version": v.version,
"severity": v.severity,
"fixed_versions": v.fixed_versions,
"aliases": v.aliases,
"evidence": {k: _redact(val) for k, val in v.evidence.items()},
"recommendation": v.recommendation,
}
def _outdated_to_dict(o: OutdatedPackage) -> Dict:
return {
"ecosystem": o.ecosystem,
"package": o.package,
"current": o.current,
"latest": o.latest,
"notes": o.notes,
}
def _license_to_dict(lf: LicenseFinding) -> Dict:
return {
"package": lf.package,
"license": lf.license,
"policy": lf.policy,
"recommendation": lf.recommendation,
}
def _now_iso() -> str:
import datetime
return datetime.datetime.now(datetime.timezone.utc).isoformat()

View File

@@ -0,0 +1,898 @@
"""
Drift Analyzer — знаходить розбіжності між "джерелами правди" та "фактом".
4 категорії перевірок (незалежні, кожна повертає findings):
1. services — Service Catalog (inventory_services.csv / 01_SERVICE_CATALOG.md) vs docker-compose*.yml
2. openapi — OpenAPI specs (docs/contracts/*.yaml) vs routes у коді (FastAPI decorators)
3. nats — inventory_nats_topics.csv vs publish/subscribe usage у коді
4. tools — tools_rollout.yml + rbac_tools_matrix.yml vs фактичні handlers у tool_manager.py
Формат findings:
{ category, severity, id, title, evidence: {path, lines, details}, recommended_fix }
Pass rule: pass=false якщо errors > 0. Warnings/infos не валять gate.
"""
import csv
import fnmatch
import hashlib
import json
import logging
import os
import re
import time
import yaml
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, FrozenSet, List, Optional, Set, Tuple
logger = logging.getLogger(__name__)
# ─── Constants ────────────────────────────────────────────────────────────────
EXCLUDED_DIRS: FrozenSet[str] = frozenset({
"node_modules", ".git", "dist", "build", "vendor",
".venv", "venv", "venv_models", "sofia_venv",
"__pycache__", ".pytest_cache", "rollback_backups",
"docs/consolidation",
})
MAX_FILES_PER_CATEGORY = 300
MAX_BYTES_PER_FILE = 262144 # 256KB
TIMEOUT_SEC = 25.0 # Hard deadline per full analysis
# Known tool handlers (must be kept in sync with execute_tool dispatch in tool_manager.py)
# Source: Priority 117 handlers in tool_manager.py
KNOWN_TOOL_HANDLERS: FrozenSet[str] = frozenset({
"memory_search", "graph_query",
"web_search", "web_extract",
"image_generate", "comfy_generate_image", "comfy_generate_video",
"remember_fact",
"presentation_create", "presentation_status", "presentation_download",
"crawl4ai_scrape", "tts_speak", "file_tool",
"market_data",
"crm_search_client", "crm_upsert_client", "crm_upsert_site",
"crm_upsert_window_unit", "crm_create_quote", "crm_update_quote",
"crm_create_job", "calc_window_quote",
"docs_render_quote_pdf", "docs_render_invoice_pdf",
"schedule_propose_slots", "schedule_confirm_slot",
"repo_tool", "pr_reviewer_tool", "contract_tool",
"oncall_tool", "observability_tool", "config_linter_tool",
"threatmodel_tool", "job_orchestrator_tool", "kb_tool",
"drift_analyzer_tool", # self-registration
})
# ─── Data Structures ──────────────────────────────────────────────────────────
@dataclass
class Finding:
category: str
severity: str # "error" | "warning" | "info"
id: str
title: str
evidence: Dict[str, str] = field(default_factory=dict)
recommended_fix: str = ""
def to_dict(self) -> Dict:
return {
"category": self.category,
"severity": self.severity,
"id": self.id,
"title": self.title,
"evidence": self.evidence,
"recommended_fix": self.recommended_fix,
}
@dataclass
class DriftReport:
pass_: bool
summary: str
stats: Dict[str, Any]
findings: List[Dict]
# ─── Utility helpers ──────────────────────────────────────────────────────────
def _is_excluded(path: str) -> bool:
"""Check if any part of the path is in the excluded dirs set."""
parts = Path(path).parts
return any(p in EXCLUDED_DIRS for p in parts)
def _walk_files(root: str, extensions: Tuple[str, ...],
deadline: float) -> List[str]:
"""
Walk repo root and collect files with given extensions.
Respects EXCLUDED_DIRS, MAX_FILES_PER_CATEGORY, TIMEOUT_SEC.
"""
found = []
for dirpath, dirnames, filenames in os.walk(root):
# Prune excluded dirs in-place (affects os.walk recursion)
dirnames[:] = [
d for d in dirnames
if d not in EXCLUDED_DIRS and not d.startswith(".")
]
if time.monotonic() > deadline:
logger.warning("_walk_files: timeout reached")
break
for fname in filenames:
if fname.endswith(extensions):
full = os.path.join(dirpath, fname)
if not _is_excluded(full):
found.append(full)
if len(found) >= MAX_FILES_PER_CATEGORY:
return found
return found
def _read_file(path: str) -> str:
"""Read file with size limit. Returns empty string on error."""
try:
size = os.path.getsize(path)
if size > MAX_BYTES_PER_FILE:
with open(path, "r", errors="replace") as f:
return f.read(MAX_BYTES_PER_FILE)
with open(path, "r", errors="replace") as f:
return f.read()
except Exception:
return ""
_SECRET_PAT = re.compile(
r'(?i)(api[_-]?key|token|secret|password|bearer|jwt|private[_-]?key)'
r'[\s=:]+[\'"`]?([a-zA-Z0-9_\-\.]{8,})[\'"`]?'
)
def _redact_evidence(text: str) -> str:
"""Mask potential secrets in evidence strings."""
return _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***REDACTED***", text)
def _rel(path: str, root: str) -> str:
"""Return path relative to root, or absolute if outside."""
try:
return os.path.relpath(path, root)
except ValueError:
return path
# ─── Category 1: Services ─────────────────────────────────────────────────────
def _load_service_catalog(repo_root: str) -> Dict[str, str]:
"""
Load services from inventory_services.csv.
Returns {service_name: status}.
"""
csv_path = os.path.join(
repo_root, "docs", "architecture_inventory", "inventory_services.csv"
)
services = {}
if not os.path.exists(csv_path):
# Fallback: scan 01_SERVICE_CATALOG.md for table rows
md_path = os.path.join(
repo_root, "docs", "architecture_inventory", "01_SERVICE_CATALOG.md"
)
if os.path.exists(md_path):
content = _read_file(md_path)
for line in content.splitlines():
m = re.match(r'\|\s*([\w\-]+)\s*\|\s*(DEPLOYED|DEFINED|PLANNED[^\|]*)', line)
if m:
services[m.group(1).strip()] = m.group(2).strip()
return services
try:
with open(csv_path, "r", newline="", errors="replace") as f:
reader = csv.DictReader(f)
for row in reader:
name = (row.get("service") or "").strip()
status = (row.get("type") or "").strip() # csv has 'type' not 'status'
if name:
services[name] = status
except Exception as e:
logger.warning(f"Could not load inventory_services.csv: {e}")
return services
def _load_compose_services(repo_root: str, deadline: float) -> Dict[str, str]:
"""
Parse docker-compose*.yml files and return {service_name: compose_file}.
"""
compose_files = []
for entry in os.listdir(repo_root):
if fnmatch.fnmatch(entry, "docker-compose*.yml"):
compose_files.append(os.path.join(repo_root, entry))
# Also infra subdir
infra_compose = os.path.join(repo_root, "infra", "compose", "docker-compose.yml")
if os.path.exists(infra_compose):
compose_files.append(infra_compose)
services = {}
for cf in compose_files:
if time.monotonic() > deadline:
break
try:
content = _read_file(cf)
data = yaml.safe_load(content) or {}
svc_section = data.get("services") or {}
for svc_name in svc_section:
services[svc_name] = _rel(cf, repo_root)
except Exception as e:
logger.debug(f"Could not parse {cf}: {e}")
return services
def _analyze_services(repo_root: str, deadline: float) -> Tuple[List[Finding], Dict]:
findings = []
catalog = _load_service_catalog(repo_root)
compose_svcs = _load_compose_services(repo_root, deadline)
compose_names = set(compose_svcs.keys())
catalog_names = set(catalog.keys())
# DEPLOYED in catalog but missing from ALL compose files
for svc, status in catalog.items():
if "DEPLOYED" in status.upper() and svc not in compose_names:
# Normalize: some catalog names use dashes vs underscores differently
normalized = svc.replace("-", "_")
variants = {svc, normalized, svc.replace("_", "-")}
if not variants.intersection(compose_names):
findings.append(Finding(
category="services",
severity="error",
id="DRIFT-SVC-001",
title=f"Service '{svc}' marked DEPLOYED in catalog but absent from all docker-compose files",
evidence={"path": "docs/architecture_inventory/inventory_services.csv",
"details": f"status={status}, not found in compose"},
recommended_fix=f"Add '{svc}' to appropriate docker-compose*.yml or update catalog status to DEFINED.",
))
# In compose but not mentioned in catalog at all
for svc, compose_file in compose_svcs.items():
if svc not in catalog_names:
normalized = svc.replace("-", "_").replace("_", "-")
if svc not in catalog_names and normalized not in catalog_names:
findings.append(Finding(
category="services",
severity="warning",
id="DRIFT-SVC-002",
title=f"Service '{svc}' found in compose but not in service catalog",
evidence={"path": compose_file, "details": f"defined in {compose_file}"},
recommended_fix=f"Add '{svc}' to inventory_services.csv / 01_SERVICE_CATALOG.md.",
))
stats = {
"catalog_entries": len(catalog),
"compose_services": len(compose_svcs),
"findings": len(findings),
}
return findings, stats
# ─── Category 2: OpenAPI ──────────────────────────────────────────────────────
def _load_openapi_paths(repo_root: str, deadline: float) -> Dict[str, Set[str]]:
"""
Scan docs/contracts/*.openapi.yaml and any openapi*.yaml/yml/json.
Returns {"/path": {"get", "post", ...}}.
"""
spec_files = []
contracts_dir = os.path.join(repo_root, "docs", "contracts")
if os.path.isdir(contracts_dir):
for f in os.listdir(contracts_dir):
if f.endswith((".yaml", ".yml", ".json")):
spec_files.append(os.path.join(contracts_dir, f))
# Also find any openapi*.yaml in repo root and services
for dirpath, dirnames, filenames in os.walk(repo_root):
dirnames[:] = [d for d in dirnames if d not in EXCLUDED_DIRS and not d.startswith(".")]
if time.monotonic() > deadline:
break
for f in filenames:
if re.match(r'openapi.*\.(ya?ml|json)$', f, re.IGNORECASE):
full = os.path.join(dirpath, f)
if full not in spec_files:
spec_files.append(full)
paths: Dict[str, Set[str]] = {}
for sf in spec_files:
if time.monotonic() > deadline:
break
try:
content = _read_file(sf)
data = yaml.safe_load(content) if sf.endswith((".yaml", ".yml")) else json.loads(content)
if not isinstance(data, dict) or "paths" not in data:
continue
for path, methods in (data.get("paths") or {}).items():
if not isinstance(methods, dict):
continue
methods_set = {
m.lower() for m in methods
if m.lower() in {"get", "post", "put", "patch", "delete", "head", "options"}
}
if path not in paths:
paths[path] = set()
paths[path].update(methods_set)
except Exception as e:
logger.debug(f"Could not parse OpenAPI spec {sf}: {e}")
return paths
_FASTAPI_ROUTE_PAT = re.compile(
r'@(?:app|router)\.(get|post|put|patch|delete|head|options)\(\s*[\'"]([^\'"]+)[\'"]',
re.MULTILINE,
)
_ADD_API_ROUTE_PAT = re.compile(
r'\.add_api_route\(\s*[\'"]([^\'"]+)[\'"].*?methods\s*=\s*\[([^\]]+)\]',
re.MULTILINE | re.DOTALL,
)
def _load_code_routes(repo_root: str, deadline: float) -> Dict[str, Set[str]]:
"""
Scan Python files for FastAPI route decorators.
Returns {"/path": {"get", "post", ...}}.
"""
py_files = _walk_files(repo_root, (".py",), deadline)
routes: Dict[str, Set[str]] = {}
for pf in py_files:
if time.monotonic() > deadline:
break
if ".venv" in pf or "venv" in pf or "node_modules" in pf:
continue
content = _read_file(pf)
if not content:
continue
for method, path in _FASTAPI_ROUTE_PAT.findall(content):
norm = path.rstrip("/") or "/"
if norm not in routes:
routes[norm] = set()
routes[norm].add(method.lower())
for path, methods_raw in _ADD_API_ROUTE_PAT.findall(content):
methods = {m.strip().strip('"\'').lower() for m in methods_raw.split(",")}
norm = path.rstrip("/") or "/"
if norm not in routes:
routes[norm] = set()
routes[norm].update(methods)
return routes
def _normalize_path(path: str) -> str:
"""Normalize OAS path for comparison: remove trailing slash, lowercase."""
return path.rstrip("/").lower() or "/"
# Paths that are infrastructure-level and expected to be missing from OAS specs.
# Add /internal/* and /debug/* patterns if your project uses them.
_OAS_IGNORE_PATH_PREFIXES: Tuple[str, ...] = (
"/healthz", "/readyz", "/livez", "/metrics",
"/internal/", "/debug/", "/__", "/favicon",
)
def _is_oas_ignored(path: str) -> bool:
"""Return True if path is on the OAS ignore allowlist."""
p = path.lower()
return any(p == prefix.rstrip("/") or p.startswith(prefix)
for prefix in _OAS_IGNORE_PATH_PREFIXES)
def _load_openapi_deprecated(repo_root: str) -> Set[str]:
"""
Return normalized paths marked as 'deprecated: true' in any OAS spec.
Deprecated endpoints downgrade from error to warning (DRIFT-OAS-001).
"""
deprecated: Set[str] = set()
spec_files: List[str] = []
for dirpath, dirnames, filenames in os.walk(repo_root):
dirnames[:] = [d for d in dirnames if d not in EXCLUDED_DIRS and not d.startswith(".")]
for f in filenames:
if re.match(r'openapi.*\.(ya?ml|json)$', f, re.IGNORECASE):
spec_files.append(os.path.join(dirpath, f))
for sf in spec_files:
try:
content = _read_file(sf)
data = yaml.safe_load(content) if sf.endswith((".yaml", ".yml")) else json.loads(content)
if not isinstance(data, dict) or "paths" not in data:
continue
for path, methods in (data.get("paths") or {}).items():
if not isinstance(methods, dict):
continue
for method, operation in methods.items():
if isinstance(operation, dict) and operation.get("deprecated", False):
deprecated.add(_normalize_path(path))
except Exception:
pass
return deprecated
def _analyze_openapi(repo_root: str, deadline: float) -> Tuple[List[Finding], Dict]:
findings = []
spec_paths = _load_openapi_paths(repo_root, deadline)
code_routes = _load_code_routes(repo_root, deadline)
if not spec_paths:
return findings, {"spec_paths": 0, "code_routes": len(code_routes), "findings": 0}
deprecated_paths = _load_openapi_deprecated(repo_root)
spec_norm: Dict[str, Set[str]] = {
_normalize_path(p): methods for p, methods in spec_paths.items()
}
code_norm: Dict[str, Set[str]] = {
_normalize_path(p): methods for p, methods in code_routes.items()
}
# DRIFT-OAS-001: In spec but not in code
for path, methods in sorted(spec_norm.items()):
# Skip infra/health endpoints — they are expected to be absent from OAS
if _is_oas_ignored(path):
continue
if path not in code_norm:
# Deprecated spec paths → warning only, not blocking
severity = "warning" if path in deprecated_paths else "error"
dep_note = " (deprecated in spec)" if path in deprecated_paths else ""
findings.append(Finding(
category="openapi",
severity=severity,
id="DRIFT-OAS-001",
title=f"OpenAPI path '{path}'{dep_note} not found in codebase routes",
evidence={"path": "docs/contracts/",
"details": f"methods={sorted(methods)}, missing from FastAPI decorators"},
recommended_fix=(
f"Mark '{path}' as removed in OpenAPI or implement the route."
if path in deprecated_paths
else f"Implement '{path}' route in code or remove from OpenAPI spec."
),
))
else:
# DRIFT-OAS-003: Method mismatch
code_methods = code_norm[path]
missing_in_code = methods - code_methods
if missing_in_code:
findings.append(Finding(
category="openapi",
severity="warning",
id="DRIFT-OAS-003",
title=f"Method mismatch for path '{path}': spec has {sorted(missing_in_code)}, code missing",
evidence={"path": "docs/contracts/",
"details": f"spec={sorted(methods)}, code={sorted(code_methods)}"},
recommended_fix=f"Add missing HTTP methods to code route for '{path}'.",
))
# DRIFT-OAS-002: In code (/v1/ paths) but not in spec
for path, methods in sorted(code_norm.items()):
# Health/internal endpoints are expected to be absent from OAS
if _is_oas_ignored(path):
continue
if not path.startswith("/v1/"):
continue
if path not in spec_norm:
findings.append(Finding(
category="openapi",
severity="error",
id="DRIFT-OAS-002",
title=f"Code route '{path}' not documented in any OpenAPI spec",
evidence={"path": "services/", "details": f"methods={sorted(methods)}"},
recommended_fix=f"Add '{path}' to OpenAPI spec in docs/contracts/.",
))
stats = {
"spec_paths": len(spec_paths),
"code_routes": len(code_routes),
"findings": len(findings),
}
return findings, stats
# ─── Category 3: NATS ─────────────────────────────────────────────────────────
_NATS_WILDCARD_PAT = re.compile(r'\{[^}]+\}|\*|>') # {agent_id}, *, >
def _normalize_nats_subject(subj: str) -> str:
"""Replace wildcards with * for matching. Lowercase."""
return _NATS_WILDCARD_PAT.sub("*", subj.strip()).lower()
def _load_nats_inventory(repo_root: str) -> Optional[List[str]]:
"""
Load documented NATS subjects from inventory_nats_topics.csv.
Returns list of normalized subjects, or None if file absent.
"""
csv_path = os.path.join(
repo_root, "docs", "architecture_inventory", "inventory_nats_topics.csv"
)
if not os.path.exists(csv_path):
return None
subjects = []
try:
with open(csv_path, "r", newline="", errors="replace") as f:
reader = csv.DictReader(f)
for row in reader:
subj = (row.get("subject") or "").strip()
if subj:
subjects.append(_normalize_nats_subject(subj))
except Exception as e:
logger.warning(f"Could not load nats inventory: {e}")
return None
return subjects
_NATS_USAGE_PATTERNS = [
re.compile(r'(?:nc|nats|js|jetstream)\.publish\([\'"]([a-zA-Z0-9._{}*>-]+)[\'"]', re.IGNORECASE),
re.compile(r'(?:nc|nats|js|jetstream)\.subscribe\([\'"]([a-zA-Z0-9._{}*>-]+)[\'"]', re.IGNORECASE),
re.compile(r'nc\.subscribe\([\'"]([a-zA-Z0-9._{}*>-]+)[\'"]', re.IGNORECASE),
re.compile(r'subject\s*=\s*[\'"]([a-zA-Z0-9._{}*>-]{4,})[\'"]', re.IGNORECASE),
re.compile(r'SUBJECT\s*=\s*[\'"]([a-zA-Z0-9._{}*>-]{4,})[\'"]'),
re.compile(r'[\'"]([a-z][a-z0-9_]+\.[a-z][a-z0-9_]+(?:\.[a-zA-Z0-9_{}_.*>-]+){0,4})[\'"]'),
]
_NATS_SUBJECT_VALIDATE = re.compile(r'^[a-zA-Z][a-zA-Z0-9._{}*>-]{2,}$')
def _load_nats_code_subjects(repo_root: str, deadline: float) -> Set[str]:
"""Extract NATS subjects from code via regex patterns."""
py_files = _walk_files(repo_root, (".py",), deadline)
found: Set[str] = set()
for pf in py_files:
if time.monotonic() > deadline:
break
if "venv" in pf or "node_modules" in pf:
continue
content = _read_file(pf)
if not content:
continue
# Quick pre-filter: must contain at least one NATS-like call pattern
_NATS_CALL_HINTS = ("nc.", "nats.", "js.", "jetstream.", "subject=", "SUBJECT=", ".publish(", ".subscribe(")
if not any(hint in content for hint in _NATS_CALL_HINTS):
continue
for pat in _NATS_USAGE_PATTERNS:
for m in pat.finditer(content):
subj = m.group(1).strip()
# Basic subject validation (must contain a dot)
if "." in subj and _NATS_SUBJECT_VALIDATE.match(subj):
found.add(_normalize_nats_subject(subj))
return found
def _nats_subject_matches(code_subj: str, inventory_subjects: List[str]) -> bool:
"""
Check if a code subject matches any inventory subject (wildcard-aware).
Supports * (one segment) and > (one or more segments).
"""
code_parts = code_subj.split(".")
for inv in inventory_subjects:
inv_parts = inv.split(".")
if _nats_match(code_parts, inv_parts) or _nats_match(inv_parts, code_parts):
return True
return False
def _nats_match(a_parts: List[str], b_parts: List[str]) -> bool:
"""Match NATS subject a against pattern b (with * and > wildcards)."""
if not b_parts:
return not a_parts
if b_parts[-1] == ">":
return len(a_parts) >= len(b_parts) - 1
if len(a_parts) != len(b_parts):
return False
for a, b in zip(a_parts, b_parts):
if b == "*" or a == "*":
continue
if a != b:
return False
return True
def _analyze_nats(repo_root: str, deadline: float) -> Tuple[List[Finding], Dict, bool]:
"""Returns (findings, stats, skipped)."""
inventory = _load_nats_inventory(repo_root)
if inventory is None:
return [], {"skipped": True}, True
code_subjects = _load_nats_code_subjects(repo_root, deadline)
findings = []
# DRIFT-NATS-001: Used in code but not in inventory
for subj in sorted(code_subjects):
if not _nats_subject_matches(subj, inventory):
findings.append(Finding(
category="nats",
severity="warning",
id="DRIFT-NATS-001",
title=f"NATS subject '{subj}' used in code but not in inventory",
evidence={"path": "docs/architecture_inventory/inventory_nats_topics.csv",
"details": f"subject '{subj}' not found (wildcard-aware match)"},
recommended_fix=f"Add '{subj}' to inventory_nats_topics.csv.",
))
# DRIFT-NATS-002: In inventory but not used in code (info — may be legacy)
for inv_subj in inventory:
if inv_subj.endswith(".*") or inv_subj.endswith(".>"):
continue # wildcard subscriptions — skip
if not _nats_subject_matches(inv_subj, list(code_subjects)):
findings.append(Finding(
category="nats",
severity="info",
id="DRIFT-NATS-002",
title=f"Documented NATS subject '{inv_subj}' not found in code (possibly legacy)",
evidence={"path": "docs/architecture_inventory/inventory_nats_topics.csv",
"details": "no matching publish/subscribe call found"},
recommended_fix="Verify if subject is still active; mark as deprecated in inventory if not.",
))
stats = {
"inventory_subjects": len(inventory),
"code_subjects": len(code_subjects),
"findings": len(findings),
}
return findings, stats, False
# ─── Category 4: Tools ────────────────────────────────────────────────────────
def _load_rollout_tools(repo_root: str) -> Set[str]:
"""Extract all tool names mentioned in tools_rollout.yml groups."""
rollout_path = os.path.join(repo_root, "config", "tools_rollout.yml")
tools: Set[str] = set()
try:
with open(rollout_path, "r") as f:
data = yaml.safe_load(f) or {}
except Exception:
return tools
# Collect all values from group lists (non-@group entries are tool names)
def _collect(obj):
if isinstance(obj, list):
for item in obj:
if isinstance(item, str) and not item.startswith("@"):
tools.add(item)
elif isinstance(item, str) and item.startswith("@"):
group_name = item[1:]
if group_name in data:
_collect(data[group_name])
elif isinstance(obj, dict):
for v in obj.values():
_collect(v)
for key, value in data.items():
if key not in ("role_map", "agent_roles"): # these are role configs, not tool lists
_collect(value)
# Also scan role_map tool lists
role_map = data.get("role_map", {})
for role_cfg in role_map.values():
_collect(role_cfg.get("tools", []))
return tools
def _load_rbac_tools(repo_root: str) -> Dict[str, Set[str]]:
"""Load tool→{actions} from rbac_tools_matrix.yml."""
matrix_path = os.path.join(repo_root, "config", "rbac_tools_matrix.yml")
result: Dict[str, Set[str]] = {}
try:
with open(matrix_path, "r") as f:
data = yaml.safe_load(f) or {}
for tool, cfg in (data.get("tools") or {}).items():
actions = set((cfg.get("actions") or {}).keys())
result[tool] = actions
except Exception:
pass
return result
def _get_effective_tools_for_roles(repo_root: str) -> Dict[str, Set[str]]:
"""Get effective tools for agent_default and agent_cto roles."""
result = {}
try:
import sys
router_path = os.path.join(repo_root, "services", "router")
if router_path not in sys.path:
sys.path.insert(0, router_path)
if repo_root not in sys.path:
sys.path.insert(0, repo_root)
from agent_tools_config import get_agent_tools, reload_rollout_config
reload_rollout_config()
# Use representative agents per role
result["agent_default"] = set(get_agent_tools("brand_new_agent_xyz_test"))
result["agent_cto"] = set(get_agent_tools("sofiia"))
except Exception as e:
logger.warning(f"Could not load effective tools: {e}")
return result
def _analyze_tools(repo_root: str) -> Tuple[List[Finding], Dict]:
findings = []
rollout_tools = _load_rollout_tools(repo_root)
rbac_tools = _load_rbac_tools(repo_root)
role_tools = _get_effective_tools_for_roles(repo_root)
all_role_tools: Set[str] = set()
for tools in role_tools.values():
all_role_tools.update(tools)
# DRIFT-TOOLS-001: Tool in rollout but no handler in tool_manager.py
for tool in sorted(rollout_tools):
if tool not in KNOWN_TOOL_HANDLERS:
findings.append(Finding(
category="tools",
severity="error",
id="DRIFT-TOOLS-001",
title=f"Tool '{tool}' in tools_rollout.yml but no handler in tool_manager.py",
evidence={"path": "config/tools_rollout.yml",
"details": f"'{tool}' referenced in rollout groups but missing from KNOWN_TOOL_HANDLERS"},
recommended_fix=f"Add handler for '{tool}' in tool_manager.py execute_tool dispatch, or remove from rollout.",
))
# DRIFT-TOOLS-002: Handler exists but not in RBAC matrix
# Severity = error if tool is in rollout/standard_stack (actively used, no RBAC gate)
# Severity = warning if tool appears experimental / not yet rolled out
for tool in sorted(KNOWN_TOOL_HANDLERS):
if tool not in rbac_tools:
# Escalate to error if tool is actively distributed to agents
is_rollouted = tool in rollout_tools or tool in all_role_tools
severity = "error" if is_rollouted else "warning"
findings.append(Finding(
category="tools",
severity=severity,
id="DRIFT-TOOLS-002",
title=f"Tool '{tool}' has a handler but is absent from rbac_tools_matrix.yml",
evidence={"path": "config/rbac_tools_matrix.yml",
"details": (
f"'{tool}' not found in matrix.tools section. "
+ ("In rollout → no RBAC gate applied." if is_rollouted
else "Not in rollout (experimental/legacy).")
)},
recommended_fix=f"Add '{tool}' with actions and entitlements to rbac_tools_matrix.yml.",
))
# DRIFT-TOOLS-003: Tool in RBAC matrix but never appears in effective_tools
if all_role_tools:
for tool in sorted(rbac_tools.keys()):
if tool not in all_role_tools:
findings.append(Finding(
category="tools",
severity="warning",
id="DRIFT-TOOLS-003",
title=f"Tool '{tool}' is in RBAC matrix but never appears in effective_tools (dead config?)",
evidence={"path": "config/rbac_tools_matrix.yml",
"details": f"'{tool}' in matrix but not in any role's effective tool list"},
recommended_fix=f"Add '{tool}' to a role in tools_rollout.yml or remove from matrix.",
))
stats = {
"rollout_tools": len(rollout_tools),
"rbac_tools": len(rbac_tools),
"handlers": len(KNOWN_TOOL_HANDLERS),
"role_tools": {role: len(tools) for role, tools in role_tools.items()},
"findings": len(findings),
}
return findings, stats
# ─── Main Analyzer ────────────────────────────────────────────────────────────
def analyze_drift(
repo_root: str,
categories: Optional[List[str]] = None,
timeout_sec: float = TIMEOUT_SEC,
) -> DriftReport:
"""
Run drift analysis across requested categories.
Args:
repo_root: absolute path to repository root
categories: subset of ["services", "openapi", "nats", "tools"] (all if None)
timeout_sec: hard deadline for full analysis
Returns:
DriftReport with pass/fail verdict
"""
all_categories = {"services", "openapi", "nats", "tools"}
if categories:
run_cats = {c for c in categories if c in all_categories}
else:
run_cats = all_categories
deadline = time.monotonic() + timeout_sec
all_findings: List[Finding] = []
skipped: List[str] = []
items_checked: Dict[str, int] = {}
cat_stats: Dict[str, Any] = {}
if "services" in run_cats:
findings, stats = _analyze_services(repo_root, deadline)
all_findings.extend(findings)
cat_stats["services"] = stats
items_checked["services"] = stats.get("catalog_entries", 0) + stats.get("compose_services", 0)
if "openapi" in run_cats:
findings, stats = _analyze_openapi(repo_root, deadline)
all_findings.extend(findings)
cat_stats["openapi"] = stats
items_checked["openapi"] = stats.get("spec_paths", 0) + stats.get("code_routes", 0)
if "nats" in run_cats:
findings, stats, was_skipped = _analyze_nats(repo_root, deadline)
if was_skipped:
skipped.append("nats")
else:
all_findings.extend(findings)
cat_stats["nats"] = stats
items_checked["nats"] = stats.get("inventory_subjects", 0) + stats.get("code_subjects", 0)
if "tools" in run_cats:
findings, stats = _analyze_tools(repo_root)
all_findings.extend(findings)
cat_stats["tools"] = stats
items_checked["tools"] = stats.get("rollout_tools", 0) + stats.get("rbac_tools", 0)
# Sort findings: severity desc (error > warning > info), then category, then id
severity_order = {"error": 0, "warning": 1, "info": 2}
all_findings.sort(key=lambda f: (severity_order.get(f.severity, 9), f.category, f.id))
# Redact evidence
for f in all_findings:
if f.evidence.get("details"):
f.evidence["details"] = _redact_evidence(f.evidence["details"])
errors = sum(1 for f in all_findings if f.severity == "error")
warnings = sum(1 for f in all_findings if f.severity == "warning")
infos = sum(1 for f in all_findings if f.severity == "info")
pass_ = errors == 0
if pass_:
summary = f"✅ Drift analysis PASSED. {len(all_findings)} findings ({warnings} warnings, {infos} infos)."
else:
summary = (
f"❌ Drift analysis FAILED. {errors} error(s), {warnings} warning(s). "
f"Categories checked: {sorted(run_cats - {'nats'} if 'nats' in skipped else run_cats)}."
)
if skipped:
summary += f" Skipped (no inventory): {skipped}."
elapsed_ms = round((time.monotonic() - (deadline - timeout_sec)) * 1000, 1)
return DriftReport(
pass_=pass_,
summary=summary,
stats={
"errors": errors,
"warnings": warnings,
"infos": infos,
"skipped": skipped,
"items_checked": items_checked,
"elapsed_ms": elapsed_ms,
"by_category": cat_stats,
},
findings=[f.to_dict() for f in all_findings],
)
def analyze_drift_dict(repo_root: str, **kwargs) -> Dict:
"""Convenience wrapper that returns a plain dict (for ToolResult)."""
report = analyze_drift(repo_root, **kwargs)
return {
"pass": report.pass_,
"summary": report.summary,
"stats": report.stats,
"findings": report.findings,
}

View File

@@ -0,0 +1,106 @@
"""
incident_artifacts.py — File-based artifact storage for incidents.
Layout: ops/incidents/<incident_id>/<filename>
Security:
- Path traversal guard (realpath must stay within base_dir)
- Max 2MB per artifact
- Only allowed formats: json, md, txt
- Atomic writes (temp + rename)
"""
from __future__ import annotations
import base64
import hashlib
import logging
import os
import tempfile
from pathlib import Path
from typing import Dict, Optional
logger = logging.getLogger(__name__)
MAX_ARTIFACT_BYTES = 2 * 1024 * 1024 # 2MB
ALLOWED_FORMATS = {"json", "md", "txt"}
_ARTIFACTS_BASE = os.getenv(
"INCIDENT_ARTIFACTS_DIR",
str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "incidents"),
)
def _base_dir() -> Path:
return Path(os.getenv("INCIDENT_ARTIFACTS_DIR", _ARTIFACTS_BASE))
def _safe_filename(name: str) -> str:
"""Strip path separators and dangerous chars."""
safe = "".join(c for c in name if c.isalnum() or c in (".", "_", "-"))
return safe or "artifact"
def write_artifact(
incident_id: str,
filename: str,
content_bytes: bytes,
*,
base_dir: Optional[str] = None,
) -> Dict:
"""
Write an artifact file atomically.
Returns: {"path": str, "sha256": str, "size_bytes": int}
Raises: ValueError on validation failure, OSError on write failure.
"""
if not incident_id or "/" in incident_id or ".." in incident_id:
raise ValueError(f"Invalid incident_id: {incident_id}")
if len(content_bytes) > MAX_ARTIFACT_BYTES:
raise ValueError(f"Artifact too large: {len(content_bytes)} bytes (max {MAX_ARTIFACT_BYTES})")
safe_name = _safe_filename(filename)
ext = safe_name.rsplit(".", 1)[-1].lower() if "." in safe_name else ""
if ext not in ALLOWED_FORMATS:
raise ValueError(f"Format '{ext}' not allowed. Allowed: {ALLOWED_FORMATS}")
bd = Path(base_dir) if base_dir else _base_dir()
inc_dir = bd / incident_id
inc_dir.mkdir(parents=True, exist_ok=True)
target = inc_dir / safe_name
real_base = bd.resolve()
real_target = target.resolve()
if not str(real_target).startswith(str(real_base)):
raise ValueError("Path traversal detected")
sha = hashlib.sha256(content_bytes).hexdigest()
# Atomic write: temp file → rename
fd, tmp_path = tempfile.mkstemp(dir=str(inc_dir), suffix=f".{ext}.tmp")
try:
os.write(fd, content_bytes)
os.close(fd)
os.replace(tmp_path, str(target))
except Exception:
os.close(fd) if not os.get_inheritable(fd) else None
if os.path.exists(tmp_path):
os.unlink(tmp_path)
raise
rel_path = str(target.relative_to(bd.parent.parent)) if bd.parent.parent.exists() else str(target)
logger.info("Artifact written: %s (%d bytes, sha256=%s…)", rel_path, len(content_bytes), sha[:12])
return {
"path": rel_path,
"sha256": sha,
"size_bytes": len(content_bytes),
}
def decode_content(content_base64: str) -> bytes:
"""Decode base64-encoded content. Raises ValueError on failure."""
try:
return base64.b64decode(content_base64)
except Exception as exc:
raise ValueError(f"Invalid base64 content: {exc}")

View File

@@ -0,0 +1,379 @@
"""
incident_escalation.py — Deterministic Incident Escalation Engine.
Actions (exposed via incident_escalation_tool):
evaluate — check active signatures against escalation thresholds
auto_resolve_candidates — find open incidents with no recent alerts
No LLM usage; all logic is policy-driven.
"""
from __future__ import annotations
import datetime
import logging
import os
import yaml
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
# ─── Severity ordering ────────────────────────────────────────────────────────
_SEV_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "INFO": 4}
_SEV_NAMES = ["P0", "P1", "P2", "P3", "INFO"]
def _sev_higher(a: str, b: str) -> bool:
"""Return True if a is more severe (lower P number) than b."""
return _SEV_ORDER.get(a, 99) < _SEV_ORDER.get(b, 99)
def _escalate_sev(current: str, cap: str = "P0") -> Optional[str]:
"""Return next higher severity, or None if already at/above cap."""
idx = _SEV_ORDER.get(current)
if idx is None or idx == 0:
return None
target = _SEV_NAMES[idx - 1]
if _SEV_ORDER.get(target, 99) < _SEV_ORDER.get(cap, 0):
return None # would exceed cap
return target
def _now_iso() -> str:
return datetime.datetime.utcnow().isoformat()
def _plus_hours(hours: int) -> str:
return (datetime.datetime.utcnow() + datetime.timedelta(hours=hours)).isoformat()
# ─── Policy loading ───────────────────────────────────────────────────────────
_POLICY_CACHE: Optional[Dict] = None
_POLICY_PATHS = [
Path("config/incident_escalation_policy.yml"),
Path(__file__).resolve().parent.parent.parent / "config" / "incident_escalation_policy.yml",
]
def load_escalation_policy() -> Dict:
global _POLICY_CACHE
if _POLICY_CACHE is not None:
return _POLICY_CACHE
for path in _POLICY_PATHS:
if path.exists():
try:
with open(path) as f:
data = yaml.safe_load(f) or {}
_POLICY_CACHE = data
return data
except Exception as e:
logger.warning("Failed to load escalation policy from %s: %s", path, e)
logger.warning("incident_escalation_policy.yml not found; using defaults")
_POLICY_CACHE = _builtin_defaults()
return _POLICY_CACHE
def _builtin_defaults() -> Dict:
return {
"defaults": {"window_minutes": 60},
"escalation": {
"occurrences_thresholds": {"P2_to_P1": 10, "P1_to_P0": 25},
"triage_thresholds_24h": {"P2_to_P1": 3, "P1_to_P0": 6},
"severity_cap": "P0",
"create_followup_on_escalate": True,
"followup": {
"priority": "P1", "due_hours": 24, "owner": "oncall",
"message_template": "Escalated: occurrences={occurrences_60m}, triages_24h={triage_count_24h}",
},
},
"auto_resolve": {
"no_alerts_minutes_for_candidate": 60,
"close_allowed_severities": ["P2", "P3"],
"auto_close": False,
"candidate_event_type": "note",
"candidate_message": "Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
},
"alert_loop_slo": {
"claim_to_ack_p95_seconds": 60,
"failed_rate_pct": 5,
"processing_stuck_minutes": 15,
},
}
# ─── Escalation thresholds helper ────────────────────────────────────────────
def _determine_escalation(
current_severity: str,
occurrences_60m: int,
triage_count_24h: int,
policy: Dict,
) -> Optional[str]:
"""Return target severity if escalation is needed, else None."""
esc = policy.get("escalation", {})
occ_thresh = esc.get("occurrences_thresholds", {})
triage_thresh = esc.get("triage_thresholds_24h", {})
cap = esc.get("severity_cap", "P0")
# Build escalation rules in priority order (most → least severe)
rules = [
("P1", "P0", occ_thresh.get("P1_to_P0", 25), triage_thresh.get("P1_to_P0", 6)),
("P2", "P1", occ_thresh.get("P2_to_P1", 10), triage_thresh.get("P2_to_P1", 3)),
]
for from_sev, to_sev, occ_limit, triage_limit in rules:
if current_severity != from_sev:
continue
if occurrences_60m >= occ_limit or triage_count_24h >= triage_limit:
# Check cap
if not _sev_higher(cap, to_sev) and to_sev != cap:
# to_sev is more severe than cap — not allowed
if _sev_higher(to_sev, cap):
return cap
return to_sev
return None
# ─── Core evaluate function ───────────────────────────────────────────────────
def evaluate_escalations(
params: Dict,
alert_store,
sig_state_store,
incident_store,
policy: Optional[Dict] = None,
dry_run: bool = False,
) -> Dict:
"""
Main escalation evaluation. Returns structured summary.
"""
if policy is None:
policy = load_escalation_policy()
env_filter = params.get("env") # "prod" / "staging" / None = any
window_minutes = int(params.get("window_minutes",
policy.get("defaults", {}).get("window_minutes", 60)))
limit = int(params.get("limit", 100))
esc_cfg = policy.get("escalation", {})
cap = esc_cfg.get("severity_cap", "P0")
create_followup = esc_cfg.get("create_followup_on_escalate", True)
followup_cfg = esc_cfg.get("followup", {})
# Pull active signatures
active_sigs = sig_state_store.list_active_signatures(
window_minutes=window_minutes, limit=limit
)
evaluated = 0
escalated = 0
followups_created = 0
candidates: List[Dict] = []
recommendations: List[str] = []
for sig_state in active_sigs:
signature = sig_state.get("signature", "")
occurrences_60m = sig_state.get("occurrences_60m", 0)
triage_count_24h = sig_state.get("triage_count_24h", 0)
# Find open incident with this signature
all_incidents = incident_store.list_incidents(
{"status": "open"}, limit=200
)
matching = [
i for i in all_incidents
if i.get("meta", {}).get("incident_signature") == signature
and (not env_filter or i.get("env") == env_filter)
]
if not matching:
# Also check mitigating
mitigating = incident_store.list_incidents(
{"status": "mitigating"}, limit=200
)
matching = [
i for i in mitigating
if i.get("meta", {}).get("incident_signature") == signature
and (not env_filter or i.get("env") == env_filter)
]
if not matching:
evaluated += 1
continue
incident = matching[0]
inc_id = incident["id"]
current_sev = incident.get("severity", "P2")
evaluated += 1
target_sev = _determine_escalation(
current_sev, occurrences_60m, triage_count_24h, policy
)
if not target_sev:
continue # no escalation needed
candidates.append({
"incident_id": inc_id,
"service": incident.get("service"),
"from_severity": current_sev,
"to_severity": target_sev,
"occurrences_60m": occurrences_60m,
"triage_count_24h": triage_count_24h,
"signature": signature,
})
if dry_run:
continue
# Append escalation decision event
esc_msg = (
f"Escalated {current_sev}{target_sev}: "
f"occurrences_60m={occurrences_60m}, "
f"triage_count_24h={triage_count_24h}"
)
incident_store.append_event(inc_id, "decision", esc_msg, meta={
"from_severity": current_sev,
"to_severity": target_sev,
"occurrences_60m": occurrences_60m,
"triage_count_24h": triage_count_24h,
"policy_cap": cap,
"automated": True,
})
escalated += 1
# Create follow-up event if configured
if create_followup:
tmpl = followup_cfg.get(
"message_template",
"Escalation follow-up: investigate {occurrences_60m} occurrences"
)
followup_msg = tmpl.format(
occurrences_60m=occurrences_60m,
triage_count_24h=triage_count_24h,
)
due = _plus_hours(int(followup_cfg.get("due_hours", 24)))
incident_store.append_event(inc_id, "followup", followup_msg, meta={
"priority": followup_cfg.get("priority", "P1"),
"due_date": due,
"owner": followup_cfg.get("owner", "oncall"),
"auto_created": True,
})
followups_created += 1
recommendations.append(
f"Incident {inc_id} ({incident.get('service')}) escalated "
f"{current_sev}{target_sev}: {esc_msg}"
)
return {
"evaluated": evaluated,
"escalated": escalated,
"followups_created": followups_created,
"candidates": candidates,
"recommendations": recommendations,
"dry_run": dry_run,
}
# ─── Auto-resolve candidates ──────────────────────────────────────────────────
def find_auto_resolve_candidates(
params: Dict,
sig_state_store,
incident_store,
policy: Optional[Dict] = None,
dry_run: bool = True,
) -> Dict:
"""
Find open incidents where no alerts have been seen in the last N minutes.
Returns list of candidate incidents.
By default dry_run=True — no state changes.
"""
if policy is None:
policy = load_escalation_policy()
ar = policy.get("auto_resolve", {})
no_alerts_minutes = int(params.get(
"no_alerts_minutes",
ar.get("no_alerts_minutes_for_candidate", 60)
))
env_filter = params.get("env")
limit = int(params.get("limit", 100))
close_allowed = ar.get("close_allowed_severities", ["P2", "P3"])
auto_close = ar.get("auto_close", False)
candidate_event_type = ar.get("candidate_event_type", "note")
candidate_msg_tmpl = ar.get(
"candidate_message",
"Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
)
now_dt = datetime.datetime.utcnow()
no_alert_cutoff = (now_dt - datetime.timedelta(minutes=no_alerts_minutes)).isoformat()
# Pull all open incidents
all_open = incident_store.list_incidents({"status": "open"}, limit=limit)
if env_filter:
all_open = [i for i in all_open if i.get("env") == env_filter]
candidates: List[Dict] = []
closed: List[str] = []
for incident in all_open:
inc_id = incident["id"]
signature = incident.get("meta", {}).get("incident_signature")
if not signature:
continue
sig_state = sig_state_store.get_state(signature)
if not sig_state:
continue
last_alert = sig_state.get("last_alert_at") or ""
if last_alert >= no_alert_cutoff:
continue # alert seen recently → not a candidate
current_sev = incident.get("severity", "P2")
can_close = current_sev in close_allowed
candidates.append({
"incident_id": inc_id,
"service": incident.get("service"),
"severity": current_sev,
"last_alert_at": last_alert,
"minutes_without_alerts": round(
(now_dt - datetime.datetime.fromisoformat(last_alert)).total_seconds() / 60
if last_alert else no_alerts_minutes
),
"auto_close_eligible": can_close and auto_close,
})
if dry_run:
continue
# Append candidate note to incident
msg = candidate_msg_tmpl.format(no_alerts_minutes=no_alerts_minutes)
incident_store.append_event(inc_id, candidate_event_type, msg, meta={
"last_alert_at": last_alert,
"no_alerts_minutes": no_alerts_minutes,
"auto_created": True,
})
if can_close and auto_close:
incident_store.close_incident(
inc_id,
_now_iso(),
f"Auto-closed: no alerts for {no_alerts_minutes} minutes",
)
closed.append(inc_id)
return {
"candidates": candidates,
"candidates_count": len(candidates),
"closed": closed,
"closed_count": len(closed),
"no_alerts_minutes": no_alerts_minutes,
"dry_run": dry_run,
}

View File

@@ -0,0 +1,143 @@
"""
incident_intel_utils.py — Data helpers for Incident Intelligence Layer.
Provides:
- kind extraction from incident (signature, meta, title heuristics)
- normalized key fields dict
- time-proximity helpers
- safe truncation/masking
No external dependencies beyond stdlib.
"""
from __future__ import annotations
import datetime
import re
from typing import Any, Dict, Optional, Tuple
# ─── Kind heuristics ──────────────────────────────────────────────────────────
_TITLE_KIND_PATTERNS = [
(re.compile(r'\b(latency|slow|timeout|p9[5-9]|p100)\b', re.I), "latency"),
(re.compile(r'\b(error.?rate|5xx|http.?error|exception)\b', re.I), "error_rate"),
(re.compile(r'\b(slo.?breach|slo)\b', re.I), "slo_breach"),
(re.compile(r'\b(oom|out.?of.?memory|memory.?pressure)\b', re.I), "oom"),
(re.compile(r'\b(disk|storage|volume.?full|inode)\b', re.I), "disk"),
(re.compile(r'\b(security|intrusion|cve|vuln|unauthorized)\b', re.I), "security"),
(re.compile(r'\b(deploy|rollout|release|canary)\b', re.I), "deploy"),
(re.compile(r'\b(crash.?loop|crashloop|restart)\b', re.I), "crashloop"),
(re.compile(r'\b(queue|lag|consumer|backlog)\b', re.I), "queue"),
(re.compile(r'\b(network|connectivity|dns|unreachable)\b', re.I), "network"),
]
_KNOWN_KINDS = frozenset([
"slo_breach", "crashloop", "latency", "error_rate",
"disk", "oom", "deploy", "security", "custom", "network", "queue",
])
def extract_kind(incident: Dict) -> str:
"""
Best-effort kind extraction. Priority:
1. incident.meta.kind (if present)
2. incident.meta.alert_kind
3. Title heuristics
4. 'custom'
"""
meta = incident.get("meta") or {}
# Direct meta fields
for key in ("kind", "alert_kind"):
v = meta.get(key)
if v and v in _KNOWN_KINDS:
return v
# Title heuristics
title = incident.get("title", "") or ""
for pat, kind_name in _TITLE_KIND_PATTERNS:
if pat.search(title):
return kind_name
return "custom"
def incident_key_fields(incident: Dict) -> Dict:
"""Return a normalized dict of key fields used for correlation."""
meta = incident.get("meta") or {}
return {
"id": incident.get("id", ""),
"service": incident.get("service", ""),
"env": incident.get("env", "prod"),
"severity": incident.get("severity", "P2"),
"status": incident.get("status", "open"),
"started_at": incident.get("started_at", ""),
"signature": meta.get("incident_signature", ""),
"kind": extract_kind(incident),
}
# ─── Time helpers ─────────────────────────────────────────────────────────────
def parse_iso(ts: str) -> Optional[datetime.datetime]:
"""Parse ISO timestamp string to datetime, returns None on failure."""
if not ts:
return None
try:
return datetime.datetime.fromisoformat(ts.rstrip("Z").split("+")[0])
except (ValueError, AttributeError):
return None
def minutes_apart(ts_a: str, ts_b: str) -> Optional[float]:
"""Return absolute minutes between two ISO timestamps, or None."""
a = parse_iso(ts_a)
b = parse_iso(ts_b)
if a is None or b is None:
return None
return abs((a - b).total_seconds()) / 60.0
def incidents_within_minutes(inc_a: Dict, inc_b: Dict, within: float) -> bool:
"""Return True if two incidents started within `within` minutes of each other."""
gap = minutes_apart(
inc_a.get("started_at", ""),
inc_b.get("started_at", ""),
)
return gap is not None and gap <= within
# ─── Text helpers ─────────────────────────────────────────────────────────────
def safe_truncate(text: str, max_chars: int = 200) -> str:
if not text:
return ""
return text[:max_chars] + ("" if len(text) > max_chars else "")
def mask_signature(sig: str, prefix_len: int = 8) -> str:
"""Show only first N chars of a SHA-256 signature for readability."""
if not sig:
return ""
return sig[:prefix_len]
def severity_rank(sev: str) -> int:
"""Lower = more severe."""
return {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "INFO": 4}.get(sev, 5)
def format_duration(started_at: str, ended_at: Optional[str]) -> str:
"""Human-readable duration string."""
a = parse_iso(started_at)
if a is None:
return "unknown"
if ended_at:
b = parse_iso(ended_at)
if b:
secs = (b - a).total_seconds()
if secs < 60:
return f"{int(secs)}s"
if secs < 3600:
return f"{int(secs / 60)}m"
return f"{secs / 3600:.1f}h"
return "ongoing"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,690 @@
"""
incident_store.py — Incident Log storage abstraction.
Backends:
- MemoryIncidentStore (testing)
- JsonlIncidentStore (MVP/fallback — ops/incidents/ directory)
- PostgresIncidentStore(production — psycopg2 sync)
- AutoIncidentStore (Postgres primary → JSONL fallback)
All writes are non-fatal: exceptions are logged as warnings.
"""
from __future__ import annotations
import datetime
import hashlib
import json
import logging
import os
import re
import threading
import time
import uuid
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
_SECRET_PAT = re.compile(r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+')
def _redact_text(text: str, max_len: int = 4000) -> str:
"""Mask secrets, truncate."""
text = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", text)
return text[:max_len] if len(text) > max_len else text
def _now_iso() -> str:
return datetime.datetime.now(datetime.timezone.utc).isoformat()
def _generate_incident_id() -> str:
now = datetime.datetime.now(datetime.timezone.utc)
rand = uuid.uuid4().hex[:6]
return f"inc_{now.strftime('%Y%m%d_%H%M')}_{rand}"
# ─── Abstract interface ──────────────────────────────────────────────────────
class IncidentStore(ABC):
@abstractmethod
def create_incident(self, data: Dict) -> Dict:
...
@abstractmethod
def get_incident(self, incident_id: str) -> Optional[Dict]:
...
@abstractmethod
def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
...
@abstractmethod
def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
...
@abstractmethod
def append_event(self, incident_id: str, event_type: str, message: str,
meta: Optional[Dict] = None) -> Optional[Dict]:
...
@abstractmethod
def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
...
@abstractmethod
def add_artifact(self, incident_id: str, kind: str, fmt: str,
path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
...
@abstractmethod
def get_artifacts(self, incident_id: str) -> List[Dict]:
...
# ─── In-memory (testing) ─────────────────────────────────────────────────────
class MemoryIncidentStore(IncidentStore):
def __init__(self):
self._incidents: Dict[str, Dict] = {}
self._events: Dict[str, List[Dict]] = {}
self._artifacts: Dict[str, List[Dict]] = {}
self._lock = threading.Lock()
def create_incident(self, data: Dict) -> Dict:
inc_id = data.get("id") or _generate_incident_id()
now = _now_iso()
inc = {
"id": inc_id,
"workspace_id": data.get("workspace_id", "default"),
"service": data["service"],
"env": data.get("env", "prod"),
"severity": data.get("severity", "P2"),
"status": "open",
"title": _redact_text(data.get("title", ""), 500),
"summary": _redact_text(data.get("summary", "") or "", 2000),
"started_at": data.get("started_at", now),
"ended_at": None,
"created_by": data.get("created_by", "unknown"),
"created_at": now,
"updated_at": now,
"meta": data.get("meta") or {},
}
with self._lock:
self._incidents[inc_id] = inc
self._events[inc_id] = []
self._artifacts[inc_id] = []
return inc
def get_incident(self, incident_id: str) -> Optional[Dict]:
inc = self._incidents.get(incident_id)
if not inc:
return None
events = self._events.get(incident_id, [])[-20:]
artifacts = self._artifacts.get(incident_id, [])
return {**inc, "events": events, "artifacts": artifacts}
def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
filters = filters or {}
result = list(self._incidents.values())
if filters.get("status"):
result = [i for i in result if i["status"] == filters["status"]]
if filters.get("service"):
result = [i for i in result if i["service"] == filters["service"]]
if filters.get("env"):
result = [i for i in result if i["env"] == filters["env"]]
if filters.get("severity"):
result = [i for i in result if i["severity"] == filters["severity"]]
result.sort(key=lambda x: x.get("created_at", ""), reverse=True)
return result[:limit]
def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
inc = self._incidents.get(incident_id)
if not inc:
return None
with self._lock:
inc["status"] = "closed"
inc["ended_at"] = ended_at
inc["summary"] = _redact_text(resolution, 2000) if resolution else inc.get("summary")
inc["updated_at"] = _now_iso()
self._events.setdefault(incident_id, []).append({
"ts": _now_iso(),
"type": "status_change",
"message": f"Incident closed: {_redact_text(resolution, 500)}",
"meta": None,
})
return inc
def append_event(self, incident_id: str, event_type: str, message: str,
meta: Optional[Dict] = None) -> Optional[Dict]:
if incident_id not in self._incidents:
return None
ev = {
"ts": _now_iso(),
"type": event_type,
"message": _redact_text(message, 4000),
"meta": meta,
}
with self._lock:
self._events.setdefault(incident_id, []).append(ev)
self._incidents[incident_id]["updated_at"] = _now_iso()
return ev
def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
return self._events.get(incident_id, [])[:limit]
def add_artifact(self, incident_id: str, kind: str, fmt: str,
path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
if incident_id not in self._incidents:
return None
art = {
"ts": _now_iso(),
"kind": kind,
"format": fmt,
"path": path,
"sha256": sha256,
"size_bytes": size_bytes,
}
with self._lock:
self._artifacts.setdefault(incident_id, []).append(art)
return art
def get_artifacts(self, incident_id: str) -> List[Dict]:
return self._artifacts.get(incident_id, [])
# ─── JSONL (MVP file backend) ────────────────────────────────────────────────
class JsonlIncidentStore(IncidentStore):
"""
Stores incidents/events/artifacts as separate JSONL files in a directory.
Layout:
<base_dir>/incidents.jsonl
<base_dir>/events.jsonl
<base_dir>/artifacts.jsonl
"""
def __init__(self, base_dir: str):
self._dir = Path(base_dir)
self._dir.mkdir(parents=True, exist_ok=True)
self._lock = threading.Lock()
def _incidents_path(self) -> Path:
return self._dir / "incidents.jsonl"
def _events_path(self) -> Path:
return self._dir / "events.jsonl"
def _artifacts_path(self) -> Path:
return self._dir / "artifacts.jsonl"
def _read_jsonl(self, path: Path) -> List[Dict]:
if not path.exists():
return []
items = []
try:
with open(path, "r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if line:
try:
items.append(json.loads(line))
except json.JSONDecodeError:
pass
except Exception:
pass
return items
def _append_jsonl(self, path: Path, record: Dict) -> None:
with self._lock:
with open(path, "a", encoding="utf-8") as fh:
fh.write(json.dumps(record, ensure_ascii=False, default=str) + "\n")
def _rewrite_jsonl(self, path: Path, items: List[Dict]) -> None:
with self._lock:
with open(path, "w", encoding="utf-8") as fh:
for item in items:
fh.write(json.dumps(item, ensure_ascii=False, default=str) + "\n")
def create_incident(self, data: Dict) -> Dict:
inc_id = data.get("id") or _generate_incident_id()
now = _now_iso()
inc = {
"id": inc_id,
"workspace_id": data.get("workspace_id", "default"),
"service": data["service"],
"env": data.get("env", "prod"),
"severity": data.get("severity", "P2"),
"status": "open",
"title": _redact_text(data.get("title", ""), 500),
"summary": _redact_text(data.get("summary", "") or "", 2000),
"started_at": data.get("started_at", now),
"ended_at": None,
"created_by": data.get("created_by", "unknown"),
"created_at": now,
"updated_at": now,
"meta": data.get("meta") or {},
}
self._append_jsonl(self._incidents_path(), inc)
return inc
def get_incident(self, incident_id: str) -> Optional[Dict]:
incidents = self._read_jsonl(self._incidents_path())
inc = next((i for i in incidents if i.get("id") == incident_id), None)
if not inc:
return None
events = [e for e in self._read_jsonl(self._events_path())
if e.get("incident_id") == incident_id][-20:]
artifacts = [a for a in self._read_jsonl(self._artifacts_path())
if a.get("incident_id") == incident_id]
return {**inc, "events": events, "artifacts": artifacts}
def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
filters = filters or {}
incidents = self._read_jsonl(self._incidents_path())
if filters.get("status"):
incidents = [i for i in incidents if i.get("status") == filters["status"]]
if filters.get("service"):
incidents = [i for i in incidents if i.get("service") == filters["service"]]
if filters.get("env"):
incidents = [i for i in incidents if i.get("env") == filters["env"]]
if filters.get("severity"):
incidents = [i for i in incidents if i.get("severity") == filters["severity"]]
incidents.sort(key=lambda x: x.get("created_at", ""), reverse=True)
return incidents[:limit]
def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
incidents = self._read_jsonl(self._incidents_path())
found = None
for inc in incidents:
if inc.get("id") == incident_id:
inc["status"] = "closed"
inc["ended_at"] = ended_at
if resolution:
inc["summary"] = _redact_text(resolution, 2000)
inc["updated_at"] = _now_iso()
found = inc
break
if not found:
return None
self._rewrite_jsonl(self._incidents_path(), incidents)
self.append_event(incident_id, "status_change",
f"Incident closed: {_redact_text(resolution or '', 500)}")
return found
def append_event(self, incident_id: str, event_type: str, message: str,
meta: Optional[Dict] = None) -> Optional[Dict]:
incidents = self._read_jsonl(self._incidents_path())
if not any(i.get("id") == incident_id for i in incidents):
return None
ev = {
"incident_id": incident_id,
"ts": _now_iso(),
"type": event_type,
"message": _redact_text(message, 4000),
"meta": meta,
}
self._append_jsonl(self._events_path(), ev)
return ev
def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
events = self._read_jsonl(self._events_path())
return [e for e in events if e.get("incident_id") == incident_id][:limit]
def add_artifact(self, incident_id: str, kind: str, fmt: str,
path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
incidents = self._read_jsonl(self._incidents_path())
if not any(i.get("id") == incident_id for i in incidents):
return None
art = {
"incident_id": incident_id,
"ts": _now_iso(),
"kind": kind,
"format": fmt,
"path": path,
"sha256": sha256,
"size_bytes": size_bytes,
}
self._append_jsonl(self._artifacts_path(), art)
return art
def get_artifacts(self, incident_id: str) -> List[Dict]:
artifacts = self._read_jsonl(self._artifacts_path())
return [a for a in artifacts if a.get("incident_id") == incident_id]
# ─── Postgres backend ─────────────────────────────────────────────────────────
class PostgresIncidentStore(IncidentStore):
"""
Production backend using psycopg2 (sync).
Tables created by ops/scripts/migrate_incidents_postgres.py.
"""
def __init__(self, dsn: str):
self._dsn = dsn
self._local = threading.local()
def _conn(self):
"""Get or create a per-thread connection."""
conn = getattr(self._local, "conn", None)
if conn is None or conn.closed:
import psycopg2 # type: ignore
conn = psycopg2.connect(self._dsn)
conn.autocommit = True
self._local.conn = conn
return conn
def create_incident(self, data: Dict) -> Dict:
inc_id = data.get("id") or _generate_incident_id()
now = _now_iso()
cur = self._conn().cursor()
cur.execute(
"""INSERT INTO incidents (id,workspace_id,service,env,severity,status,
title,summary,started_at,created_by,created_at,updated_at)
VALUES (%s,%s,%s,%s,%s,'open',%s,%s,%s,%s,%s,%s)""",
(inc_id, data.get("workspace_id", "default"),
data["service"], data.get("env", "prod"),
data.get("severity", "P2"),
_redact_text(data.get("title", ""), 500),
_redact_text(data.get("summary", "") or "", 2000),
data.get("started_at") or now,
data.get("created_by", "unknown"), now, now),
)
cur.close()
return {"id": inc_id, "status": "open", "service": data["service"],
"severity": data.get("severity", "P2"),
"started_at": data.get("started_at") or now,
"created_at": now}
def get_incident(self, incident_id: str) -> Optional[Dict]:
cur = self._conn().cursor()
cur.execute("SELECT id,workspace_id,service,env,severity,status,title,summary,"
"started_at,ended_at,created_by,created_at,updated_at "
"FROM incidents WHERE id=%s", (incident_id,))
row = cur.fetchone()
if not row:
cur.close()
return None
cols = [d[0] for d in cur.description]
inc = {c: (v.isoformat() if isinstance(v, datetime.datetime) else v) for c, v in zip(cols, row)}
# Events
cur.execute("SELECT ts,type,message,meta FROM incident_events "
"WHERE incident_id=%s ORDER BY ts DESC LIMIT 200", (incident_id,))
events = []
for r in cur.fetchall():
events.append({"ts": r[0].isoformat() if r[0] else "", "type": r[1],
"message": r[2], "meta": r[3]})
events.reverse()
# Artifacts
cur.execute("SELECT ts,kind,format,path,sha256,size_bytes FROM incident_artifacts "
"WHERE incident_id=%s ORDER BY ts", (incident_id,))
artifacts = []
for r in cur.fetchall():
artifacts.append({"ts": r[0].isoformat() if r[0] else "", "kind": r[1],
"format": r[2], "path": r[3], "sha256": r[4], "size_bytes": r[5]})
cur.close()
return {**inc, "events": events, "artifacts": artifacts}
def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
filters = filters or {}
clauses = []
params: list = []
for k in ("status", "service", "env", "severity"):
if filters.get(k):
clauses.append(f"{k}=%s")
params.append(filters[k])
if filters.get("window_days"):
clauses.append("created_at >= NOW() - INTERVAL '%s days'")
params.append(int(filters["window_days"]))
where = ("WHERE " + " AND ".join(clauses)) if clauses else ""
params.append(min(limit, 200))
cur = self._conn().cursor()
cur.execute(f"SELECT id,workspace_id,service,env,severity,status,title,summary,"
f"started_at,ended_at,created_by,created_at,updated_at "
f"FROM incidents {where} ORDER BY created_at DESC LIMIT %s", params)
cols = [d[0] for d in cur.description]
rows = []
for row in cur.fetchall():
rows.append({c: (v.isoformat() if isinstance(v, datetime.datetime) else v)
for c, v in zip(cols, row)})
cur.close()
return rows
def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
cur = self._conn().cursor()
cur.execute("UPDATE incidents SET status='closed', ended_at=%s, summary=%s, updated_at=%s "
"WHERE id=%s RETURNING id",
(ended_at or _now_iso(), _redact_text(resolution, 2000) if resolution else None,
_now_iso(), incident_id))
if not cur.fetchone():
cur.close()
return None
cur.close()
self.append_event(incident_id, "status_change",
f"Incident closed: {_redact_text(resolution or '', 500)}")
return {"id": incident_id, "status": "closed"}
def append_event(self, incident_id: str, event_type: str, message: str,
meta: Optional[Dict] = None) -> Optional[Dict]:
now = _now_iso()
cur = self._conn().cursor()
meta_json = json.dumps(meta, default=str) if meta else None
cur.execute("INSERT INTO incident_events (incident_id,ts,type,message,meta) "
"VALUES (%s,%s,%s,%s,%s)",
(incident_id, now, event_type, _redact_text(message, 4000), meta_json))
cur.close()
return {"ts": now, "type": event_type, "message": _redact_text(message, 4000), "meta": meta}
def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
cur = self._conn().cursor()
cur.execute("SELECT ts,type,message,meta FROM incident_events "
"WHERE incident_id=%s ORDER BY ts LIMIT %s", (incident_id, limit))
events = [{"ts": r[0].isoformat() if r[0] else "", "type": r[1],
"message": r[2], "meta": r[3]} for r in cur.fetchall()]
cur.close()
return events
def add_artifact(self, incident_id: str, kind: str, fmt: str,
path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
now = _now_iso()
cur = self._conn().cursor()
cur.execute("INSERT INTO incident_artifacts (incident_id,ts,kind,format,path,sha256,size_bytes) "
"VALUES (%s,%s,%s,%s,%s,%s,%s)",
(incident_id, now, kind, fmt, path, sha256, size_bytes))
cur.close()
return {"ts": now, "kind": kind, "format": fmt, "path": path,
"sha256": sha256, "size_bytes": size_bytes}
def get_artifacts(self, incident_id: str) -> List[Dict]:
cur = self._conn().cursor()
cur.execute("SELECT ts,kind,format,path,sha256,size_bytes FROM incident_artifacts "
"WHERE incident_id=%s ORDER BY ts", (incident_id,))
artifacts = [{"ts": r[0].isoformat() if r[0] else "", "kind": r[1], "format": r[2],
"path": r[3], "sha256": r[4], "size_bytes": r[5]} for r in cur.fetchall()]
cur.close()
return artifacts
def close(self):
conn = getattr(self._local, "conn", None)
if conn and not conn.closed:
conn.close()
# ─── Auto backend (Postgres → JSONL fallback) ────────────────────────────────
class AutoIncidentStore(IncidentStore):
"""
Tries Postgres first; on any failure falls back to JSONL.
Re-attempts Postgres after RECOVERY_INTERVAL_S (5 min).
"""
_RECOVERY_INTERVAL_S = 300
def __init__(self, pg_dsn: str, jsonl_dir: str):
self._pg_dsn = pg_dsn
self._jsonl_dir = jsonl_dir
self._primary: Optional[PostgresIncidentStore] = None
self._fallback: Optional[JsonlIncidentStore] = None
self._using_fallback = False
self._fallback_since: float = 0.0
self._init_lock = threading.Lock()
def _get_primary(self) -> PostgresIncidentStore:
if self._primary is None:
with self._init_lock:
if self._primary is None:
self._primary = PostgresIncidentStore(self._pg_dsn)
return self._primary
def _get_fallback(self) -> JsonlIncidentStore:
if self._fallback is None:
with self._init_lock:
if self._fallback is None:
self._fallback = JsonlIncidentStore(self._jsonl_dir)
return self._fallback
def _maybe_recover(self) -> None:
if self._using_fallback and self._fallback_since > 0:
if time.monotonic() - self._fallback_since >= self._RECOVERY_INTERVAL_S:
logger.info("AutoIncidentStore: attempting Postgres recovery")
self._using_fallback = False
self._fallback_since = 0.0
def _switch_to_fallback(self, err: Exception) -> None:
logger.warning("AutoIncidentStore: Postgres failed (%s), using JSONL fallback", err)
self._using_fallback = True
self._fallback_since = time.monotonic()
def active_backend(self) -> str:
return "jsonl_fallback" if self._using_fallback else "postgres"
# ── Delegate methods ──────────────────────────────────────────────────────
def create_incident(self, data: Dict) -> Dict:
self._maybe_recover()
if not self._using_fallback:
try:
return self._get_primary().create_incident(data)
except Exception as e:
self._switch_to_fallback(e)
return self._get_fallback().create_incident(data)
def get_incident(self, incident_id: str) -> Optional[Dict]:
self._maybe_recover()
if not self._using_fallback:
try:
return self._get_primary().get_incident(incident_id)
except Exception as e:
self._switch_to_fallback(e)
return self._get_fallback().get_incident(incident_id)
def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
self._maybe_recover()
if not self._using_fallback:
try:
return self._get_primary().list_incidents(filters, limit)
except Exception as e:
self._switch_to_fallback(e)
return self._get_fallback().list_incidents(filters, limit)
def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
self._maybe_recover()
if not self._using_fallback:
try:
return self._get_primary().close_incident(incident_id, ended_at, resolution)
except Exception as e:
self._switch_to_fallback(e)
return self._get_fallback().close_incident(incident_id, ended_at, resolution)
def append_event(self, incident_id: str, event_type: str, message: str,
meta: Optional[Dict] = None) -> Optional[Dict]:
self._maybe_recover()
if not self._using_fallback:
try:
return self._get_primary().append_event(incident_id, event_type, message, meta)
except Exception as e:
self._switch_to_fallback(e)
return self._get_fallback().append_event(incident_id, event_type, message, meta)
def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
self._maybe_recover()
if not self._using_fallback:
try:
return self._get_primary().get_events(incident_id, limit)
except Exception as e:
self._switch_to_fallback(e)
return self._get_fallback().get_events(incident_id, limit)
def add_artifact(self, incident_id: str, kind: str, fmt: str,
path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
self._maybe_recover()
if not self._using_fallback:
try:
return self._get_primary().add_artifact(incident_id, kind, fmt, path, sha256, size_bytes)
except Exception as e:
self._switch_to_fallback(e)
return self._get_fallback().add_artifact(incident_id, kind, fmt, path, sha256, size_bytes)
def get_artifacts(self, incident_id: str) -> List[Dict]:
self._maybe_recover()
if not self._using_fallback:
try:
return self._get_primary().get_artifacts(incident_id)
except Exception as e:
self._switch_to_fallback(e)
return self._get_fallback().get_artifacts(incident_id)
# ─── Singleton ────────────────────────────────────────────────────────────────
_store: Optional[IncidentStore] = None
_store_lock = threading.Lock()
def get_incident_store() -> IncidentStore:
global _store
if _store is None:
with _store_lock:
if _store is None:
_store = _create_store()
return _store
def set_incident_store(store: Optional[IncidentStore]) -> None:
global _store
with _store_lock:
_store = store
def _create_store() -> IncidentStore:
backend = os.getenv("INCIDENT_BACKEND", "jsonl").lower()
dsn = os.getenv("DATABASE_URL") or os.getenv("INCIDENT_DATABASE_URL", "")
jsonl_dir = os.getenv(
"INCIDENT_JSONL_DIR",
str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "incidents"),
)
if backend == "memory":
logger.info("IncidentStore: in-memory (testing only)")
return MemoryIncidentStore()
if backend == "postgres":
if dsn:
logger.info("IncidentStore: postgres dsn=%s", dsn[:30])
return PostgresIncidentStore(dsn)
logger.warning("INCIDENT_BACKEND=postgres but no DATABASE_URL; falling back to jsonl")
if backend == "auto":
if dsn:
logger.info("IncidentStore: auto (postgres→jsonl fallback) dsn=%s", dsn[:30])
return AutoIncidentStore(pg_dsn=dsn, jsonl_dir=jsonl_dir)
logger.info("IncidentStore: auto — no DATABASE_URL, using jsonl")
if backend == "null":
return MemoryIncidentStore()
# Default: JSONL
logger.info("IncidentStore: jsonl dir=%s", jsonl_dir)
return JsonlIncidentStore(jsonl_dir)

View File

@@ -0,0 +1,261 @@
"""
llm_enrichment.py — Optional LLM enrichment for Risk Attribution (strictly bounded).
Design constraints:
- LLM output is explanatory ONLY — never changes scores or decisions.
- Default mode is OFF (llm_mode="off").
- Local mode calls a local HTTP model runner (Ollama-compatible by default).
- Triggers are checked before every call: off if delta < warn OR band not high/critical.
- Input is hard-truncated to llm_max_chars_in.
- Output is hard-truncated to llm_max_chars_out.
- Any error → graceful skip, returns {enabled: false, text: null}.
Hardening guards (new):
- model_allowlist: model must be in allowlist or call is skipped.
- max_calls_per_digest: caller passes a mutable counter dict; stops after limit.
- per_day_dedupe: in-memory key per (date, service, env) prevents duplicate calls.
Usage:
from llm_enrichment import maybe_enrich_attribution
call_counter = {"count": 0}
report["llm_enrichment"] = maybe_enrich_attribution(
attribution_report, risk_report, attr_policy,
call_counter=call_counter,
)
"""
from __future__ import annotations
import datetime
import json
import logging
from typing import Dict, Optional
logger = logging.getLogger(__name__)
# ─── Per-day dedupe store (module-level in-memory) ───────────────────────────
# key: "risk_enrich:{YYYY-MM-DD}:{service}:{env}" → True
_dedupe_store: Dict[str, bool] = {}
def _dedupe_key(service: str, env: str) -> str:
date = datetime.datetime.utcnow().strftime("%Y-%m-%d")
return f"risk_enrich:{date}:{service}:{env}"
def _is_deduped(service: str, env: str) -> bool:
return _dedupe_store.get(_dedupe_key(service, env), False)
def _mark_deduped(service: str, env: str) -> None:
_dedupe_store[_dedupe_key(service, env)] = True
def _clear_dedupe_store() -> None:
"""Test helper to reset per-day dedup state."""
_dedupe_store.clear()
# ─── Trigger guard ────────────────────────────────────────────────────────────
def _should_trigger(risk_report: Dict, attr_policy: Dict) -> bool:
"""
Returns True only if triggers are met:
delta_24h >= risk_delta_warn OR band in band_in
Both conditions are OR — either is enough.
"""
triggers = attr_policy.get("llm_triggers", {})
delta_warn = int(triggers.get("risk_delta_warn", 10))
band_in = set(triggers.get("band_in", ["high", "critical"]))
band = risk_report.get("band", "low")
delta_24h = (risk_report.get("trend") or {}).get("delta_24h")
if band in band_in:
return True
if delta_24h is not None and delta_24h >= delta_warn:
return True
return False
# ─── Prompt builder ───────────────────────────────────────────────────────────
def _build_prompt(
attribution_report: Dict,
risk_report: Dict,
max_chars: int,
) -> str:
"""Build a compact prompt for local LLM enrichment."""
service = attribution_report.get("service", "?")
env = attribution_report.get("env", "prod")
score = risk_report.get("score", 0)
band = risk_report.get("band", "?")
delta = attribution_report.get("delta_24h")
causes = attribution_report.get("causes", [])[:3]
reasons = risk_report.get("reasons", [])[:4]
causes_text = "\n".join(
f" - {c['type']} (score={c['score']}, confidence={c['confidence']}): "
+ "; ".join(c.get("evidence", []))
for c in causes
)
reasons_text = "\n".join(f" - {r}" for r in reasons)
prompt = (
f"You are a platform reliability assistant. Provide a 2-3 sentence human-readable "
f"explanation for a risk spike in service '{service}' (env={env}).\n\n"
f"Risk score: {score} ({band}). "
+ (f"Delta 24h: +{delta}.\n\n" if delta is not None else "\n\n")
+ f"Risk signals:\n{reasons_text}\n\n"
f"Attributed causes:\n{causes_text}\n\n"
f"Write a concise explanation (max 3 sentences). Do NOT include scores or numbers "
f"from above verbatim. Focus on actionable insight."
)
return prompt[:max_chars]
# ─── Local model call ─────────────────────────────────────────────────────────
def _is_model_allowed(model: str, attr_policy: Dict) -> bool:
"""Return True if model is in llm_local.model_allowlist (or list is empty/absent)."""
allowlist = attr_policy.get("llm_local", {}).get("model_allowlist")
if not allowlist:
return True # no restriction configured
return model in allowlist
def _call_local_llm(
prompt: str,
attr_policy: Dict,
max_out: int,
) -> Optional[str]:
"""
Calls Ollama-compatible local endpoint.
Skips if model is not in model_allowlist.
Returns text or None on failure.
"""
llm_cfg = attr_policy.get("llm_local", {})
endpoint = llm_cfg.get("endpoint", "http://localhost:11434/api/generate")
model = llm_cfg.get("model", "llama3")
timeout = int(llm_cfg.get("timeout_seconds", 15))
if not _is_model_allowed(model, attr_policy):
logger.warning("llm_enrichment: model '%s' not in allowlist; skipping", model)
return None
try:
import urllib.request
payload = json.dumps({
"model": model,
"prompt": prompt,
"stream": False,
"options": {"num_predict": max_out // 4}, # approx token budget
}).encode()
req = urllib.request.Request(
endpoint,
data=payload,
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=timeout) as resp:
body = json.loads(resp.read())
text = body.get("response", "") or ""
return text[:max_out] if text else None
except (Exception, OSError, ConnectionError) as e:
logger.warning("llm_enrichment: local LLM call failed: %s", e)
return None
# ─── Public interface ─────────────────────────────────────────────────────────
def maybe_enrich_attribution(
attribution_report: Dict,
risk_report: Dict,
attr_policy: Optional[Dict] = None,
*,
call_counter: Optional[Dict] = None,
) -> Dict:
"""
Conditionally enrich attribution_report with LLM text.
Hardening guards (checked in order):
1. llm_mode must be "local" (not "off" or "remote")
2. triggers must be met (delta >= warn OR band in high/critical)
3. model must be in model_allowlist
4. max_calls_per_digest not exceeded (via mutable `call_counter` dict)
5. per-day dedupe: (service, env) pair not already enriched today
Returns:
{"enabled": True/False, "text": str|None, "mode": str}
Never raises. LLM output does NOT alter scores.
"""
if attr_policy is None:
try:
from risk_attribution import load_attribution_policy
attr_policy = load_attribution_policy()
except Exception:
return {"enabled": False, "text": None, "mode": "off"}
mode = (attr_policy.get("defaults") or {}).get("llm_mode", "off")
if mode == "off":
return {"enabled": False, "text": None, "mode": "off"}
# Guard: triggers
if not _should_trigger(risk_report, attr_policy):
return {"enabled": False, "text": None, "mode": mode,
"skipped_reason": "triggers not met"}
service = attribution_report.get("service", "")
env = attribution_report.get("env", "prod")
# Guard: model allowlist (checked early so tests can assert without calling LLM)
if mode == "local":
llm_local_cfg_early = attr_policy.get("llm_local", {})
model_cfg = llm_local_cfg_early.get("model", "llama3")
if not _is_model_allowed(model_cfg, attr_policy):
logger.warning("llm_enrichment: model '%s' not in allowlist; skipping", model_cfg)
return {"enabled": False, "text": None, "mode": mode,
"skipped_reason": f"model '{model_cfg}' not in allowlist"}
# Guard: per-day dedupe
llm_local_cfg = attr_policy.get("llm_local", {})
if llm_local_cfg.get("per_day_dedupe", True):
if _is_deduped(service, env):
return {"enabled": False, "text": None, "mode": mode,
"skipped_reason": "per_day_dedupe: already enriched today"}
# Guard: max_calls_per_digest
if call_counter is not None:
max_calls = int(llm_local_cfg.get("max_calls_per_digest", 3))
if call_counter.get("count", 0) >= max_calls:
return {"enabled": False, "text": None, "mode": mode,
"skipped_reason": f"max_calls_per_digest={max_calls} reached"}
defaults = attr_policy.get("defaults", {})
max_in = int(defaults.get("llm_max_chars_in", 3500))
max_out = int(defaults.get("llm_max_chars_out", 800))
prompt = _build_prompt(attribution_report, risk_report, max_in)
if mode == "local":
try:
text = _call_local_llm(prompt, attr_policy, max_out)
except Exception as e:
logger.warning("llm_enrichment: local call raised: %s", e)
text = None
if text is not None:
# Update guards on success
_mark_deduped(service, env)
if call_counter is not None:
call_counter["count"] = call_counter.get("count", 0) + 1
return {
"enabled": text is not None,
"text": text,
"mode": "local",
}
# mode == "remote" — not implemented; stub for future extensibility
logger.debug("llm_enrichment: remote mode not implemented; skipping")
return {"enabled": False, "text": None, "mode": "remote",
"skipped_reason": "remote not implemented"}

View File

@@ -0,0 +1,340 @@
"""
platform_priority_digest.py — Weekly Platform Priority Digest.
DAARION.city | deterministic, no LLM.
Generates a Markdown + JSON report prioritising services by Architecture Pressure,
optionally correlated with Risk score/delta.
Outputs:
ops/reports/platform/{YYYY-WW}.md
ops/reports/platform/{YYYY-WW}.json
Public API:
weekly_platform_digest(env, ...) -> DigestResult
"""
from __future__ import annotations
import datetime
import json
import logging
import os
from pathlib import Path
from typing import Dict, List, Optional
from architecture_pressure import load_pressure_policy
logger = logging.getLogger(__name__)
# ─── Action templates ─────────────────────────────────────────────────────────
_ACTION_TEMPLATES = {
"arch_review": (
"📋 **Schedule architecture review**: '{service}' pressure={score} "
"({band}). Review structural debt and recurring failure patterns."
),
"refactor_sprint": (
"🔧 **Allocate refactor sprint**: '{service}' has {regressions} regressions "
"and {escalations} escalations in 30d — structural instability requires investment."
),
"freeze_features": (
"🚫 **Freeze non-critical features**: '{service}' is critical-pressure + "
"risk-high. Stabilise before new feature work."
),
"reduce_backlog": (
"📌 **Reduce followup backlog**: '{service}' has {overdue} overdue follow-ups. "
"Address before next release cycle."
),
}
def _now_week() -> str:
"""Return ISO week string: YYYY-WNN."""
return datetime.datetime.utcnow().strftime("%Y-W%V")
def _now_date() -> str:
return datetime.datetime.utcnow().strftime("%Y-%m-%d")
def _clamp(text: str, max_chars: int) -> str:
if max_chars and len(text) > max_chars:
return text[:max_chars - 3] + ""
return text
# ─── Action list builder ──────────────────────────────────────────────────────
def _build_priority_actions(pressure_reports: List[Dict], risk_reports: Optional[Dict] = None) -> List[str]:
actions = []
risk_reports = risk_reports or {}
for r in pressure_reports:
svc = r["service"]
score = r.get("score", 0)
band = r.get("band", "low")
comp = r.get("components", {})
if r.get("requires_arch_review"):
actions.append(
_ACTION_TEMPLATES["arch_review"].format(
service=svc, score=score, band=band
)
)
regressions = int(comp.get("regressions_30d", 0))
escalations = int(comp.get("escalations_30d", 0))
if regressions >= 3 and escalations >= 2:
actions.append(
_ACTION_TEMPLATES["refactor_sprint"].format(
service=svc, regressions=regressions, escalations=escalations
)
)
rr = risk_reports.get(svc, {})
risk_band = rr.get("band", "low") if rr else r.get("risk_band", "low")
if band == "critical" and risk_band in ("high", "critical"):
actions.append(
_ACTION_TEMPLATES["freeze_features"].format(service=svc)
)
overdue = int(comp.get("followups_overdue", 0))
if overdue >= 2:
actions.append(
_ACTION_TEMPLATES["reduce_backlog"].format(service=svc, overdue=overdue)
)
return actions[:20] # cap
# ─── Markdown builder ─────────────────────────────────────────────────────────
def _build_markdown(
week_str: str,
env: str,
pressure_reports: List[Dict],
investment_list: List[Dict],
actions: List[str],
band_counts: Dict[str, int],
) -> str:
lines = [
f"# Platform Priority Digest — {env.upper()} | {week_str}",
f"_Generated: {_now_date()} | Deterministic | No LLM_",
"",
"## Pressure Band Summary",
"",
f"| Band | Services |",
f"|------|---------|",
f"| 🔴 Critical | {band_counts.get('critical', 0)} |",
f"| 🟠 High | {band_counts.get('high', 0)} |",
f"| 🟡 Medium | {band_counts.get('medium', 0)} |",
f"| 🟢 Low | {band_counts.get('low', 0)} |",
"",
]
# Critical pressure
critical = [r for r in pressure_reports if r.get("band") == "critical"]
if critical:
lines += ["## 🔴 Critical Structural Pressure", ""]
for r in critical:
svc = r["service"]
score = r.get("score", 0)
summary = "; ".join(r.get("signals_summary", [])[:3])
arch_flag = " ⚠️ ARCH REVIEW REQUIRED" if r.get("requires_arch_review") else ""
lines.append(f"### {svc} (score={score}){arch_flag}")
lines.append(f"> {summary}")
# Risk correlation
if r.get("risk_score") is not None:
lines.append(
f"> Risk: {r['risk_score']} ({r.get('risk_band', '?')})"
+ (f" Δ24h: +{r['risk_delta_24h']}" if r.get("risk_delta_24h") else "")
)
lines.append("")
# High pressure
high = [r for r in pressure_reports if r.get("band") == "high"]
if high:
lines += ["## 🟠 High Pressure Services", ""]
for r in high:
svc = r["service"]
score = r.get("score", 0)
summary = (r.get("signals_summary") or [""])[0]
lines.append(
f"- **{svc}** (score={score}): {summary}"
)
lines.append("")
# Investment priority list
if investment_list:
lines += ["## 📊 Investment Priority List", ""]
lines.append("Services where Pressure ≥ require_arch_review_at AND risk is elevated:")
lines.append("")
for i, item in enumerate(investment_list, 1):
lines.append(
f"{i}. **{item['service']}** — Pressure: {item['pressure_score']} "
f"({item['pressure_band']}) | Risk: {item.get('risk_score', 'N/A')} "
f"({item.get('risk_band', 'N/A')})"
)
lines.append("")
# Action recommendations
if actions:
lines += ["## ✅ Action Recommendations", ""]
for action in actions:
lines.append(f"- {action}")
lines.append("")
lines += [
"---",
"_Generated by DAARION.city Platform Priority Digest (deterministic, no LLM)_",
]
return "\n".join(lines)
# ─── Main digest function ─────────────────────────────────────────────────────
def weekly_platform_digest(
env: str = "prod",
*,
pressure_reports: Optional[List[Dict]] = None,
risk_reports: Optional[Dict[str, Dict]] = None,
policy: Optional[Dict] = None,
week_str: Optional[str] = None,
output_dir: Optional[str] = None,
date_str: Optional[str] = None,
write_files: bool = True,
auto_followup: bool = True,
incident_store=None,
) -> Dict:
"""
Generate Weekly Platform Priority Digest.
Args:
pressure_reports: pre-computed pressure reports list (sorted by score desc)
risk_reports: {service: RiskReport} for side-by-side correlation
policy: architecture_pressure_policy (loaded if None)
week_str: ISO week for filenames (defaults to current week)
output_dir: override output directory
write_files: write .md and .json to disk
auto_followup: call maybe_create_arch_review_followup for each requiring review
incident_store: needed for auto_followup
Returns: DigestResult dict with markdown, json_data, files_written, followups_created.
"""
if policy is None:
policy = load_pressure_policy()
effective_week = week_str or _now_week()
effective_date = date_str or _now_date()
cfg_output_dir = policy.get("digest", {}).get("output_dir", "ops/reports/platform")
effective_output_dir = output_dir or cfg_output_dir
max_chars = int(policy.get("digest", {}).get("max_chars", 12000))
top_n = int(policy.get("digest", {}).get("top_n_in_digest", 10))
pressure_reports = sorted(pressure_reports or [], key=lambda r: -r.get("score", 0))[:top_n]
risk_reports = risk_reports or {}
# Band counts
band_counts: Dict[str, int] = {"critical": 0, "high": 0, "medium": 0, "low": 0}
for r in pressure_reports:
b = r.get("band", "low")
band_counts[b] = band_counts.get(b, 0) + 1
# Investment priority list: requires_arch_review AND (risk high/critical OR delta > 0)
review_at = int(policy.get("priority_rules", {}).get("require_arch_review_at", 70))
investment_list = []
for r in pressure_reports:
if not r.get("requires_arch_review"):
continue
svc = r["service"]
rr = risk_reports.get(svc, {})
risk_band = rr.get("band", "low") if rr else r.get("risk_band", "low") or "low"
risk_delta = (rr.get("trend") or {}).get("delta_24h") if rr else r.get("risk_delta_24h")
if risk_band in ("high", "critical") or (risk_delta is not None and risk_delta > 0):
investment_list.append({
"service": svc,
"pressure_score": r.get("score"),
"pressure_band": r.get("band"),
"risk_score": rr.get("score") if rr else r.get("risk_score"),
"risk_band": risk_band,
"risk_delta_24h": risk_delta,
})
actions = _build_priority_actions(pressure_reports, risk_reports)
markdown_raw = _build_markdown(
week_str=effective_week,
env=env,
pressure_reports=pressure_reports,
investment_list=investment_list,
actions=actions,
band_counts=band_counts,
)
markdown = _clamp(markdown_raw, max_chars)
json_data = {
"week": effective_week,
"date": effective_date,
"env": env,
"generated_at": datetime.datetime.utcnow().isoformat(),
"band_counts": band_counts,
"top_pressure_services": [
{
"service": r.get("service"),
"score": r.get("score"),
"band": r.get("band"),
"requires_arch_review": r.get("requires_arch_review"),
"signals_summary": r.get("signals_summary", [])[:4],
"components": r.get("components", {}),
"risk_score": r.get("risk_score"),
"risk_band": r.get("risk_band"),
"risk_delta_24h": r.get("risk_delta_24h"),
}
for r in pressure_reports
],
"investment_priority_list": investment_list,
"actions": actions,
}
# ── Auto followup creation ────────────────────────────────────────────────
followups_created = []
if auto_followup and incident_store is not None:
from architecture_pressure import maybe_create_arch_review_followup
for r in pressure_reports:
if r.get("requires_arch_review"):
fu_result = maybe_create_arch_review_followup(
r,
incident_store=incident_store,
policy=policy,
week_str=effective_week,
)
if fu_result.get("created"):
followups_created.append({
"service": r["service"],
"dedupe_key": fu_result.get("dedupe_key"),
"incident_id": fu_result.get("incident_id"),
})
# ── Write files ───────────────────────────────────────────────────────────
files_written: List[str] = []
if write_files:
try:
out_path = Path(effective_output_dir)
out_path.mkdir(parents=True, exist_ok=True)
md_file = out_path / f"{effective_week}.md"
json_file = out_path / f"{effective_week}.json"
md_file.write_text(markdown, encoding="utf-8")
json_file.write_text(json.dumps(json_data, indent=2, default=str), encoding="utf-8")
files_written = [str(md_file), str(json_file)]
logger.info("platform_priority_digest: wrote %s and %s", md_file, json_file)
except Exception as e:
logger.warning("platform_priority_digest: failed to write files: %s", e)
return {
"week": effective_week,
"env": env,
"markdown": markdown,
"json_data": json_data,
"files_written": files_written,
"followups_created": followups_created,
"band_counts": band_counts,
}

View File

@@ -0,0 +1,419 @@
"""Provider Budget Tracker — real-money token usage accounting.
Tracks:
- Tokens used (input/output) per provider per model
- Estimated USD cost based on published pricing
- Approximate balance (if configured via env var)
- Rolling 24h / 7d / 30d windows
Pricing table: updated Feb 2026 (USD per 1M tokens)
"""
from __future__ import annotations
import json
import logging
import os
import threading
import time
from collections import defaultdict
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
# ── Pricing catalog (USD / 1M tokens) ─────────────────────────────────────────
PRICING: Dict[str, Dict[str, float]] = {
# provider → model_pattern → {input, output}
"anthropic": {
"claude-sonnet-4-5": {"input": 3.0, "output": 15.0},
"claude-opus-4-5": {"input": 15.0, "output": 75.0},
"claude-haiku-3-5": {"input": 0.8, "output": 4.0},
"claude-3-5-sonnet": {"input": 3.0, "output": 15.0},
"_default": {"input": 3.0, "output": 15.0},
},
"grok": {
"grok-4-1-fast-reasoning": {"input": 5.0, "output": 15.0},
"grok-3": {"input": 5.0, "output": 25.0},
"grok-2-1212": {"input": 2.0, "output": 10.0},
"_default": {"input": 5.0, "output": 15.0},
},
"deepseek": {
"deepseek-chat": {"input": 0.27, "output": 1.10},
"deepseek-reasoner": {"input": 0.55, "output": 2.19},
"_default": {"input": 0.27, "output": 1.10},
},
"mistral": {
"mistral-large-latest": {"input": 2.0, "output": 6.0},
"mistral-small-latest": {"input": 0.2, "output": 0.6},
"_default": {"input": 2.0, "output": 6.0},
},
"openai": {
"gpt-4o": {"input": 2.5, "output": 10.0},
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
"gpt-4-turbo": {"input": 10.0, "output": 30.0},
"_default": {"input": 2.5, "output": 10.0},
},
"glm": {
"glm-4-plus": {"input": 0.05, "output": 0.05},
"glm-4-flash": {"input": 0.0, "output": 0.0}, # free tier
"glm-4.7-flash": {"input": 0.0, "output": 0.0},
"glm-z1-plus": {"input": 0.07, "output": 0.07},
"_default": {"input": 0.05, "output": 0.05},
},
"ollama": {
"_default": {"input": 0.0, "output": 0.0},
},
}
def get_price(provider: str, model: str) -> Dict[str, float]:
p = PRICING.get(provider.lower(), PRICING.get("anthropic"))
# exact match
if model in p:
return p[model]
# prefix match
for k, v in p.items():
if k != "_default" and model.startswith(k):
return v
return p.get("_default", {"input": 3.0, "output": 15.0})
def calc_cost_usd(provider: str, model: str, input_tokens: int, output_tokens: int) -> float:
price = get_price(provider, model)
return (input_tokens * price["input"] + output_tokens * price["output"]) / 1_000_000
# ── Usage record ──────────────────────────────────────────────────────────────
@dataclass
class UsageRecord:
ts: float
provider: str
model: str
agent: str
input_tokens: int
output_tokens: int
cost_usd: float
latency_ms: int = 0
task_type: str = ""
fallback_used: bool = False
# ── Storage ────────────────────────────────────────────────────────────────────
_BUDGET_DIR = Path(os.getenv("BUDGET_DATA_DIR", os.path.expanduser("~/.sofiia/budget")))
_USAGE_FILE = _BUDGET_DIR / "usage.jsonl"
_LIMITS_FILE = _BUDGET_DIR / "limits.json"
_lock = threading.Lock()
def _ensure_dir() -> None:
_BUDGET_DIR.mkdir(parents=True, exist_ok=True)
def _append_usage(rec: UsageRecord) -> None:
_ensure_dir()
with _lock:
with open(_USAGE_FILE, "a", encoding="utf-8") as f:
f.write(json.dumps(asdict(rec)) + "\n")
def _load_usage(since_ts: float = 0.0) -> List[UsageRecord]:
if not _USAGE_FILE.exists():
return []
records: List[UsageRecord] = []
with _lock:
try:
with open(_USAGE_FILE, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
d = json.loads(line)
if d.get("ts", 0) >= since_ts:
records.append(UsageRecord(**d))
except Exception:
pass
except Exception as e:
logger.warning("budget: failed to load usage: %s", e)
return records
# ── Manual balance config ──────────────────────────────────────────────────────
def _load_limits() -> Dict[str, Any]:
if not _LIMITS_FILE.exists():
return {}
try:
with open(_LIMITS_FILE, "r") as f:
return json.load(f)
except Exception:
return {}
def _save_limits(data: Dict[str, Any]) -> None:
_ensure_dir()
with _lock:
with open(_LIMITS_FILE, "w") as f:
json.dump(data, f, indent=2)
# ── Public API ─────────────────────────────────────────────────────────────────
def track_usage(
provider: str,
model: str,
agent: str,
input_tokens: int,
output_tokens: int,
latency_ms: int = 0,
task_type: str = "",
fallback_used: bool = False,
) -> float:
"""Record token usage and return cost in USD."""
cost = calc_cost_usd(provider, model, input_tokens, output_tokens)
rec = UsageRecord(
ts=time.time(),
provider=provider,
model=model,
agent=agent,
input_tokens=input_tokens,
output_tokens=output_tokens,
cost_usd=cost,
latency_ms=latency_ms,
task_type=task_type,
fallback_used=fallback_used,
)
_append_usage(rec)
logger.debug(
"💰 tracked: provider=%s model=%s tokens=%d+%d cost=$%.5f",
provider, model, input_tokens, output_tokens, cost,
)
return cost
@dataclass
class ProviderStats:
provider: str
total_input_tokens: int = 0
total_output_tokens: int = 0
total_cost_usd: float = 0.0
call_count: int = 0
avg_latency_ms: float = 0.0
top_models: List[Dict[str, Any]] = field(default_factory=list)
# Configured limits (from limits.json)
monthly_limit_usd: Optional[float] = None
topup_balance_usd: Optional[float] = None
estimated_remaining_usd: Optional[float] = None
def get_stats(window_hours: int = 720) -> Dict[str, ProviderStats]:
"""
Aggregate usage stats per provider for the given time window.
Default window = 720h = 30 days.
"""
since_ts = time.time() - window_hours * 3600
records = _load_usage(since_ts)
by_provider = _aggregate_records(records)
limits = _load_limits()
for p, s in by_provider.items():
lim = limits.get(p, {})
if "monthly_limit_usd" in lim:
s.monthly_limit_usd = lim["monthly_limit_usd"]
if "topup_balance_usd" in lim:
s.topup_balance_usd = lim["topup_balance_usd"]
s.estimated_remaining_usd = round(lim["topup_balance_usd"] - s.total_cost_usd, 4)
return by_provider
def get_dashboard_data() -> Dict[str, Any]:
"""
Returns structured data for the budget dashboard UI.
Includes 24h, 7d, 30d windows.
Single file read + in-memory filtering for all three windows.
"""
now = time.time()
ts_30d = now - 720 * 3600
ts_7d = now - 168 * 3600
ts_24h = now - 24 * 3600
all_records = _load_usage(since_ts=ts_30d)
records_7d = [r for r in all_records if r.ts >= ts_7d]
records_24h = [r for r in records_7d if r.ts >= ts_24h]
stats_30d = _aggregate_records(all_records)
stats_7d = _aggregate_records(records_7d)
stats_24h = _aggregate_records(records_24h)
limits = _load_limits()
# Apply limits to 30d stats
for p, s in stats_30d.items():
lim = limits.get(p, {})
if "monthly_limit_usd" in lim:
s.monthly_limit_usd = lim["monthly_limit_usd"]
if "topup_balance_usd" in lim:
s.topup_balance_usd = lim["topup_balance_usd"]
s.estimated_remaining_usd = round(lim["topup_balance_usd"] - s.total_cost_usd, 4)
all_providers = sorted({
*(k for k in PRICING if k != "ollama"),
*stats_30d.keys(),
})
providers_data = []
for p in all_providers:
s30 = stats_30d.get(p, ProviderStats(provider=p))
s7 = stats_7d.get(p, ProviderStats(provider=p))
s24 = stats_24h.get(p, ProviderStats(provider=p))
plim = limits.get(p, {})
providers_data.append({
"provider": p,
"display_name": _provider_display_name(p),
"icon": _provider_icon(p),
"available": bool(os.getenv(_provider_env_key(p), "").strip()),
"cost_24h": round(s24.total_cost_usd, 5),
"cost_7d": round(s7.total_cost_usd, 5),
"cost_30d": round(s30.total_cost_usd, 5),
"calls_24h": s24.call_count,
"calls_30d": s30.call_count,
"tokens_24h": s24.total_input_tokens + s24.total_output_tokens,
"tokens_30d": s30.total_input_tokens + s30.total_output_tokens,
"avg_latency_ms": round(s30.avg_latency_ms),
"monthly_limit_usd": s30.monthly_limit_usd,
"topup_balance_usd": plim.get("topup_balance_usd"),
"estimated_remaining_usd": s30.estimated_remaining_usd,
"top_models": s30.top_models,
})
total_24h = sum(s.total_cost_usd for s in stats_24h.values())
total_7d = sum(s.total_cost_usd for s in stats_7d.values())
total_30d = sum(s.total_cost_usd for s in stats_30d.values())
return {
"providers": providers_data,
"summary": {
"total_cost_24h": round(total_24h, 5),
"total_cost_7d": round(total_7d, 5),
"total_cost_30d": round(total_30d, 5),
"total_calls_30d": sum(s.call_count for s in stats_30d.values()),
},
"generated_at": now,
}
def _aggregate_records(records: List[UsageRecord]) -> Dict[str, ProviderStats]:
"""Aggregate a list of records into per-provider stats."""
by_provider: Dict[str, ProviderStats] = {}
model_usage: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(
lambda: defaultdict(lambda: {"calls": 0, "cost": 0.0, "tokens": 0})
)
for rec in records:
p = rec.provider
if p not in by_provider:
by_provider[p] = ProviderStats(provider=p)
s = by_provider[p]
s.total_input_tokens += rec.input_tokens
s.total_output_tokens += rec.output_tokens
s.total_cost_usd += rec.cost_usd
s.call_count += 1
if rec.latency_ms:
s.avg_latency_ms = (
(s.avg_latency_ms * (s.call_count - 1) + rec.latency_ms) / s.call_count
)
model_usage[p][rec.model]["calls"] += 1
model_usage[p][rec.model]["cost"] += rec.cost_usd
model_usage[p][rec.model]["tokens"] += rec.input_tokens + rec.output_tokens
for p, s in by_provider.items():
top = sorted(model_usage[p].items(), key=lambda x: x[1]["cost"], reverse=True)[:3]
s.top_models = [{"model": k, **v} for k, v in top]
return by_provider
def rotate_usage_log(max_age_days: int = 90) -> int:
"""Remove records older than max_age_days. Returns count of removed lines."""
if not _USAGE_FILE.exists():
return 0
cutoff = time.time() - max_age_days * 86400
kept = []
removed = 0
with _lock:
try:
with open(_USAGE_FILE, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
d = json.loads(line)
if d.get("ts", 0) >= cutoff:
kept.append(line)
else:
removed += 1
except Exception:
removed += 1
with open(_USAGE_FILE, "w", encoding="utf-8") as f:
for line in kept:
f.write(line + "\n")
except Exception as e:
logger.warning("budget: rotate failed: %s", e)
if removed:
logger.info("budget: rotated %d old records (>%dd)", removed, max_age_days)
return removed
def set_provider_limit(provider: str, monthly_limit_usd: Optional[float] = None, topup_balance_usd: Optional[float] = None) -> None:
"""Configure budget limits for a provider."""
limits = _load_limits()
if provider not in limits:
limits[provider] = {}
if monthly_limit_usd is not None:
limits[provider]["monthly_limit_usd"] = monthly_limit_usd
if topup_balance_usd is not None:
limits[provider]["topup_balance_usd"] = topup_balance_usd
_save_limits(limits)
logger.info("budget: set limits for %s: %s", provider, limits[provider])
def _provider_display_name(p: str) -> str:
return {
"anthropic": "Anthropic Claude",
"grok": "xAI Grok",
"deepseek": "DeepSeek",
"mistral": "Mistral AI",
"openai": "OpenAI",
"glm": "GLM / Z.AI",
"ollama": "Local (Ollama)",
}.get(p, p.title())
def _provider_icon(p: str) -> str:
return {
"anthropic": "🟣",
"grok": "",
"deepseek": "🔵",
"mistral": "🌊",
"openai": "🟢",
"glm": "🐉",
"ollama": "🖥️",
}.get(p, "🤖")
def _provider_env_key(p: str) -> str:
return {
"anthropic": "ANTHROPIC_API_KEY",
"grok": "GROK_API_KEY",
"deepseek": "DEEPSEEK_API_KEY",
"mistral": "MISTRAL_API_KEY",
"openai": "OPENAI_API_KEY",
"glm": "GLM5_API_KEY",
}.get(p, f"{p.upper()}_API_KEY")

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,731 @@
"""
risk_attribution.py — Change Impact Attribution Engine (deterministic, no LLM by default).
Given a service + env, explains WHY risk spiked by correlating signals:
deploy activity, dependency scan findings, drift errors, incident storms,
SLO violations, overdue follow-ups, alert-loop degradation.
New in this revision:
- Change Timeline: ordered event stream (deploy, incident, slo, followup, …)
- Evidence refs: alert_ref[], incident_id[], release_check_run_id, artifact paths
- Per-cause refs (clickthrough IDs for UI)
Provides:
load_attribution_policy() -> Dict
compute_attribution(service, env, ...) -> AttributionReport (includes timeline + evidence_refs)
build_timeline(events, policy) -> List[TimelineItem]
fetch_signals_from_stores(service, env, ...) -> SignalsData
LLM enrichment is separate (llm_enrichment.py) and off by default.
"""
from __future__ import annotations
import datetime
import logging
import yaml
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
# ─── Policy ───────────────────────────────────────────────────────────────────
_ATTR_POLICY_CACHE: Optional[Dict] = None
_ATTR_POLICY_SEARCH_PATHS = [
Path("config/risk_attribution_policy.yml"),
Path(__file__).resolve().parent.parent.parent / "config" / "risk_attribution_policy.yml",
]
def load_attribution_policy() -> Dict:
global _ATTR_POLICY_CACHE
if _ATTR_POLICY_CACHE is not None:
return _ATTR_POLICY_CACHE
for p in _ATTR_POLICY_SEARCH_PATHS:
if p.exists():
try:
with open(p) as f:
data = yaml.safe_load(f) or {}
_ATTR_POLICY_CACHE = data
return data
except Exception as e:
logger.warning("Failed to load risk_attribution_policy from %s: %s", p, e)
_ATTR_POLICY_CACHE = _builtin_attr_defaults()
return _ATTR_POLICY_CACHE
def _reload_attribution_policy() -> None:
global _ATTR_POLICY_CACHE
_ATTR_POLICY_CACHE = None
def _builtin_attr_defaults() -> Dict:
return {
"defaults": {"lookback_hours": 24, "max_causes": 5, "llm_mode": "off",
"llm_max_chars_in": 3500, "llm_max_chars_out": 800},
"llm_triggers": {"risk_delta_warn": 10, "risk_delta_fail": 20,
"band_in": ["high", "critical"]},
"weights": {"deploy": 30, "dependency": 25, "drift": 25, "incident_storm": 20,
"slo_violation": 15, "followups_overdue": 10, "alert_loop_degraded": 10},
"signals": {
"deploy": {"kinds": ["deploy", "deployment", "rollout", "canary"]},
"dependency": {"release_gate_names": ["dependency_scan", "deps"]},
"drift": {"release_gate_names": ["drift", "config_drift"]},
"incident_storm": {"thresholds": {"occurrences_60m_warn": 10,
"escalations_24h_warn": 2}},
"slo": {"require_active_violation": True},
},
"output": {"confidence_bands": {"high": 60, "medium": 35}},
"timeline": {
"enabled": True,
"lookback_hours": 24,
"max_items": 30,
"include_types": ["deploy", "dependency", "drift", "incident", "slo",
"followup", "alert_loop", "release_gate"],
"time_bucket_minutes": 5,
},
"evidence_linking": {"enabled": True, "max_refs_per_cause": 10},
"llm_local": {
"endpoint": "http://localhost:11434/api/generate",
"model": "llama3",
"timeout_seconds": 15,
"model_allowlist": ["qwen2.5-coder:3b", "llama3.1:8b-instruct", "phi3:mini", "llama3"],
"max_calls_per_digest": 3,
"per_day_dedupe": True,
},
}
# ─── Confidence ───────────────────────────────────────────────────────────────
def _score_to_confidence(score: int, policy: Dict) -> str:
bands = policy.get("output", {}).get("confidence_bands", {})
high_t = int(bands.get("high", 60))
med_t = int(bands.get("medium", 35))
if score >= high_t:
return "high"
if score >= med_t:
return "medium"
return "low"
# ─── Signal detection helpers (now also return refs) ──────────────────────────
def _cap_refs(refs: List[Any], max_refs: int) -> List[Any]:
return refs[:max_refs]
def _detect_deploy(
alerts: List[Dict],
cutoff_iso: str,
policy: Dict,
max_refs: int = 10,
) -> Tuple[int, List[str], List[Dict]]:
"""Returns (score, evidence_list, refs)."""
kinds = set(policy.get("signals", {}).get("deploy", {}).get(
"kinds", ["deploy", "deployment", "rollout", "canary"]
))
deploy_alerts = [
a for a in alerts
if a.get("kind", "").lower() in kinds and a.get("created_at", "") >= cutoff_iso
]
if not deploy_alerts:
return 0, [], []
weight = int(policy.get("weights", {}).get("deploy", 30))
last_seen = max(a.get("created_at", "") for a in deploy_alerts)
evidence = [
f"deploy alerts: {len(deploy_alerts)} in last 24h",
f"last seen: {last_seen[:16] if last_seen else 'unknown'}",
]
refs = _cap_refs(
[{"alert_ref": a["alert_ref"], "kind": a.get("kind", "deploy"),
"ts": a.get("created_at", "")}
for a in deploy_alerts if a.get("alert_ref")],
max_refs,
)
return weight, evidence, refs
def _detect_dependency(
release_gate_results: List[Dict],
policy: Dict,
max_refs: int = 10,
) -> Tuple[int, List[str], List[Dict]]:
gate_names = set(policy.get("signals", {}).get("dependency", {}).get(
"release_gate_names", ["dependency_scan", "deps"]
))
failing = [
g for g in release_gate_results
if g.get("gate") in gate_names and g.get("status") in ("fail", "warn")
]
if not failing:
return 0, [], []
weight = int(policy.get("weights", {}).get("dependency", 25))
evidence = [f"dependency_scan gate: {g['gate']} = {g['status']}" for g in failing[:3]]
refs = _cap_refs(
[{"release_check_run_id": g.get("run_id"), "gate": g["gate"],
"artifact": g.get("artifact")}
for g in failing if g.get("run_id") or g.get("artifact")],
max_refs,
)
return weight, evidence, refs
def _detect_drift(
release_gate_results: List[Dict],
policy: Dict,
max_refs: int = 10,
) -> Tuple[int, List[str], List[Dict]]:
gate_names = set(policy.get("signals", {}).get("drift", {}).get(
"release_gate_names", ["drift", "config_drift"]
))
failing = [
g for g in release_gate_results
if g.get("gate") in gate_names and g.get("status") in ("fail", "warn")
]
if not failing:
return 0, [], []
weight = int(policy.get("weights", {}).get("drift", 25))
evidence = [f"drift gate: {g['gate']} = {g['status']}" for g in failing[:3]]
refs = _cap_refs(
[{"release_check_run_id": g.get("run_id"), "gate": g["gate"],
"artifact": g.get("artifact")}
for g in failing if g.get("run_id") or g.get("artifact")],
max_refs,
)
return weight, evidence, refs
def _detect_incident_storm(
occurrences_60m: int,
escalations_24h: int,
policy: Dict,
incident_ids: Optional[List[str]] = None,
max_refs: int = 10,
) -> Tuple[int, List[str], List[Dict]]:
storm_cfg = policy.get("signals", {}).get("incident_storm", {}).get("thresholds", {})
occ_warn = int(storm_cfg.get("occurrences_60m_warn", 10))
esc_warn = int(storm_cfg.get("escalations_24h_warn", 2))
triggered = (occurrences_60m >= occ_warn) or (escalations_24h >= esc_warn)
if not triggered:
return 0, [], []
weight = int(policy.get("weights", {}).get("incident_storm", 20))
evidence = []
if occurrences_60m >= occ_warn:
evidence.append(f"occurrences_60m={occurrences_60m} (≥{occ_warn})")
if escalations_24h >= esc_warn:
evidence.append(f"escalations_24h={escalations_24h} (≥{esc_warn})")
refs = _cap_refs(
[{"incident_id": iid} for iid in (incident_ids or [])],
max_refs,
)
return weight, evidence, refs
def _detect_slo(
slo_violations: int,
policy: Dict,
slo_metrics: Optional[List[str]] = None,
max_refs: int = 10,
) -> Tuple[int, List[str], List[Dict]]:
require_active = policy.get("signals", {}).get("slo", {}).get("require_active_violation", True)
if require_active and slo_violations == 0:
return 0, [], []
if slo_violations == 0:
return 0, [], []
weight = int(policy.get("weights", {}).get("slo_violation", 15))
evidence = [f"active SLO violations: {slo_violations}"]
refs = _cap_refs(
[{"metric": m} for m in (slo_metrics or [])],
max_refs,
)
return weight, evidence, refs
def _detect_followups_overdue(
overdue_count: int,
policy: Dict,
followup_refs: Optional[List[Dict]] = None,
max_refs: int = 10,
) -> Tuple[int, List[str], List[Dict]]:
if overdue_count == 0:
return 0, [], []
weight = int(policy.get("weights", {}).get("followups_overdue", 10))
evidence = [f"overdue follow-ups: {overdue_count}"]
refs = _cap_refs(followup_refs or [], max_refs)
return weight, evidence, refs
def _detect_alert_loop_degraded(
loop_slo_violations: int,
policy: Dict,
max_refs: int = 10,
) -> Tuple[int, List[str], List[Dict]]:
if loop_slo_violations == 0:
return 0, [], []
weight = int(policy.get("weights", {}).get("alert_loop_degraded", 10))
evidence = [f"alert-loop SLO violations: {loop_slo_violations}"]
refs: List[Dict] = []
return weight, evidence, refs
# ─── Timeline builder ────────────────────────────────────────────────────────
def _bucket_key(ts_iso: str, bucket_minutes: int) -> str:
"""Round timestamp down to the nearest bucket boundary."""
try:
dt = datetime.datetime.fromisoformat(ts_iso.rstrip("Z"))
total_mins = dt.hour * 60 + dt.minute
bucket_start = (total_mins // bucket_minutes) * bucket_minutes
return f"{dt.strftime('%Y-%m-%d')}T{bucket_start // 60:02d}:{bucket_start % 60:02d}"
except Exception:
return ts_iso[:13] # fallback: truncate to hour
def build_timeline(
raw_events: List[Dict],
policy: Optional[Dict] = None,
) -> List[Dict]:
"""
Build an ordered Change Timeline from raw event dicts.
raw_events is a list of:
{ts, type, label, refs, ...}
Returns newest-first list, bucketed and capped at max_items.
Multiple same-type events in the same time bucket are coalesced into
one "xN" item.
"""
if policy is None:
policy = load_attribution_policy()
tl_cfg = policy.get("timeline", {})
if not tl_cfg.get("enabled", True):
return []
max_items = int(tl_cfg.get("max_items", 30))
bucket_minutes = int(tl_cfg.get("time_bucket_minutes", 5))
include_types = set(tl_cfg.get("include_types", []))
# Filter by allowed types
filtered = [
e for e in raw_events
if not include_types or e.get("type") in include_types
]
# Sort newest-first
filtered.sort(key=lambda e: e.get("ts", ""), reverse=True)
# Bucket coalescing: same type + same bucket → single item with count
seen: Dict[str, Dict] = {} # key → accumulated item
order: List[str] = [] # preserve insertion order
for ev in filtered:
bk = _bucket_key(ev.get("ts", ""), bucket_minutes)
key = f"{ev.get('type', 'unknown')}:{bk}"
if key not in seen:
seen[key] = {
"ts": ev.get("ts", ""),
"type": ev.get("type", "unknown"),
"label": ev.get("label", ""),
"refs": list(ev.get("refs", {}).items() if isinstance(ev.get("refs"), dict)
else ev.get("refs", [])),
"_count": 1,
"_latest_ts": ev.get("ts", ""),
}
order.append(key)
else:
seen[key]["_count"] += 1
# Keep latest ts
if ev.get("ts", "") > seen[key]["_latest_ts"]:
seen[key]["_latest_ts"] = ev.get("ts", "")
seen[key]["ts"] = ev.get("ts", "")
# Merge refs (up to 5 per bucket)
new_refs = (list(ev.get("refs", {}).items()) if isinstance(ev.get("refs"), dict)
else ev.get("refs", []))
if len(seen[key]["refs"]) < 5:
seen[key]["refs"].extend(new_refs[:5 - len(seen[key]["refs"])])
# Build final items
items = []
for key in order:
item = seen[key]
count = item.pop("_count", 1)
item.pop("_latest_ts", None)
if count > 1:
item["label"] = f"{item['label']} (×{count})"
# Convert refs back to dict if needed
if isinstance(item["refs"], list) and item["refs"] and isinstance(item["refs"][0], tuple):
item["refs"] = dict(item["refs"])
items.append(item)
return items[:max_items]
def _make_timeline_events_from_alerts(
alerts: List[Dict],
deploy_kinds: set,
cutoff_iso: str,
) -> List[Dict]:
"""Convert alert records to raw timeline events."""
events = []
for a in alerts:
if a.get("created_at", "") < cutoff_iso:
continue
kind = a.get("kind", "").lower()
ev_type = "deploy" if kind in deploy_kinds else "alert"
refs = {}
if a.get("alert_ref"):
refs["alert_ref"] = a["alert_ref"]
if a.get("service"):
refs["service"] = a["service"]
events.append({
"ts": a.get("created_at", ""),
"type": ev_type,
"label": f"Alert: {kind}" + (f" ({a.get('title', '')})"
if a.get("title") else ""),
"refs": refs,
})
return events
def _make_timeline_events_from_incidents(
incidents: List[Dict],
events_by_id: Dict[str, List[Dict]],
cutoff_iso: str,
) -> List[Dict]:
"""Convert incident + escalation events to raw timeline events."""
timeline_events = []
for inc in incidents:
inc_id = inc.get("id", "")
started = inc.get("started_at") or inc.get("created_at", "")
if started >= cutoff_iso:
timeline_events.append({
"ts": started,
"type": "incident",
"label": f"Incident started: {inc.get('title', inc_id)[:80]}",
"refs": {"incident_id": inc_id},
})
for ev in events_by_id.get(inc_id, []):
if (ev.get("type") == "decision"
and "Escalat" in (ev.get("message") or "")
and ev.get("ts", "") >= cutoff_iso):
timeline_events.append({
"ts": ev["ts"],
"type": "incident",
"label": f"Incident escalated: {inc_id}",
"refs": {"incident_id": inc_id,
"event_type": ev.get("type", "")},
})
return timeline_events
def _make_timeline_events_from_gates(
release_gate_results: List[Dict],
) -> List[Dict]:
"""Convert release gate results to raw timeline events."""
events = []
for g in release_gate_results:
if g.get("status") not in ("fail", "warn"):
continue
gate_type = "dependency" if "dep" in g.get("gate", "").lower() else "release_gate"
if "drift" in g.get("gate", "").lower():
gate_type = "drift"
refs: Dict = {}
if g.get("run_id"):
refs["release_check_run_id"] = g["run_id"]
if g.get("artifact"):
refs["artifact"] = g["artifact"]
events.append({
"ts": g.get("ts", datetime.datetime.utcnow().isoformat()),
"type": gate_type,
"label": f"Gate {g['gate']} = {g['status']}",
"refs": refs,
})
return events
# ─── Evidence refs builder ────────────────────────────────────────────────────
def build_evidence_refs(
alerts_24h: List[Dict],
incidents_24h: List[Dict],
release_gate_results: List[Dict],
followup_refs: Optional[List[Dict]] = None,
policy: Optional[Dict] = None,
) -> Dict:
"""
Collect top-level evidence_refs: alert_refs, incident_ids,
release_check_run_ids, artifacts.
"""
if policy is None:
policy = load_attribution_policy()
max_refs = int(policy.get("evidence_linking", {}).get("max_refs_per_cause", 10))
alert_refs = _cap_refs(
[a["alert_ref"] for a in alerts_24h if a.get("alert_ref")], max_refs
)
incident_ids = _cap_refs(
list({inc.get("id", "") for inc in incidents_24h if inc.get("id")}), max_refs
)
rc_ids = _cap_refs(
list({g.get("run_id") for g in release_gate_results if g.get("run_id")}), max_refs
)
artifacts = _cap_refs(
list({g.get("artifact") for g in release_gate_results if g.get("artifact")}), max_refs
)
fu_refs = _cap_refs(
[r for r in (followup_refs or []) if r], max_refs
)
return {
"alerts": alert_refs,
"incidents": incident_ids,
"release_checks": list(filter(None, rc_ids)),
"artifacts": list(filter(None, artifacts)),
"followups": fu_refs,
}
# ─── Summary builder ──────────────────────────────────────────────────────────
_TYPE_LABELS = {
"deploy": "deploy activity",
"dependency": "dependency change",
"drift": "config/infrastructure drift",
"incident_storm": "incident storm",
"slo_violation": "SLO violation",
"followups_overdue": "overdue follow-ups",
"alert_loop_degraded": "alert-loop degradation",
}
def _build_summary(causes: List[Dict]) -> str:
if not causes:
return "No significant attribution signals detected."
labels = [_TYPE_LABELS.get(c["type"], c["type"]) for c in causes[:3]]
return "Likely causes: " + " + ".join(labels) + "."
# ─── Main attribution function ────────────────────────────────────────────────
def compute_attribution(
service: str,
env: str,
*,
risk_report: Optional[Dict] = None,
# Signals (pre-fetched)
alerts_24h: Optional[List[Dict]] = None,
occurrences_60m: int = 0,
escalations_24h: int = 0,
release_gate_results: Optional[List[Dict]] = None,
slo_violations: int = 0,
slo_metrics: Optional[List[str]] = None,
overdue_followup_count: int = 0,
followup_refs: Optional[List[Dict]] = None,
loop_slo_violations: int = 0,
# For evidence + timeline
incidents_24h: Optional[List[Dict]] = None,
incident_events: Optional[Dict[str, List[Dict]]] = None,
window_hours: int = 24,
policy: Optional[Dict] = None,
) -> Dict:
"""
Deterministic attribution: causes with evidence, refs, timeline, evidence_refs.
All signal arguments default to safe empty values.
Never raises (returns minimal report on any error).
"""
if policy is None:
policy = load_attribution_policy()
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(hours=window_hours)
).isoformat()
max_causes = int(policy.get("defaults", {}).get("max_causes", 5))
max_refs = int(policy.get("evidence_linking", {}).get("max_refs_per_cause", 10))
risk_report = risk_report or {}
alerts_24h = alerts_24h or []
release_gate_results = release_gate_results or []
incidents_24h = incidents_24h or []
incident_events = incident_events or {}
# Extract from risk_report.components when not explicitly provided
if slo_violations == 0 and risk_report:
slo_violations = (risk_report.get("components", {}).get("slo") or {}).get("violations", 0)
if overdue_followup_count == 0 and risk_report:
fu = risk_report.get("components", {}).get("followups") or {}
overdue_followup_count = fu.get("P0", 0) + fu.get("P1", 0) + fu.get("other", 0)
if loop_slo_violations == 0 and risk_report:
loop_slo_violations = (
risk_report.get("components", {}).get("alerts_loop") or {}
).get("violations", 0)
incident_ids = [inc.get("id", "") for inc in incidents_24h if inc.get("id")]
# ── Score each signal (now with refs) ────────────────────────────────────
candidates: List[Dict] = []
score, evid, refs = _detect_deploy(alerts_24h, cutoff, policy, max_refs)
if score:
candidates.append({"type": "deploy", "score": score, "evidence": evid, "refs": refs})
score, evid, refs = _detect_dependency(release_gate_results, policy, max_refs)
if score:
candidates.append({"type": "dependency", "score": score, "evidence": evid, "refs": refs})
score, evid, refs = _detect_drift(release_gate_results, policy, max_refs)
if score:
candidates.append({"type": "drift", "score": score, "evidence": evid, "refs": refs})
score, evid, refs = _detect_incident_storm(
occurrences_60m, escalations_24h, policy, incident_ids, max_refs
)
if score:
candidates.append({"type": "incident_storm", "score": score, "evidence": evid, "refs": refs})
score, evid, refs = _detect_slo(slo_violations, policy, slo_metrics, max_refs)
if score:
candidates.append({"type": "slo_violation", "score": score, "evidence": evid, "refs": refs})
score, evid, refs = _detect_followups_overdue(
overdue_followup_count, policy, followup_refs, max_refs
)
if score:
candidates.append({"type": "followups_overdue", "score": score,
"evidence": evid, "refs": refs})
score, evid, refs = _detect_alert_loop_degraded(loop_slo_violations, policy, max_refs)
if score:
candidates.append({"type": "alert_loop_degraded", "score": score,
"evidence": evid, "refs": refs})
# Sort desc, cap, add confidence
candidates.sort(key=lambda c: -c["score"])
causes = candidates[:max_causes]
for c in causes:
c["confidence"] = _score_to_confidence(c["score"], policy)
delta_24h = (risk_report.get("trend") or {}).get("delta_24h")
summary = _build_summary(causes)
# ── Timeline ──────────────────────────────────────────────────────────────
tl_cfg = policy.get("timeline", {})
deploy_kinds = set(policy.get("signals", {}).get("deploy", {}).get(
"kinds", ["deploy", "deployment", "rollout", "canary"]
))
raw_events: List[Dict] = []
raw_events.extend(_make_timeline_events_from_alerts(alerts_24h, deploy_kinds, cutoff))
raw_events.extend(_make_timeline_events_from_incidents(incidents_24h, incident_events, cutoff))
raw_events.extend(_make_timeline_events_from_gates(release_gate_results))
timeline = build_timeline(raw_events, policy) if tl_cfg.get("enabled", True) else []
# ── Evidence refs ─────────────────────────────────────────────────────────
evidence_refs: Dict = {}
if policy.get("evidence_linking", {}).get("enabled", True):
evidence_refs = build_evidence_refs(
alerts_24h, incidents_24h, release_gate_results,
followup_refs=followup_refs, policy=policy,
)
return {
"service": service,
"env": env,
"window_hours": window_hours,
"delta_24h": delta_24h,
"causes": causes,
"summary": summary,
"timeline": timeline,
"evidence_refs": evidence_refs,
"llm_enrichment": {"enabled": False, "text": None},
}
# ─── Signal fetcher (for wiring in tool_manager/risk_engine) ─────────────────
def fetch_signals_from_stores(
service: str,
env: str,
window_hours: int = 24,
*,
alert_store=None,
incident_store=None,
policy: Optional[Dict] = None,
) -> Dict:
"""
Fetches raw signals from existing stores.
Returns a dict ready to unpack into compute_attribution().
Always non-fatal per store.
"""
if policy is None:
policy = load_attribution_policy()
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(hours=window_hours)
).isoformat()
# ── Deploy + other alerts ─────────────────────────────────────────────────
alerts_24h: List[Dict] = []
try:
if alert_store is not None:
all_alerts = alert_store.list_alerts(limit=200)
alerts_24h = [
a for a in all_alerts
if a.get("created_at", "") >= cutoff
and (not a.get("service") or a.get("service") == service)
]
except Exception as e:
logger.warning("attribution fetch alerts failed: %s", e)
# ── Incidents in window + event maps ──────────────────────────────────────
incidents_24h: List[Dict] = []
incident_events: Dict[str, List[Dict]] = {}
occurrences_60m = 0
escalations_24h = 0
try:
if incident_store is not None:
cutoff_60m = (
datetime.datetime.utcnow() - datetime.timedelta(minutes=60)
).isoformat()
# Count alert occurrences from alert_store top_signatures
if alert_store is not None:
try:
sigs = alert_store.top_signatures(window_minutes=60, limit=20)
occurrences_60m = sum(s.get("occurrences", 0) for s in sigs)
except Exception:
pass
incs = incident_store.list_incidents({"service": service}, limit=30)
for inc in incs:
inc_id = inc.get("id", "")
inc_started = inc.get("started_at") or inc.get("created_at", "")
try:
events = incident_store.get_events(inc_id, limit=50)
incident_events[inc_id] = events
for ev in events:
if (ev.get("type") == "decision"
and "Escalat" in (ev.get("message") or "")
and ev.get("ts", "") >= cutoff):
escalations_24h += 1
except Exception:
pass
# Include incident if started within window
if inc_started >= cutoff:
incidents_24h.append(inc)
except Exception as e:
logger.warning("attribution fetch incident signals failed: %s", e)
return {
"alerts_24h": alerts_24h,
"occurrences_60m": occurrences_60m,
"escalations_24h": escalations_24h,
"incidents_24h": incidents_24h,
"incident_events": incident_events,
"release_gate_results": [], # caller can inject if persisted
}

View File

@@ -0,0 +1,341 @@
"""
risk_digest.py — Daily Risk Digest generator (deterministic, no LLM).
Produces:
ops/reports/risk/YYYY-MM-DD.json
ops/reports/risk/YYYY-MM-DD.md
Content:
- Top risky services (score desc)
- Top regressions (delta_24h desc)
- SLO violation summary
- Deterministic action list based on risk state
"""
from __future__ import annotations
import datetime
import json
import logging
import math
import os
from pathlib import Path
from typing import Dict, List, Optional
logger = logging.getLogger(__name__)
_ACTION_TEMPLATES = {
"regression_fail": "🚨 **Regression detected**: {service} score +{delta} in 24h. Freeze deployments; inspect recent incidents/followups immediately.",
"regression_warn": "⚠️ **Score rising**: {service} +{delta} in 24h. Review open incidents and overdue follow-ups.",
"critical_band": "🔴 **Critical risk**: {service} (score {score}). Oncall review required within 2h.",
"high_band": "🟠 **High risk**: {service} (score {score}). Coordinate with oncall before next release.",
"overdue_followups": "📋 **Overdue follow-ups**: {service} has {count} overdue follow-up(s). Close them to reduce risk score.",
"slo_violation": "📉 **SLO violation**: {service} has {count} active SLO violation(s). Avoid deploying until clear.",
}
def _now_date() -> str:
return datetime.datetime.utcnow().strftime("%Y-%m-%d")
def _clamp(text: str, max_chars: int) -> str:
if len(text) <= max_chars:
return text
truncated = text[:max_chars]
return truncated + "\n\n_[digest truncated to policy max_chars]_"
def _build_action_list(reports: List[Dict]) -> List[str]:
actions = []
for r in reports[:10]:
service = r.get("service", "?")
score = r.get("score", 0)
band = r.get("band", "low")
trend = r.get("trend") or {}
comp = r.get("components", {})
delta_24h = trend.get("delta_24h")
reg = trend.get("regression", {})
if reg.get("fail") and delta_24h is not None and delta_24h > 0:
actions.append(_ACTION_TEMPLATES["regression_fail"].format(
service=service, delta=delta_24h))
elif reg.get("warn") and delta_24h is not None and delta_24h > 0:
actions.append(_ACTION_TEMPLATES["regression_warn"].format(
service=service, delta=delta_24h))
if band == "critical":
actions.append(_ACTION_TEMPLATES["critical_band"].format(
service=service, score=score))
elif band == "high":
actions.append(_ACTION_TEMPLATES["high_band"].format(
service=service, score=score))
overdue = (
(comp.get("followups") or {}).get("P0", 0)
+ (comp.get("followups") or {}).get("P1", 0)
+ (comp.get("followups") or {}).get("other", 0)
)
if overdue:
actions.append(_ACTION_TEMPLATES["overdue_followups"].format(
service=service, count=overdue))
slo_count = (comp.get("slo") or {}).get("violations", 0)
if slo_count:
actions.append(_ACTION_TEMPLATES["slo_violation"].format(
service=service, count=slo_count))
return actions[:20] # cap
def _build_markdown(
date_str: str,
env: str,
reports: List[Dict],
top_regressions: List[Dict],
improving: List[Dict],
actions: List[str],
band_counts: Dict,
) -> str:
lines = [
f"# Risk Digest — {date_str} ({env})",
"",
f"Generated: {datetime.datetime.utcnow().isoformat()} UTC",
"",
"## Band Summary",
"",
"| Band | Count |",
"|------|-------|",
]
for band in ("critical", "high", "medium", "low"):
lines.append(f"| {band} | {band_counts.get(band, 0)} |")
lines += [
"",
"## Top Risky Services",
"",
"| Service | Score | Band | Δ24h | Δ7d |",
"|---------|-------|------|------|-----|",
]
for r in reports:
t = r.get("trend") or {}
d24 = t.get("delta_24h")
d7 = t.get("delta_7d")
d24_str = (f"+{d24}" if d24 and d24 > 0 else str(d24)) if d24 is not None else ""
d7_str = (f"+{d7}" if d7 and d7 > 0 else str(d7)) if d7 is not None else ""
lines.append(
f"| {r['service']} | {r.get('score', 0)} | {r.get('band', '?')} "
f"| {d24_str} | {d7_str} |"
)
if top_regressions:
lines += ["", "## Top Regressions (Δ24h)", ""]
for item in top_regressions:
delta = item.get("delta_24h", 0)
lines.append(f"- **{item['service']}**: +{delta} points in 24h")
# ── Likely Causes (Attribution) ───────────────────────────────────────────
regressions_with_attribution = [
r for r in reports
if (r.get("trend") or {}).get("delta_24h") is not None
and r["trend"]["delta_24h"] > 0
and r.get("attribution") is not None
and r["attribution"].get("causes")
]
regressions_with_attribution = sorted(
regressions_with_attribution,
key=lambda r: -(r.get("trend") or {}).get("delta_24h", 0),
)[:5]
if regressions_with_attribution:
lines += ["", "## Likely Causes (Top Regressions)", ""]
for r in regressions_with_attribution:
svc = r["service"]
attr = r["attribution"]
delta = r["trend"]["delta_24h"]
summary = attr.get("summary", "")
lines.append(f"### {svc} (+{delta} pts)")
if summary:
lines.append(f"> {summary}")
causes = attr.get("causes", [])[:2]
for c in causes:
evid = "; ".join(c.get("evidence", []))
lines.append(
f"- **{c['type']}** (confidence: {c.get('confidence', '?')}): {evid}"
)
# LLM text if available
llm = attr.get("llm_enrichment") or {}
if llm.get("enabled") and llm.get("text"):
lines += ["", f" _LLM insight_: {llm['text'][:400]}"]
lines.append("")
# ── Change Timeline (Top Regressions) ────────────────────────────────────
regressions_with_timeline = [
r for r in regressions_with_attribution
if r.get("attribution") and r["attribution"].get("timeline")
]
if regressions_with_timeline:
lines += ["", "## Change Timeline (Top Regressions)", ""]
for r in regressions_with_timeline:
svc = r["service"]
timeline = r["attribution"]["timeline"][:5] # top 5 per service
lines.append(f"### {svc}")
for item in timeline:
ts = (item.get("ts") or "")[:16]
label = item.get("label", "")
ev_type = item.get("type", "")
lines.append(f"- `{ts}` [{ev_type}] {label}")
lines.append("")
if improving:
lines += ["", "## Improving Services (Δ7d)", ""]
for item in improving:
delta = item.get("delta_7d", 0)
lines.append(f"- **{item['service']}**: {delta} points over 7d")
if actions:
lines += ["", "## Action List", ""]
for action in actions:
lines.append(f"- {action}")
lines += ["", "---", "_Generated by DAARION.city Risk Digest (deterministic, no LLM by default)_"]
return "\n".join(lines)
def daily_digest(
env: str = "prod",
*,
service_reports: Optional[List[Dict]] = None,
policy: Optional[Dict] = None,
date_str: Optional[str] = None,
output_dir: Optional[str] = None,
write_files: bool = True,
) -> Dict:
"""
Build and optionally persist the daily risk digest.
service_reports — pre-fetched+enriched list of RiskReports (with trend).
Returns {json_path, md_path, json_data, markdown, date, env}
"""
from risk_engine import load_risk_policy, compute_risk_dashboard
if policy is None:
policy = load_risk_policy()
digest_cfg = policy.get("digest", {})
top_n = int(digest_cfg.get("top_n", 10))
max_chars = int(digest_cfg.get("markdown_max_chars", 8000))
cfg_output_dir = digest_cfg.get("output_dir", "ops/reports/risk")
effective_output_dir = output_dir or cfg_output_dir
effective_date = date_str or _now_date()
reports = sorted(service_reports or [], key=lambda r: -r.get("score", 0))[:top_n]
# Band counts
band_counts: Dict[str, int] = {"critical": 0, "high": 0, "medium": 0, "low": 0}
for r in reports:
b = r.get("band", "low")
band_counts[b] = band_counts.get(b, 0) + 1
# Top regressions
top_regressions = sorted(
[r for r in reports if (r.get("trend") or {}).get("delta_24h") is not None
and r["trend"]["delta_24h"] > 0],
key=lambda r: -r["trend"]["delta_24h"],
)[:5]
top_regressions_out = [
{"service": r["service"], "delta_24h": r["trend"]["delta_24h"],
"attribution_causes": [
{"type": c["type"], "score": c["score"],
"confidence": c.get("confidence", "low"),
"evidence": c.get("evidence", [])[:2],
"refs": c.get("refs", [])[:3]}
for c in (r.get("attribution") or {}).get("causes", [])[:2]
],
"timeline_preview": (r.get("attribution") or {}).get("timeline", [])[:3],
}
for r in top_regressions
]
# Improving services
improving = sorted(
[r for r in reports if (r.get("trend") or {}).get("delta_7d") is not None
and r["trend"]["delta_7d"] < 0],
key=lambda r: r["trend"]["delta_7d"],
)[:5]
improving_out = [
{"service": r["service"], "delta_7d": r["trend"]["delta_7d"]}
for r in improving
]
actions = _build_action_list(reports)
markdown_raw = _build_markdown(
date_str=effective_date,
env=env,
reports=reports,
top_regressions=top_regressions_out,
improving=improving_out,
actions=actions,
band_counts=band_counts,
)
markdown = _clamp(markdown_raw, max_chars)
json_data = {
"date": effective_date,
"env": env,
"generated_at": datetime.datetime.utcnow().isoformat(),
"band_counts": band_counts,
"top_services": [
{
"service": r.get("service"),
"score": r.get("score"),
"band": r.get("band"),
"delta_24h": (r.get("trend") or {}).get("delta_24h"),
"delta_7d": (r.get("trend") or {}).get("delta_7d"),
"regression": (r.get("trend") or {}).get("regression"),
"reasons": r.get("reasons", [])[:5],
"attribution_summary": (r.get("attribution") or {}).get("summary"),
"top_causes": [
{"type": c["type"], "score": c["score"],
"confidence": c.get("confidence", "low"),
"evidence": c.get("evidence", [])[:2],
"refs": c.get("refs", [])[:3]}
for c in (r.get("attribution") or {}).get("causes", [])[:2]
],
"timeline_preview": (r.get("attribution") or {}).get("timeline", [])[:3],
"evidence_refs": (r.get("attribution") or {}).get("evidence_refs", {}),
}
for r in reports
],
"top_regressions": top_regressions_out,
"improving_services": improving_out,
"actions": actions,
}
json_path: Optional[str] = None
md_path: Optional[str] = None
if write_files:
try:
out = Path(effective_output_dir)
out.mkdir(parents=True, exist_ok=True)
json_path = str(out / f"{effective_date}.json")
md_path = str(out / f"{effective_date}.md")
with open(json_path, "w") as f:
json.dump(json_data, f, indent=2)
with open(md_path, "w") as f:
f.write(markdown)
logger.info("Risk digest written: %s, %s", json_path, md_path)
except Exception as e:
logger.warning("Risk digest write failed: %s", e)
json_path = md_path = None
return {
"date": effective_date,
"env": env,
"json_path": json_path,
"md_path": md_path,
"json_data": json_data,
"markdown": markdown,
}

View File

@@ -0,0 +1,710 @@
"""
risk_engine.py — Service Risk Index Engine (deterministic, no LLM).
Provides:
compute_service_risk(service, env, ...) -> RiskReport
compute_risk_dashboard(env, top_n, ...) -> Dashboard
compute_trend(series) -> TrendReport
enrich_risk_report_with_trend(report, history_store, policy) -> report (mutated)
snapshot_all_services(env, compute_fn, history_store, policy) -> SnapshotResult
All inputs come from existing stores and tools.
The engine never calls external services directly — callers inject store references.
"""
from __future__ import annotations
import datetime
import logging
import math
import yaml
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
# ─── Policy ───────────────────────────────────────────────────────────────────
_POLICY_CACHE: Optional[Dict] = None
_POLICY_SEARCH_PATHS = [
Path("config/risk_policy.yml"),
Path(__file__).resolve().parent.parent.parent / "config" / "risk_policy.yml",
]
def load_risk_policy() -> Dict:
global _POLICY_CACHE
if _POLICY_CACHE is not None:
return _POLICY_CACHE
for p in _POLICY_SEARCH_PATHS:
if p.exists():
try:
with open(p) as f:
data = yaml.safe_load(f) or {}
_POLICY_CACHE = data
return data
except Exception as e:
logger.warning("Failed to load risk_policy from %s: %s", p, e)
logger.warning("risk_policy.yml not found; using built-in defaults")
_POLICY_CACHE = _builtin_defaults()
return _POLICY_CACHE
def _builtin_defaults() -> Dict:
return {
"defaults": {"window_hours": 24, "recurrence_windows_days": [7, 30],
"slo_window_minutes": 60},
"thresholds": {
"bands": {"low_max": 20, "medium_max": 50, "high_max": 80},
"risk_watch": {"warn_at": 50, "fail_at": 80},
},
"weights": {
"open_incidents": {"P0": 50, "P1": 25, "P2": 10, "P3": 5},
"recurrence": {
"signature_warn_7d": 10, "signature_high_7d": 20,
"kind_warn_7d": 8, "kind_high_7d": 15,
"signature_high_30d": 10, "kind_high_30d": 8,
},
"followups": {"overdue_P0": 20, "overdue_P1": 12, "overdue_other": 6},
"slo": {"violation": 10},
"alerts_loop": {"slo_violation": 10},
"escalation": {"escalations_24h": {"warn": 5, "high": 12}},
},
"service_overrides": {},
"p0_services": ["gateway", "router"],
}
def _reload_policy() -> None:
global _POLICY_CACHE
_POLICY_CACHE = None
# ─── Band classification ──────────────────────────────────────────────────────
def score_to_band(score: int, policy: Dict) -> str:
bands = policy.get("thresholds", {}).get("bands", {})
low_max = int(bands.get("low_max", 20))
medium_max = int(bands.get("medium_max", 50))
high_max = int(bands.get("high_max", 80))
if score <= low_max:
return "low"
if score <= medium_max:
return "medium"
if score <= high_max:
return "high"
return "critical"
def get_service_thresholds(service: str, policy: Dict) -> Dict:
overrides = policy.get("service_overrides", {}).get(service, {})
defaults = policy.get("thresholds", {}).get("risk_watch", {})
ov_rw = overrides.get("risk_watch", {})
return {
"warn_at": int(ov_rw.get("warn_at", defaults.get("warn_at", 50))),
"fail_at": int(ov_rw.get("fail_at", defaults.get("fail_at", 80))),
}
# ─── Individual scoring components ───────────────────────────────────────────
def _score_open_incidents(
open_incidents: List[Dict],
weights: Dict,
) -> Tuple[int, Dict, List[str]]:
"""Score open incidents by severity."""
w = weights.get("open_incidents", {})
counts: Dict[str, int] = {"P0": 0, "P1": 0, "P2": 0, "P3": 0}
points = 0
for inc in open_incidents:
sev = inc.get("severity", "P3")
if sev in counts:
counts[sev] += 1
pts = int(w.get(sev, 0))
points += pts
reasons = []
if counts["P0"]:
reasons.append(f"Open P0 incident(s): {counts['P0']}")
if counts["P1"]:
reasons.append(f"Open P1 incident(s): {counts['P1']}")
if counts["P2"]:
reasons.append(f"Open P2 incident(s): {counts['P2']}")
return points, {**counts, "points": points}, reasons
def _score_recurrence(
recurrence_data: Dict,
weights: Dict,
) -> Tuple[int, Dict, List[str]]:
"""Score from recurrence detection stats."""
w = weights.get("recurrence", {})
high_rec = recurrence_data.get("high_recurrence", {})
warn_rec = recurrence_data.get("warn_recurrence", {})
high_sigs_7d = len(high_rec.get("signatures", []))
high_kinds_7d = len(high_rec.get("kinds", []))
warn_sigs_7d = len(warn_rec.get("signatures", []))
warn_kinds_7d = len(warn_rec.get("kinds", []))
# Note: 30d data comes from separate call; keep it optional
high_sigs_30d = len(recurrence_data.get("high_recurrence_30d", {}).get("signatures", []))
high_kinds_30d = len(recurrence_data.get("high_recurrence_30d", {}).get("kinds", []))
points = (
high_sigs_7d * int(w.get("signature_high_7d", 20))
+ warn_sigs_7d * int(w.get("signature_warn_7d", 10))
+ high_kinds_7d * int(w.get("kind_high_7d", 15))
+ warn_kinds_7d * int(w.get("kind_warn_7d", 8))
+ high_sigs_30d * int(w.get("signature_high_30d", 10))
+ high_kinds_30d * int(w.get("kind_high_30d", 8))
)
component = {
"high_signatures_7d": high_sigs_7d,
"warn_signatures_7d": warn_sigs_7d,
"high_kinds_7d": high_kinds_7d,
"warn_kinds_7d": warn_kinds_7d,
"high_signatures_30d": high_sigs_30d,
"high_kinds_30d": high_kinds_30d,
"points": points,
}
reasons = []
if high_sigs_7d:
reasons.append(f"High recurrence signatures (7d): {high_sigs_7d}")
if high_kinds_7d:
reasons.append(f"High recurrence kinds (7d): {high_kinds_7d}")
if warn_sigs_7d:
reasons.append(f"Warn recurrence signatures (7d): {warn_sigs_7d}")
return points, component, reasons
def _score_followups(
followups_data: Dict,
weights: Dict,
) -> Tuple[int, Dict, List[str]]:
"""Score overdue follow-ups by priority."""
w = weights.get("followups", {})
overdue = followups_data.get("overdue_followups", [])
counts: Dict[str, int] = {"P0": 0, "P1": 0, "other": 0}
points = 0
for fu in overdue:
prio = fu.get("priority", "other")
if prio == "P0":
counts["P0"] += 1
points += int(w.get("overdue_P0", 20))
elif prio == "P1":
counts["P1"] += 1
points += int(w.get("overdue_P1", 12))
else:
counts["other"] += 1
points += int(w.get("overdue_other", 6))
reasons = []
if counts["P0"]:
reasons.append(f"Overdue follow-ups (P0): {counts['P0']}")
if counts["P1"]:
reasons.append(f"Overdue follow-ups (P1): {counts['P1']}")
if counts["other"]:
reasons.append(f"Overdue follow-ups (other): {counts['other']}")
return points, {**counts, "points": points}, reasons
def _score_slo(
slo_data: Dict,
weights: Dict,
) -> Tuple[int, Dict, List[str]]:
"""Score SLO violations."""
w = weights.get("slo", {})
violations = slo_data.get("violations", [])
skipped = slo_data.get("skipped", False)
if skipped:
return 0, {"violations": 0, "skipped": True, "points": 0}, []
count = len(violations)
points = count * int(w.get("violation", 10))
reasons = []
if count:
reasons.append(f"Active SLO violation(s) in window: {count}")
return points, {"violations": count, "skipped": False, "points": points}, reasons
def _score_alerts_loop(
loop_slo: Dict,
weights: Dict,
) -> Tuple[int, Dict, List[str]]:
"""Score alert-loop SLO violations (self-monitoring)."""
w = weights.get("alerts_loop", {})
violations = loop_slo.get("violations", [])
count = len(violations)
points = count * int(w.get("slo_violation", 10))
reasons = []
if count:
reasons.append(f"Alert-loop SLO violation(s): {count}")
return points, {"violations": count, "points": points}, reasons
def _score_escalations(
escalation_count: int,
weights: Dict,
) -> Tuple[int, Dict, List[str]]:
"""Score escalations in last 24h."""
esc_w = weights.get("escalation", {}).get("escalations_24h", {})
warn_pts = int(esc_w.get("warn", 5))
high_pts = int(esc_w.get("high", 12))
if escalation_count >= 3:
points = high_pts
elif escalation_count >= 1:
points = warn_pts
else:
points = 0
reasons = []
if escalation_count:
reasons.append(f"Escalations in last 24h: {escalation_count}")
return points, {"count_24h": escalation_count, "points": points}, reasons
# ─── Main scoring function ────────────────────────────────────────────────────
def compute_service_risk(
service: str,
env: str = "prod",
*,
open_incidents: Optional[List[Dict]] = None,
recurrence_7d: Optional[Dict] = None,
recurrence_30d: Optional[Dict] = None,
followups_data: Optional[Dict] = None,
slo_data: Optional[Dict] = None,
alerts_loop_slo: Optional[Dict] = None,
escalation_count_24h: int = 0,
policy: Optional[Dict] = None,
) -> Dict:
"""
Compute risk score for a service.
Accepts pre-fetched data dicts (callers are responsible for fetching
from stores/tools). All args default to empty/safe values so the engine
never crashes due to missing data.
"""
if policy is None:
policy = load_risk_policy()
weights = policy.get("weights", _builtin_defaults()["weights"])
# ── Compute each component ────────────────────────────────────────────────
open_incs = open_incidents or []
pts_inc, comp_inc, reasons_inc = _score_open_incidents(open_incs, weights)
# Merge 7d + 30d recurrence into a single dict
rec_merged = dict(recurrence_7d or {})
if recurrence_30d:
rec_merged["high_recurrence_30d"] = recurrence_30d.get("high_recurrence", {})
rec_merged["warn_recurrence_30d"] = recurrence_30d.get("warn_recurrence", {})
pts_rec, comp_rec, reasons_rec = _score_recurrence(rec_merged, weights)
pts_fu, comp_fu, reasons_fu = _score_followups(followups_data or {}, weights)
pts_slo, comp_slo, reasons_slo = _score_slo(slo_data or {}, weights)
pts_loop, comp_loop, reasons_loop = _score_alerts_loop(alerts_loop_slo or {}, weights)
pts_esc, comp_esc, reasons_esc = _score_escalations(escalation_count_24h, weights)
total = max(0, pts_inc + pts_rec + pts_fu + pts_slo + pts_loop + pts_esc)
band = score_to_band(total, policy)
svc_thresholds = get_service_thresholds(service, policy)
all_reasons = reasons_inc + reasons_rec + reasons_fu + reasons_slo + reasons_loop + reasons_esc
# Deterministic recommendations
recs = _build_recommendations(band, comp_inc, comp_rec, comp_fu, comp_slo)
return {
"service": service,
"env": env,
"score": total,
"band": band,
"thresholds": svc_thresholds,
"components": {
"open_incidents": comp_inc,
"recurrence": comp_rec,
"followups": comp_fu,
"slo": comp_slo,
"alerts_loop": comp_loop,
"escalations": comp_esc,
},
"reasons": all_reasons,
"recommendations": recs,
"updated_at": datetime.datetime.utcnow().isoformat(),
}
def _build_recommendations(
band: str,
comp_inc: Dict,
comp_rec: Dict,
comp_fu: Dict,
comp_slo: Dict,
) -> List[str]:
recs = []
if comp_inc.get("P0", 0) or comp_inc.get("P1", 0):
recs.append("Prioritize open P0/P1 incidents before deploying.")
if comp_rec.get("high_signatures_7d", 0) or comp_rec.get("high_kinds_7d", 0):
recs.append("Investigate recurring failure patterns (high recurrence buckets).")
if comp_fu.get("P0", 0) or comp_fu.get("P1", 0):
recs.append("Prioritize follow-up closure for recurring bucket(s).")
if comp_slo.get("violations", 0):
recs.append("Avoid risky deploys until SLO violation clears.")
if band in ("high", "critical"):
recs.append("Service is high-risk — coordinate with oncall before release.")
return recs[:6]
# ─── Dashboard ────────────────────────────────────────────────────────────────
# ─── Trend computation ────────────────────────────────────────────────────────
def compute_trend(
series: List, # List[RiskSnapshot] — most-recent first
policy: Optional[Dict] = None,
) -> Dict:
"""
Compute trend metrics from a list of RiskSnapshot objects (or dicts).
Returns:
delta_24h, delta_7d, slope_per_day, volatility, regression{warn, fail}
"""
if policy is None:
policy = load_risk_policy()
trend_cfg = policy.get("trend", {})
reg = trend_cfg.get("regression_threshold", {})
warn_24h = int(reg.get("delta_24h_warn", 10))
fail_24h = int(reg.get("delta_24h_fail", 20))
warn_7d = int(reg.get("delta_7d_warn", 15))
fail_7d = int(reg.get("delta_7d_fail", 30))
if not series:
return _empty_trend()
# Normalise: accept both RiskSnapshot dataclasses and plain dicts
def _score(s) -> int:
return int(s.score if hasattr(s, "score") else s["score"])
def _ts(s) -> str:
return s.ts if hasattr(s, "ts") else s["ts"]
now = datetime.datetime.utcnow()
latest_score = _score(series[0])
# ── delta_24h ─────────────────────────────────────────────────────────────
cutoff_24h = (now - datetime.timedelta(hours=24)).isoformat()
base_24h = _find_baseline(series, cutoff_24h, _ts)
delta_24h = (latest_score - _score(base_24h)) if base_24h is not None else None
# ── delta_7d ──────────────────────────────────────────────────────────────
cutoff_7d = (now - datetime.timedelta(hours=168)).isoformat()
base_7d = _find_baseline(series, cutoff_7d, _ts)
delta_7d = (latest_score - _score(base_7d)) if base_7d is not None else None
# ── slope (simple linear regression over all available points) ────────────
slope_per_day: Optional[float] = None
if len(series) >= 2:
# xs = age in hours from oldest point
pairs = [(now - _parse_ts(_ts(s))).total_seconds() / 3600.0 for s in series]
hours_from_oldest = [max(pairs) - p for p in pairs] # 0=oldest, max=newest
scores = [_score(s) for s in series]
slope_per_day = _linear_slope(hours_from_oldest, scores) * 24 # per day
# ── volatility (stddev of daily last-score-per-day over 7d) ──────────────
volatility: Optional[float] = None
daily_scores = _daily_latest_scores(series, days=7, _ts_fn=_ts, _score_fn=_score)
if len(daily_scores) >= 2:
mean = sum(daily_scores) / len(daily_scores)
variance = sum((x - mean) ** 2 for x in daily_scores) / len(daily_scores)
volatility = round(math.sqrt(variance), 2)
# ── regression flags ──────────────────────────────────────────────────────
reg_warn = (
(delta_24h is not None and delta_24h >= warn_24h)
or (delta_7d is not None and delta_7d >= warn_7d)
)
reg_fail = (
(delta_24h is not None and delta_24h >= fail_24h)
or (delta_7d is not None and delta_7d >= fail_7d)
)
return {
"delta_24h": delta_24h,
"delta_7d": delta_7d,
"slope_per_day": round(slope_per_day, 2) if slope_per_day is not None else None,
"volatility": volatility,
"regression": {"warn": reg_warn, "fail": reg_fail},
}
def _empty_trend() -> Dict:
return {
"delta_24h": None, "delta_7d": None,
"slope_per_day": None, "volatility": None,
"regression": {"warn": False, "fail": False},
}
def _find_baseline(series, cutoff_iso: str, ts_fn):
"""Return the first element whose ts <= cutoff (series is newest-first)."""
for s in series:
if ts_fn(s) <= cutoff_iso:
return s
return None
def _parse_ts(ts_str: str) -> datetime.datetime:
ts_str = ts_str.rstrip("Z")
for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d"):
try:
return datetime.datetime.strptime(ts_str, fmt)
except ValueError:
continue
return datetime.datetime.utcnow()
def _linear_slope(xs: List[float], ys: List[float]) -> float:
"""Simple least-squares slope (score per hour)."""
n = len(xs)
if n < 2:
return 0.0
x_mean = sum(xs) / n
y_mean = sum(ys) / n
num = sum((xs[i] - x_mean) * (ys[i] - y_mean) for i in range(n))
den = sum((xs[i] - x_mean) ** 2 for i in range(n))
return num / den if den != 0 else 0.0
def _daily_latest_scores(series, days: int, _ts_fn, _score_fn) -> List[float]:
"""Collect the latest score for each calendar day over last `days` days."""
now = datetime.datetime.utcnow()
day_scores: Dict[str, int] = {}
cutoff = (now - datetime.timedelta(days=days)).isoformat()
for s in series:
ts = _ts_fn(s)
if ts < cutoff:
break
day_key = ts[:10] # YYYY-MM-DD
if day_key not in day_scores: # series is newest-first, so first = latest
day_scores[day_key] = _score_fn(s)
return list(day_scores.values())
def enrich_risk_report_with_trend(
report: Dict,
history_store, # RiskHistoryStore
policy: Optional[Dict] = None,
) -> Dict:
"""
Mutates `report` in-place to add a `trend` key.
Non-fatal: on any error, adds `trend: null`.
"""
try:
service = report.get("service", "")
env = report.get("env", "prod")
if policy is None:
policy = load_risk_policy()
trend_cfg = policy.get("trend", {})
vol_hours = int(trend_cfg.get("volatility_window_hours", 168))
series = history_store.get_series(service, env, hours=vol_hours, limit=500)
report["trend"] = compute_trend(series, policy=policy)
except Exception as e:
logger.warning("enrich_risk_report_with_trend failed for %s: %s", report.get("service"), e)
report["trend"] = None
return report
def enrich_risk_report_with_attribution(
report: Dict,
*,
alert_store=None,
incident_store=None,
attr_policy: Optional[Dict] = None,
) -> Dict:
"""
Mutates `report` in-place to add an `attribution` key.
Non-fatal: on any error, adds `attribution: null`.
LLM enrichment is applied if policy.llm_mode != 'off' and triggers met.
"""
try:
from risk_attribution import (
compute_attribution, fetch_signals_from_stores, load_attribution_policy,
)
from llm_enrichment import maybe_enrich_attribution
if attr_policy is None:
attr_policy = load_attribution_policy()
service = report.get("service", "")
env = report.get("env", "prod")
# Fetch raw signals
signals = fetch_signals_from_stores(
service, env,
window_hours=int((attr_policy.get("defaults") or {}).get("lookback_hours", 24)),
alert_store=alert_store,
incident_store=incident_store,
policy=attr_policy,
)
attribution = compute_attribution(
service, env,
risk_report=report,
**signals,
policy=attr_policy,
)
# Optionally enrich with LLM (bounded, off by default)
attribution["llm_enrichment"] = maybe_enrich_attribution(
attribution, report, attr_policy
)
report["attribution"] = attribution
except Exception as e:
logger.warning("enrich_risk_report_with_attribution failed for %s: %s",
report.get("service"), e)
report["attribution"] = None
return report
# ─── Snapshot writer ──────────────────────────────────────────────────────────
def snapshot_all_services(
env: str,
compute_fn, # Callable[[str, str], Dict] — returns RiskReport for (service, env)
history_store, # RiskHistoryStore
policy: Optional[Dict] = None,
known_services: Optional[List[str]] = None,
) -> Dict:
"""
Compute and persist a RiskSnapshot for every known service.
`compute_fn(service, env)` must return a RiskReport dict.
Returns {written, skipped, errors, services}.
Non-fatal per service.
"""
if policy is None:
policy = load_risk_policy()
from risk_history_store import RiskSnapshot
max_services = int(policy.get("history", {}).get("max_services_per_run", 50))
services = (known_services or [])[:max_services]
written = skipped = errors = 0
snapped: List[str] = []
for svc in services:
try:
report = compute_fn(svc, env)
snap = RiskSnapshot(
ts=datetime.datetime.utcnow().isoformat(),
service=svc,
env=env,
score=int(report.get("score", 0)),
band=report.get("band", "low"),
components=report.get("components", {}),
reasons=report.get("reasons", []),
)
history_store.write_snapshot([snap])
written += 1
snapped.append(svc)
except Exception as e:
logger.warning("snapshot_all_services: error for %s/%s: %s", svc, env, e)
errors += 1
return {
"written": written,
"skipped": skipped,
"errors": errors,
"services": snapped,
"env": env,
"ts": datetime.datetime.utcnow().isoformat(),
}
def compute_risk_dashboard(
env: str = "prod",
top_n: int = 10,
*,
service_reports: Optional[List[Dict]] = None,
history_store=None, # Optional[RiskHistoryStore] — if provided, enrich with trend
policy: Optional[Dict] = None,
) -> Dict:
"""
Build risk dashboard from a list of pre-computed service reports.
Sorts by score desc and returns summary.
If history_store is provided, each report is enriched with trend data.
"""
if policy is None:
policy = load_risk_policy()
reports = sorted(
service_reports or [],
key=lambda r: -r.get("score", 0),
)[:top_n]
# Enrich with trend if history_store provided
if history_store is not None:
for r in reports:
enrich_risk_report_with_trend(r, history_store, policy)
band_counts: Dict[str, int] = {"critical": 0, "high": 0, "medium": 0, "low": 0}
for r in reports:
b = r.get("band", "low")
band_counts[b] = band_counts.get(b, 0) + 1
p0_services = set(policy.get("p0_services", []))
critical_p0 = [r for r in reports if r["service"] in p0_services
and r["band"] in ("high", "critical")]
# Top regressions (highest delta_24h, trend present)
top_regressions = sorted(
[r for r in reports if (r.get("trend") or {}).get("delta_24h") is not None
and r["trend"]["delta_24h"] > 0],
key=lambda r: -r["trend"]["delta_24h"],
)[:5]
# Improving services (most negative delta_7d)
improving = sorted(
[r for r in reports if (r.get("trend") or {}).get("delta_7d") is not None
and r["trend"]["delta_7d"] < 0],
key=lambda r: r["trend"]["delta_7d"],
)[:5]
# Top regression summaries (with top-2 causes if attribution available)
top_regression_summaries = []
for r in top_regressions:
entry: Dict = {
"service": r["service"],
"delta_24h": r["trend"]["delta_24h"],
}
attr = r.get("attribution")
if attr and attr.get("causes"):
entry["causes"] = attr["causes"][:2]
entry["attribution_summary"] = attr.get("summary", "")
top_regression_summaries.append(entry)
now_iso = datetime.datetime.utcnow().isoformat()
return {
"env": env,
"generated_at": now_iso,
"history_updated_at": now_iso,
"total_services": len(reports),
"band_counts": band_counts,
"critical_p0_services": [r["service"] for r in critical_p0],
"top_regressions": top_regression_summaries,
"improving_services": [{"service": r["service"], "delta_7d": r["trend"]["delta_7d"]}
for r in improving],
"services": reports,
}

View File

@@ -0,0 +1,409 @@
"""
risk_history_store.py — Storage layer for Risk Score snapshots.
Provides:
RiskSnapshot — dataclass for a single point-in-time risk record
RiskHistoryStore — abstract base
MemoryRiskHistoryStore — in-process (tests + fallback)
NullRiskHistoryStore — no-op (disabled)
PostgresRiskHistoryStore — Postgres primary (psycopg2 sync)
AutoRiskHistoryStore — Postgres → Memory fallback
Factory: get_risk_history_store() → AutoRiskHistoryStore by default
"""
from __future__ import annotations
import datetime
import json
import logging
import os
import threading
from abc import ABC, abstractmethod
from collections import defaultdict
from dataclasses import dataclass, field, asdict
from typing import Dict, List, Optional
logger = logging.getLogger(__name__)
# ─── Data model ───────────────────────────────────────────────────────────────
@dataclass
class RiskSnapshot:
ts: str # ISO-8601 UTC
service: str
env: str
score: int
band: str
components: Dict = field(default_factory=dict)
reasons: List[str] = field(default_factory=list)
def to_dict(self) -> Dict:
return asdict(self)
@staticmethod
def from_dict(d: Dict) -> "RiskSnapshot":
return RiskSnapshot(
ts=d["ts"], service=d["service"], env=d.get("env", "prod"),
score=int(d["score"]), band=d.get("band", "low"),
components=d.get("components", {}),
reasons=d.get("reasons", []),
)
# ─── Abstract base ────────────────────────────────────────────────────────────
class RiskHistoryStore(ABC):
@abstractmethod
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
"""Persist records; returns number written."""
@abstractmethod
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
"""Most recent snapshot for service/env."""
@abstractmethod
def get_series(
self, service: str, env: str, hours: int = 168, limit: int = 200
) -> List[RiskSnapshot]:
"""Snapshots in descending time order within last `hours` hours."""
def get_delta(self, service: str, env: str, hours: int = 24) -> Optional[int]:
"""
latest.score - closest-to-(now-hours) score.
Returns None if no baseline is available.
"""
series = self.get_series(service, env, hours=hours * 2, limit=500)
if not series:
return None
latest = series[0]
cutoff_ts = (
datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
).isoformat()
# Find snapshot closest to cutoff (first one before or at cutoff)
baseline = None
for snap in series:
if snap.ts <= cutoff_ts:
baseline = snap
break
if baseline is None:
return None
return latest.score - baseline.score
def dashboard_series(
self, env: str, hours: int = 24, top_n: int = 10
) -> List[Dict]:
"""Return latest snapshot for each service in env, sorted by score desc."""
raise NotImplementedError
@abstractmethod
def cleanup(self, retention_days: int = 90) -> int:
"""Delete records older than retention_days; returns count deleted."""
# ─── Memory backend (tests + fallback) ────────────────────────────────────────
class MemoryRiskHistoryStore(RiskHistoryStore):
def __init__(self) -> None:
self._lock = threading.Lock()
# key: (service, env) → list of RiskSnapshot sorted desc by ts
self._data: Dict = defaultdict(list)
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
with self._lock:
for rec in records:
key = (rec.service, rec.env)
self._data[key].append(rec)
self._data[key].sort(key=lambda r: r.ts, reverse=True)
return len(records)
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
with self._lock:
series = self._data.get((service, env), [])
return series[0] if series else None
def get_series(
self, service: str, env: str, hours: int = 168, limit: int = 200
) -> List[RiskSnapshot]:
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
).isoformat()
with self._lock:
series = self._data.get((service, env), [])
result = [s for s in series if s.ts >= cutoff]
return result[:limit]
def dashboard_series(
self, env: str, hours: int = 24, top_n: int = 10
) -> List[Dict]:
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
).isoformat()
with self._lock:
latest_per_service: Dict[str, RiskSnapshot] = {}
for (svc, e), snaps in self._data.items():
if e != env:
continue
recent = [s for s in snaps if s.ts >= cutoff]
if recent:
latest_per_service[svc] = recent[0]
return sorted(
[s.to_dict() for s in latest_per_service.values()],
key=lambda r: -r["score"],
)[:top_n]
def cleanup(self, retention_days: int = 90) -> int:
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
).isoformat()
deleted = 0
with self._lock:
for key in list(self._data.keys()):
before = len(self._data[key])
self._data[key] = [s for s in self._data[key] if s.ts >= cutoff]
deleted += before - len(self._data[key])
return deleted
# ─── Null backend ──────────────────────────────────────────────────────────────
class NullRiskHistoryStore(RiskHistoryStore):
"""No-op: all writes discarded, all reads return empty."""
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
return 0
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
return None
def get_series(
self, service: str, env: str, hours: int = 168, limit: int = 200
) -> List[RiskSnapshot]:
return []
def cleanup(self, retention_days: int = 90) -> int:
return 0
# ─── Postgres backend ──────────────────────────────────────────────────────────
class PostgresRiskHistoryStore(RiskHistoryStore):
"""
Production Postgres backend (psycopg2 sync, per-thread connection).
Schema created by ops/scripts/migrate_risk_history_postgres.py.
"""
def __init__(self, dsn: str) -> None:
self._dsn = dsn
self._local = threading.local()
def _conn(self):
conn = getattr(self._local, "conn", None)
if conn is None or conn.closed:
import psycopg2 # type: ignore
conn = psycopg2.connect(self._dsn)
conn.autocommit = True
self._local.conn = conn
return conn
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
if not records:
return 0
cur = self._conn().cursor()
written = 0
for rec in records:
try:
cur.execute(
"""INSERT INTO risk_history (ts, service, env, score, band, components, reasons)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (ts, service, env) DO UPDATE
SET score=EXCLUDED.score, band=EXCLUDED.band,
components=EXCLUDED.components, reasons=EXCLUDED.reasons""",
(rec.ts, rec.service, rec.env, rec.score, rec.band,
json.dumps(rec.components), json.dumps(rec.reasons)),
)
written += 1
except Exception as e:
logger.warning("risk_history write failed for %s/%s: %s", rec.service, rec.env, e)
cur.close()
return written
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
cur = self._conn().cursor()
cur.execute(
"SELECT ts,service,env,score,band,components,reasons FROM risk_history "
"WHERE service=%s AND env=%s ORDER BY ts DESC LIMIT 1",
(service, env),
)
row = cur.fetchone()
cur.close()
if not row:
return None
return self._row_to_snap(row)
def get_series(
self, service: str, env: str, hours: int = 168, limit: int = 200
) -> List[RiskSnapshot]:
cutoff = datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
cur = self._conn().cursor()
cur.execute(
"SELECT ts,service,env,score,band,components,reasons FROM risk_history "
"WHERE service=%s AND env=%s AND ts >= %s ORDER BY ts DESC LIMIT %s",
(service, env, cutoff, limit),
)
rows = cur.fetchall()
cur.close()
return [self._row_to_snap(r) for r in rows]
def dashboard_series(
self, env: str, hours: int = 24, top_n: int = 10
) -> List[Dict]:
cutoff = datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
cur = self._conn().cursor()
# Latest snapshot per service in env within window
cur.execute(
"""SELECT DISTINCT ON (service)
ts, service, env, score, band, components, reasons
FROM risk_history
WHERE env=%s AND ts >= %s
ORDER BY service, ts DESC""",
(env, cutoff),
)
rows = cur.fetchall()
cur.close()
snaps = [self._row_to_snap(r).to_dict() for r in rows]
return sorted(snaps, key=lambda r: -r["score"])[:top_n]
def cleanup(self, retention_days: int = 90) -> int:
cutoff = datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
cur = self._conn().cursor()
cur.execute("DELETE FROM risk_history WHERE ts < %s", (cutoff,))
deleted = cur.rowcount
cur.close()
return deleted
@staticmethod
def _row_to_snap(row) -> RiskSnapshot:
ts, service, env, score, band, components, reasons = row
if isinstance(ts, datetime.datetime):
ts = ts.isoformat()
if isinstance(components, str):
components = json.loads(components)
if isinstance(reasons, str):
reasons = json.loads(reasons)
return RiskSnapshot(
ts=ts, service=service, env=env,
score=int(score), band=band,
components=components or {},
reasons=reasons or [],
)
# ─── Auto backend ─────────────────────────────────────────────────────────────
class AutoRiskHistoryStore(RiskHistoryStore):
"""
Postgres primary; falls back to MemoryRiskHistoryStore on connection failures.
Reads are always tried against Postgres first. On failure, returns from memory buffer.
"""
def __init__(self, pg_dsn: str) -> None:
self._pg = PostgresRiskHistoryStore(pg_dsn)
self._mem = MemoryRiskHistoryStore()
self._pg_ok = True
def _try_pg(self, method: str, *args, **kwargs):
try:
result = getattr(self._pg, method)(*args, **kwargs)
self._pg_ok = True
return True, result
except Exception as e:
if self._pg_ok:
logger.warning("AutoRiskHistoryStore: Postgres unavailable (%s), using memory", e)
self._pg_ok = False
return False, None
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
ok, written = self._try_pg("write_snapshot", records)
self._mem.write_snapshot(records) # always keep in-memory buffer
return written if ok else len(records)
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
ok, result = self._try_pg("get_latest", service, env)
if ok:
return result
return self._mem.get_latest(service, env)
def get_series(
self, service: str, env: str, hours: int = 168, limit: int = 200
) -> List[RiskSnapshot]:
ok, result = self._try_pg("get_series", service, env, hours, limit)
if ok:
return result
return self._mem.get_series(service, env, hours, limit)
def dashboard_series(
self, env: str, hours: int = 24, top_n: int = 10
) -> List[Dict]:
ok, result = self._try_pg("dashboard_series", env, hours, top_n)
if ok:
return result
return self._mem.dashboard_series(env, hours, top_n)
def cleanup(self, retention_days: int = 90) -> int:
ok, count = self._try_pg("cleanup", retention_days)
self._mem.cleanup(retention_days)
return count if ok else 0
# ─── Singleton factory ────────────────────────────────────────────────────────
_store: Optional[RiskHistoryStore] = None
_store_lock = threading.Lock()
def get_risk_history_store() -> RiskHistoryStore:
global _store
if _store is None:
with _store_lock:
if _store is None:
_store = _create_store()
return _store
def set_risk_history_store(store: Optional[RiskHistoryStore]) -> None:
global _store
with _store_lock:
_store = store
def _create_store() -> RiskHistoryStore:
backend = os.getenv("RISK_HISTORY_BACKEND", "auto").lower()
dsn = (
os.getenv("RISK_DATABASE_URL")
or os.getenv("DATABASE_URL")
or ""
)
if backend == "memory":
logger.info("RiskHistoryStore: in-memory")
return MemoryRiskHistoryStore()
if backend == "null":
logger.info("RiskHistoryStore: null (disabled)")
return NullRiskHistoryStore()
if backend == "postgres":
if dsn:
logger.info("RiskHistoryStore: postgres dsn=%s", dsn[:30])
return PostgresRiskHistoryStore(dsn)
logger.warning("RISK_HISTORY_BACKEND=postgres but no DATABASE_URL; falling back to memory")
return MemoryRiskHistoryStore()
# Default: auto
if dsn:
logger.info("RiskHistoryStore: auto (postgres→memory fallback) dsn=%s", dsn[:30])
return AutoRiskHistoryStore(pg_dsn=dsn)
logger.info("RiskHistoryStore: auto — no DATABASE_URL, using memory")
return MemoryRiskHistoryStore()

View File

@@ -0,0 +1,376 @@
"""
signature_state_store.py — Cooldown tracking per incident signature.
Prevents triage from running too frequently for the same failure type.
A "signature" is the same one computed by alert_routing.compute_incident_signature.
Backends:
- MemorySignatureStateStore (tests / single-process)
- PostgresSignatureStateStore (production)
- AutoSignatureStateStore (Postgres → Memory fallback)
Table: incident_signature_state
signature text PK, last_triage_at timestamptz, last_alert_at timestamptz,
triage_count_24h int, updated_at timestamptz
DDL: ops/scripts/migrate_alerts_postgres.py
"""
from __future__ import annotations
import datetime
import logging
import os
import threading
import time
from abc import ABC, abstractmethod
from typing import Dict, List, Optional
logger = logging.getLogger(__name__)
DEFAULT_COOLDOWN_MINUTES = 15
def _now_dt() -> datetime.datetime:
return datetime.datetime.utcnow()
def _now_iso() -> str:
return datetime.datetime.utcnow().isoformat()
# ─── Abstract ─────────────────────────────────────────────────────────────────
class SignatureStateStore(ABC):
@abstractmethod
def should_run_triage(
self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES
) -> bool:
"""Return True if cooldown has passed (triage may proceed)."""
@abstractmethod
def mark_alert_seen(self, signature: str) -> None:
"""Record that an alert with this signature was observed.
Also updates occurrences_60m rolling bucket."""
@abstractmethod
def mark_triage_run(self, signature: str) -> None:
"""Record that triage was executed for this signature."""
@abstractmethod
def get_state(self, signature: str) -> Optional[Dict]:
"""Return raw state dict or None."""
@abstractmethod
def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]:
"""Return signatures seen in last window_minutes, ordered by occurrences_60m desc."""
# ─── Memory backend ────────────────────────────────────────────────────────────
class MemorySignatureStateStore(SignatureStateStore):
BUCKET_MINUTES = 60 # rolling window for occurrences_60m
def __init__(self):
self._lock = threading.Lock()
self._states: Dict[str, Dict] = {}
def _update_bucket(self, state: Dict, now: str) -> None:
"""Update the 60-min rolling occurrence bucket in-place."""
bucket_start = state.get("occurrences_60m_bucket_start") or ""
cutoff = (_now_dt() - datetime.timedelta(minutes=self.BUCKET_MINUTES)).isoformat()
if bucket_start < cutoff:
state["occurrences_60m"] = 1
state["occurrences_60m_bucket_start"] = now
else:
state["occurrences_60m"] = state.get("occurrences_60m", 0) + 1
def should_run_triage(
self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES
) -> bool:
with self._lock:
state = self._states.get(signature)
if state is None:
return True
last_triage = state.get("last_triage_at")
if not last_triage:
return True
cutoff = (_now_dt() - datetime.timedelta(minutes=cooldown_minutes)).isoformat()
return last_triage < cutoff
def mark_alert_seen(self, signature: str) -> None:
now = _now_iso()
with self._lock:
if signature not in self._states:
self._states[signature] = {
"signature": signature,
"last_triage_at": None,
"last_alert_at": now,
"triage_count_24h": 0,
"occurrences_60m": 1,
"occurrences_60m_bucket_start": now,
"updated_at": now,
}
else:
s = self._states[signature]
s["last_alert_at"] = now
s["updated_at"] = now
self._update_bucket(s, now)
def mark_triage_run(self, signature: str) -> None:
now = _now_iso()
cutoff_24h = (_now_dt() - datetime.timedelta(hours=24)).isoformat()
with self._lock:
if signature not in self._states:
self._states[signature] = {
"signature": signature,
"last_triage_at": now,
"last_alert_at": now,
"triage_count_24h": 1,
"occurrences_60m": 0,
"occurrences_60m_bucket_start": now,
"updated_at": now,
}
else:
s = self._states[signature]
prev = s.get("last_triage_at") or ""
if prev < cutoff_24h:
s["triage_count_24h"] = 1
else:
s["triage_count_24h"] = s.get("triage_count_24h", 0) + 1
s["last_triage_at"] = now
s["updated_at"] = now
def get_state(self, signature: str) -> Optional[Dict]:
with self._lock:
s = self._states.get(signature)
return dict(s) if s else None
def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]:
cutoff = (_now_dt() - datetime.timedelta(minutes=window_minutes)).isoformat()
with self._lock:
active = [
dict(s) for s in self._states.values()
if (s.get("last_alert_at") or "") >= cutoff
]
return sorted(active, key=lambda x: x.get("occurrences_60m", 0), reverse=True)[:limit]
# ─── Postgres backend ──────────────────────────────────────────────────────────
class PostgresSignatureStateStore(SignatureStateStore):
def __init__(self, dsn: str):
self._dsn = dsn
self._local = threading.local()
def _conn(self):
conn = getattr(self._local, "conn", None)
if conn is None or conn.closed:
import psycopg2 # type: ignore
conn = psycopg2.connect(self._dsn)
conn.autocommit = True
self._local.conn = conn
return conn
def should_run_triage(
self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES
) -> bool:
cur = self._conn().cursor()
cur.execute(
"SELECT last_triage_at FROM incident_signature_state WHERE signature=%s",
(signature,),
)
row = cur.fetchone()
cur.close()
if not row or row[0] is None:
return True
cutoff = _now_dt() - datetime.timedelta(minutes=cooldown_minutes)
last = row[0]
if hasattr(last, "tzinfo") and last.tzinfo:
last = last.replace(tzinfo=None)
return last < cutoff
def mark_alert_seen(self, signature: str) -> None:
now = _now_iso()
cutoff_60m = (_now_dt() - datetime.timedelta(minutes=60)).isoformat()
cur = self._conn().cursor()
cur.execute(
"""INSERT INTO incident_signature_state
(signature, last_alert_at, triage_count_24h, updated_at,
occurrences_60m, occurrences_60m_bucket_start)
VALUES (%s, %s, 0, %s, 1, %s)
ON CONFLICT (signature) DO UPDATE
SET last_alert_at=EXCLUDED.last_alert_at,
updated_at=EXCLUDED.updated_at,
occurrences_60m = CASE
WHEN incident_signature_state.occurrences_60m_bucket_start IS NULL
OR incident_signature_state.occurrences_60m_bucket_start < %s
THEN 1
ELSE incident_signature_state.occurrences_60m + 1
END,
occurrences_60m_bucket_start = CASE
WHEN incident_signature_state.occurrences_60m_bucket_start IS NULL
OR incident_signature_state.occurrences_60m_bucket_start < %s
THEN EXCLUDED.occurrences_60m_bucket_start
ELSE incident_signature_state.occurrences_60m_bucket_start
END""",
(signature, now, now, now, cutoff_60m, cutoff_60m),
)
cur.close()
def mark_triage_run(self, signature: str) -> None:
now = _now_iso()
cutoff_24h = (_now_dt() - datetime.timedelta(hours=24)).isoformat()
cur = self._conn().cursor()
cur.execute(
"""INSERT INTO incident_signature_state
(signature, last_triage_at, last_alert_at, triage_count_24h, updated_at,
occurrences_60m, occurrences_60m_bucket_start)
VALUES (%s, %s, %s, 1, %s, 0, %s)
ON CONFLICT (signature) DO UPDATE
SET last_triage_at=EXCLUDED.last_triage_at,
triage_count_24h = CASE
WHEN incident_signature_state.last_triage_at IS NULL
OR incident_signature_state.last_triage_at < %s
THEN 1
ELSE incident_signature_state.triage_count_24h + 1
END,
updated_at=EXCLUDED.updated_at""",
(signature, now, now, now, now, cutoff_24h),
)
cur.close()
def get_state(self, signature: str) -> Optional[Dict]:
cur = self._conn().cursor()
cur.execute(
"SELECT signature, last_triage_at, last_alert_at, triage_count_24h, updated_at, "
"occurrences_60m, occurrences_60m_bucket_start "
"FROM incident_signature_state WHERE signature=%s",
(signature,),
)
row = cur.fetchone()
cur.close()
if not row:
return None
sig, lta, laa, cnt, upd, occ60, occ_start = row
return {
"signature": sig,
"last_triage_at": lta.isoformat() if hasattr(lta, "isoformat") else lta,
"last_alert_at": laa.isoformat() if hasattr(laa, "isoformat") else laa,
"triage_count_24h": cnt,
"updated_at": upd.isoformat() if hasattr(upd, "isoformat") else upd,
"occurrences_60m": occ60 or 0,
"occurrences_60m_bucket_start": (
occ_start.isoformat() if hasattr(occ_start, "isoformat") else occ_start
),
}
def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]:
cutoff = (_now_dt() - datetime.timedelta(minutes=window_minutes)).isoformat()
cur = self._conn().cursor()
cur.execute(
"SELECT signature, last_triage_at, last_alert_at, triage_count_24h, updated_at, "
"occurrences_60m, occurrences_60m_bucket_start "
"FROM incident_signature_state "
"WHERE last_alert_at >= %s "
"ORDER BY occurrences_60m DESC NULLS LAST LIMIT %s",
(cutoff, limit),
)
rows = []
for row in cur.fetchall():
sig, lta, laa, cnt, upd, occ60, occ_start = row
rows.append({
"signature": sig,
"last_triage_at": lta.isoformat() if hasattr(lta, "isoformat") else lta,
"last_alert_at": laa.isoformat() if hasattr(laa, "isoformat") else laa,
"triage_count_24h": cnt,
"updated_at": upd.isoformat() if hasattr(upd, "isoformat") else upd,
"occurrences_60m": occ60 or 0,
"occurrences_60m_bucket_start": (
occ_start.isoformat() if hasattr(occ_start, "isoformat") else occ_start
),
})
cur.close()
return rows
# ─── Auto backend ──────────────────────────────────────────────────────────────
class AutoSignatureStateStore(SignatureStateStore):
_RECOVERY_S = 300
def __init__(self, pg_dsn: str):
self._pg_dsn = pg_dsn
self._primary: Optional[PostgresSignatureStateStore] = None
self._fallback = MemorySignatureStateStore()
self._using_fallback = False
self._since: float = 0.0
self._lock = threading.Lock()
def _get_primary(self) -> PostgresSignatureStateStore:
if self._primary is None:
with self._lock:
if self._primary is None:
self._primary = PostgresSignatureStateStore(self._pg_dsn)
return self._primary
def _maybe_recover(self):
if self._using_fallback and time.monotonic() - self._since >= self._RECOVERY_S:
self._using_fallback = False
def _delegate(self, method: str, *args, **kwargs):
self._maybe_recover()
if not self._using_fallback:
try:
return getattr(self._get_primary(), method)(*args, **kwargs)
except Exception as e:
logger.warning("AutoSignatureStateStore Postgres failed: %s", e)
self._using_fallback = True
self._since = time.monotonic()
return getattr(self._fallback, method)(*args, **kwargs)
def should_run_triage(self, signature, cooldown_minutes=DEFAULT_COOLDOWN_MINUTES):
return self._delegate("should_run_triage", signature, cooldown_minutes)
def mark_alert_seen(self, signature):
self._delegate("mark_alert_seen", signature)
def mark_triage_run(self, signature):
self._delegate("mark_triage_run", signature)
def get_state(self, signature):
return self._delegate("get_state", signature)
def list_active_signatures(self, window_minutes=60, limit=100):
return self._delegate("list_active_signatures", window_minutes, limit)
# ─── Singleton ────────────────────────────────────────────────────────────────
_sig_store: Optional[SignatureStateStore] = None
_sig_lock = threading.Lock()
def get_signature_state_store() -> SignatureStateStore:
global _sig_store
if _sig_store is None:
with _sig_lock:
if _sig_store is None:
_sig_store = _create_sig_store()
return _sig_store
def set_signature_state_store(store: Optional[SignatureStateStore]) -> None:
global _sig_store
with _sig_lock:
_sig_store = store
def _create_sig_store() -> SignatureStateStore:
backend = os.getenv("ALERT_BACKEND", "memory").lower()
dsn = os.getenv("DATABASE_URL") or os.getenv("ALERT_DATABASE_URL", "")
if backend == "postgres" and dsn:
return PostgresSignatureStateStore(dsn)
if backend == "auto" and dsn:
return AutoSignatureStateStore(dsn)
return MemorySignatureStateStore()

View File

@@ -0,0 +1,767 @@
"""Sofiia Smart Auto-Router — Cursor-style model selection for Sofiia agent.
Classifies incoming prompt by task type and selects the best available model,
balancing capability, speed, cost, and provider availability.
Full model catalog includes:
- Cloud: Anthropic Claude, xAI Grok, DeepSeek, Mistral AI, GLM-5 (Z.AI)
- Local Ollama (NODA2/MacBook): qwen3.5:35b-a3b, qwen3:14b, glm-4.7-flash:32k,
deepseek-r1:70b, deepseek-coder:33b, gemma3, mistral-nemo:12b,
starcoder2:3b, phi3, llava:13b
Task taxonomy (inspired by Cursor Auto mode):
code_gen, code_review, code_debug, code_refactor,
architecture, devops, security, analysis, quick_answer, creative, reasoning,
math_code, vision, chatbot
"""
from __future__ import annotations
import logging
import os
import re
import time
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
# ── Task taxonomy ──────────────────────────────────────────────────────────────
# Each pattern group uses multi-word or context-aware patterns to reduce false
# positives. Single common words (system, design, check, list, graph, tree) are
# avoided unless paired with a qualifier.
TASK_PATTERNS: List[Tuple[str, List[str], float]] = [
# (task_type, patterns, base_weight) — weight scales final score
("code_gen", [
r"\апиши\s+(функці|код|клас|скрипт|модуль|endpoint|api)",
r"\bреалізуй\b", r"\bcreate\s+(function|class|module|endpoint|api|component)",
r"\bimplement\b", r"\bgenerate\s+code\b", r"\генеруй\s+код\b",
r"\ункці[юя]\s+для\b", r"\bклас\s+для\b", r"\апиши\s+код\b",
r"\bwrite\s+a?\s*(function|class|module|script|endpoint)\b",
r"\bcontroller\b", r"\bendpoint\s+(для|for)\b",
], 1.0),
("code_debug", [
r"\bвиправ\b", r"\bбаг\b", r"\bпомилк[аи]\b", r"\btraceback\b",
r"\bexception\b", r"\bfailed\b", r"\bcrash(es|ed)?\b", r"\е\s+працю",
r"\ебаг\b", r"\bdebug\b", r"\bfix\s+(the\s+)?(bug|error|issue|crash)\b",
r"\bsyntax\s*error\b", r"\btype\s*error\b", r"\battribute\s*error\b",
r"\bruntime\s*error\b", r"\bvalue\s*error\b",
], 1.0),
("code_review", [
r"\breview\s+(the\s+)?(code|pr|pull\s+request|diff)\b",
r"\bаудит\s+(код|сервіс|систем)\b", r"\baudit\s+(code|service)\b",
r"\bперевір\w*\s+(код|якість)\b", r"\bcode\s+quality\b",
r"\bcode\s+review\b", r"\brev'ю\b",
], 1.0),
("code_refactor", [
r"\bрефактор\b", r"\brefactor\b",
r"\bоптимізу[йї]\s+(код|функці|клас)\b", r"\boptimize\s+(the\s+)?(code|function|class)\b",
r"\bclean\s+up\s+(the\s+)?code\b", r"\bpolish\s+(the\s+)?code\b",
r"\bspeed\s+up\b", r"\bimprove\s+(the\s+)?code\b",
], 1.0),
("architecture", [
r"\bархітектур\w+\b", r"\barchitecture\b",
r"\bспроєктуй\b", r"\bsystem\s+design\b",
r"\bmicroservice\s+(architect|design|pattern)\b",
r"\bdatabase\s+design\b", r"\bapi\s+design\b",
r"\bscalab(le|ility)\b", r"\bscaling\s+strateg\b",
r"\bdesign\s+pattern\b", r"\bsystem\s+structure\b",
], 1.0),
("devops", [
r"\bdeploy\b", r"\bdocker\s*(file|compose|-compose|ize)?\b",
r"\bkubernetes\b", r"\bk8s\b", r"\bci[\s/]cd\b",
r"\bpipeline\b", r"\bnginx\b", r"\bcaddy\b",
r"\ода\d?\b", r"\bnoda\d?\b", r"\bcontainer\s+(start|stop|restart|build|image)\b",
r"\еплой\b", r"\bssh\s+(to|into|root|connect)\b",
r"\bhelm\b", r"\bterraform\b", r"\binfrastructure\b",
r"\bdocker\s+compose\s+up\b",
], 1.0),
("security", [
r"\bvulnerability\b", r"\bCVE-\d+\b", r"\bsecurity\s+(audit|review|issue|scan)\b",
r"\bauth(entication|orization)\b", r"\bencrypt(ion)?\b",
r"\bRBAC\b", r"\bpermission\s+(model|system)\b",
r"\bбезпек\w+\b", r"\bpentest\b", r"\b(sql|xss|csrf)\s*injection\b",
r"\bthreat\s+model\b",
], 1.0),
("reasoning", [
r"\ому\s+\w+\b", r"\bwhy\s+(does|is|do|did|should|would)\b",
r"\bpros\s+and\s+cons\b", r"\btrade[\s-]?off\b",
r"\bпорівняй\b", r"\bcompare\s+\w+\s+(vs|and|with|to)\b",
r"\bяк\s+краще\b", r"\bперевага\b", r"\едолік\b",
r"\bdecision\s+(between|about)\b",
r"\bversus\b", r"\b\w+\s+vs\s+\w+\b",
], 1.0),
("analysis", [
r"\bпроаналізуй\b", r"\bаналіз\s+\w+\b",
r"\banalyze\s+\w+\b", r"\binvestigate\b",
r"\bexplain\s+(how|why|what)\b", r"\bsummariz(e|ation)\b",
r"\ослідж\b", r"\bпоясни\s+(як|чому|що)\b",
r"\bhow\s+does\s+\w+\s+work\b",
], 1.0),
("creative", [
r"\апиши\s+(текст|стат|пост|лист|опис)\b",
r"\bwrite\s+a\s+(blog|article|post|email|description|letter)\b",
r"\bdraft\s+(a\s+)?(doc|email|message|proposal)\b",
r"\breadme\b", r"\bchangelog\b", r"\bdocumentation\b",
], 1.0),
("quick_answer", [
r"\о\s+таке\b", r"\bwhat\s+is\s+(a|an|the)?\b",
r"\bhow\s+to\s+\w+\b", r"\bdefinition\s+of\b",
r"\bшвидко\b", r"\bсинтаксис\s+\w+\b",
r"\bgive\s+me\s+an?\s+example\b", r"\bexample\s+of\b",
], 0.9),
("vision", [
r"\ображен\w+\b", r"\ото\b", r"\bimage\s+(analysis|recognition|detect)\b",
r"\bскріншот\b", r"\bscreenshot\b",
r"\ізуальн\w+\s+аналіз\b", r"\ідео\s+(аналіз|розпізна)\b",
], 1.0),
("math_code", [
r"\bалгоритм\s+\w+\b", r"\balgorithm\s+(for|to)\b",
r"\bсортуван\w+\b", r"\bsort(ing)?\s+algorithm\b",
r"\bdynamic\s+programming\b", r"\bgraph\s+(algorithm|traversal|search)\b",
r"\bmatrix\s+(mult|inver|decomp)\b",
r"\bcalculate\s+\w+\b", r"\bcompute\s+\w+\b",
r"\bformula\s+(for|to)\b", r"\ейкстр\b", r"\bDijkstra\b",
], 1.0),
# Chatbot / conversational — greetings, small talk, acknowledgements
("chatbot", [
r"^(привіт|вітаю|добрий|доброго|hi|hello|hey)\b",
r"^(дякую|спасибі|thank|thanks)\b",
r"^(ок|добре|зрозумів|зрозуміло|so?|ok|yes|no|ні|так)\s*[,!.]?\s*$",
r"\bяк\s+(справи|діла|ся маєш)\b", r"\bhow\s+are\s+you\b",
], 0.8),
]
# Pre-compile patterns once for performance
_COMPILED_PATTERNS: Optional[List[Tuple[str, List[re.Pattern], float]]] = None
def _get_compiled_patterns() -> List[Tuple[str, List[re.Pattern], float]]:
global _COMPILED_PATTERNS
if _COMPILED_PATTERNS is None:
_COMPILED_PATTERNS = [
(task_type, [re.compile(p, re.IGNORECASE) for p in patterns], weight)
for task_type, patterns, weight in TASK_PATTERNS
]
return _COMPILED_PATTERNS
# ── Model catalog ──────────────────────────────────────────────────────────────
@dataclass
class ModelSpec:
profile_name: str
provider: str
model_id: str
api_key_env: str = ""
strengths: List[str] = field(default_factory=list)
cost_tier: int = 1 # 0=free(local), 1=cheap, 2=mid, 3=expensive
speed_tier: int = 1 # 1=fast, 2=medium, 3=slow
context_k: int = 8 # context window in thousands
local: bool = False
max_tokens: int = 4096
vram_gb: float = 0.0
description: str = ""
@property
def available(self) -> bool:
if self.local:
return _is_ollama_model_available(self.model_id)
return bool(os.getenv(self.api_key_env, "").strip())
@property
def has_credits(self) -> bool:
return ProviderBudget.is_available(self.provider)
# ── Ollama model availability cache ───────────────────────────────────────────
_ollama_available_models: Optional[List[str]] = None
_ollama_cache_ts: float = 0.0
_OLLAMA_CACHE_TTL = 60.0
def _is_ollama_model_available(model_id: str) -> bool:
global _ollama_available_models, _ollama_cache_ts
now = time.time()
if _ollama_available_models is None or (now - _ollama_cache_ts) > _OLLAMA_CACHE_TTL:
_refresh_ollama_models_sync()
if _ollama_available_models is None:
return False
model_lower = model_id.lower()
model_base = model_lower.split(":")[0]
for m in _ollama_available_models:
ml = m.lower()
if ml == model_lower or ml.split(":")[0] == model_base:
return True
return False
def _refresh_ollama_models_sync() -> None:
global _ollama_available_models, _ollama_cache_ts
import urllib.request
import json as _json
ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
try:
with urllib.request.urlopen(f"{ollama_url}/api/tags", timeout=2) as resp:
data = _json.loads(resp.read())
_ollama_available_models = [m["name"] for m in data.get("models", [])]
_ollama_cache_ts = time.time()
except Exception:
_ollama_available_models = []
_ollama_cache_ts = time.time()
async def refresh_ollama_models_async() -> List[str]:
global _ollama_available_models, _ollama_cache_ts
try:
import httpx
ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
async with httpx.AsyncClient(timeout=2.0) as client:
resp = await client.get(f"{ollama_url}/api/tags")
data = resp.json()
_ollama_available_models = [m["name"] for m in data.get("models", [])]
_ollama_cache_ts = time.time()
return _ollama_available_models
except Exception:
_ollama_available_models = _ollama_available_models or []
return _ollama_available_models
# ── Full model catalog ─────────────────────────────────────────────────────────
SOFIIA_MODEL_CATALOG: List[ModelSpec] = [
# ── Anthropic Claude ─────────────────────────────────────────────────────
ModelSpec(
profile_name="cloud_claude_sonnet",
provider="anthropic", model_id="claude-sonnet-4-5",
api_key_env="ANTHROPIC_API_KEY",
strengths=["code_gen", "code_debug", "code_refactor", "architecture", "security", "reasoning"],
cost_tier=2, speed_tier=2, context_k=200, max_tokens=8192,
description="Claude Sonnet 4.5 — найкращий для коду та архітектури",
),
ModelSpec(
profile_name="cloud_claude_haiku",
provider="anthropic", model_id="claude-haiku-3-5",
api_key_env="ANTHROPIC_API_KEY",
strengths=["quick_answer", "code_review", "creative", "analysis", "chatbot"],
cost_tier=1, speed_tier=1, context_k=200, max_tokens=4096,
description="Claude Haiku 3.5 — швидкий та дешевий",
),
# ── xAI Grok ─────────────────────────────────────────────────────────────
ModelSpec(
profile_name="cloud_grok",
provider="grok", model_id="grok-4-1-fast-reasoning",
api_key_env="GROK_API_KEY",
strengths=["reasoning", "architecture", "analysis", "code_gen"],
cost_tier=2, speed_tier=1, context_k=2000, max_tokens=8192,
description="Grok 4.1 Fast — 2M контекст, кращий для reasoning",
),
# ── DeepSeek API ─────────────────────────────────────────────────────────
ModelSpec(
profile_name="cloud_deepseek",
provider="deepseek", model_id="deepseek-chat",
api_key_env="DEEPSEEK_API_KEY",
strengths=["code_gen", "code_debug", "code_refactor", "devops", "quick_answer"],
cost_tier=1, speed_tier=2, context_k=64, max_tokens=4096,
description="DeepSeek Chat — дешевий і добре знає код/devops",
),
# ── GLM-5 / Z.AI (API) ───────────────────────────────────────────────────
ModelSpec(
profile_name="cloud_glm5",
provider="glm", model_id="glm-4-plus",
api_key_env="GLM5_API_KEY",
strengths=["quick_answer", "creative", "analysis", "code_gen", "chatbot"],
cost_tier=1, speed_tier=1, context_k=128, max_tokens=4096,
description="GLM-4 Plus (Z.AI) — швидкий, дешевий, гарно знає українську/CJK",
),
ModelSpec(
profile_name="cloud_glm5_flash",
provider="glm", model_id="glm-4-flash",
api_key_env="GLM5_API_KEY",
strengths=["quick_answer", "creative", "chatbot"],
cost_tier=0, speed_tier=1, context_k=128, max_tokens=2048,
description="GLM-4 Flash (Z.AI) — безкоштовний, найшвидший",
),
# ── Mistral AI (API) ─────────────────────────────────────────────────────
ModelSpec(
profile_name="cloud_mistral",
provider="mistral", model_id="mistral-large-latest",
api_key_env="MISTRAL_API_KEY",
strengths=["analysis", "creative", "reasoning", "architecture"],
cost_tier=2, speed_tier=2, context_k=128, max_tokens=4096,
description="Mistral Large — добрий для аналізу та creative",
),
# ── Local: qwen3.5:35b-a3b (FLAGSHIP) ────────────────────────────────────
ModelSpec(
profile_name="local_qwen35_35b",
provider="ollama", model_id="qwen3.5:35b-a3b",
strengths=["code_gen", "code_debug", "code_refactor", "reasoning", "architecture",
"analysis", "devops", "security", "chatbot"],
cost_tier=0, speed_tier=2, context_k=32, max_tokens=4096,
local=True, vram_gb=24.0,
description="Qwen3.5 35B MoE (NODA2) — флагман локально, якість ≈ cloud",
),
# ── Local: qwen3:14b ─────────────────────────────────────────────────────
ModelSpec(
profile_name="local_qwen3_14b",
provider="ollama", model_id="qwen3:14b",
strengths=["code_gen", "code_debug", "quick_answer", "devops", "analysis", "chatbot"],
cost_tier=0, speed_tier=2, context_k=32, max_tokens=2048,
local=True, vram_gb=10.0,
description="Qwen3 14B (NODA2) — швидкий локальний загальний",
),
# ── Local: glm-4.7-flash:32k ─────────────────────────────────────────────
ModelSpec(
profile_name="local_glm47_32k",
provider="ollama", model_id="glm-4.7-flash:32k",
strengths=["quick_answer", "creative", "analysis", "code_review", "chatbot"],
cost_tier=0, speed_tier=2, context_k=32, max_tokens=2048,
local=True, vram_gb=20.0,
description="GLM-4.7 Flash 32K (NODA2) — локальний GLM, великий контекст",
),
# ── Local: deepseek-r1:70b ────────────────────────────────────────────────
ModelSpec(
profile_name="local_deepseek_r1_70b",
provider="ollama", model_id="deepseek-r1:70b",
strengths=["reasoning", "math_code", "architecture", "analysis"],
cost_tier=0, speed_tier=3, context_k=64, max_tokens=4096,
local=True, vram_gb=48.0,
description="DeepSeek-R1 70B (NODA2) — локальний reasoning як o1",
),
# ── Local: deepseek-coder:33b ─────────────────────────────────────────────
ModelSpec(
profile_name="local_deepseek_coder_33b",
provider="ollama", model_id="deepseek-coder:33b",
strengths=["code_gen", "code_debug", "code_refactor", "math_code"],
cost_tier=0, speed_tier=2, context_k=16, max_tokens=2048,
local=True, vram_gb=20.0,
description="DeepSeek Coder 33B (NODA2) — спеціаліст по коду",
),
# ── Local: gemma3:latest ──────────────────────────────────────────────────
ModelSpec(
profile_name="local_gemma3",
provider="ollama", model_id="gemma3:latest",
strengths=["quick_answer", "analysis", "creative", "chatbot"],
cost_tier=0, speed_tier=2, context_k=8, max_tokens=2048,
local=True, vram_gb=8.0,
description="Gemma3 (NODA2) — Google's ефективна модель",
),
# ── Local: mistral-nemo:12b ───────────────────────────────────────────────
ModelSpec(
profile_name="local_mistral_nemo",
provider="ollama", model_id="mistral-nemo:12b",
strengths=["creative", "quick_answer", "analysis", "chatbot"],
cost_tier=0, speed_tier=2, context_k=128, max_tokens=2048,
local=True, vram_gb=8.0,
description="Mistral Nemo 12B (NODA2) — 128K контекст локально",
),
# ── Local: starcoder2:3b ──────────────────────────────────────────────────
ModelSpec(
profile_name="local_starcoder2",
provider="ollama", model_id="starcoder2:3b",
strengths=["code_gen", "code_review"],
cost_tier=0, speed_tier=1, context_k=16, max_tokens=2048,
local=True, vram_gb=2.0,
description="StarCoder2 3B (NODA2) — мікро-модель для code completion",
),
# ── Local: phi3:latest ────────────────────────────────────────────────────
ModelSpec(
profile_name="local_phi3",
provider="ollama", model_id="phi3:latest",
strengths=["quick_answer", "analysis", "chatbot"],
cost_tier=0, speed_tier=1, context_k=128, max_tokens=2048,
local=True, vram_gb=4.0,
description="Phi-3 (NODA2) — Microsoft мала ефективна модель",
),
# ── Local: llava:13b (vision) ─────────────────────────────────────────────
ModelSpec(
profile_name="local_llava_13b",
provider="ollama", model_id="llava:13b",
strengths=["vision"],
cost_tier=0, speed_tier=2, context_k=4, max_tokens=2048,
local=True, vram_gb=10.0,
description="LLaVA 13B (NODA2) — vision модель для зображень",
),
# ── Local: gpt-oss:latest ─────────────────────────────────────────────────
ModelSpec(
profile_name="local_gpt_oss",
provider="ollama", model_id="gpt-oss:latest",
strengths=["code_gen", "quick_answer"],
cost_tier=0, speed_tier=2, context_k=8, max_tokens=2048,
local=True, vram_gb=8.0,
description="GPT-OSS (NODA2) — відкрита OSS GPT-like модель",
),
]
# ── Task → preferred model matrix ─────────────────────────────────────────────
TASK_MODEL_PRIORITY: Dict[str, List[str]] = {
# Principle: local-first for tasks where local quality is sufficient.
# Cloud only when the task genuinely needs it (complex code, deep reasoning,
# very long context, security audits).
#
# qwen3.5:35b-a3b is the flagship local — MoE with cloud-level quality.
# It should be preferred over cloud APIs for most routine tasks.
"code_gen": [
"local_qwen35_35b", "cloud_claude_sonnet", "local_deepseek_coder_33b",
"cloud_deepseek", "local_qwen3_14b", "cloud_grok",
],
"code_debug": [
"local_qwen35_35b", "local_deepseek_coder_33b", "cloud_claude_sonnet",
"cloud_deepseek", "local_qwen3_14b",
],
"code_review": [
"local_qwen35_35b", "cloud_claude_haiku", "local_deepseek_coder_33b",
"cloud_claude_sonnet", "cloud_deepseek",
],
"code_refactor": [
"local_qwen35_35b", "local_deepseek_coder_33b", "cloud_claude_sonnet",
"cloud_deepseek", "local_qwen3_14b",
],
"math_code": [
"local_deepseek_r1_70b", "local_qwen35_35b", "cloud_grok",
"cloud_claude_sonnet", "local_deepseek_coder_33b",
],
"architecture": [
"local_qwen35_35b", "cloud_grok", "cloud_claude_sonnet",
"local_deepseek_r1_70b", "cloud_mistral",
],
"devops": [
"local_qwen35_35b", "local_qwen3_14b", "cloud_deepseek",
"cloud_claude_sonnet", "local_glm47_32k",
],
"security": [
"cloud_claude_sonnet", "local_qwen35_35b", "cloud_grok", "cloud_mistral",
],
"reasoning": [
"local_deepseek_r1_70b", "local_qwen35_35b", "cloud_grok",
"cloud_claude_sonnet", "cloud_mistral",
],
"analysis": [
"local_qwen35_35b", "local_glm47_32k", "cloud_grok",
"cloud_claude_haiku", "local_mistral_nemo", "cloud_mistral",
],
"creative": [
"local_qwen35_35b", "local_mistral_nemo", "cloud_claude_haiku",
"local_glm47_32k", "cloud_mistral",
],
"quick_answer": [
"local_qwen3_14b", "local_qwen35_35b", "local_phi3",
"local_gemma3", "cloud_deepseek", "cloud_glm5_flash",
],
"chatbot": [
"local_qwen3_14b", "local_qwen35_35b", "local_gemma3",
"local_phi3", "local_mistral_nemo",
],
"vision": [
"local_llava_13b",
],
"unknown": [
"local_qwen35_35b", "local_qwen3_14b", "cloud_claude_sonnet",
"cloud_grok", "cloud_deepseek",
],
}
# ── Budget integration ─────────────────────────────────────────────────────────
class ProviderBudget:
"""In-memory budget gate: marks providers exhausted until TTL expires."""
_exhausted: Dict[str, float] = {}
_exhausted_ttl: int = 3600
@classmethod
def mark_exhausted(cls, provider: str) -> None:
cls._exhausted[provider] = time.time()
logger.warning("💸 Provider %s marked as budget-exhausted", provider)
@classmethod
def is_available(cls, provider: str) -> bool:
ts = cls._exhausted.get(provider)
if ts is None:
return True
if time.time() - ts > cls._exhausted_ttl:
cls._exhausted.pop(provider, None)
return True
return False
@classmethod
def reset(cls, provider: str) -> None:
cls._exhausted.pop(provider, None)
# ── Task classification ────────────────────────────────────────────────────────
@dataclass
class ClassificationResult:
task_type: str
confidence: float
all_scores: Dict[str, float]
ambiguous: bool = False
runner_up: Optional[str] = None
def classify_task(prompt: str, context_len: int = 0) -> Tuple[str, float]:
"""Classify prompt into a task type. Returns (task_type, confidence)."""
result = classify_task_detailed(prompt, context_len)
return result.task_type, result.confidence
def classify_task_detailed(prompt: str, context_len: int = 0) -> ClassificationResult:
"""Detailed classification with ambiguity detection and all scores."""
if not prompt or not prompt.strip():
return ClassificationResult("chatbot", 0.5, {}, ambiguous=False)
text = prompt.strip()
compiled = _get_compiled_patterns()
scores: Dict[str, float] = {}
for task_type, patterns, weight in compiled:
hits = sum(1 for p in patterns if p.search(text))
if hits > 0:
raw = hits / len(patterns)
scores[task_type] = raw * weight
if not scores:
return ClassificationResult("unknown", 0.3, {}, ambiguous=False)
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
best_task, best_score = sorted_scores[0]
confidence = min(best_score * 10, 1.0)
# Penalize confidence for very short prompts (fewer signals)
word_count = len(text.split())
if word_count <= 3:
confidence *= 0.6
elif word_count <= 8:
confidence *= 0.85
# Detect ambiguity: second-place is within 30% of the best
ambiguous = False
runner_up = None
if len(sorted_scores) >= 2:
_, second_score = sorted_scores[1]
if second_score > 0 and second_score / best_score > 0.7:
ambiguous = True
runner_up = sorted_scores[1][0]
# For long conversations, slight preference for context-heavy models
# (influences scoring, not classification)
if context_len > 50:
confidence = max(confidence, 0.5)
return ClassificationResult(
task_type=best_task,
confidence=round(confidence, 3),
all_scores={k: round(v, 4) for k, v in sorted_scores[:5]},
ambiguous=ambiguous,
runner_up=runner_up,
)
def _prompt_complexity(prompt: str) -> str:
"""Estimate prompt complexity: simple | medium | complex"""
words = len(prompt.split())
lines = prompt.count("\n")
code_blocks = prompt.count("```")
if words < 20 and lines < 3 and code_blocks == 0:
return "simple"
if words > 200 or code_blocks >= 2 or lines > 20:
return "complex"
return "medium"
# ── Main selection function ────────────────────────────────────────────────────
@dataclass
class AutoRouteResult:
profile_name: str
model_id: str
provider: str
task_type: str
confidence: float
complexity: str
reason: str
fallback_used: bool = False
all_candidates: List[str] = field(default_factory=list)
ambiguous: bool = False
runner_up: Optional[str] = None
all_scores: Dict[str, float] = field(default_factory=dict)
def select_model_auto(
prompt: str,
force_fast: bool = False,
force_capable: bool = False,
prefer_local: bool = False,
prefer_cheap: bool = False,
budget_aware: bool = True,
context_messages_len: int = 0,
) -> AutoRouteResult:
"""
Cursor-style auto model selection for Sofiia.
Logic:
1. Classify task type from prompt (with ambiguity detection)
2. Estimate complexity (simple/medium/complex)
3. Apply modifiers (force_fast, force_capable, prefer_local, prefer_cheap)
4. Score candidates from priority list factoring availability, budget, speed, cost
5. For long conversations, prefer large-context models
"""
classification = classify_task_detailed(prompt, context_messages_len)
task_type = classification.task_type
confidence = classification.confidence
complexity = _prompt_complexity(prompt)
effective_task = task_type
# Modifier overrides (parentheses fix for operator precedence)
if force_fast and task_type not in ("code_gen", "code_debug", "math_code"):
effective_task = "quick_answer"
if (prefer_cheap or complexity == "simple") and task_type in ("quick_answer", "creative", "chatbot"):
effective_task = "quick_answer"
priority_list = TASK_MODEL_PRIORITY.get(effective_task, TASK_MODEL_PRIORITY["unknown"])
catalog_map = {m.profile_name: m for m in SOFIIA_MODEL_CATALOG}
candidates = [p for p in priority_list if p in catalog_map]
if prefer_local:
local_cands = [p for p in candidates if catalog_map[p].local]
if local_cands:
candidates = local_cands
def _score(profile_name: str) -> float:
spec = catalog_map[profile_name]
score = 0.0
if not spec.available:
score += 1000
if budget_aware and not spec.has_credits:
score += 500
# Priority-list position is the strongest signal
try:
pos = priority_list.index(profile_name)
score += pos * 20
except ValueError:
score += 200
if prefer_local and not spec.local:
score += 200
if force_fast:
score += spec.speed_tier * 15
if prefer_cheap or prefer_local:
score -= spec.cost_tier * 20
else:
score += spec.cost_tier * 2
if force_capable:
score -= spec.context_k / 100
if complexity == "complex" and spec.context_k < 32:
score += 40
# Long conversation bonus for large-context models
if context_messages_len > 30 and spec.context_k >= 128:
score -= 15
elif context_messages_len > 50 and spec.context_k < 32:
score += 25
return score
scored = sorted([c for c in candidates if c in catalog_map], key=_score)
if not scored:
for fallback in ["local_qwen35_35b", "local_qwen3_14b", "local_phi3"]:
if fallback in catalog_map:
scored = [fallback]
break
best = scored[0] if scored else "local_qwen3_14b"
spec = catalog_map.get(best)
fallback_used = best not in priority_list[:2]
reasons: List[str] = [f"task={task_type} ({confidence:.0%})", f"complexity={complexity}"]
if classification.ambiguous:
reasons.append(f"ambiguous (runner_up={classification.runner_up})")
if force_fast:
reasons.append("force_fast")
if prefer_local:
reasons.append("prefer_local")
if prefer_cheap:
reasons.append("prefer_cheap")
if force_capable:
reasons.append("force_capable")
if context_messages_len > 30:
reasons.append(f"long_conversation({context_messages_len})")
if fallback_used:
reasons.append("fallback (top unavailable)")
return AutoRouteResult(
profile_name=best,
model_id=spec.model_id if spec else best,
provider=spec.provider if spec else "unknown",
task_type=task_type,
confidence=confidence,
complexity=complexity,
reason=" | ".join(reasons),
fallback_used=fallback_used,
all_candidates=scored[:5],
ambiguous=classification.ambiguous,
runner_up=classification.runner_up,
all_scores=classification.all_scores,
)
def explain_selection(result: AutoRouteResult) -> str:
"""Human-readable explanation of model selection (for debug/UI)."""
lines = [
f"Auto-selected **{result.model_id}** ({result.provider})",
f"Task: `{result.task_type}` | Complexity: `{result.complexity}` | "
f"Confidence: {result.confidence:.0%}",
f"Reason: {result.reason}",
]
if result.ambiguous:
lines.append(f"Ambiguous: runner-up was `{result.runner_up}`")
if result.all_scores:
top3 = list(result.all_scores.items())[:3]
lines.append("Scores: " + ", ".join(f"{k}={v:.3f}" for k, v in top3))
return "\n".join(lines)
def get_full_catalog() -> List[Dict[str, Any]]:
"""Return full model catalog with availability status for dashboard."""
return [
{
"profile_name": m.profile_name,
"provider": m.provider,
"model_id": m.model_id,
"description": m.description,
"strengths": m.strengths,
"cost_tier": m.cost_tier,
"speed_tier": m.speed_tier,
"context_k": m.context_k,
"local": m.local,
"vram_gb": m.vram_gb,
"available": m.available,
"has_credits": m.has_credits,
}
for m in SOFIIA_MODEL_CATALOG
]

View File

@@ -0,0 +1,473 @@
"""
Tool Governance: RBAC enforcement, Safety Middleware, Audit.
Applies to ALL /v1/tools/* dispatch.
Components:
1. RBAC Matrix enforcement deny without entitlement
2. Tool Safety Middleware limits, redaction, allowlist, audit
3. Audit events structured per-call events (no payload, only metadata)
Usage (in tool_manager.py execute_tool):
from tool_governance import ToolGovernance
governance = ToolGovernance()
# Pre-call
check = governance.pre_call(tool_name, action, agent_id, user_id, workspace_id, input_text)
if not check.allowed:
return ToolResult(success=False, error=check.reason)
# Execute actual tool handler ...
result = await _actual_handler(args)
# Post-call
governance.post_call(check.call_ctx, result, duration_ms)
"""
import hashlib
import ipaddress
import json
import logging
import os
import re
import time
import uuid
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
# ─── Config Paths ─────────────────────────────────────────────────────────────
_CONFIG_DIR = Path(__file__).parent.parent.parent / "config"
_RBAC_PATH = _CONFIG_DIR / "rbac_tools_matrix.yml"
_LIMITS_PATH = _CONFIG_DIR / "tool_limits.yml"
_ALLOWLIST_PATH = _CONFIG_DIR / "network_allowlist.yml"
# ─── Data Classes ─────────────────────────────────────────────────────────────
@dataclass
class CallContext:
req_id: str
tool: str
action: str
agent_id: str
user_id: str
workspace_id: str
ts_start: float
input_hash: str
input_chars: int
limits_applied: Dict[str, Any] = field(default_factory=dict)
@dataclass
class PreCallResult:
allowed: bool
reason: str = ""
call_ctx: Optional[CallContext] = None
@dataclass
class AuditEvent:
ts: str
req_id: str
tool: str
action: str
workspace_id: str
user_id: str
agent_id: str
status: str # "pass" | "deny" | "error"
duration_ms: float
limits_applied: Dict[str, Any]
input_hash: str
input_chars: int
output_size_bytes: int
# ─── YAML Loader (lazy, cached) ───────────────────────────────────────────────
_yaml_cache: Dict[str, Any] = {}
def _load_yaml(path: Path) -> dict:
key = str(path)
if key not in _yaml_cache:
try:
import yaml
with open(path, "r") as f:
_yaml_cache[key] = yaml.safe_load(f) or {}
except Exception as e:
logger.warning(f"Could not load {path}: {e}")
_yaml_cache[key] = {}
return _yaml_cache[key]
def _reload_yaml_cache():
"""Force reload all yaml caches (for tests / hot-reload)."""
_yaml_cache.clear()
# ─── Secret Redaction ─────────────────────────────────────────────────────────
_SECRET_PATTERNS = [
# API keys / tokens
re.compile(
r'(?i)(api[_-]?key|token|secret|password|passwd|pwd|auth|bearer|jwt|'
r'oauth|private[_-]?key|sk-|ghp_|xoxb-|AKIA|client_secret)'
r'[\s=:]+[\'"`]?([a-zA-Z0-9_\-\.]{8,})[\'"`]?',
re.MULTILINE,
),
# Generic high-entropy strings after known labels
re.compile(
r'(?i)(credential|access[_-]?key|refresh[_-]?token|signing[_-]?key)'
r'[\s=:]+[\'"`]?([a-zA-Z0-9/+]{20,}={0,2})[\'"`]?',
re.MULTILINE,
),
]
def redact(text: str) -> str:
"""Mask secret values in text. Always enabled by default."""
if not text:
return text
for pat in _SECRET_PATTERNS:
def _replace(m):
label = m.group(1)
return f"{label}=***REDACTED***"
text = pat.sub(_replace, text)
return text
# ─── Network Allowlist Check ──────────────────────────────────────────────────
_PRIVATE_RANGES = [
ipaddress.ip_network("10.0.0.0/8"),
ipaddress.ip_network("172.16.0.0/12"),
ipaddress.ip_network("192.168.0.0/16"),
ipaddress.ip_network("127.0.0.0/8"),
ipaddress.ip_network("169.254.0.0/16"),
ipaddress.ip_network("::1/128"),
ipaddress.ip_network("fc00::/7"),
]
def _is_private_ip(host: str) -> bool:
try:
addr = ipaddress.ip_address(host)
return any(addr in net for net in _PRIVATE_RANGES)
except ValueError:
return False
def check_url_allowed(tool: str, url: str) -> Tuple[bool, str]:
"""
Check if a URL is allowed for a given tool per network_allowlist.yml.
Returns (allowed, reason).
"""
import urllib.parse
parsed = urllib.parse.urlparse(url)
host = parsed.hostname or ""
scheme = parsed.scheme or "https"
allowlist_cfg = _load_yaml(_ALLOWLIST_PATH)
tool_cfg = allowlist_cfg.get(tool, {})
if not tool_cfg:
# No config: deny by default (safe default)
return False, f"No allowlist config for tool '{tool}'"
# Check scheme
allowed_schemes = tool_cfg.get("schemes", ["https"])
if scheme not in allowed_schemes:
return False, f"Scheme '{scheme}' not allowed for tool '{tool}'"
# Check allow_any_public flag
if tool_cfg.get("allow_any_public"):
if tool_cfg.get("block_private_ranges") and _is_private_ip(host):
return False, f"Private IP blocked: {host}"
return True, ""
# Check explicit hosts
allowed_hosts = tool_cfg.get("hosts", [])
if host in allowed_hosts:
return True, ""
return False, f"Host '{host}' not in allowlist for tool '{tool}'"
# ─── RBAC Matrix ──────────────────────────────────────────────────────────────
def _get_agent_role(agent_id: str) -> str:
"""Resolve agent role (delegates to agent_tools_config)."""
try:
from agent_tools_config import get_agent_role
return get_agent_role(agent_id)
except Exception:
return "agent_default"
def _get_role_entitlements(role: str) -> List[str]:
"""Get entitlements for a role from RBAC matrix."""
matrix = _load_yaml(_RBAC_PATH)
role_entitlements = matrix.get("role_entitlements", {})
return role_entitlements.get(role, role_entitlements.get("agent_default", []))
def _get_required_entitlements(tool: str, action: str) -> List[str]:
"""Get required entitlements for tool+action from matrix."""
matrix = _load_yaml(_RBAC_PATH)
tools_section = matrix.get("tools", {})
tool_cfg = tools_section.get(tool, {})
actions = tool_cfg.get("actions", {})
# Try exact action, then _default
action_cfg = actions.get(action) or actions.get("_default", {})
return action_cfg.get("entitlements", []) if action_cfg else []
def check_rbac(agent_id: str, tool: str, action: str) -> Tuple[bool, str]:
"""
Check RBAC: agent role → entitlements → required entitlements for tool+action.
Returns (allowed, reason).
"""
role = _get_agent_role(agent_id)
agent_ents = set(_get_role_entitlements(role))
required = _get_required_entitlements(tool, action)
if not required:
# No entitlements required → allowed
return True, ""
missing = [e for e in required if e not in agent_ents]
if missing:
return False, f"Missing entitlements: {missing} (agent={agent_id}, role={role})"
return True, ""
# ─── Limits ───────────────────────────────────────────────────────────────────
def _get_limits(tool: str) -> Dict[str, Any]:
"""Get effective limits for a tool (per-tool overrides merged with defaults)."""
cfg = _load_yaml(_LIMITS_PATH)
defaults = cfg.get("defaults", {
"timeout_ms": 30000,
"max_chars_in": 200000,
"max_bytes_out": 524288,
"rate_limit_rpm": 60,
"concurrency": 5,
})
per_tool = cfg.get("tools", {}).get(tool, {})
return {**defaults, **per_tool}
def check_input_limits(tool: str, input_text: str) -> Tuple[bool, str, Dict]:
"""
Enforce max_chars_in limit.
Returns (ok, reason, limits_applied).
"""
limits = _get_limits(tool)
max_chars = limits.get("max_chars_in", 200000)
actual = len(input_text) if input_text else 0
if actual > max_chars:
return False, f"Input too large: {actual} chars (max {max_chars} for {tool})", limits
return True, "", limits
# ─── Audit ────────────────────────────────────────────────────────────────────
def _emit_audit(event: AuditEvent):
"""
Emit structured audit event.
1. Writes to logger (structured, no payload).
2. Persists to AuditStore (JSONL/Postgres/Memory) for FinOps analysis.
Persistence is non-fatal: errors are logged as warnings without interrupting tool execution.
"""
import datetime
record = {
"ts": event.ts or datetime.datetime.now(datetime.timezone.utc).isoformat(),
"req_id": event.req_id,
"tool": event.tool,
"action": event.action,
"workspace_id": event.workspace_id,
"user_id": event.user_id,
"agent_id": event.agent_id,
"status": event.status,
"duration_ms": round(event.duration_ms, 2),
"limits_applied": event.limits_applied,
"input_hash": event.input_hash,
"input_chars": event.input_chars,
"output_size_bytes": event.output_size_bytes,
}
logger.info(f"TOOL_AUDIT {json.dumps(record)}")
# Persist to audit store (non-fatal)
try:
from audit_store import get_audit_store
store = get_audit_store()
store.write(event)
except Exception as _audit_err:
logger.warning("audit_store.write failed (non-fatal): %s", _audit_err)
# ─── Main Governance Class ────────────────────────────────────────────────────
class ToolGovernance:
"""
Single entry point for tool governance.
Call pre_call() before executing any tool.
Call post_call() after execution to emit audit event.
"""
def __init__(self, *, enable_rbac: bool = True, enable_redaction: bool = True,
enable_limits: bool = True, enable_audit: bool = True,
enable_allowlist: bool = True):
self.enable_rbac = enable_rbac
self.enable_redaction = enable_redaction
self.enable_limits = enable_limits
self.enable_audit = enable_audit
self.enable_allowlist = enable_allowlist
def pre_call(
self,
tool: str,
action: str,
agent_id: str,
user_id: str = "unknown",
workspace_id: str = "unknown",
input_text: str = "",
) -> PreCallResult:
"""
Run all pre-call checks. Returns PreCallResult.
If allowed=False, caller must return error immediately.
"""
req_id = str(uuid.uuid4())[:12]
ts_start = time.monotonic()
# 1. RBAC check
if self.enable_rbac:
ok, reason = check_rbac(agent_id, tool, action)
if not ok:
if self.enable_audit:
_emit_audit(AuditEvent(
ts=_now_iso(), req_id=req_id, tool=tool, action=action,
workspace_id=workspace_id, user_id=user_id, agent_id=agent_id,
status="deny", duration_ms=0,
limits_applied={}, input_hash="", input_chars=0, output_size_bytes=0,
))
return PreCallResult(allowed=False, reason=f"RBAC denied: {reason}")
# 2. Input limits
limits_applied = {}
if self.enable_limits and input_text:
ok, reason, limits_applied = check_input_limits(tool, input_text)
if not ok:
if self.enable_audit:
_emit_audit(AuditEvent(
ts=_now_iso(), req_id=req_id, tool=tool, action=action,
workspace_id=workspace_id, user_id=user_id, agent_id=agent_id,
status="deny", duration_ms=0,
limits_applied=limits_applied,
input_hash="", input_chars=len(input_text), output_size_bytes=0,
))
return PreCallResult(allowed=False, reason=f"Limits exceeded: {reason}")
elif not limits_applied:
limits_applied = _get_limits(tool)
# Build call context
input_hash = hashlib.sha256(input_text.encode()).hexdigest()[:16] if input_text else ""
ctx = CallContext(
req_id=req_id,
tool=tool,
action=action,
agent_id=agent_id,
user_id=user_id,
workspace_id=workspace_id,
ts_start=ts_start,
input_hash=input_hash,
input_chars=len(input_text) if input_text else 0,
limits_applied=limits_applied,
)
return PreCallResult(allowed=True, call_ctx=ctx)
def post_call(self, ctx: CallContext, result_value: Any, error: Optional[str] = None):
"""
Emit audit event after tool execution.
result_value: raw result data (used only for size calculation, not logged).
"""
if not self.enable_audit or ctx is None:
return
duration_ms = (time.monotonic() - ctx.ts_start) * 1000
status = "error" if error else "pass"
# Calculate output size (bytes) without logging content
try:
out_bytes = len(json.dumps(result_value).encode()) if result_value is not None else 0
except Exception:
out_bytes = 0
_emit_audit(AuditEvent(
ts=_now_iso(),
req_id=ctx.req_id,
tool=ctx.tool,
action=ctx.action,
workspace_id=ctx.workspace_id,
user_id=ctx.user_id,
agent_id=ctx.agent_id,
status=status,
duration_ms=duration_ms,
limits_applied=ctx.limits_applied,
input_hash=ctx.input_hash,
input_chars=ctx.input_chars,
output_size_bytes=out_bytes,
))
def apply_redaction(self, text: str) -> str:
"""Apply secret redaction if enabled."""
if not self.enable_redaction:
return text
return redact(text)
def check_url(self, tool: str, url: str) -> Tuple[bool, str]:
"""Check URL against allowlist if enabled."""
if not self.enable_allowlist:
return True, ""
return check_url_allowed(tool, url)
def get_timeout_ms(self, tool: str) -> int:
"""Get configured timeout for a tool."""
limits = _get_limits(tool)
return limits.get("timeout_ms", 30000)
# ─── Helpers ──────────────────────────────────────────────────────────────────
def _now_iso() -> str:
import datetime
return datetime.datetime.now(datetime.timezone.utc).isoformat()
# ─── Module-level singleton ───────────────────────────────────────────────────
_governance: Optional[ToolGovernance] = None
def get_governance() -> ToolGovernance:
"""Get the shared ToolGovernance singleton."""
global _governance
if _governance is None:
_governance = ToolGovernance()
return _governance
def reset_governance(instance: Optional[ToolGovernance] = None):
"""Reset singleton (for testing)."""
global _governance
_governance = instance