feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
This commit is contained in:
138
services/router/alert_ingest.py
Normal file
138
services/router/alert_ingest.py
Normal file
@@ -0,0 +1,138 @@
|
||||
"""
|
||||
alert_ingest.py — Alert ingestion business logic.
|
||||
|
||||
Handles:
|
||||
- AlertEvent validation and normalization
|
||||
- Dedupe-aware ingestion via AlertStore
|
||||
- list/get/ack helpers used by alert_ingest_tool handler
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from alert_store import (
|
||||
AlertStore,
|
||||
_compute_dedupe_key,
|
||||
_redact_text,
|
||||
_sanitize_alert,
|
||||
MAX_LOG_SAMPLES,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Validation ────────────────────────────────────────────────────────────────
|
||||
|
||||
VALID_SEVERITIES = {"P0", "P1", "P2", "P3", "INFO"}
|
||||
VALID_KINDS = {
|
||||
"slo_breach", "crashloop", "latency", "error_rate",
|
||||
"disk", "oom", "deploy", "security", "custom",
|
||||
}
|
||||
VALID_ENVS = {"prod", "staging", "dev", "any"}
|
||||
|
||||
|
||||
def validate_alert(data: Dict) -> Optional[str]:
|
||||
"""Return error string or None if valid."""
|
||||
if not data.get("service"):
|
||||
return "alert.service is required"
|
||||
if not data.get("title"):
|
||||
return "alert.title is required"
|
||||
sev = data.get("severity", "P2")
|
||||
if sev not in VALID_SEVERITIES:
|
||||
return f"alert.severity must be one of {VALID_SEVERITIES}"
|
||||
kind = data.get("kind", "custom")
|
||||
if kind not in VALID_KINDS:
|
||||
return f"alert.kind must be one of {VALID_KINDS}"
|
||||
return None
|
||||
|
||||
|
||||
def normalize_alert(data: Dict) -> Dict:
|
||||
"""Normalize and sanitize alert fields."""
|
||||
safe = _sanitize_alert(data)
|
||||
safe.setdefault("kind", "custom")
|
||||
safe.setdefault("env", "prod")
|
||||
safe.setdefault("severity", "P2")
|
||||
safe.setdefault("labels", {})
|
||||
safe.setdefault("metrics", {})
|
||||
safe.setdefault("links", [])
|
||||
safe.setdefault("evidence", {})
|
||||
|
||||
ev = safe.get("evidence", {})
|
||||
logs = ev.get("log_samples", [])
|
||||
safe["evidence"] = {
|
||||
**ev,
|
||||
"log_samples": [_redact_text(l, 300) for l in logs[:MAX_LOG_SAMPLES]],
|
||||
}
|
||||
return safe
|
||||
|
||||
|
||||
# ─── Ingest ────────────────────────────────────────────────────────────────────
|
||||
|
||||
def ingest_alert(
|
||||
store: AlertStore,
|
||||
alert_data: Dict,
|
||||
dedupe_ttl_minutes: int = 30,
|
||||
) -> Dict:
|
||||
"""
|
||||
Validate, normalize, and ingest alert with dedupe.
|
||||
Returns the store result dict.
|
||||
"""
|
||||
err = validate_alert(alert_data)
|
||||
if err:
|
||||
return {"accepted": False, "error": err}
|
||||
|
||||
normalized = normalize_alert(alert_data)
|
||||
return store.ingest(normalized, dedupe_ttl_minutes=dedupe_ttl_minutes)
|
||||
|
||||
|
||||
# ─── List/Get/Ack ──────────────────────────────────────────────────────────────
|
||||
|
||||
def list_alerts(
|
||||
store: AlertStore,
|
||||
service: Optional[str] = None,
|
||||
env: Optional[str] = None,
|
||||
window_minutes: int = 240,
|
||||
limit: int = 50,
|
||||
) -> List[Dict]:
|
||||
filters = {}
|
||||
if service:
|
||||
filters["service"] = service
|
||||
if env and env != "any":
|
||||
filters["env"] = env
|
||||
filters["window_minutes"] = window_minutes
|
||||
return store.list_alerts(filters, limit=min(limit, 200))
|
||||
|
||||
|
||||
def get_alert(store: AlertStore, alert_ref: str) -> Optional[Dict]:
|
||||
return store.get_alert(alert_ref)
|
||||
|
||||
|
||||
def ack_alert(store: AlertStore, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
|
||||
if not alert_ref:
|
||||
return None
|
||||
return store.ack_alert(alert_ref, actor, _redact_text(note, 500))
|
||||
|
||||
|
||||
# ─── Dedupe helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
def build_dedupe_key(service: str, env: str, kind: str, fingerprint: str = "") -> str:
|
||||
return _compute_dedupe_key(service, env, kind, fingerprint)
|
||||
|
||||
|
||||
def map_alert_severity_to_incident(
|
||||
alert_severity: str,
|
||||
cap: str = "P1",
|
||||
) -> str:
|
||||
"""
|
||||
Map alert severity to incident severity, applying a cap.
|
||||
e.g. alert P0 with cap P1 → P1.
|
||||
"""
|
||||
order = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "INFO": 4}
|
||||
sev = alert_severity if alert_severity in order else "P2"
|
||||
cap_val = cap if cap in order else "P1"
|
||||
# Take the higher (less critical) of the two
|
||||
if order[sev] < order[cap_val]:
|
||||
return cap_val
|
||||
return sev
|
||||
1031
services/router/alert_store.py
Normal file
1031
services/router/alert_store.py
Normal file
File diff suppressed because it is too large
Load Diff
574
services/router/architecture_pressure.py
Normal file
574
services/router/architecture_pressure.py
Normal file
@@ -0,0 +1,574 @@
|
||||
"""
|
||||
architecture_pressure.py — Architecture Pressure Index (APIx) Engine.
|
||||
DAARION.city | deterministic, no LLM.
|
||||
|
||||
Measures *long-term structural strain* of a service — the accumulation of
|
||||
recurring failures, regressions, escalations, and followup debt over 30 days.
|
||||
|
||||
Contrast with Risk Engine (short-term operational health).
|
||||
|
||||
Public API:
|
||||
load_pressure_policy() -> Dict
|
||||
compute_pressure(service, env, ...) -> PressureReport
|
||||
compute_pressure_dashboard(env, services, ...) -> DashboardResult
|
||||
list_known_services(policy) -> List[str]
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Policy ───────────────────────────────────────────────────────────────────
|
||||
|
||||
_PRESSURE_POLICY_CACHE: Optional[Dict] = None
|
||||
_PRESSURE_POLICY_PATHS = [
|
||||
Path("config/architecture_pressure_policy.yml"),
|
||||
Path(__file__).resolve().parent.parent.parent / "config" / "architecture_pressure_policy.yml",
|
||||
]
|
||||
|
||||
|
||||
def load_pressure_policy() -> Dict:
|
||||
global _PRESSURE_POLICY_CACHE
|
||||
if _PRESSURE_POLICY_CACHE is not None:
|
||||
return _PRESSURE_POLICY_CACHE
|
||||
for p in _PRESSURE_POLICY_PATHS:
|
||||
if p.exists():
|
||||
try:
|
||||
with open(p) as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
_PRESSURE_POLICY_CACHE = data
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.warning("Failed to load architecture_pressure_policy from %s: %s", p, e)
|
||||
_PRESSURE_POLICY_CACHE = _builtin_pressure_defaults()
|
||||
return _PRESSURE_POLICY_CACHE
|
||||
|
||||
|
||||
def _reload_pressure_policy() -> None:
|
||||
global _PRESSURE_POLICY_CACHE
|
||||
_PRESSURE_POLICY_CACHE = None
|
||||
|
||||
|
||||
def _builtin_pressure_defaults() -> Dict:
|
||||
return {
|
||||
"defaults": {"lookback_days": 30, "top_n": 10},
|
||||
"weights": {
|
||||
"recurrence_high_30d": 20,
|
||||
"recurrence_warn_30d": 10,
|
||||
"regressions_30d": 15,
|
||||
"escalations_30d": 12,
|
||||
"followups_created_30d": 8,
|
||||
"followups_overdue": 15,
|
||||
"drift_failures_30d": 10,
|
||||
"dependency_high_30d": 10,
|
||||
},
|
||||
"bands": {"low_max": 20, "medium_max": 45, "high_max": 70},
|
||||
"priority_rules": {
|
||||
"require_arch_review_at": 70,
|
||||
"auto_create_followup": True,
|
||||
"followup_priority": "P1",
|
||||
"followup_due_days": 14,
|
||||
"followup_owner": "cto",
|
||||
},
|
||||
"release_gate": {
|
||||
"platform_review_required": {"enabled": True, "warn_at": 60, "fail_at": 85}
|
||||
},
|
||||
"digest": {
|
||||
"output_dir": "ops/reports/platform",
|
||||
"max_chars": 12000,
|
||||
"top_n_in_digest": 10,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ─── Band classifier ──────────────────────────────────────────────────────────
|
||||
|
||||
def classify_pressure_band(score: int, policy: Dict) -> str:
|
||||
bands = policy.get("bands", {})
|
||||
low_max = int(bands.get("low_max", 20))
|
||||
med_max = int(bands.get("medium_max", 45))
|
||||
high_max = int(bands.get("high_max", 70))
|
||||
if score <= low_max:
|
||||
return "low"
|
||||
if score <= med_max:
|
||||
return "medium"
|
||||
if score <= high_max:
|
||||
return "high"
|
||||
return "critical"
|
||||
|
||||
|
||||
# ─── Signal scoring helpers ───────────────────────────────────────────────────
|
||||
|
||||
def _score_signals(components: Dict, policy: Dict) -> int:
|
||||
"""
|
||||
Additive scoring:
|
||||
recurrence_high_30d, recurrence_warn_30d — boolean (1/0)
|
||||
regressions_30d, escalations_30d, ... — counts (capped internally)
|
||||
"""
|
||||
weights = policy.get("weights", {})
|
||||
score = 0
|
||||
|
||||
# Boolean presence signals
|
||||
for bool_key in ("recurrence_high_30d", "recurrence_warn_30d"):
|
||||
if components.get(bool_key, 0):
|
||||
score += int(weights.get(bool_key, 0))
|
||||
|
||||
# Count-based signals: weight applied per unit, capped at 3× weight
|
||||
for count_key in (
|
||||
"regressions_30d", "escalations_30d", "followups_created_30d",
|
||||
"followups_overdue", "drift_failures_30d", "dependency_high_30d",
|
||||
):
|
||||
count = int(components.get(count_key, 0))
|
||||
if count:
|
||||
w = int(weights.get(count_key, 0))
|
||||
# First occurrence = full weight, subsequent = half (diminishing)
|
||||
score += w + (count - 1) * max(1, w // 2)
|
||||
|
||||
return max(0, score)
|
||||
|
||||
|
||||
def _signals_summary(components: Dict, policy: Dict) -> List[str]:
|
||||
"""Generate human-readable signal descriptions."""
|
||||
summaries = []
|
||||
if components.get("recurrence_high_30d"):
|
||||
summaries.append("High-recurrence alert buckets in last 30d")
|
||||
if components.get("recurrence_warn_30d"):
|
||||
summaries.append("Warn-level recurrence in last 30d")
|
||||
regressions = int(components.get("regressions_30d", 0))
|
||||
if regressions:
|
||||
summaries.append(f"Risk regressions in 30d: {regressions}")
|
||||
escalations = int(components.get("escalations_30d", 0))
|
||||
if escalations:
|
||||
summaries.append(f"Escalations in 30d: {escalations}")
|
||||
fu_created = int(components.get("followups_created_30d", 0))
|
||||
if fu_created:
|
||||
summaries.append(f"Follow-ups created in 30d: {fu_created}")
|
||||
fu_overdue = int(components.get("followups_overdue", 0))
|
||||
if fu_overdue:
|
||||
summaries.append(f"Overdue follow-ups: {fu_overdue}")
|
||||
drift = int(components.get("drift_failures_30d", 0))
|
||||
if drift:
|
||||
summaries.append(f"Drift gate failures in 30d: {drift}")
|
||||
dep = int(components.get("dependency_high_30d", 0))
|
||||
if dep:
|
||||
summaries.append(f"Dependency HIGH/CRITICAL findings in 30d: {dep}")
|
||||
return summaries
|
||||
|
||||
|
||||
# ─── Signal collection from stores ───────────────────────────────────────────
|
||||
|
||||
def fetch_pressure_signals(
|
||||
service: str,
|
||||
env: str,
|
||||
lookback_days: int = 30,
|
||||
*,
|
||||
incident_store=None,
|
||||
alert_store=None,
|
||||
risk_history_store=None,
|
||||
policy: Optional[Dict] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Collect all signals needed for compute_pressure from existing stores.
|
||||
Always non-fatal per store.
|
||||
Returns a components dict ready to pass to compute_pressure.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_pressure_policy()
|
||||
|
||||
cutoff = (
|
||||
datetime.datetime.utcnow() - datetime.timedelta(days=lookback_days)
|
||||
).isoformat()
|
||||
cutoff_60m = (
|
||||
datetime.datetime.utcnow() - datetime.timedelta(minutes=60)
|
||||
).isoformat()
|
||||
|
||||
components: Dict = {
|
||||
"recurrence_high_30d": 0,
|
||||
"recurrence_warn_30d": 0,
|
||||
"regressions_30d": 0,
|
||||
"escalations_30d": 0,
|
||||
"followups_created_30d": 0,
|
||||
"followups_overdue": 0,
|
||||
"drift_failures_30d": 0,
|
||||
"dependency_high_30d": 0,
|
||||
}
|
||||
|
||||
# ── Escalations + followups from incident_store ───────────────────────────
|
||||
try:
|
||||
if incident_store is not None:
|
||||
incs = incident_store.list_incidents({"service": service}, limit=100)
|
||||
for inc in incs:
|
||||
inc_id = inc.get("id", "")
|
||||
inc_start = inc.get("started_at") or inc.get("created_at", "")
|
||||
try:
|
||||
events = incident_store.get_events(inc_id, limit=200)
|
||||
for ev in events:
|
||||
ev_ts = ev.get("ts", "")
|
||||
if ev_ts < cutoff:
|
||||
continue
|
||||
ev_type = ev.get("type", "")
|
||||
msg = ev.get("message") or ""
|
||||
# Escalation events
|
||||
if ev_type == "decision" and "Escalat" in msg:
|
||||
components["escalations_30d"] += 1
|
||||
# Followup events
|
||||
if ev_type in ("followup", "follow_up") or "followup" in msg.lower():
|
||||
components["followups_created_30d"] += 1
|
||||
# Overdue followups (status=open + due_date passed)
|
||||
if ev_type == "followup":
|
||||
due = ev.get("due_date", "")
|
||||
status = ev.get("status", "")
|
||||
today = datetime.datetime.utcnow().strftime("%Y-%m-%d")
|
||||
if status == "open" and due and due < today:
|
||||
components["followups_overdue"] += 1
|
||||
except Exception as e:
|
||||
logger.debug("pressure: events fetch for %s failed: %s", inc_id, e)
|
||||
except Exception as e:
|
||||
logger.warning("pressure: incident_store fetch failed: %s", e)
|
||||
|
||||
# ── Regressions from risk_history_store ───────────────────────────────────
|
||||
try:
|
||||
if risk_history_store is not None:
|
||||
series = risk_history_store.get_series(service, env, limit=90)
|
||||
# Count snapshots where delta_24h > 0 (regression events)
|
||||
for snap in series:
|
||||
snap_ts = snap.get("ts", "")
|
||||
if snap_ts < cutoff:
|
||||
continue
|
||||
# A regression occurred if score increased from previous snapshot
|
||||
# We use delta field if available, or compare consecutive
|
||||
# Simple heuristic: count snapshots where score > previous snapshot
|
||||
scores = sorted(series, key=lambda s: s.get("ts", ""))
|
||||
for i in range(1, len(scores)):
|
||||
if (scores[i].get("ts", "") >= cutoff
|
||||
and scores[i].get("score", 0) > scores[i - 1].get("score", 0)):
|
||||
components["regressions_30d"] += 1
|
||||
except Exception as e:
|
||||
logger.warning("pressure: risk_history_store fetch failed: %s", e)
|
||||
|
||||
# ── Recurrence from alert_store top_signatures ───────────────────────────
|
||||
try:
|
||||
if alert_store is not None:
|
||||
# Use 30-day window approximation via large window
|
||||
sigs = alert_store.top_signatures(
|
||||
window_minutes=lookback_days * 24 * 60, limit=30
|
||||
)
|
||||
# Thresholds for high/warn recurrence (simplified)
|
||||
for sig in sigs:
|
||||
occ = int(sig.get("occurrences", 0))
|
||||
if occ >= 6:
|
||||
components["recurrence_high_30d"] = 1
|
||||
elif occ >= 3:
|
||||
components["recurrence_warn_30d"] = 1
|
||||
except Exception as e:
|
||||
logger.warning("pressure: alert_store recurrence fetch failed: %s", e)
|
||||
|
||||
return components
|
||||
|
||||
|
||||
# ─── Core engine ──────────────────────────────────────────────────────────────
|
||||
|
||||
def compute_pressure(
|
||||
service: str,
|
||||
env: str = "prod",
|
||||
*,
|
||||
components: Optional[Dict] = None,
|
||||
lookback_days: int = 30,
|
||||
policy: Optional[Dict] = None,
|
||||
# Optional stores for signal collection when components not pre-fetched
|
||||
incident_store=None,
|
||||
alert_store=None,
|
||||
risk_history_store=None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Compute Architecture Pressure score for a service.
|
||||
|
||||
If `components` is provided, no stores are accessed.
|
||||
Otherwise, signals are collected from stores (non-fatal fallbacks).
|
||||
|
||||
Returns a PressureReport dict.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_pressure_policy()
|
||||
|
||||
effective_days = lookback_days or int(
|
||||
policy.get("defaults", {}).get("lookback_days", 30)
|
||||
)
|
||||
|
||||
if components is None:
|
||||
components = fetch_pressure_signals(
|
||||
service, env, effective_days,
|
||||
incident_store=incident_store,
|
||||
alert_store=alert_store,
|
||||
risk_history_store=risk_history_store,
|
||||
policy=policy,
|
||||
)
|
||||
else:
|
||||
components = dict(components)
|
||||
|
||||
# Ensure all keys present
|
||||
defaults_keys = [
|
||||
"recurrence_high_30d", "recurrence_warn_30d", "regressions_30d",
|
||||
"escalations_30d", "followups_created_30d", "followups_overdue",
|
||||
"drift_failures_30d", "dependency_high_30d",
|
||||
]
|
||||
for k in defaults_keys:
|
||||
components.setdefault(k, 0)
|
||||
|
||||
score = _score_signals(components, policy)
|
||||
band = classify_pressure_band(score, policy)
|
||||
signals_summary = _signals_summary(components, policy)
|
||||
|
||||
# Architecture review required?
|
||||
review_threshold = int(
|
||||
policy.get("priority_rules", {}).get("require_arch_review_at", 70)
|
||||
)
|
||||
requires_arch_review = score >= review_threshold
|
||||
|
||||
return {
|
||||
"service": service,
|
||||
"env": env,
|
||||
"lookback_days": effective_days,
|
||||
"score": score,
|
||||
"band": band,
|
||||
"components": components,
|
||||
"signals_summary": signals_summary,
|
||||
"requires_arch_review": requires_arch_review,
|
||||
"computed_at": datetime.datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
# ─── Dashboard ────────────────────────────────────────────────────────────────
|
||||
|
||||
def compute_pressure_dashboard(
|
||||
env: str = "prod",
|
||||
services: Optional[List[str]] = None,
|
||||
top_n: int = 10,
|
||||
*,
|
||||
policy: Optional[Dict] = None,
|
||||
incident_store=None,
|
||||
alert_store=None,
|
||||
risk_history_store=None,
|
||||
risk_reports: Optional[Dict[str, Dict]] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Compute Architecture Pressure for multiple services and return a dashboard.
|
||||
|
||||
`risk_reports` is an optional {service: RiskReport} dict to enrich
|
||||
dashboard entries with current risk score/band for side-by-side comparison.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_pressure_policy()
|
||||
|
||||
effective_top_n = top_n or int(policy.get("defaults", {}).get("top_n", 10))
|
||||
|
||||
# Determine services to evaluate
|
||||
if not services:
|
||||
services = _list_services_from_stores(
|
||||
env=env, incident_store=incident_store, policy=policy
|
||||
)
|
||||
|
||||
reports = []
|
||||
for svc in services:
|
||||
try:
|
||||
report = compute_pressure(
|
||||
svc, env,
|
||||
policy=policy,
|
||||
incident_store=incident_store,
|
||||
alert_store=alert_store,
|
||||
risk_history_store=risk_history_store,
|
||||
)
|
||||
# Optionally attach current risk info
|
||||
if risk_reports and svc in risk_reports:
|
||||
rr = risk_reports[svc]
|
||||
report["risk_score"] = rr.get("score")
|
||||
report["risk_band"] = rr.get("band")
|
||||
report["risk_delta_24h"] = (rr.get("trend") or {}).get("delta_24h")
|
||||
reports.append(report)
|
||||
except Exception as e:
|
||||
logger.warning("pressure dashboard: compute_pressure failed for %s: %s", svc, e)
|
||||
|
||||
reports.sort(key=lambda r: -r.get("score", 0))
|
||||
|
||||
# Band counts
|
||||
band_counts: Dict[str, int] = {"critical": 0, "high": 0, "medium": 0, "low": 0}
|
||||
for r in reports:
|
||||
b = r.get("band", "low")
|
||||
band_counts[b] = band_counts.get(b, 0) + 1
|
||||
|
||||
critical_services = [r["service"] for r in reports if r.get("band") == "critical"]
|
||||
high_services = [r["service"] for r in reports if r.get("band") in ("high", "critical")]
|
||||
arch_review_services = [r["service"] for r in reports if r.get("requires_arch_review")]
|
||||
|
||||
return {
|
||||
"env": env,
|
||||
"computed_at": datetime.datetime.utcnow().isoformat(),
|
||||
"top_pressure_services": reports[:effective_top_n],
|
||||
"band_counts": band_counts,
|
||||
"critical_services": critical_services,
|
||||
"high_services": high_services,
|
||||
"arch_review_required": arch_review_services,
|
||||
"total_services_evaluated": len(reports),
|
||||
}
|
||||
|
||||
|
||||
def _list_services_from_stores(
|
||||
env: str,
|
||||
incident_store=None,
|
||||
policy: Optional[Dict] = None,
|
||||
) -> List[str]:
|
||||
"""Infer known services from incident store, falling back to SLO policy."""
|
||||
services: set = set()
|
||||
try:
|
||||
if incident_store is not None:
|
||||
incs = incident_store.list_incidents({}, limit=200)
|
||||
for inc in incs:
|
||||
svc = inc.get("service")
|
||||
if svc:
|
||||
services.add(svc)
|
||||
except Exception as e:
|
||||
logger.warning("pressure: list_services from incident_store failed: %s", e)
|
||||
|
||||
if not services:
|
||||
# Fallback: read from SLO policy
|
||||
try:
|
||||
slo_paths = [
|
||||
Path("config/slo_policy.yml"),
|
||||
Path(__file__).resolve().parent.parent.parent / "config" / "slo_policy.yml",
|
||||
]
|
||||
for p in slo_paths:
|
||||
if p.exists():
|
||||
import yaml as _yaml
|
||||
with open(p) as f:
|
||||
slo = _yaml.safe_load(f) or {}
|
||||
services.update(slo.get("services", {}).keys())
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return sorted(services)
|
||||
|
||||
|
||||
# ─── Auto followup creation ───────────────────────────────────────────────────
|
||||
|
||||
def maybe_create_arch_review_followup(
|
||||
pressure_report: Dict,
|
||||
*,
|
||||
incident_store=None,
|
||||
policy: Optional[Dict] = None,
|
||||
week_str: Optional[str] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
If pressure score >= require_arch_review_at and auto_create_followup=True,
|
||||
create an architecture-review follow-up on the latest open incident.
|
||||
|
||||
Deduped by key: arch_review:{YYYY-WW}:{service}
|
||||
Returns: {"created": bool, "dedupe_key": str, "skipped_reason": str|None}
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_pressure_policy()
|
||||
|
||||
service = pressure_report.get("service", "")
|
||||
score = int(pressure_report.get("score", 0))
|
||||
|
||||
rules = policy.get("priority_rules", {})
|
||||
review_at = int(rules.get("require_arch_review_at", 70))
|
||||
auto_create = bool(rules.get("auto_create_followup", True))
|
||||
|
||||
if score < review_at:
|
||||
return {"created": False, "dedupe_key": None,
|
||||
"skipped_reason": f"score {score} < require_arch_review_at {review_at}"}
|
||||
|
||||
if not auto_create:
|
||||
return {"created": False, "dedupe_key": None,
|
||||
"skipped_reason": "auto_create_followup disabled"}
|
||||
|
||||
if incident_store is None:
|
||||
return {"created": False, "dedupe_key": None,
|
||||
"skipped_reason": "incident_store not available"}
|
||||
|
||||
if week_str is None:
|
||||
week_str = datetime.datetime.utcnow().strftime("%Y-W%V")
|
||||
|
||||
dedupe_key = f"arch_review:{week_str}:{service}"
|
||||
priority = rules.get("followup_priority", "P1")
|
||||
owner = rules.get("followup_owner", "cto")
|
||||
due_days = int(rules.get("followup_due_days", 14))
|
||||
due_date = (
|
||||
datetime.datetime.utcnow() + datetime.timedelta(days=due_days)
|
||||
).strftime("%Y-%m-%d")
|
||||
|
||||
try:
|
||||
# Check if a follow-up with this dedupe_key already exists
|
||||
incs = incident_store.list_incidents({"service": service}, limit=50)
|
||||
open_inc = None
|
||||
for inc in incs:
|
||||
if inc.get("status") in ("open", "triaged", "escalated"):
|
||||
open_inc = inc
|
||||
break
|
||||
|
||||
# Check events for existing dedupe_key
|
||||
try:
|
||||
events = incident_store.get_events(inc.get("id", ""), limit=100)
|
||||
for ev in events:
|
||||
if ev.get("dedupe_key") == dedupe_key:
|
||||
return {"created": False, "dedupe_key": dedupe_key,
|
||||
"skipped_reason": f"already exists: {dedupe_key}"}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if open_inc is None:
|
||||
# No open incident — create a synthetic architecture_review incident
|
||||
open_inc = incident_store.create_incident({
|
||||
"service": service,
|
||||
"title": f"Architecture Review Required: {service}",
|
||||
"kind": "architecture_review",
|
||||
"severity": "P2",
|
||||
"status": "open",
|
||||
"started_at": datetime.datetime.utcnow().isoformat(),
|
||||
"source": "architecture_pressure_engine",
|
||||
})
|
||||
|
||||
# Add followup event to the incident
|
||||
inc_id = open_inc.get("id", "")
|
||||
incident_store.get_events(inc_id, limit=1) # verify inc exists
|
||||
|
||||
# Write the followup event
|
||||
followup_event = {
|
||||
"type": "followup",
|
||||
"ts": datetime.datetime.utcnow().isoformat(),
|
||||
"message": (
|
||||
f"[Architecture Pressure] Score={score} >= {review_at}. "
|
||||
f"Schedule architecture review for '{service}'."
|
||||
),
|
||||
"owner": owner,
|
||||
"priority": priority,
|
||||
"due_date": due_date,
|
||||
"status": "open",
|
||||
"dedupe_key": dedupe_key,
|
||||
"source": "architecture_pressure_engine",
|
||||
}
|
||||
|
||||
if hasattr(incident_store, "add_event"):
|
||||
incident_store.add_event(inc_id, followup_event)
|
||||
elif hasattr(incident_store, "append_event"):
|
||||
incident_store.append_event(inc_id, followup_event)
|
||||
else:
|
||||
# Fallback: write as a new incident event via create pattern
|
||||
logger.info(
|
||||
"pressure: would create followup for %s (inc=%s, key=%s)",
|
||||
service, inc_id, dedupe_key
|
||||
)
|
||||
|
||||
return {"created": True, "dedupe_key": dedupe_key, "skipped_reason": None,
|
||||
"incident_id": inc_id, "due_date": due_date, "priority": priority}
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("maybe_create_arch_review_followup failed for %s: %s", service, e)
|
||||
return {"created": False, "dedupe_key": dedupe_key,
|
||||
"skipped_reason": f"error: {e}"}
|
||||
573
services/router/audit_store.py
Normal file
573
services/router/audit_store.py
Normal file
@@ -0,0 +1,573 @@
|
||||
"""
|
||||
Audit Store — persistence layer for ToolGovernance audit events.
|
||||
|
||||
Backends:
|
||||
memory — in-process list (testing; not persistent)
|
||||
jsonl — append-only JSONL file with daily rotation (default, zero-config)
|
||||
postgres — asyncpg INSERT into tool_audit_events table
|
||||
|
||||
Selection: env var AUDIT_BACKEND=jsonl|postgres|memory (default: jsonl)
|
||||
|
||||
Security / Privacy:
|
||||
- Payload is NEVER written (only hash + sizes)
|
||||
- Each write is fire-and-forget: errors → log warning, do NOT raise
|
||||
- Postgres writes are non-blocking (asyncio task)
|
||||
|
||||
JSONL schema per line (matches AuditEvent fields):
|
||||
{ts, req_id, workspace_id, user_id, agent_id, tool, action,
|
||||
status, duration_ms, in_size, out_size, input_hash,
|
||||
graph_run_id?, graph_node?, job_id?}
|
||||
|
||||
Postgres DDL (run once — or apply via migration):
|
||||
See _POSTGRES_DDL constant below.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── DDL ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
_POSTGRES_DDL = """
|
||||
CREATE TABLE IF NOT EXISTS tool_audit_events (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
ts TIMESTAMPTZ NOT NULL,
|
||||
req_id TEXT NOT NULL,
|
||||
workspace_id TEXT NOT NULL,
|
||||
user_id TEXT NOT NULL,
|
||||
agent_id TEXT NOT NULL,
|
||||
tool TEXT NOT NULL,
|
||||
action TEXT NOT NULL,
|
||||
status TEXT NOT NULL,
|
||||
duration_ms INT NOT NULL,
|
||||
in_size INT NOT NULL,
|
||||
out_size INT NOT NULL,
|
||||
input_hash TEXT NOT NULL,
|
||||
graph_run_id TEXT,
|
||||
graph_node TEXT,
|
||||
job_id TEXT
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_tool_audit_ts ON tool_audit_events(ts);
|
||||
CREATE INDEX IF NOT EXISTS idx_tool_audit_tool_ts ON tool_audit_events(tool, ts);
|
||||
CREATE INDEX IF NOT EXISTS idx_tool_audit_agent_ts ON tool_audit_events(agent_id, ts);
|
||||
CREATE INDEX IF NOT EXISTS idx_tool_audit_ws_ts ON tool_audit_events(workspace_id, ts);
|
||||
"""
|
||||
|
||||
|
||||
# ─── Canonical event dict ─────────────────────────────────────────────────────
|
||||
|
||||
def _event_to_dict(event: "AuditEventLike") -> Dict[str, Any]:
|
||||
"""Convert an AuditEvent (dataclass) or dict to canonical storage dict."""
|
||||
if isinstance(event, dict):
|
||||
return event
|
||||
return {
|
||||
"ts": getattr(event, "ts", ""),
|
||||
"req_id": getattr(event, "req_id", ""),
|
||||
"workspace_id": getattr(event, "workspace_id", ""),
|
||||
"user_id": getattr(event, "user_id", ""),
|
||||
"agent_id": getattr(event, "agent_id", ""),
|
||||
"tool": getattr(event, "tool", ""),
|
||||
"action": getattr(event, "action", ""),
|
||||
"status": getattr(event, "status", ""),
|
||||
"duration_ms": round(float(getattr(event, "duration_ms", 0))),
|
||||
"in_size": int(getattr(event, "input_chars", 0)),
|
||||
"out_size": int(getattr(event, "output_size_bytes", 0)),
|
||||
"input_hash": getattr(event, "input_hash", ""),
|
||||
"graph_run_id": getattr(event, "graph_run_id", None),
|
||||
"graph_node": getattr(event, "graph_node", None),
|
||||
"job_id": getattr(event, "job_id", None),
|
||||
}
|
||||
|
||||
|
||||
# Type alias (avoid circular imports)
|
||||
AuditEventLike = Any
|
||||
|
||||
|
||||
# ─── Interface ────────────────────────────────────────────────────────────────
|
||||
|
||||
class AuditStore(ABC):
|
||||
@abstractmethod
|
||||
def write(self, event: AuditEventLike) -> None:
|
||||
"""Non-blocking write. MUST NOT raise on error."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def read(
|
||||
self,
|
||||
from_ts: Optional[str] = None,
|
||||
to_ts: Optional[str] = None,
|
||||
tool: Optional[str] = None,
|
||||
agent_id: Optional[str] = None,
|
||||
workspace_id: Optional[str] = None,
|
||||
limit: int = 50000,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Read events matching filters. Returns list of dicts."""
|
||||
...
|
||||
|
||||
def close(self) -> None:
|
||||
pass
|
||||
|
||||
|
||||
# ─── Memory store ─────────────────────────────────────────────────────────────
|
||||
|
||||
class MemoryAuditStore(AuditStore):
|
||||
"""In-process store for testing. Thread-safe."""
|
||||
|
||||
def __init__(self, max_events: int = 100_000):
|
||||
self._events: List[Dict] = []
|
||||
self._lock = threading.Lock()
|
||||
self._max = max_events
|
||||
|
||||
def write(self, event: AuditEventLike) -> None:
|
||||
try:
|
||||
d = _event_to_dict(event)
|
||||
with self._lock:
|
||||
self._events.append(d)
|
||||
if len(self._events) > self._max:
|
||||
self._events = self._events[-self._max:]
|
||||
except Exception as e:
|
||||
logger.warning("MemoryAuditStore.write error: %s", e)
|
||||
|
||||
def read(
|
||||
self,
|
||||
from_ts: Optional[str] = None,
|
||||
to_ts: Optional[str] = None,
|
||||
tool: Optional[str] = None,
|
||||
agent_id: Optional[str] = None,
|
||||
workspace_id: Optional[str] = None,
|
||||
limit: int = 50000,
|
||||
) -> List[Dict]:
|
||||
with self._lock:
|
||||
rows = list(self._events)
|
||||
|
||||
# Filter
|
||||
if from_ts:
|
||||
rows = [r for r in rows if r.get("ts", "") >= from_ts]
|
||||
if to_ts:
|
||||
rows = [r for r in rows if r.get("ts", "") <= to_ts]
|
||||
if tool:
|
||||
rows = [r for r in rows if r.get("tool") == tool]
|
||||
if agent_id:
|
||||
rows = [r for r in rows if r.get("agent_id") == agent_id]
|
||||
if workspace_id:
|
||||
rows = [r for r in rows if r.get("workspace_id") == workspace_id]
|
||||
|
||||
return rows[-limit:]
|
||||
|
||||
def clear(self) -> None:
|
||||
with self._lock:
|
||||
self._events.clear()
|
||||
|
||||
|
||||
# ─── JSONL store ──────────────────────────────────────────────────────────────
|
||||
|
||||
class JsonlAuditStore(AuditStore):
|
||||
"""
|
||||
Append-only JSONL file with daily rotation.
|
||||
|
||||
File pattern: ops/audit/tool_audit_YYYY-MM-DD.jsonl
|
||||
Writes are serialised through a threading.Lock (safe for multi-thread, not multi-process).
|
||||
"""
|
||||
|
||||
def __init__(self, directory: str = "ops/audit"):
|
||||
self._dir = Path(directory)
|
||||
self._dir.mkdir(parents=True, exist_ok=True)
|
||||
self._lock = threading.Lock()
|
||||
self._current_file: Optional[Path] = None
|
||||
self._current_date: Optional[str] = None
|
||||
self._fh = None
|
||||
|
||||
def _get_fh(self, date_str: str):
|
||||
if date_str != self._current_date:
|
||||
if self._fh:
|
||||
try:
|
||||
self._fh.close()
|
||||
except Exception:
|
||||
pass
|
||||
path = self._dir / f"tool_audit_{date_str}.jsonl"
|
||||
self._fh = open(path, "a", encoding="utf-8", buffering=1) # line-buffered
|
||||
self._current_date = date_str
|
||||
self._current_file = path
|
||||
return self._fh
|
||||
|
||||
def write(self, event: AuditEventLike) -> None:
|
||||
try:
|
||||
d = _event_to_dict(event)
|
||||
date_str = (d.get("ts") or "")[:10] or datetime.date.today().isoformat()
|
||||
line = json.dumps(d, ensure_ascii=False)
|
||||
with self._lock:
|
||||
fh = self._get_fh(date_str)
|
||||
fh.write(line + "\n")
|
||||
except Exception as e:
|
||||
logger.warning("JsonlAuditStore.write error: %s", e)
|
||||
|
||||
def read(
|
||||
self,
|
||||
from_ts: Optional[str] = None,
|
||||
to_ts: Optional[str] = None,
|
||||
tool: Optional[str] = None,
|
||||
agent_id: Optional[str] = None,
|
||||
workspace_id: Optional[str] = None,
|
||||
limit: int = 50000,
|
||||
) -> List[Dict]:
|
||||
"""Stream-read JSONL files in date range."""
|
||||
# Determine which files to read
|
||||
files = sorted(self._dir.glob("tool_audit_*.jsonl"))
|
||||
if from_ts:
|
||||
from_date = from_ts[:10]
|
||||
files = [f for f in files if f.stem[-10:] >= from_date]
|
||||
if to_ts:
|
||||
to_date = to_ts[:10]
|
||||
files = [f for f in files if f.stem[-10:] <= to_date]
|
||||
|
||||
rows = []
|
||||
for fpath in files:
|
||||
try:
|
||||
with open(fpath, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
d = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
ts = d.get("ts", "")
|
||||
if from_ts and ts < from_ts:
|
||||
continue
|
||||
if to_ts and ts > to_ts:
|
||||
continue
|
||||
if tool and d.get("tool") != tool:
|
||||
continue
|
||||
if agent_id and d.get("agent_id") != agent_id:
|
||||
continue
|
||||
if workspace_id and d.get("workspace_id") != workspace_id:
|
||||
continue
|
||||
rows.append(d)
|
||||
if len(rows) >= limit:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warning("JsonlAuditStore.read error %s: %s", fpath, e)
|
||||
if len(rows) >= limit:
|
||||
break
|
||||
|
||||
return rows
|
||||
|
||||
def close(self) -> None:
|
||||
with self._lock:
|
||||
if self._fh:
|
||||
try:
|
||||
self._fh.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._fh = None
|
||||
|
||||
|
||||
# ─── Postgres store ───────────────────────────────────────────────────────────
|
||||
|
||||
class PostgresAuditStore(AuditStore):
|
||||
"""
|
||||
Async Postgres store using asyncpg.
|
||||
Writes are enqueued to an asyncio queue and flushed in background.
|
||||
Falls back gracefully if Postgres is unavailable.
|
||||
"""
|
||||
|
||||
def __init__(self, dsn: str):
|
||||
self._dsn = dsn
|
||||
self._pool = None
|
||||
self._queue: asyncio.Queue = asyncio.Queue(maxsize=10_000)
|
||||
self._task: Optional[asyncio.Task] = None
|
||||
self._started = False
|
||||
|
||||
def _ensure_started(self):
|
||||
if self._started:
|
||||
return
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
if loop.is_running():
|
||||
self._task = loop.create_task(self._flush_loop())
|
||||
self._started = True
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
async def _get_pool(self):
|
||||
if self._pool is None:
|
||||
import asyncpg
|
||||
self._pool = await asyncpg.create_pool(self._dsn, min_size=1, max_size=3)
|
||||
async with self._pool.acquire() as conn:
|
||||
await conn.execute(_POSTGRES_DDL)
|
||||
return self._pool
|
||||
|
||||
async def _flush_loop(self):
|
||||
while True:
|
||||
events = []
|
||||
try:
|
||||
# Collect up to 50 events or wait 2s
|
||||
evt = await asyncio.wait_for(self._queue.get(), timeout=2.0)
|
||||
events.append(evt)
|
||||
while not self._queue.empty() and len(events) < 50:
|
||||
events.append(self._queue.get_nowait())
|
||||
except asyncio.TimeoutError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not events:
|
||||
continue
|
||||
|
||||
try:
|
||||
pool = await self._get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
await conn.executemany(
|
||||
"""
|
||||
INSERT INTO tool_audit_events
|
||||
(ts, req_id, workspace_id, user_id, agent_id, tool, action,
|
||||
status, duration_ms, in_size, out_size, input_hash,
|
||||
graph_run_id, graph_node, job_id)
|
||||
VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15)
|
||||
""",
|
||||
[
|
||||
(
|
||||
e["ts"], e["req_id"], e["workspace_id"], e["user_id"],
|
||||
e["agent_id"], e["tool"], e["action"], e["status"],
|
||||
e["duration_ms"], e["in_size"], e["out_size"],
|
||||
e["input_hash"], e.get("graph_run_id"),
|
||||
e.get("graph_node"), e.get("job_id"),
|
||||
)
|
||||
for e in events
|
||||
],
|
||||
)
|
||||
except Exception as ex:
|
||||
logger.warning("PostgresAuditStore flush error: %s", ex)
|
||||
|
||||
def write(self, event: AuditEventLike) -> None:
|
||||
try:
|
||||
d = _event_to_dict(event)
|
||||
self._ensure_started()
|
||||
if self._started and not self._queue.full():
|
||||
self._queue.put_nowait(d)
|
||||
except Exception as e:
|
||||
logger.warning("PostgresAuditStore.write error: %s", e)
|
||||
|
||||
def read(
|
||||
self,
|
||||
from_ts: Optional[str] = None,
|
||||
to_ts: Optional[str] = None,
|
||||
tool: Optional[str] = None,
|
||||
agent_id: Optional[str] = None,
|
||||
workspace_id: Optional[str] = None,
|
||||
limit: int = 50000,
|
||||
) -> List[Dict]:
|
||||
"""Synchronous read via asyncio.run() — for analyzer queries."""
|
||||
try:
|
||||
return asyncio.run(self._async_read(from_ts, to_ts, tool, agent_id, workspace_id, limit))
|
||||
except Exception as e:
|
||||
logger.warning("PostgresAuditStore.read error: %s", e)
|
||||
return []
|
||||
|
||||
async def _async_read(self, from_ts, to_ts, tool, agent_id, workspace_id, limit):
|
||||
pool = await self._get_pool()
|
||||
conditions = ["TRUE"]
|
||||
params = []
|
||||
p = 1
|
||||
if from_ts:
|
||||
conditions.append(f"ts >= ${p}"); params.append(from_ts); p += 1
|
||||
if to_ts:
|
||||
conditions.append(f"ts <= ${p}"); params.append(to_ts); p += 1
|
||||
if tool:
|
||||
conditions.append(f"tool = ${p}"); params.append(tool); p += 1
|
||||
if agent_id:
|
||||
conditions.append(f"agent_id = ${p}"); params.append(agent_id); p += 1
|
||||
if workspace_id:
|
||||
conditions.append(f"workspace_id = ${p}"); params.append(workspace_id); p += 1
|
||||
|
||||
sql = f"SELECT * FROM tool_audit_events WHERE {' AND '.join(conditions)} ORDER BY ts LIMIT {limit}"
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(sql, *params)
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
# ─── Null store ───────────────────────────────────────────────────────────────
|
||||
|
||||
class NullAuditStore(AuditStore):
|
||||
"""No-op store (audit disabled)."""
|
||||
def write(self, event: AuditEventLike) -> None:
|
||||
pass
|
||||
def read(self, **kwargs) -> List[Dict]:
|
||||
return []
|
||||
|
||||
|
||||
# ─── Global singleton ─────────────────────────────────────────────────────────
|
||||
|
||||
_store: Optional[AuditStore] = None
|
||||
_store_lock = threading.Lock()
|
||||
|
||||
|
||||
def get_audit_store() -> AuditStore:
|
||||
"""Lazily initialise and return the global audit store."""
|
||||
global _store
|
||||
if _store is None:
|
||||
with _store_lock:
|
||||
if _store is None:
|
||||
_store = _create_store()
|
||||
return _store
|
||||
|
||||
|
||||
def set_audit_store(store: AuditStore) -> None:
|
||||
"""Override the global store (used in tests)."""
|
||||
global _store
|
||||
with _store_lock:
|
||||
_store = store
|
||||
|
||||
|
||||
class AutoAuditStore(AuditStore):
|
||||
"""
|
||||
Smart backend: tries Postgres first, falls back to JSONL on failure.
|
||||
|
||||
Used when AUDIT_BACKEND=auto (or unset with DATABASE_URL present).
|
||||
- Writes go to whichever backend is currently healthy.
|
||||
- On Postgres failure, transparently falls back to JsonlAuditStore.
|
||||
- Recovers to Postgres on next health check (every ~5 min).
|
||||
|
||||
Non-fatal: write errors are logged as warnings.
|
||||
"""
|
||||
|
||||
_RECOVERY_INTERVAL_S = 300 # retry Postgres after 5 minutes
|
||||
|
||||
def __init__(self, pg_dsn: str, jsonl_dir: str):
|
||||
self._pg_dsn = pg_dsn
|
||||
self._jsonl_dir = jsonl_dir
|
||||
self._primary: Optional[PostgresAuditStore] = None
|
||||
self._fallback: Optional[JsonlAuditStore] = None
|
||||
self._using_fallback = False
|
||||
self._fallback_since: float = 0.0
|
||||
self._init_lock = threading.Lock()
|
||||
|
||||
def _get_primary(self) -> Optional[PostgresAuditStore]:
|
||||
if self._primary is None:
|
||||
with self._init_lock:
|
||||
if self._primary is None:
|
||||
self._primary = PostgresAuditStore(self._pg_dsn)
|
||||
return self._primary
|
||||
|
||||
def _get_fallback(self) -> JsonlAuditStore:
|
||||
if self._fallback is None:
|
||||
with self._init_lock:
|
||||
if self._fallback is None:
|
||||
self._fallback = JsonlAuditStore(self._jsonl_dir)
|
||||
return self._fallback
|
||||
|
||||
def _maybe_recover(self) -> None:
|
||||
"""Try to switch back to Postgres if enough time has passed since fallback."""
|
||||
if self._using_fallback and self._fallback_since > 0:
|
||||
if time.monotonic() - self._fallback_since >= self._RECOVERY_INTERVAL_S:
|
||||
logger.info("AutoAuditStore: attempting Postgres recovery")
|
||||
self._using_fallback = False
|
||||
self._fallback_since = 0.0
|
||||
|
||||
def write(self, event: AuditEventLike) -> None:
|
||||
self._maybe_recover()
|
||||
if not self._using_fallback:
|
||||
try:
|
||||
primary = self._get_primary()
|
||||
if primary:
|
||||
primary.write(event)
|
||||
return
|
||||
except Exception as pg_err:
|
||||
logger.warning(
|
||||
"AutoAuditStore: Postgres write failed (%s), switching to JSONL fallback", pg_err
|
||||
)
|
||||
self._using_fallback = True
|
||||
self._fallback_since = time.monotonic()
|
||||
# Write to JSONL fallback
|
||||
try:
|
||||
self._get_fallback().write(event)
|
||||
except Exception as jl_err:
|
||||
logger.warning("AutoAuditStore: JSONL fallback write failed: %s", jl_err)
|
||||
|
||||
def read(
|
||||
self,
|
||||
from_ts: Optional[str] = None,
|
||||
to_ts: Optional[str] = None,
|
||||
tool: Optional[str] = None,
|
||||
agent_id: Optional[str] = None,
|
||||
workspace_id: Optional[str] = None,
|
||||
limit: int = 50000,
|
||||
) -> List[Dict]:
|
||||
"""Read from Postgres if available, else JSONL."""
|
||||
self._maybe_recover()
|
||||
if not self._using_fallback:
|
||||
try:
|
||||
primary = self._get_primary()
|
||||
if primary:
|
||||
return primary.read(from_ts=from_ts, to_ts=to_ts, tool=tool,
|
||||
agent_id=agent_id, workspace_id=workspace_id, limit=limit)
|
||||
except Exception as pg_err:
|
||||
logger.warning("AutoAuditStore: Postgres read failed (%s), using JSONL", pg_err)
|
||||
self._using_fallback = True
|
||||
self._fallback_since = time.monotonic()
|
||||
return self._get_fallback().read(
|
||||
from_ts=from_ts, to_ts=to_ts, tool=tool,
|
||||
agent_id=agent_id, workspace_id=workspace_id, limit=limit,
|
||||
)
|
||||
|
||||
def active_backend(self) -> str:
|
||||
"""Return the name of the currently active backend."""
|
||||
return "jsonl_fallback" if self._using_fallback else "postgres"
|
||||
|
||||
def close(self) -> None:
|
||||
if self._primary:
|
||||
try:
|
||||
self._primary.close()
|
||||
except Exception:
|
||||
pass
|
||||
if self._fallback:
|
||||
try:
|
||||
self._fallback.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _create_store() -> AuditStore:
|
||||
backend = os.getenv("AUDIT_BACKEND", "jsonl").lower()
|
||||
dsn = os.getenv("DATABASE_URL") or os.getenv("POSTGRES_DSN", "")
|
||||
audit_dir = os.getenv(
|
||||
"AUDIT_JSONL_DIR",
|
||||
str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "audit"),
|
||||
)
|
||||
|
||||
if backend == "memory":
|
||||
logger.info("AuditStore: in-memory (testing only)")
|
||||
return MemoryAuditStore()
|
||||
|
||||
if backend == "postgres":
|
||||
if not dsn:
|
||||
logger.warning("AUDIT_BACKEND=postgres but DATABASE_URL not set; falling back to jsonl")
|
||||
else:
|
||||
logger.info("AuditStore: postgres dsn=%s…", dsn[:30])
|
||||
return PostgresAuditStore(dsn)
|
||||
|
||||
if backend == "auto":
|
||||
if dsn:
|
||||
logger.info("AuditStore: auto (postgres→jsonl fallback) dsn=%s…", dsn[:30])
|
||||
return AutoAuditStore(pg_dsn=dsn, jsonl_dir=audit_dir)
|
||||
else:
|
||||
logger.info("AuditStore: auto — no DATABASE_URL, using jsonl")
|
||||
|
||||
if backend == "null":
|
||||
return NullAuditStore()
|
||||
|
||||
# Default / jsonl
|
||||
logger.info("AuditStore: jsonl dir=%s", audit_dir)
|
||||
return JsonlAuditStore(audit_dir)
|
||||
530
services/router/backlog_generator.py
Normal file
530
services/router/backlog_generator.py
Normal file
@@ -0,0 +1,530 @@
|
||||
"""
|
||||
backlog_generator.py — Auto-generation of Engineering Backlog items
|
||||
from Platform Priority / Risk digests.
|
||||
DAARION.city | deterministic, no LLM.
|
||||
|
||||
Public API:
|
||||
load_backlog_policy() -> Dict
|
||||
generate_from_pressure_digest(digest_data, env, ...) -> GenerateResult
|
||||
generate_from_risk_digest(digest_data, env, ...) -> GenerateResult
|
||||
_build_item_from_rule(service, rule, context, policy, week_str, env) -> BacklogItem | None
|
||||
_make_dedupe_key(prefix, week_str, env, service, category) -> str
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from backlog_store import (
|
||||
BacklogItem, BacklogEvent, BacklogStore,
|
||||
_new_id, _now_iso,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Policy ───────────────────────────────────────────────────────────────────
|
||||
|
||||
_BACKLOG_POLICY_CACHE: Optional[Dict] = None
|
||||
_BACKLOG_POLICY_PATHS = [
|
||||
Path("config/backlog_policy.yml"),
|
||||
Path(__file__).resolve().parent.parent.parent / "config" / "backlog_policy.yml",
|
||||
]
|
||||
|
||||
|
||||
def load_backlog_policy() -> Dict:
|
||||
global _BACKLOG_POLICY_CACHE
|
||||
if _BACKLOG_POLICY_CACHE is not None:
|
||||
return _BACKLOG_POLICY_CACHE
|
||||
for p in _BACKLOG_POLICY_PATHS:
|
||||
if p.exists():
|
||||
try:
|
||||
with open(p) as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
_BACKLOG_POLICY_CACHE = data
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.warning("Failed to load backlog_policy from %s: %s", p, e)
|
||||
_BACKLOG_POLICY_CACHE = _builtin_backlog_defaults()
|
||||
return _BACKLOG_POLICY_CACHE
|
||||
|
||||
|
||||
def _reload_backlog_policy() -> None:
|
||||
global _BACKLOG_POLICY_CACHE
|
||||
_BACKLOG_POLICY_CACHE = None
|
||||
|
||||
|
||||
def _builtin_backlog_defaults() -> Dict:
|
||||
return {
|
||||
"defaults": {"env": "prod", "retention_days": 180, "max_items_per_run": 50},
|
||||
"dedupe": {
|
||||
"scheme": "YYYY-WW",
|
||||
"key_fields": ["service", "category", "env"],
|
||||
"key_prefix": "platform_backlog",
|
||||
},
|
||||
"categories": {
|
||||
"arch_review": {"priority": "P1", "due_days": 14},
|
||||
"refactor": {"priority": "P1", "due_days": 21},
|
||||
"slo_hardening": {"priority": "P2", "due_days": 30},
|
||||
"cleanup_followups": {"priority": "P2", "due_days": 14},
|
||||
"security": {"priority": "P0", "due_days": 7},
|
||||
},
|
||||
"generation": {
|
||||
"weekly_from_pressure_digest": True,
|
||||
"daily_from_risk_digest": False,
|
||||
"rules": [
|
||||
{
|
||||
"name": "arch_review_required",
|
||||
"when": {"pressure_requires_arch_review": True},
|
||||
"create": {
|
||||
"category": "arch_review",
|
||||
"title_template": "[ARCH] Review required: {service}",
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "high_pressure_refactor",
|
||||
"when": {
|
||||
"pressure_band_in": ["high", "critical"],
|
||||
"risk_band_in": ["high", "critical"],
|
||||
},
|
||||
"create": {
|
||||
"category": "refactor",
|
||||
"title_template": "[REF] Reduce pressure & risk: {service}",
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "slo_violations",
|
||||
"when": {"risk_has_slo_violations": True},
|
||||
"create": {
|
||||
"category": "slo_hardening",
|
||||
"title_template": "[SLO] Fix violations: {service}",
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "followup_backlog",
|
||||
"when": {"followups_overdue_gt": 0},
|
||||
"create": {
|
||||
"category": "cleanup_followups",
|
||||
"title_template": "[OPS] Close overdue followups: {service}",
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
"ownership": {
|
||||
"default_owner": "oncall",
|
||||
"overrides": {"gateway": "cto"},
|
||||
},
|
||||
"workflow": {
|
||||
"statuses": ["open", "in_progress", "blocked", "done", "canceled"],
|
||||
"allowed_transitions": {
|
||||
"open": ["in_progress", "blocked", "canceled"],
|
||||
"in_progress": ["blocked", "done", "canceled"],
|
||||
"blocked": ["open", "in_progress", "canceled"],
|
||||
"done": [],
|
||||
"canceled": [],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def _now_week() -> str:
|
||||
return datetime.datetime.utcnow().strftime("%Y-W%V")
|
||||
|
||||
|
||||
def _make_dedupe_key(prefix: str, week_str: str, env: str,
|
||||
service: str, category: str) -> str:
|
||||
return f"{prefix}:{week_str}:{env}:{service}:{category}"
|
||||
|
||||
|
||||
def _due_date(due_days: int) -> str:
|
||||
return (
|
||||
datetime.datetime.utcnow() + datetime.timedelta(days=due_days)
|
||||
).strftime("%Y-%m-%d")
|
||||
|
||||
|
||||
def _owner_for(service: str, policy: Dict) -> str:
|
||||
overrides = policy.get("ownership", {}).get("overrides", {})
|
||||
return overrides.get(service, policy.get("ownership", {}).get("default_owner", "oncall"))
|
||||
|
||||
|
||||
def _match_rule(rule: Dict, ctx: Dict) -> bool:
|
||||
"""
|
||||
Evaluate a rule's `when` conditions against the service context dict.
|
||||
All conditions must hold (AND logic).
|
||||
"""
|
||||
when = rule.get("when", {})
|
||||
for key, expected in when.items():
|
||||
if key == "pressure_requires_arch_review":
|
||||
if bool(ctx.get("pressure_requires_arch_review")) is not bool(expected):
|
||||
return False
|
||||
|
||||
elif key == "pressure_band_in":
|
||||
if ctx.get("pressure_band") not in expected:
|
||||
return False
|
||||
|
||||
elif key == "risk_band_in":
|
||||
if ctx.get("risk_band") not in expected:
|
||||
return False
|
||||
|
||||
elif key == "risk_has_slo_violations":
|
||||
slo_v = int(ctx.get("slo_violations", 0))
|
||||
if (slo_v > 0) is not bool(expected):
|
||||
return False
|
||||
|
||||
elif key == "followups_overdue_gt":
|
||||
overdue = int(ctx.get("followups_overdue", 0))
|
||||
if not (overdue > int(expected)):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _build_description(service: str, ctx: Dict, rule: Dict) -> str:
|
||||
"""Generate deterministic bullet-list description from context."""
|
||||
lines = [f"Auto-generated by Engineering Backlog Bridge — rule: {rule.get('name', '?')}.", ""]
|
||||
p_score = ctx.get("pressure_score")
|
||||
p_band = ctx.get("pressure_band")
|
||||
r_score = ctx.get("risk_score")
|
||||
r_band = ctx.get("risk_band")
|
||||
r_delta = ctx.get("risk_delta_24h")
|
||||
|
||||
if p_score is not None:
|
||||
lines.append(f"- Architecture Pressure: {p_score} ({p_band})")
|
||||
if r_score is not None:
|
||||
lines.append(f"- Risk Score: {r_score} ({r_band})"
|
||||
+ (f" Δ24h: +{r_delta}" if r_delta else ""))
|
||||
slo_v = int(ctx.get("slo_violations", 0))
|
||||
if slo_v:
|
||||
lines.append(f"- Active SLO violations: {slo_v}")
|
||||
overdue = int(ctx.get("followups_overdue", 0))
|
||||
if overdue:
|
||||
lines.append(f"- Overdue follow-ups: {overdue}")
|
||||
if ctx.get("signals_summary"):
|
||||
lines.append(f"- Pressure signals: {'; '.join(ctx['signals_summary'][:3])}")
|
||||
if ctx.get("risk_reasons"):
|
||||
lines.append(f"- Risk signals: {'; '.join(ctx['risk_reasons'][:3])}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _build_item_from_rule(
|
||||
service: str,
|
||||
rule: Dict,
|
||||
ctx: Dict,
|
||||
policy: Dict,
|
||||
week_str: str,
|
||||
env: str,
|
||||
) -> Optional[BacklogItem]:
|
||||
"""Build a BacklogItem from a matched rule and service context."""
|
||||
create_cfg = rule.get("create", {})
|
||||
category = create_cfg.get("category", "arch_review")
|
||||
title_template = create_cfg.get("title_template", "[BACKLOG] {service}")
|
||||
title = title_template.format(service=service)
|
||||
|
||||
cat_cfg = policy.get("categories", {}).get(category, {})
|
||||
priority = cat_cfg.get("priority", "P2")
|
||||
due_days = int(cat_cfg.get("due_days", 14))
|
||||
owner = _owner_for(service, policy)
|
||||
prefix = policy.get("dedupe", {}).get("key_prefix", "platform_backlog")
|
||||
dedupe_key = _make_dedupe_key(prefix, week_str, env, service, category)
|
||||
description = _build_description(service, ctx, rule)
|
||||
|
||||
# Gather evidence_refs from context
|
||||
evidence_refs = dict(ctx.get("evidence_refs") or {})
|
||||
|
||||
return BacklogItem(
|
||||
id=_new_id("bl"),
|
||||
created_at=_now_iso(),
|
||||
updated_at=_now_iso(),
|
||||
env=env,
|
||||
service=service,
|
||||
category=category,
|
||||
title=title,
|
||||
description=description,
|
||||
priority=priority,
|
||||
status="open",
|
||||
owner=owner,
|
||||
due_date=_due_date(due_days),
|
||||
source="digest",
|
||||
dedupe_key=dedupe_key,
|
||||
evidence_refs=evidence_refs,
|
||||
tags=["auto", f"week:{week_str}", f"rule:{rule.get('name', '?')}"],
|
||||
meta={
|
||||
"rule_name": rule.get("name", ""),
|
||||
"pressure_score": ctx.get("pressure_score"),
|
||||
"risk_score": ctx.get("risk_score"),
|
||||
"week": week_str,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# ─── Context builder from digest ──────────────────────────────────────────────
|
||||
|
||||
def _build_service_context(
|
||||
service_entry: Dict,
|
||||
risk_entry: Optional[Dict] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Build a unified service context dict from a platform_priority_digest
|
||||
top_pressure_services entry plus an optional risk_digest service entry.
|
||||
"""
|
||||
p_score = service_entry.get("score")
|
||||
p_band = service_entry.get("band", "low")
|
||||
requires_review = bool(service_entry.get("requires_arch_review", False))
|
||||
signals_summary = service_entry.get("signals_summary", [])
|
||||
comp = service_entry.get("components", {})
|
||||
followups_overdue = int(comp.get("followups_overdue", 0))
|
||||
evidence_refs = service_entry.get("evidence_refs") or {}
|
||||
|
||||
ctx: Dict[str, Any] = {
|
||||
"pressure_score": p_score,
|
||||
"pressure_band": p_band,
|
||||
"pressure_requires_arch_review": requires_review,
|
||||
"signals_summary": signals_summary,
|
||||
"followups_overdue": followups_overdue,
|
||||
"evidence_refs": dict(evidence_refs),
|
||||
}
|
||||
|
||||
# Merge risk data
|
||||
if risk_entry:
|
||||
ctx["risk_score"] = risk_entry.get("score")
|
||||
ctx["risk_band"] = risk_entry.get("band", "low")
|
||||
ctx["risk_delta_24h"] = (risk_entry.get("trend") or {}).get("delta_24h")
|
||||
slo_comp = (risk_entry.get("components") or {}).get("slo") or {}
|
||||
ctx["slo_violations"] = int(slo_comp.get("violations", 0))
|
||||
ctx["risk_reasons"] = risk_entry.get("reasons", [])
|
||||
# Merge evidence_refs from risk
|
||||
risk_attrs = risk_entry.get("attribution") or {}
|
||||
risk_erefs = risk_attrs.get("evidence_refs") or {}
|
||||
for k, v in risk_erefs.items():
|
||||
if k not in ctx["evidence_refs"]:
|
||||
ctx["evidence_refs"][k] = v
|
||||
else:
|
||||
ctx.setdefault("risk_band", service_entry.get("risk_band", "low"))
|
||||
ctx.setdefault("risk_score", service_entry.get("risk_score"))
|
||||
ctx.setdefault("risk_delta_24h", service_entry.get("risk_delta_24h"))
|
||||
ctx.setdefault("slo_violations", 0)
|
||||
|
||||
return ctx
|
||||
|
||||
|
||||
# ─── Main generation function ─────────────────────────────────────────────────
|
||||
|
||||
def generate_from_pressure_digest(
|
||||
digest_data: Dict,
|
||||
env: str = "prod",
|
||||
*,
|
||||
store: Optional[BacklogStore] = None,
|
||||
policy: Optional[Dict] = None,
|
||||
week_str: Optional[str] = None,
|
||||
risk_digest_data: Optional[Dict] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Generate backlog items from a weekly_platform_priority_digest JSON output.
|
||||
|
||||
Args:
|
||||
digest_data: JSON dict from platform_priority_digest (top_pressure_services list)
|
||||
env: deployment environment
|
||||
store: backlog store (loaded from factory if None)
|
||||
policy: backlog_policy (loaded if None)
|
||||
week_str: override ISO week (defaults to digest's "week" field or current)
|
||||
risk_digest_data: optional daily risk digest JSON to enrich context
|
||||
|
||||
Returns GenerateResult dict: created, updated, skipped, items
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_backlog_policy()
|
||||
if store is None:
|
||||
from backlog_store import get_backlog_store
|
||||
store = get_backlog_store()
|
||||
|
||||
gen_cfg = policy.get("generation", {})
|
||||
if not gen_cfg.get("weekly_from_pressure_digest", True):
|
||||
return {"created": 0, "updated": 0, "skipped": 0, "items": [],
|
||||
"skipped_reason": "weekly_from_pressure_digest disabled in policy"}
|
||||
|
||||
effective_week = week_str or digest_data.get("week") or _now_week()
|
||||
max_items = int(policy.get("defaults", {}).get("max_items_per_run", 50))
|
||||
rules = gen_cfg.get("rules", [])
|
||||
|
||||
# Build risk_by_service lookup
|
||||
risk_by_service: Dict[str, Dict] = {}
|
||||
if risk_digest_data:
|
||||
for rs in (risk_digest_data.get("top_services") or []):
|
||||
svc = rs.get("service", "")
|
||||
if svc:
|
||||
risk_by_service[svc] = rs
|
||||
|
||||
created = updated = skipped = 0
|
||||
items_out: List[Dict] = []
|
||||
total_written = 0
|
||||
|
||||
for svc_entry in (digest_data.get("top_pressure_services") or []):
|
||||
service = svc_entry.get("service", "")
|
||||
if not service:
|
||||
continue
|
||||
if total_written >= max_items:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
ctx = _build_service_context(svc_entry, risk_by_service.get(service))
|
||||
|
||||
# Evaluate rules — one item per matched rule
|
||||
matched_categories: set = set()
|
||||
for rule in rules:
|
||||
try:
|
||||
if not _match_rule(rule, ctx):
|
||||
continue
|
||||
category = rule.get("create", {}).get("category", "")
|
||||
if category in matched_categories:
|
||||
continue # dedupe same category within a service
|
||||
matched_categories.add(category)
|
||||
|
||||
item = _build_item_from_rule(service, rule, ctx, policy,
|
||||
effective_week, env)
|
||||
if item is None:
|
||||
continue
|
||||
|
||||
result = store.upsert(item)
|
||||
action = result["action"]
|
||||
upserted = result["item"]
|
||||
|
||||
# Emit event
|
||||
ev_type = "created" if action == "created" else "auto_update"
|
||||
store.add_event(BacklogEvent(
|
||||
id=_new_id("ev"),
|
||||
item_id=upserted.id,
|
||||
ts=_now_iso(),
|
||||
type=ev_type,
|
||||
message=f"Auto-generated by weekly digest — rule: {rule.get('name', '?')}",
|
||||
actor="backlog_generator",
|
||||
meta={"week": effective_week, "rule": rule.get("name", "")},
|
||||
))
|
||||
|
||||
if action == "created":
|
||||
created += 1
|
||||
else:
|
||||
updated += 1
|
||||
total_written += 1
|
||||
items_out.append({
|
||||
"id": upserted.id,
|
||||
"service": service,
|
||||
"category": upserted.category,
|
||||
"status": upserted.status,
|
||||
"action": action,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning("backlog_generator: skip rule %s for %s: %s",
|
||||
rule.get("name"), service, e)
|
||||
skipped += 1
|
||||
|
||||
return {
|
||||
"created": created,
|
||||
"updated": updated,
|
||||
"skipped": skipped,
|
||||
"items": items_out,
|
||||
"week": effective_week,
|
||||
}
|
||||
|
||||
|
||||
def generate_from_risk_digest(
|
||||
risk_digest_data: Dict,
|
||||
env: str = "prod",
|
||||
*,
|
||||
store: Optional[BacklogStore] = None,
|
||||
policy: Optional[Dict] = None,
|
||||
week_str: Optional[str] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Optional: generate items from a daily risk digest JSON.
|
||||
Only active when generation.daily_from_risk_digest=true.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_backlog_policy()
|
||||
|
||||
gen_cfg = policy.get("generation", {})
|
||||
if not gen_cfg.get("daily_from_risk_digest", False):
|
||||
return {"created": 0, "updated": 0, "skipped": 0, "items": [],
|
||||
"skipped_reason": "daily_from_risk_digest disabled in policy"}
|
||||
|
||||
if store is None:
|
||||
from backlog_store import get_backlog_store
|
||||
store = get_backlog_store()
|
||||
|
||||
# Convert risk digest top_services into pressure-like entries
|
||||
effective_week = week_str or _now_week()
|
||||
max_items = int(policy.get("defaults", {}).get("max_items_per_run", 50))
|
||||
rules = gen_cfg.get("rules", [])
|
||||
|
||||
created = updated = skipped = 0
|
||||
items_out: List[Dict] = []
|
||||
total_written = 0
|
||||
|
||||
for svc_entry in (risk_digest_data.get("top_services") or []):
|
||||
service = svc_entry.get("service", "")
|
||||
if not service or total_written >= max_items:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Build a minimal pressure context from risk data
|
||||
ctx: Dict = {
|
||||
"pressure_score": None,
|
||||
"pressure_band": "low",
|
||||
"pressure_requires_arch_review": False,
|
||||
"signals_summary": [],
|
||||
"followups_overdue": 0,
|
||||
"risk_score": svc_entry.get("score"),
|
||||
"risk_band": svc_entry.get("band", "low"),
|
||||
"risk_delta_24h": (svc_entry.get("trend") or {}).get("delta_24h"),
|
||||
"slo_violations": (svc_entry.get("components") or {}).get("slo", {}).get("violations", 0) if svc_entry.get("components") else 0,
|
||||
"risk_reasons": svc_entry.get("reasons", []),
|
||||
"evidence_refs": (svc_entry.get("attribution") or {}).get("evidence_refs") or {},
|
||||
}
|
||||
|
||||
matched_categories: set = set()
|
||||
for rule in rules:
|
||||
try:
|
||||
if not _match_rule(rule, ctx):
|
||||
continue
|
||||
category = rule.get("create", {}).get("category", "")
|
||||
if category in matched_categories:
|
||||
continue
|
||||
matched_categories.add(category)
|
||||
|
||||
item = _build_item_from_rule(service, rule, ctx, policy,
|
||||
effective_week, env)
|
||||
if item is None:
|
||||
continue
|
||||
result = store.upsert(item)
|
||||
action = result["action"]
|
||||
upserted = result["item"]
|
||||
store.add_event(BacklogEvent(
|
||||
id=_new_id("ev"),
|
||||
item_id=upserted.id,
|
||||
ts=_now_iso(),
|
||||
type="created" if action == "created" else "auto_update",
|
||||
message="Auto-generated from daily risk digest",
|
||||
actor="backlog_generator",
|
||||
meta={"week": effective_week},
|
||||
))
|
||||
if action == "created":
|
||||
created += 1
|
||||
else:
|
||||
updated += 1
|
||||
total_written += 1
|
||||
items_out.append({
|
||||
"id": upserted.id, "service": service,
|
||||
"category": upserted.category, "status": upserted.status,
|
||||
"action": action,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning("backlog_generator(risk): skip rule %s for %s: %s",
|
||||
rule.get("name"), service, e)
|
||||
skipped += 1
|
||||
|
||||
return {"created": created, "updated": updated, "skipped": skipped,
|
||||
"items": items_out, "week": effective_week}
|
||||
705
services/router/backlog_store.py
Normal file
705
services/router/backlog_store.py
Normal file
@@ -0,0 +1,705 @@
|
||||
"""
|
||||
backlog_store.py — Engineering Backlog Storage Layer.
|
||||
DAARION.city | deterministic, no LLM.
|
||||
|
||||
Backends:
|
||||
MemoryBacklogStore — in-process (tests + fallback)
|
||||
JsonlBacklogStore — filesystem append-only JSONL (MVP)
|
||||
PostgresBacklogStore — Postgres primary (psycopg2 sync)
|
||||
AutoBacklogStore — Postgres → JSONL → Memory cascade
|
||||
|
||||
Factory: get_backlog_store() → respects BACKLOG_BACKEND env var.
|
||||
|
||||
BACKLOG_BACKEND: auto | postgres | jsonl | memory | null
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
import uuid
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Data model ───────────────────────────────────────────────────────────────
|
||||
|
||||
_VALID_STATUSES = {"open", "in_progress", "blocked", "done", "canceled"}
|
||||
_VALID_PRIORITIES = {"P0", "P1", "P2", "P3"}
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.datetime.utcnow().isoformat()
|
||||
|
||||
|
||||
def _new_id(prefix: str = "bl") -> str:
|
||||
return f"{prefix}_{uuid.uuid4().hex[:12]}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class BacklogItem:
|
||||
id: str
|
||||
created_at: str
|
||||
updated_at: str
|
||||
env: str
|
||||
service: str
|
||||
category: str # arch_review / refactor / slo_hardening / cleanup_followups / security
|
||||
title: str
|
||||
description: str
|
||||
priority: str # P0..P3
|
||||
status: str # open / in_progress / blocked / done / canceled
|
||||
owner: str
|
||||
due_date: str # YYYY-MM-DD
|
||||
source: str # risk | pressure | digest | manual
|
||||
dedupe_key: str
|
||||
evidence_refs: Dict = field(default_factory=dict) # alerts, incidents, release_checks, ...
|
||||
tags: List[str] = field(default_factory=list)
|
||||
meta: Dict = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return asdict(self)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: Dict) -> "BacklogItem":
|
||||
return cls(
|
||||
id=d.get("id", _new_id()),
|
||||
created_at=d.get("created_at", _now_iso()),
|
||||
updated_at=d.get("updated_at", _now_iso()),
|
||||
env=d.get("env", "prod"),
|
||||
service=d.get("service", ""),
|
||||
category=d.get("category", ""),
|
||||
title=d.get("title", ""),
|
||||
description=d.get("description", ""),
|
||||
priority=d.get("priority", "P2"),
|
||||
status=d.get("status", "open"),
|
||||
owner=d.get("owner", "oncall"),
|
||||
due_date=d.get("due_date", ""),
|
||||
source=d.get("source", "manual"),
|
||||
dedupe_key=d.get("dedupe_key", ""),
|
||||
evidence_refs=d.get("evidence_refs") or {},
|
||||
tags=d.get("tags") or [],
|
||||
meta=d.get("meta") or {},
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BacklogEvent:
|
||||
id: str
|
||||
item_id: str
|
||||
ts: str
|
||||
type: str # created | status_change | comment | auto_update
|
||||
message: str
|
||||
actor: str
|
||||
meta: Dict = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return asdict(self)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: Dict) -> "BacklogEvent":
|
||||
return cls(
|
||||
id=d.get("id", _new_id("ev")),
|
||||
item_id=d.get("item_id", ""),
|
||||
ts=d.get("ts", _now_iso()),
|
||||
type=d.get("type", "comment"),
|
||||
message=d.get("message", ""),
|
||||
actor=d.get("actor", "system"),
|
||||
meta=d.get("meta") or {},
|
||||
)
|
||||
|
||||
|
||||
# ─── Abstract base ────────────────────────────────────────────────────────────
|
||||
|
||||
class BacklogStore(ABC):
|
||||
@abstractmethod
|
||||
def create(self, item: BacklogItem) -> BacklogItem: ...
|
||||
|
||||
@abstractmethod
|
||||
def get(self, item_id: str) -> Optional[BacklogItem]: ...
|
||||
|
||||
@abstractmethod
|
||||
def get_by_dedupe_key(self, key: str) -> Optional[BacklogItem]: ...
|
||||
|
||||
@abstractmethod
|
||||
def update(self, item: BacklogItem) -> BacklogItem: ...
|
||||
|
||||
@abstractmethod
|
||||
def list_items(self, filters: Optional[Dict] = None, limit: int = 50,
|
||||
offset: int = 0) -> List[BacklogItem]: ...
|
||||
|
||||
@abstractmethod
|
||||
def add_event(self, event: BacklogEvent) -> BacklogEvent: ...
|
||||
|
||||
@abstractmethod
|
||||
def get_events(self, item_id: str, limit: int = 50) -> List[BacklogEvent]: ...
|
||||
|
||||
@abstractmethod
|
||||
def cleanup(self, retention_days: int = 180) -> int: ...
|
||||
|
||||
def upsert(self, item: BacklogItem) -> Dict:
|
||||
"""Create or update by dedupe_key. Returns {"action": created|updated, "item": ...}"""
|
||||
existing = self.get_by_dedupe_key(item.dedupe_key)
|
||||
if existing is None:
|
||||
created = self.create(item)
|
||||
return {"action": "created", "item": created}
|
||||
# Update title/description/evidence_refs/tags/meta; preserve status/owner
|
||||
existing.title = item.title
|
||||
existing.description = item.description
|
||||
existing.evidence_refs = item.evidence_refs
|
||||
existing.tags = list(set(existing.tags + item.tags))
|
||||
existing.meta.update(item.meta or {})
|
||||
existing.updated_at = _now_iso()
|
||||
updated = self.update(existing)
|
||||
return {"action": "updated", "item": updated}
|
||||
|
||||
def dashboard(self, env: str = "prod") -> Dict:
|
||||
"""Return aggregated backlog counts."""
|
||||
items = self.list_items({"env": env}, limit=1000)
|
||||
today = datetime.datetime.utcnow().strftime("%Y-%m-%d")
|
||||
status_counts: Dict[str, int] = {}
|
||||
priority_counts: Dict[str, int] = {}
|
||||
category_counts: Dict[str, int] = {}
|
||||
overdue: List[Dict] = []
|
||||
service_counts: Dict[str, int] = {}
|
||||
|
||||
for it in items:
|
||||
status_counts[it.status] = status_counts.get(it.status, 0) + 1
|
||||
priority_counts[it.priority] = priority_counts.get(it.priority, 0) + 1
|
||||
category_counts[it.category] = category_counts.get(it.category, 0) + 1
|
||||
service_counts[it.service] = service_counts.get(it.service, 0) + 1
|
||||
if (it.status not in ("done", "canceled")
|
||||
and it.due_date and it.due_date < today):
|
||||
overdue.append({
|
||||
"id": it.id, "service": it.service,
|
||||
"title": it.title, "priority": it.priority,
|
||||
"due_date": it.due_date, "owner": it.owner,
|
||||
})
|
||||
|
||||
overdue.sort(key=lambda x: (x["priority"], x["due_date"]))
|
||||
top_services = sorted(service_counts.items(), key=lambda x: -x[1])[:10]
|
||||
|
||||
return {
|
||||
"env": env,
|
||||
"total": len(items),
|
||||
"status_counts": status_counts,
|
||||
"priority_counts": priority_counts,
|
||||
"category_counts": category_counts,
|
||||
"overdue": overdue[:20],
|
||||
"overdue_count": len(overdue),
|
||||
"top_services": [{"service": s, "count": c} for s, c in top_services],
|
||||
}
|
||||
|
||||
|
||||
# ─── Workflow helper ──────────────────────────────────────────────────────────
|
||||
|
||||
def validate_transition(current_status: str, new_status: str,
|
||||
policy: Optional[Dict] = None) -> bool:
|
||||
"""Return True if transition is allowed, False otherwise."""
|
||||
defaults = _builtin_workflow()
|
||||
if policy is None:
|
||||
allowed = defaults
|
||||
else:
|
||||
allowed = policy.get("workflow", {}).get("allowed_transitions", defaults)
|
||||
return new_status in allowed.get(current_status, [])
|
||||
|
||||
|
||||
def _builtin_workflow() -> Dict:
|
||||
return {
|
||||
"open": ["in_progress", "blocked", "canceled"],
|
||||
"in_progress": ["blocked", "done", "canceled"],
|
||||
"blocked": ["open", "in_progress", "canceled"],
|
||||
"done": [],
|
||||
"canceled": [],
|
||||
}
|
||||
|
||||
|
||||
# ─── Memory backend ───────────────────────────────────────────────────────────
|
||||
|
||||
class MemoryBacklogStore(BacklogStore):
|
||||
def __init__(self) -> None:
|
||||
self._items: Dict[str, BacklogItem] = {}
|
||||
self._events: List[BacklogEvent] = []
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def create(self, item: BacklogItem) -> BacklogItem:
|
||||
with self._lock:
|
||||
self._items[item.id] = item
|
||||
return item
|
||||
|
||||
def get(self, item_id: str) -> Optional[BacklogItem]:
|
||||
with self._lock:
|
||||
return self._items.get(item_id)
|
||||
|
||||
def get_by_dedupe_key(self, key: str) -> Optional[BacklogItem]:
|
||||
with self._lock:
|
||||
for it in self._items.values():
|
||||
if it.dedupe_key == key:
|
||||
return it
|
||||
return None
|
||||
|
||||
def update(self, item: BacklogItem) -> BacklogItem:
|
||||
with self._lock:
|
||||
self._items[item.id] = item
|
||||
return item
|
||||
|
||||
def list_items(self, filters: Optional[Dict] = None,
|
||||
limit: int = 50, offset: int = 0) -> List[BacklogItem]:
|
||||
filters = filters or {}
|
||||
with self._lock:
|
||||
items = list(self._items.values())
|
||||
items = _apply_filters(items, filters)
|
||||
items.sort(key=lambda x: (x.priority, x.due_date or "9999"))
|
||||
return items[offset: offset + limit]
|
||||
|
||||
def add_event(self, event: BacklogEvent) -> BacklogEvent:
|
||||
with self._lock:
|
||||
self._events.append(event)
|
||||
return event
|
||||
|
||||
def get_events(self, item_id: str, limit: int = 50) -> List[BacklogEvent]:
|
||||
with self._lock:
|
||||
evs = [e for e in self._events if e.item_id == item_id]
|
||||
return evs[-limit:]
|
||||
|
||||
def cleanup(self, retention_days: int = 180) -> int:
|
||||
cutoff = (
|
||||
datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
|
||||
).isoformat()
|
||||
with self._lock:
|
||||
to_delete = [
|
||||
iid for iid, it in self._items.items()
|
||||
if it.status in ("done", "canceled") and it.updated_at < cutoff
|
||||
]
|
||||
for iid in to_delete:
|
||||
del self._items[iid]
|
||||
return len(to_delete)
|
||||
|
||||
|
||||
# ─── JSONL backend ────────────────────────────────────────────────────────────
|
||||
|
||||
_JSONL_ITEMS = "ops/backlog/items.jsonl"
|
||||
_JSONL_EVENTS = "ops/backlog/events.jsonl"
|
||||
_JSONL_CACHE_MAX = 50_000 # lines to scan
|
||||
|
||||
|
||||
class JsonlBacklogStore(BacklogStore):
|
||||
"""
|
||||
Append-only JSONL filesystem store.
|
||||
Last-write-wins: items keyed by id, updates appended (read returns latest).
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
items_path: str = _JSONL_ITEMS,
|
||||
events_path: str = _JSONL_EVENTS,
|
||||
) -> None:
|
||||
self._items_path = Path(items_path)
|
||||
self._events_path = Path(events_path)
|
||||
self._lock = threading.Lock()
|
||||
self._items_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._events_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _load_items(self) -> Dict[str, BacklogItem]:
|
||||
"""Scan file, last-write-wins per id."""
|
||||
items: Dict[str, BacklogItem] = {}
|
||||
if not self._items_path.exists():
|
||||
return items
|
||||
try:
|
||||
with open(self._items_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
d = json.loads(line)
|
||||
items[d["id"]] = BacklogItem.from_dict(d)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.warning("JsonlBacklogStore: load_items error: %s", e)
|
||||
return items
|
||||
|
||||
def _append_item(self, item: BacklogItem) -> None:
|
||||
with open(self._items_path, "a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(item.to_dict(), default=str) + "\n")
|
||||
|
||||
def create(self, item: BacklogItem) -> BacklogItem:
|
||||
with self._lock:
|
||||
self._append_item(item)
|
||||
return item
|
||||
|
||||
def get(self, item_id: str) -> Optional[BacklogItem]:
|
||||
with self._lock:
|
||||
items = self._load_items()
|
||||
return items.get(item_id)
|
||||
|
||||
def get_by_dedupe_key(self, key: str) -> Optional[BacklogItem]:
|
||||
with self._lock:
|
||||
items = self._load_items()
|
||||
for it in items.values():
|
||||
if it.dedupe_key == key:
|
||||
return it
|
||||
return None
|
||||
|
||||
def update(self, item: BacklogItem) -> BacklogItem:
|
||||
item.updated_at = _now_iso()
|
||||
with self._lock:
|
||||
self._append_item(item)
|
||||
return item
|
||||
|
||||
def list_items(self, filters: Optional[Dict] = None,
|
||||
limit: int = 50, offset: int = 0) -> List[BacklogItem]:
|
||||
with self._lock:
|
||||
items = list(self._load_items().values())
|
||||
items = _apply_filters(items, filters or {})
|
||||
items.sort(key=lambda x: (x.priority, x.due_date or "9999"))
|
||||
return items[offset: offset + limit]
|
||||
|
||||
def add_event(self, event: BacklogEvent) -> BacklogEvent:
|
||||
with self._lock:
|
||||
if not self._events_path.parent.exists():
|
||||
self._events_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(self._events_path, "a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(event.to_dict(), default=str) + "\n")
|
||||
return event
|
||||
|
||||
def get_events(self, item_id: str, limit: int = 50) -> List[BacklogEvent]:
|
||||
events: List[BacklogEvent] = []
|
||||
if not self._events_path.exists():
|
||||
return events
|
||||
try:
|
||||
with open(self._events_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
d = json.loads(line)
|
||||
if d.get("item_id") == item_id:
|
||||
events.append(BacklogEvent.from_dict(d))
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.warning("JsonlBacklogStore: get_events error: %s", e)
|
||||
return events[-limit:]
|
||||
|
||||
def cleanup(self, retention_days: int = 180) -> int:
|
||||
cutoff = (
|
||||
datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
|
||||
).isoformat()
|
||||
with self._lock:
|
||||
items = self._load_items()
|
||||
to_keep = {
|
||||
iid: it for iid, it in items.items()
|
||||
if not (it.status in ("done", "canceled") and it.updated_at < cutoff)
|
||||
}
|
||||
deleted = len(items) - len(to_keep)
|
||||
if deleted:
|
||||
# Rewrite the file
|
||||
with open(self._items_path, "w", encoding="utf-8") as f:
|
||||
for it in to_keep.values():
|
||||
f.write(json.dumps(it.to_dict(), default=str) + "\n")
|
||||
return deleted
|
||||
|
||||
|
||||
# ─── Postgres backend ─────────────────────────────────────────────────────────
|
||||
|
||||
class PostgresBacklogStore(BacklogStore):
|
||||
"""
|
||||
Postgres-backed store using psycopg2 (sync).
|
||||
Tables: backlog_items, backlog_events (created by migration script).
|
||||
"""
|
||||
def __init__(self, dsn: Optional[str] = None) -> None:
|
||||
self._dsn = dsn or os.environ.get(
|
||||
"BACKLOG_POSTGRES_DSN",
|
||||
os.environ.get("POSTGRES_DSN", "postgresql://localhost/daarion")
|
||||
)
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def _conn(self):
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
return psycopg2.connect(self._dsn)
|
||||
|
||||
def create(self, item: BacklogItem) -> BacklogItem:
|
||||
sql = """
|
||||
INSERT INTO backlog_items
|
||||
(id, created_at, updated_at, env, service, category, title, description,
|
||||
priority, status, owner, due_date, source, dedupe_key,
|
||||
evidence_refs, tags, meta)
|
||||
VALUES
|
||||
(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
|
||||
ON CONFLICT (dedupe_key) DO NOTHING
|
||||
"""
|
||||
with self._conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql, (
|
||||
item.id, item.created_at, item.updated_at,
|
||||
item.env, item.service, item.category,
|
||||
item.title, item.description, item.priority,
|
||||
item.status, item.owner, item.due_date or None,
|
||||
item.source, item.dedupe_key,
|
||||
json.dumps(item.evidence_refs),
|
||||
json.dumps(item.tags),
|
||||
json.dumps(item.meta),
|
||||
))
|
||||
return item
|
||||
|
||||
def get(self, item_id: str) -> Optional[BacklogItem]:
|
||||
with self._conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT * FROM backlog_items WHERE id=%s", (item_id,))
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
return self._row_to_item(row, cur.description)
|
||||
return None
|
||||
|
||||
def get_by_dedupe_key(self, key: str) -> Optional[BacklogItem]:
|
||||
with self._conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT * FROM backlog_items WHERE dedupe_key=%s", (key,))
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
return self._row_to_item(row, cur.description)
|
||||
return None
|
||||
|
||||
def update(self, item: BacklogItem) -> BacklogItem:
|
||||
item.updated_at = _now_iso()
|
||||
sql = """
|
||||
UPDATE backlog_items SET
|
||||
updated_at=%s, title=%s, description=%s, priority=%s,
|
||||
status=%s, owner=%s, due_date=%s, evidence_refs=%s, tags=%s, meta=%s
|
||||
WHERE id=%s
|
||||
"""
|
||||
with self._conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql, (
|
||||
item.updated_at, item.title, item.description,
|
||||
item.priority, item.status, item.owner,
|
||||
item.due_date or None,
|
||||
json.dumps(item.evidence_refs),
|
||||
json.dumps(item.tags),
|
||||
json.dumps(item.meta),
|
||||
item.id,
|
||||
))
|
||||
return item
|
||||
|
||||
def list_items(self, filters: Optional[Dict] = None,
|
||||
limit: int = 50, offset: int = 0) -> List[BacklogItem]:
|
||||
filters = filters or {}
|
||||
where, params = _pg_where_clause(filters)
|
||||
sql = f"""
|
||||
SELECT * FROM backlog_items {where}
|
||||
ORDER BY priority ASC, due_date ASC NULLS LAST
|
||||
LIMIT %s OFFSET %s
|
||||
"""
|
||||
with self._conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql, params + [limit, offset])
|
||||
rows = cur.fetchall()
|
||||
desc = cur.description
|
||||
return [self._row_to_item(r, desc) for r in rows]
|
||||
|
||||
def add_event(self, event: BacklogEvent) -> BacklogEvent:
|
||||
sql = """
|
||||
INSERT INTO backlog_events (id, item_id, ts, type, message, actor, meta)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s)
|
||||
"""
|
||||
with self._conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql, (
|
||||
event.id, event.item_id, event.ts,
|
||||
event.type, event.message, event.actor,
|
||||
json.dumps(event.meta),
|
||||
))
|
||||
return event
|
||||
|
||||
def get_events(self, item_id: str, limit: int = 50) -> List[BacklogEvent]:
|
||||
with self._conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT * FROM backlog_events WHERE item_id=%s ORDER BY ts DESC LIMIT %s",
|
||||
(item_id, limit)
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
desc = cur.description
|
||||
return [self._row_to_event(r, desc) for r in rows]
|
||||
|
||||
def cleanup(self, retention_days: int = 180) -> int:
|
||||
cutoff = (
|
||||
datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
|
||||
).isoformat()
|
||||
with self._conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""DELETE FROM backlog_items
|
||||
WHERE status IN ('done','canceled') AND updated_at < %s""",
|
||||
(cutoff,)
|
||||
)
|
||||
return cur.rowcount
|
||||
|
||||
@staticmethod
|
||||
def _row_to_item(row, description) -> BacklogItem:
|
||||
d = {col.name: val for col, val in zip(description, row)}
|
||||
for json_key in ("evidence_refs", "tags", "meta"):
|
||||
v = d.get(json_key)
|
||||
if isinstance(v, str):
|
||||
try:
|
||||
d[json_key] = json.loads(v)
|
||||
except Exception:
|
||||
d[json_key] = {} if json_key != "tags" else []
|
||||
return BacklogItem.from_dict(d)
|
||||
|
||||
@staticmethod
|
||||
def _row_to_event(row, description) -> BacklogEvent:
|
||||
d = {col.name: val for col, val in zip(description, row)}
|
||||
if isinstance(d.get("meta"), str):
|
||||
try:
|
||||
d["meta"] = json.loads(d["meta"])
|
||||
except Exception:
|
||||
d["meta"] = {}
|
||||
return BacklogEvent.from_dict(d)
|
||||
|
||||
|
||||
def _pg_where_clause(filters: Dict):
|
||||
clauses, params = [], []
|
||||
if filters.get("env"):
|
||||
clauses.append("env=%s"); params.append(filters["env"])
|
||||
if filters.get("service"):
|
||||
clauses.append("service=%s"); params.append(filters["service"])
|
||||
if filters.get("status"):
|
||||
if isinstance(filters["status"], list):
|
||||
ph = ",".join(["%s"] * len(filters["status"]))
|
||||
clauses.append(f"status IN ({ph})"); params.extend(filters["status"])
|
||||
else:
|
||||
clauses.append("status=%s"); params.append(filters["status"])
|
||||
if filters.get("owner"):
|
||||
clauses.append("owner=%s"); params.append(filters["owner"])
|
||||
if filters.get("category"):
|
||||
clauses.append("category=%s"); params.append(filters["category"])
|
||||
if filters.get("due_before"):
|
||||
clauses.append("due_date < %s"); params.append(filters["due_before"])
|
||||
return ("WHERE " + " AND ".join(clauses)) if clauses else "", params
|
||||
|
||||
|
||||
# ─── Null backend ─────────────────────────────────────────────────────────────
|
||||
|
||||
class NullBacklogStore(BacklogStore):
|
||||
def create(self, item): return item
|
||||
def get(self, item_id): return None
|
||||
def get_by_dedupe_key(self, key): return None
|
||||
def update(self, item): return item
|
||||
def list_items(self, filters=None, limit=50, offset=0): return []
|
||||
def add_event(self, event): return event
|
||||
def get_events(self, item_id, limit=50): return []
|
||||
def cleanup(self, retention_days=180): return 0
|
||||
|
||||
|
||||
# ─── Auto backend (Postgres → JSONL fallback) ─────────────────────────────────
|
||||
|
||||
class AutoBacklogStore(BacklogStore):
|
||||
"""Postgres primary with JSONL fallback. Retries Postgres after 5 min."""
|
||||
_RETRY_SEC = 300
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
postgres_dsn: Optional[str] = None,
|
||||
jsonl_items: str = _JSONL_ITEMS,
|
||||
jsonl_events: str = _JSONL_EVENTS,
|
||||
) -> None:
|
||||
self._pg: Optional[PostgresBacklogStore] = None
|
||||
self._jsonl = JsonlBacklogStore(jsonl_items, jsonl_events)
|
||||
self._dsn = postgres_dsn
|
||||
self._pg_failed_at: Optional[float] = None
|
||||
self._lock = threading.Lock()
|
||||
self._try_init_pg()
|
||||
|
||||
def _try_init_pg(self) -> None:
|
||||
try:
|
||||
self._pg = PostgresBacklogStore(self._dsn)
|
||||
self._pg._conn().close() # test connection
|
||||
self._pg_failed_at = None
|
||||
logger.info("AutoBacklogStore: Postgres backend active")
|
||||
except Exception as e:
|
||||
logger.warning("AutoBacklogStore: Postgres unavailable, using JSONL: %s", e)
|
||||
self._pg = None
|
||||
import time
|
||||
self._pg_failed_at = time.time()
|
||||
|
||||
def _backend(self) -> BacklogStore:
|
||||
if self._pg is not None:
|
||||
return self._pg
|
||||
import time
|
||||
if (self._pg_failed_at is None
|
||||
or time.time() - self._pg_failed_at >= self._RETRY_SEC):
|
||||
self._try_init_pg()
|
||||
return self._pg if self._pg is not None else self._jsonl
|
||||
|
||||
def create(self, item): return self._backend().create(item)
|
||||
def get(self, item_id): return self._backend().get(item_id)
|
||||
def get_by_dedupe_key(self, key): return self._backend().get_by_dedupe_key(key)
|
||||
def update(self, item): return self._backend().update(item)
|
||||
def list_items(self, filters=None, limit=50, offset=0):
|
||||
return self._backend().list_items(filters, limit, offset)
|
||||
def add_event(self, event): return self._backend().add_event(event)
|
||||
def get_events(self, item_id, limit=50): return self._backend().get_events(item_id, limit)
|
||||
def cleanup(self, retention_days=180): return self._backend().cleanup(retention_days)
|
||||
|
||||
|
||||
# ─── Filters helper ───────────────────────────────────────────────────────────
|
||||
|
||||
def _apply_filters(items: List[BacklogItem], filters: Dict) -> List[BacklogItem]:
|
||||
result = []
|
||||
for it in items:
|
||||
if filters.get("env") and it.env != filters["env"]:
|
||||
continue
|
||||
if filters.get("service") and it.service != filters["service"]:
|
||||
continue
|
||||
if filters.get("status"):
|
||||
statuses = filters["status"] if isinstance(filters["status"], list) else [filters["status"]]
|
||||
if it.status not in statuses:
|
||||
continue
|
||||
if filters.get("owner") and it.owner != filters["owner"]:
|
||||
continue
|
||||
if filters.get("category") and it.category != filters["category"]:
|
||||
continue
|
||||
if filters.get("due_before") and it.due_date and it.due_date >= filters["due_before"]:
|
||||
continue
|
||||
result.append(it)
|
||||
return result
|
||||
|
||||
|
||||
# ─── Factory ──────────────────────────────────────────────────────────────────
|
||||
|
||||
_STORE_INSTANCE: Optional[BacklogStore] = None
|
||||
_STORE_LOCK = threading.Lock()
|
||||
|
||||
|
||||
def get_backlog_store() -> BacklogStore:
|
||||
global _STORE_INSTANCE
|
||||
with _STORE_LOCK:
|
||||
if _STORE_INSTANCE is not None:
|
||||
return _STORE_INSTANCE
|
||||
backend = os.environ.get("BACKLOG_BACKEND", "auto").lower()
|
||||
if backend == "memory":
|
||||
_STORE_INSTANCE = MemoryBacklogStore()
|
||||
elif backend == "jsonl":
|
||||
_STORE_INSTANCE = JsonlBacklogStore()
|
||||
elif backend == "postgres":
|
||||
_STORE_INSTANCE = PostgresBacklogStore()
|
||||
elif backend == "null":
|
||||
_STORE_INSTANCE = NullBacklogStore()
|
||||
else: # auto
|
||||
_STORE_INSTANCE = AutoBacklogStore()
|
||||
logger.info("backlog_store: using %s backend", type(_STORE_INSTANCE).__name__)
|
||||
return _STORE_INSTANCE
|
||||
|
||||
|
||||
def _reset_store_for_tests() -> None:
|
||||
global _STORE_INSTANCE
|
||||
with _STORE_LOCK:
|
||||
_STORE_INSTANCE = None
|
||||
595
services/router/cost_analyzer.py
Normal file
595
services/router/cost_analyzer.py
Normal file
@@ -0,0 +1,595 @@
|
||||
"""
|
||||
Cost & Resource Analyzer (FinOps MVP)
|
||||
|
||||
Reads audit events from AuditStore and computes:
|
||||
- Aggregated cost_units by tool/agent/workspace/status
|
||||
- Top spenders (tools, agents, users)
|
||||
- Anomalies (cost spikes, error rate spikes)
|
||||
- Cost model weights
|
||||
|
||||
"cost_units" = cost_per_call(tool) + duration_ms * cost_per_ms(tool)
|
||||
These are relative units, not real dollars.
|
||||
|
||||
No payload access — all inputs are aggregation parameters only.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Config loader ────────────────────────────────────────────────────────────
|
||||
|
||||
_weights_cache: Optional[Dict] = None
|
||||
_WEIGHTS_PATH = os.path.join(
|
||||
os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent)),
|
||||
"config", "cost_weights.yml",
|
||||
)
|
||||
|
||||
|
||||
def _load_weights() -> Dict:
|
||||
global _weights_cache
|
||||
if _weights_cache is not None:
|
||||
return _weights_cache
|
||||
try:
|
||||
import yaml
|
||||
with open(_WEIGHTS_PATH, "r") as f:
|
||||
_weights_cache = yaml.safe_load(f) or {}
|
||||
except Exception as e:
|
||||
logger.warning("cost_weights.yml not loaded: %s", e)
|
||||
_weights_cache = {}
|
||||
return _weights_cache
|
||||
|
||||
|
||||
def reload_cost_weights() -> None:
|
||||
"""Force reload weights (for tests)."""
|
||||
global _weights_cache
|
||||
_weights_cache = None
|
||||
|
||||
|
||||
def get_weights_for_tool(tool: str) -> Tuple[float, float]:
|
||||
"""Return (cost_per_call, cost_per_ms) for a tool."""
|
||||
cfg = _load_weights()
|
||||
defaults = cfg.get("defaults", {})
|
||||
tool_cfg = (cfg.get("tools") or {}).get(tool, {})
|
||||
cpc = float(tool_cfg.get("cost_per_call", defaults.get("cost_per_call", 1.0)))
|
||||
cpm = float(tool_cfg.get("cost_per_ms", defaults.get("cost_per_ms", 0.001)))
|
||||
return cpc, cpm
|
||||
|
||||
|
||||
def compute_event_cost(event: Dict) -> float:
|
||||
"""Compute cost_units for a single audit event."""
|
||||
tool = event.get("tool", "")
|
||||
duration_ms = float(event.get("duration_ms", 0))
|
||||
cpc, cpm = get_weights_for_tool(tool)
|
||||
return round(cpc + duration_ms * cpm, 4)
|
||||
|
||||
|
||||
# ─── Time helpers ─────────────────────────────────────────────────────────────
|
||||
|
||||
def _now_utc() -> datetime.datetime:
|
||||
return datetime.datetime.now(datetime.timezone.utc)
|
||||
|
||||
|
||||
def _iso(dt: datetime.datetime) -> str:
|
||||
return dt.isoformat()
|
||||
|
||||
|
||||
def _parse_iso(s: str) -> datetime.datetime:
|
||||
s = s.replace("Z", "+00:00")
|
||||
try:
|
||||
return datetime.datetime.fromisoformat(s)
|
||||
except Exception:
|
||||
return _now_utc()
|
||||
|
||||
|
||||
def _bucket_hour(ts: str) -> str:
|
||||
"""Truncate ISO ts to hour: '2026-02-23T10:00:00+00:00'."""
|
||||
return ts[:13] + ":00"
|
||||
|
||||
|
||||
# ─── Aggregation helpers ──────────────────────────────────────────────────────
|
||||
|
||||
def _aggregate(
|
||||
events: List[Dict],
|
||||
group_keys: List[str],
|
||||
) -> Dict[str, Dict]:
|
||||
"""
|
||||
Aggregate events by composite key (e.g. ["tool"] or ["agent_id", "tool"]).
|
||||
Returns {key_str: {count, cost_units, duration_sum, failed_count, ...}}.
|
||||
"""
|
||||
result: Dict[str, Dict] = defaultdict(lambda: {
|
||||
"count": 0,
|
||||
"cost_units": 0.0,
|
||||
"duration_ms_sum": 0.0,
|
||||
"failed_count": 0,
|
||||
"denied_count": 0,
|
||||
"in_size_sum": 0,
|
||||
"out_size_sum": 0,
|
||||
})
|
||||
|
||||
for ev in events:
|
||||
parts = [str(ev.get(k, "unknown")) for k in group_keys]
|
||||
key = ":".join(parts)
|
||||
cost = compute_event_cost(ev)
|
||||
status = ev.get("status", "pass")
|
||||
|
||||
r = result[key]
|
||||
r["count"] += 1
|
||||
r["cost_units"] = round(r["cost_units"] + cost, 4)
|
||||
r["duration_ms_sum"] = round(r["duration_ms_sum"] + float(ev.get("duration_ms", 0)), 2)
|
||||
r["in_size_sum"] += int(ev.get("in_size", 0))
|
||||
r["out_size_sum"] += int(ev.get("out_size", 0))
|
||||
if status in ("failed", "error"):
|
||||
r["failed_count"] += 1
|
||||
elif status == "denied":
|
||||
r["denied_count"] += 1
|
||||
|
||||
# Enrich with averages
|
||||
for key, r in result.items():
|
||||
n = r["count"] or 1
|
||||
r["avg_duration_ms"] = round(r["duration_ms_sum"] / n, 1)
|
||||
r["avg_cost_units"] = round(r["cost_units"] / n, 4)
|
||||
r["error_rate"] = round(r["failed_count"] / (r["count"] or 1), 4)
|
||||
|
||||
return dict(result)
|
||||
|
||||
|
||||
def _top_n(aggregated: Dict[str, Dict], key_field: str, n: int, sort_by: str = "cost_units") -> List[Dict]:
|
||||
"""Sort aggregated dict by sort_by and return top N."""
|
||||
items = [
|
||||
{"key": k, key_field: k, **v}
|
||||
for k, v in aggregated.items()
|
||||
]
|
||||
items.sort(key=lambda x: x.get(sort_by, 0), reverse=True)
|
||||
return items[:n]
|
||||
|
||||
|
||||
# ─── Actions ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def action_report(
|
||||
store,
|
||||
time_range: Optional[Dict[str, str]] = None,
|
||||
group_by: Optional[List[str]] = None,
|
||||
top_n: int = 10,
|
||||
include_failed: bool = True,
|
||||
include_hourly: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate aggregated cost report for a time range.
|
||||
|
||||
Returns:
|
||||
totals, breakdowns by group_by keys, top spenders, optional hourly trend.
|
||||
"""
|
||||
now = _now_utc()
|
||||
tr = time_range or {}
|
||||
from_ts = tr.get("from") or _iso(now - datetime.timedelta(days=7))
|
||||
to_ts = tr.get("to") or _iso(now)
|
||||
|
||||
events = store.read(from_ts=from_ts, to_ts=to_ts, limit=200_000)
|
||||
if not include_failed:
|
||||
events = [e for e in events if e.get("status", "pass") not in ("failed", "error")]
|
||||
|
||||
# Totals
|
||||
total_cost = sum(compute_event_cost(e) for e in events)
|
||||
total_calls = len(events)
|
||||
total_failed = sum(1 for e in events if e.get("status") in ("failed", "error"))
|
||||
total_denied = sum(1 for e in events if e.get("status") == "denied")
|
||||
|
||||
# Breakdowns
|
||||
by_key = group_by or ["tool"]
|
||||
breakdowns: Dict[str, List[Dict]] = {}
|
||||
for gk in by_key:
|
||||
agg = _aggregate(events, [gk])
|
||||
breakdowns[gk] = _top_n(agg, gk, top_n)
|
||||
|
||||
# Hourly trend (optional, for last 7d max)
|
||||
hourly: List[Dict] = []
|
||||
if include_hourly and events:
|
||||
hourly_agg: Dict[str, Dict] = defaultdict(lambda: {"count": 0, "cost_units": 0.0})
|
||||
for ev in events:
|
||||
bucket = _bucket_hour(ev.get("ts", ""))
|
||||
hourly_agg[bucket]["count"] += 1
|
||||
hourly_agg[bucket]["cost_units"] = round(
|
||||
hourly_agg[bucket]["cost_units"] + compute_event_cost(ev), 4
|
||||
)
|
||||
hourly = [{"hour": k, **v} for k, v in sorted(hourly_agg.items())]
|
||||
|
||||
return {
|
||||
"time_range": {"from": from_ts, "to": to_ts},
|
||||
"totals": {
|
||||
"calls": total_calls,
|
||||
"cost_units": round(total_cost, 2),
|
||||
"failed": total_failed,
|
||||
"denied": total_denied,
|
||||
"error_rate": round(total_failed / (total_calls or 1), 4),
|
||||
},
|
||||
"breakdowns": breakdowns,
|
||||
**({"hourly": hourly} if include_hourly else {}),
|
||||
}
|
||||
|
||||
|
||||
def action_top(
|
||||
store,
|
||||
window_hours: int = 24,
|
||||
top_n: int = 10,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Quick top-N report for tools, agents, and users over window_hours.
|
||||
"""
|
||||
now = _now_utc()
|
||||
from_ts = _iso(now - datetime.timedelta(hours=window_hours))
|
||||
to_ts = _iso(now)
|
||||
|
||||
events = store.read(from_ts=from_ts, to_ts=to_ts, limit=100_000)
|
||||
|
||||
top_tools = _top_n(_aggregate(events, ["tool"]), "tool", top_n)
|
||||
top_agents = _top_n(_aggregate(events, ["agent_id"]), "agent_id", top_n)
|
||||
top_users = _top_n(_aggregate(events, ["user_id"]), "user_id", top_n)
|
||||
top_workspaces = _top_n(_aggregate(events, ["workspace_id"]), "workspace_id", top_n)
|
||||
|
||||
return {
|
||||
"window_hours": window_hours,
|
||||
"time_range": {"from": from_ts, "to": to_ts},
|
||||
"total_calls": len(events),
|
||||
"top_tools": top_tools,
|
||||
"top_agents": top_agents,
|
||||
"top_users": top_users,
|
||||
"top_workspaces": top_workspaces,
|
||||
}
|
||||
|
||||
|
||||
def action_anomalies(
|
||||
store,
|
||||
window_minutes: int = 60,
|
||||
baseline_hours: int = 24,
|
||||
ratio_threshold: Optional[float] = None,
|
||||
min_calls: Optional[int] = None,
|
||||
tools_filter: Optional[List[str]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Detect cost/call spikes and elevated error rates.
|
||||
|
||||
Algorithm:
|
||||
1. Compute per-tool metrics for window [now-window_minutes, now]
|
||||
2. Compute per-tool metrics for baseline [now-baseline_hours, now-window_minutes]
|
||||
3. Spike = window_rate / baseline_rate >= ratio_threshold AND calls >= min_calls
|
||||
4. Error spike = failed_rate > 10% AND calls >= min_calls
|
||||
"""
|
||||
cfg = _load_weights()
|
||||
anomaly_cfg = cfg.get("anomaly", {})
|
||||
|
||||
if ratio_threshold is None:
|
||||
ratio_threshold = float(anomaly_cfg.get("spike_ratio_threshold", 3.0))
|
||||
if min_calls is None:
|
||||
min_calls = int(anomaly_cfg.get("min_calls_threshold", 10))
|
||||
|
||||
now = _now_utc()
|
||||
window_from = _iso(now - datetime.timedelta(minutes=window_minutes))
|
||||
baseline_from = _iso(now - datetime.timedelta(hours=baseline_hours))
|
||||
baseline_to = window_from # non-overlapping
|
||||
|
||||
# Fetch both windows
|
||||
window_events = store.read(from_ts=window_from, to_ts=_iso(now), limit=50_000)
|
||||
baseline_events = store.read(from_ts=baseline_from, to_ts=baseline_to, limit=200_000)
|
||||
|
||||
if tools_filter:
|
||||
window_events = [e for e in window_events if e.get("tool") in tools_filter]
|
||||
baseline_events = [e for e in baseline_events if e.get("tool") in tools_filter]
|
||||
|
||||
# Aggregate by tool
|
||||
window_by_tool = _aggregate(window_events, ["tool"])
|
||||
baseline_by_tool = _aggregate(baseline_events, ["tool"])
|
||||
|
||||
# Normalise baseline to per-minute rate
|
||||
baseline_minutes = (baseline_hours * 60) - window_minutes
|
||||
baseline_minutes = max(baseline_minutes, 1)
|
||||
window_minutes_actual = float(window_minutes)
|
||||
|
||||
anomalies = []
|
||||
|
||||
all_tools = set(window_by_tool.keys()) | set(baseline_by_tool.keys())
|
||||
for tool_key in sorted(all_tools):
|
||||
w = window_by_tool.get(tool_key, {})
|
||||
b = baseline_by_tool.get(tool_key, {})
|
||||
|
||||
w_calls = w.get("count", 0)
|
||||
b_calls = b.get("count", 0)
|
||||
|
||||
if w_calls < min_calls:
|
||||
continue # Not enough traffic for meaningful anomaly
|
||||
|
||||
# Per-minute rates
|
||||
w_rate = w_calls / window_minutes_actual
|
||||
b_rate = b_calls / baseline_minutes if b_calls > 0 else 0.0
|
||||
|
||||
# Cost spike
|
||||
w_cost_pm = w.get("cost_units", 0) / window_minutes_actual
|
||||
b_cost_pm = b.get("cost_units", 0) / baseline_minutes if b_calls > 0 else 0.0
|
||||
|
||||
call_ratio = (w_rate / b_rate) if b_rate > 0 else float("inf")
|
||||
cost_ratio = (w_cost_pm / b_cost_pm) if b_cost_pm > 0 else float("inf")
|
||||
|
||||
if call_ratio >= ratio_threshold or cost_ratio >= ratio_threshold:
|
||||
ratio_display = round(max(call_ratio, cost_ratio), 2)
|
||||
if ratio_display == float("inf"):
|
||||
ratio_display = "∞ (no baseline)"
|
||||
w_cost = w.get("cost_units", 0)
|
||||
b_cost = b.get("cost_units", 0)
|
||||
anomalies.append({
|
||||
"type": "cost_spike",
|
||||
"key": f"tool:{tool_key}",
|
||||
"tool": tool_key,
|
||||
"window": f"last_{window_minutes}m",
|
||||
"baseline": f"prev_{baseline_hours}h",
|
||||
"window_calls": w_calls,
|
||||
"baseline_calls": b_calls,
|
||||
"window_cost_units": round(w_cost, 2),
|
||||
"baseline_cost_units": round(b_cost, 2),
|
||||
"ratio": ratio_display,
|
||||
"recommendation": _spike_recommendation(tool_key, ratio_display, w_calls),
|
||||
})
|
||||
|
||||
# Error rate spike
|
||||
w_err_rate = w.get("error_rate", 0)
|
||||
if w_err_rate > 0.10 and w_calls >= min_calls:
|
||||
anomalies.append({
|
||||
"type": "error_spike",
|
||||
"key": f"tool:{tool_key}",
|
||||
"tool": tool_key,
|
||||
"window": f"last_{window_minutes}m",
|
||||
"failed_calls": w.get("failed_count", 0),
|
||||
"total_calls": w_calls,
|
||||
"error_rate": round(w_err_rate, 4),
|
||||
"recommendation": f"Investigate failures for '{tool_key}': {w.get('failed_count',0)} failed / {w_calls} calls ({round(w_err_rate*100,1)}% error rate).",
|
||||
})
|
||||
|
||||
# De-duplicate tool+type combos (error_spike already separate)
|
||||
seen = set()
|
||||
unique_anomalies = []
|
||||
for a in anomalies:
|
||||
key = (a["type"], a.get("tool", ""))
|
||||
if key not in seen:
|
||||
unique_anomalies.append(a)
|
||||
seen.add(key)
|
||||
|
||||
return {
|
||||
"anomalies": unique_anomalies,
|
||||
"anomaly_count": len(unique_anomalies),
|
||||
"window_minutes": window_minutes,
|
||||
"baseline_hours": baseline_hours,
|
||||
"ratio_threshold": ratio_threshold,
|
||||
"min_calls": min_calls,
|
||||
"stats": {
|
||||
"window_calls": len(window_events),
|
||||
"baseline_calls": len(baseline_events),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def action_weights(repo_root: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Return current cost weights configuration."""
|
||||
global _weights_cache
|
||||
_weights_cache = None # Force reload
|
||||
cfg = _load_weights()
|
||||
return {
|
||||
"defaults": cfg.get("defaults", {}),
|
||||
"tools": cfg.get("tools", {}),
|
||||
"anomaly": cfg.get("anomaly", {}),
|
||||
"config_path": _WEIGHTS_PATH,
|
||||
}
|
||||
|
||||
|
||||
# ─── Recommendation templates ─────────────────────────────────────────────────
|
||||
|
||||
def _spike_recommendation(tool: str, ratio: Any, calls: int) -> str:
|
||||
cfg = _load_weights()
|
||||
tool_cfg = (cfg.get("tools") or {}).get(tool, {})
|
||||
category = tool_cfg.get("category", "")
|
||||
|
||||
if category == "media":
|
||||
return (
|
||||
f"'{tool}' cost spike (ratio={ratio}, {calls} calls). "
|
||||
"Consider: rate-limit per workspace, queue with priority, review calling agents."
|
||||
)
|
||||
if category == "release":
|
||||
return (
|
||||
f"'{tool}' called more frequently than baseline (ratio={ratio}). "
|
||||
"Review if release_check is looping or being triggered too often."
|
||||
)
|
||||
if category == "web":
|
||||
return (
|
||||
f"'{tool}' spike (ratio={ratio}). Consider: result caching, dedup identical queries."
|
||||
)
|
||||
return (
|
||||
f"'{tool}' cost spike (ratio={ratio}, {calls} calls in window). "
|
||||
"Review caller agents and apply rate limits if needed."
|
||||
)
|
||||
|
||||
|
||||
# ─── backend=auto store resolver ─────────────────────────────────────────────
|
||||
|
||||
def _resolve_store(backend: str = "auto"):
|
||||
"""
|
||||
Return an AuditStore based on backend param.
|
||||
backend='auto' (default): uses the globally configured store (which may be
|
||||
AutoAuditStore, Postgres, or JSONL).
|
||||
backend='jsonl': forces JsonlAuditStore (7-day window max recommended).
|
||||
backend='memory': MemoryAuditStore (testing).
|
||||
"""
|
||||
from audit_store import get_audit_store, JsonlAuditStore, MemoryAuditStore
|
||||
if backend in ("auto", None, ""):
|
||||
return get_audit_store()
|
||||
if backend == "jsonl":
|
||||
import os
|
||||
from pathlib import Path
|
||||
audit_dir = os.getenv(
|
||||
"AUDIT_JSONL_DIR",
|
||||
str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "audit"),
|
||||
)
|
||||
return JsonlAuditStore(audit_dir)
|
||||
if backend == "memory":
|
||||
return MemoryAuditStore()
|
||||
return get_audit_store()
|
||||
|
||||
|
||||
# ─── Digest action ────────────────────────────────────────────────────────────
|
||||
|
||||
def action_digest(
|
||||
store,
|
||||
window_hours: int = 24,
|
||||
baseline_hours: int = 168, # 7 days
|
||||
top_n: int = 10,
|
||||
max_markdown_chars: int = 3800,
|
||||
) -> Dict:
|
||||
"""
|
||||
Daily/weekly cost digest: top tools/agents + anomalies + recommendations.
|
||||
|
||||
Returns both structured JSON and a Telegram/markdown-friendly `markdown` field.
|
||||
"""
|
||||
now = _now_utc()
|
||||
window_from = _iso(now - datetime.timedelta(hours=window_hours))
|
||||
window_to = _iso(now)
|
||||
baseline_from = _iso(now - datetime.timedelta(hours=baseline_hours))
|
||||
|
||||
# ── Top ──────────────────────────────────────────────────────────────────
|
||||
top_data = action_top(store, window_hours=window_hours, top_n=top_n)
|
||||
top_tools = top_data.get("top_tools") or []
|
||||
top_agents = top_data.get("top_agents") or []
|
||||
total_calls = top_data.get("total_calls", 0)
|
||||
|
||||
# ── Anomalies ─────────────────────────────────────────────────────────────
|
||||
anomaly_data = action_anomalies(
|
||||
store,
|
||||
window_minutes=int(window_hours * 60 / 4),
|
||||
baseline_hours=baseline_hours,
|
||||
min_calls=5,
|
||||
)
|
||||
anomalies = anomaly_data.get("anomalies") or []
|
||||
|
||||
# ── Total cost ────────────────────────────────────────────────────────────
|
||||
events = store.read(from_ts=window_from, to_ts=window_to, limit=200_000)
|
||||
total_cost = sum(compute_event_cost(e) for e in events)
|
||||
failed = sum(1 for e in events if e.get("status") in ("failed", "error"))
|
||||
error_rate = round(failed / max(len(events), 1), 4)
|
||||
|
||||
# ── Recommendations ───────────────────────────────────────────────────────
|
||||
recs = []
|
||||
for a in anomalies[:5]:
|
||||
r = a.get("recommendation", "")
|
||||
if r:
|
||||
recs.append(r)
|
||||
if error_rate > 0.05:
|
||||
recs.append(f"High error rate {round(error_rate*100,1)}% — investigate failing tools.")
|
||||
if top_tools and top_tools[0].get("cost_units", 0) > 500:
|
||||
tool_name = top_tools[0].get("tool", "?")
|
||||
recs.append(f"Top spender '{tool_name}' used {top_tools[0]['cost_units']:.0f} cost units — review frequency.")
|
||||
recs = list(dict.fromkeys(recs))[:8]
|
||||
|
||||
# ── Markdown ─────────────────────────────────────────────────────────────
|
||||
period_label = f"Last {window_hours}h" if window_hours <= 48 else f"Last {window_hours//24}d"
|
||||
lines = [
|
||||
f"📊 **Cost Digest** ({period_label})",
|
||||
f"Total calls: {total_calls} | Cost units: {total_cost:.0f} | Errors: {round(error_rate*100,1)}%",
|
||||
"",
|
||||
"**Top Tools:**",
|
||||
]
|
||||
for t in top_tools[:5]:
|
||||
lines.append(f" • `{t.get('tool','?')}` — {t.get('cost_units',0):.1f}u, {t.get('count',0)} calls")
|
||||
lines.append("")
|
||||
lines.append("**Top Agents:**")
|
||||
for a in top_agents[:3]:
|
||||
lines.append(f" • `{a.get('agent_id','?')}` — {a.get('cost_units',0):.1f}u, {a.get('count',0)} calls")
|
||||
|
||||
if anomalies:
|
||||
lines.append("")
|
||||
lines.append(f"⚠️ **{len(anomalies)} Anomaly(ies):**")
|
||||
for anm in anomalies[:3]:
|
||||
lines.append(f" • [{anm.get('type','?')}] `{anm.get('tool','?')}` ratio={anm.get('ratio','?')}")
|
||||
if recs:
|
||||
lines.append("")
|
||||
lines.append("💡 **Recommendations:**")
|
||||
for r in recs[:5]:
|
||||
lines.append(f" {r[:200]}")
|
||||
|
||||
markdown = "\n".join(lines)
|
||||
if len(markdown) > max_markdown_chars:
|
||||
markdown = markdown[:max_markdown_chars] + "\n…[truncated]"
|
||||
|
||||
return {
|
||||
"period": period_label,
|
||||
"window_hours": window_hours,
|
||||
"time_range": {"from": window_from, "to": window_to},
|
||||
"totals": {
|
||||
"calls": total_calls,
|
||||
"cost_units": round(total_cost, 2),
|
||||
"failed": failed,
|
||||
"error_rate": error_rate,
|
||||
},
|
||||
"top_tools": top_tools[:top_n],
|
||||
"top_agents": top_agents[:top_n],
|
||||
"anomalies": anomalies[:10],
|
||||
"anomaly_count": len(anomalies),
|
||||
"recommendations": recs,
|
||||
"markdown": markdown,
|
||||
}
|
||||
|
||||
|
||||
# ─── Main entrypoint ─────────────────────────────────────────────────────────
|
||||
|
||||
def analyze_cost_dict(action: str, params: Optional[Dict] = None, store=None) -> Dict:
|
||||
"""
|
||||
Wrapper called by tool_manager handler.
|
||||
Returns plain dict for ToolResult.
|
||||
"""
|
||||
params = params or {}
|
||||
if store is None:
|
||||
backend = params.get("backend", "auto")
|
||||
store = _resolve_store(backend)
|
||||
|
||||
if action == "digest":
|
||||
return action_digest(
|
||||
store,
|
||||
window_hours=int(params.get("window_hours", 24)),
|
||||
baseline_hours=int(params.get("baseline_hours", 168)),
|
||||
top_n=int(params.get("top_n", 10)),
|
||||
max_markdown_chars=int(params.get("max_markdown_chars", 3800)),
|
||||
)
|
||||
|
||||
if action == "report":
|
||||
return action_report(
|
||||
store,
|
||||
time_range=params.get("time_range"),
|
||||
group_by=params.get("group_by", ["tool"]),
|
||||
top_n=int(params.get("top_n", 10)),
|
||||
include_failed=bool(params.get("include_failed", True)),
|
||||
include_hourly=bool(params.get("include_hourly", False)),
|
||||
)
|
||||
|
||||
if action == "top":
|
||||
return action_top(
|
||||
store,
|
||||
window_hours=int(params.get("window_hours", 24)),
|
||||
top_n=int(params.get("top_n", 10)),
|
||||
)
|
||||
|
||||
if action == "anomalies":
|
||||
return action_anomalies(
|
||||
store,
|
||||
window_minutes=int(params.get("window_minutes", 60)),
|
||||
baseline_hours=int(params.get("baseline_hours", 24)),
|
||||
ratio_threshold=params.get("ratio_threshold"),
|
||||
min_calls=params.get("min_calls"),
|
||||
tools_filter=params.get("tools_filter"),
|
||||
)
|
||||
|
||||
if action == "weights":
|
||||
return action_weights()
|
||||
|
||||
return {"error": f"Unknown action '{action}'. Valid: digest, report, top, anomalies, weights"}
|
||||
1024
services/router/data_governance.py
Normal file
1024
services/router/data_governance.py
Normal file
File diff suppressed because it is too large
Load Diff
968
services/router/dependency_scanner.py
Normal file
968
services/router/dependency_scanner.py
Normal file
@@ -0,0 +1,968 @@
|
||||
"""
|
||||
Dependency & Supply Chain Scanner.
|
||||
|
||||
Scans Python and Node.js dependencies for:
|
||||
1. Known vulnerabilities (via OSV.dev API or offline cache)
|
||||
2. Outdated packages (lockfile_only mode, using OSV fixed_versions)
|
||||
3. License policy enforcement (optional, MVP: offline-only)
|
||||
|
||||
Ecosystems supported:
|
||||
Python → poetry.lock, pipfile.lock, requirements*.txt, pyproject.toml
|
||||
Node → package-lock.json, pnpm-lock.yaml, yarn.lock, package.json
|
||||
|
||||
Pass rule: pass=false if any vuln with severity in fail_on (default: CRITICAL, HIGH).
|
||||
MEDIUM → warning (not blocking by default). UNKNOWN → warning if not in fail_on.
|
||||
|
||||
Security:
|
||||
- Read-only: no file writes except cache update (explicit)
|
||||
- Evidence masked for secrets
|
||||
- Payload not logged; only hash + counts
|
||||
- Max files/deps enforced via limits
|
||||
- Timeout via deadline
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import fnmatch
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, FrozenSet, List, Optional, Set, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Constants ────────────────────────────────────────────────────────────────
|
||||
|
||||
EXCLUDED_DIRS: FrozenSet[str] = frozenset({
|
||||
"node_modules", ".git", "dist", "build", "vendor",
|
||||
".venv", "venv", "venv_models", "sofia_venv",
|
||||
"__pycache__", ".pytest_cache", "rollback_backups",
|
||||
"docs/consolidation",
|
||||
})
|
||||
|
||||
OSV_API_URL = "https://api.osv.dev/v1/querybatch"
|
||||
OSV_BATCH_SIZE = 100 # max per request
|
||||
OSV_TIMEOUT_SEC = 15.0
|
||||
|
||||
# OSV ecosystems
|
||||
ECOSYSTEM_PYPI = "PyPI"
|
||||
ECOSYSTEM_NPM = "npm"
|
||||
|
||||
SEVERITY_ORDER = {"CRITICAL": 4, "HIGH": 3, "MEDIUM": 2, "LOW": 1, "UNKNOWN": 0}
|
||||
|
||||
# ─── Data Structures ──────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class Package:
|
||||
name: str
|
||||
version: str # empty string = unresolved/unpinned
|
||||
ecosystem: str # "PyPI" | "npm"
|
||||
source_file: str
|
||||
pinned: bool = True
|
||||
|
||||
@property
|
||||
def normalized_name(self) -> str:
|
||||
return self.name.lower().replace("_", "-")
|
||||
|
||||
@property
|
||||
def cache_key(self) -> str:
|
||||
return f"{self.ecosystem}:{self.normalized_name}:{self.version}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Vulnerability:
|
||||
osv_id: str
|
||||
ecosystem: str
|
||||
package: str
|
||||
version: str
|
||||
severity: str # CRITICAL | HIGH | MEDIUM | LOW | UNKNOWN
|
||||
fixed_versions: List[str]
|
||||
aliases: List[str] # CVE-XXXX-XXXX etc.
|
||||
evidence: Dict[str, str]
|
||||
recommendation: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class OutdatedPackage:
|
||||
ecosystem: str
|
||||
package: str
|
||||
current: str
|
||||
latest: Optional[str]
|
||||
notes: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class LicenseFinding:
|
||||
package: str
|
||||
license: str
|
||||
policy: str # "deny" | "warn" | "ok" | "unknown"
|
||||
recommendation: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScanResult:
|
||||
pass_: bool
|
||||
summary: str
|
||||
stats: Dict[str, Any]
|
||||
vulnerabilities: List[Dict]
|
||||
outdated: List[Dict]
|
||||
licenses: List[Dict]
|
||||
recommendations: List[str]
|
||||
|
||||
|
||||
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
_SECRET_PAT = re.compile(
|
||||
r'(?i)(api[_-]?key|token|secret|password|bearer|jwt|private[_-]?key)'
|
||||
r'[\s=:]+[\'"`]?([a-zA-Z0-9_\-\.]{8,})[\'"`]?'
|
||||
)
|
||||
|
||||
|
||||
def _redact(text: str) -> str:
|
||||
return _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***REDACTED***", text or "")
|
||||
|
||||
|
||||
def _is_excluded(path: str) -> bool:
|
||||
parts = Path(path).parts
|
||||
return any(p in EXCLUDED_DIRS for p in parts)
|
||||
|
||||
|
||||
def _read_file(path: str, max_bytes: int = 524288) -> str:
|
||||
try:
|
||||
size = os.path.getsize(path)
|
||||
with open(path, "r", errors="replace") as f:
|
||||
return f.read(min(size, max_bytes))
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _normalize_pkg_name(name: str) -> str:
|
||||
"""Normalize: lowercase, underscores → dashes."""
|
||||
return name.strip().lower().replace("_", "-")
|
||||
|
||||
|
||||
def _compare_versions(v1: str, v2: str) -> int:
|
||||
"""
|
||||
Simple version comparison. Returns -1 / 0 / 1.
|
||||
Handles semver and PEP 440 in a best-effort way.
|
||||
"""
|
||||
def _parts(v: str) -> List[int]:
|
||||
nums = re.findall(r'\d+', v.split("+")[0].split("-")[0])
|
||||
return [int(x) for x in nums] if nums else [0]
|
||||
|
||||
p1, p2 = _parts(v1), _parts(v2)
|
||||
# Pad to equal length
|
||||
max_len = max(len(p1), len(p2))
|
||||
p1 += [0] * (max_len - len(p1))
|
||||
p2 += [0] * (max_len - len(p2))
|
||||
if p1 < p2:
|
||||
return -1
|
||||
if p1 > p2:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
# ─── Python Parsers ───────────────────────────────────────────────────────────
|
||||
|
||||
def _parse_poetry_lock(content: str, source_file: str) -> List[Package]:
|
||||
"""Parse poetry.lock [[package]] sections."""
|
||||
packages = []
|
||||
# Split on [[package]] headers
|
||||
sections = re.split(r'\[\[package\]\]', content)
|
||||
for section in sections[1:]:
|
||||
name_m = re.search(r'^name\s*=\s*"([^"]+)"', section, re.MULTILINE)
|
||||
ver_m = re.search(r'^version\s*=\s*"([^"]+)"', section, re.MULTILINE)
|
||||
if name_m and ver_m:
|
||||
packages.append(Package(
|
||||
name=name_m.group(1),
|
||||
version=ver_m.group(1),
|
||||
ecosystem=ECOSYSTEM_PYPI,
|
||||
source_file=source_file,
|
||||
pinned=True,
|
||||
))
|
||||
return packages
|
||||
|
||||
|
||||
def _parse_pipfile_lock(content: str, source_file: str) -> List[Package]:
|
||||
"""Parse Pipfile.lock JSON."""
|
||||
packages = []
|
||||
try:
|
||||
data = json.loads(content)
|
||||
for section in ("default", "develop"):
|
||||
for pkg_name, pkg_info in (data.get(section) or {}).items():
|
||||
version = pkg_info.get("version", "")
|
||||
# Pipfile.lock versions are like "==2.28.0"
|
||||
version = re.sub(r'^==', '', version)
|
||||
if version:
|
||||
packages.append(Package(
|
||||
name=pkg_name,
|
||||
version=version,
|
||||
ecosystem=ECOSYSTEM_PYPI,
|
||||
source_file=source_file,
|
||||
pinned=True,
|
||||
))
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not parse Pipfile.lock: {e}")
|
||||
return packages
|
||||
|
||||
|
||||
_REQ_LINE_PAT = re.compile(
|
||||
r'^([A-Za-z0-9_\-\.]+)(?:\[.*?\])?\s*==\s*([^\s;#]+)',
|
||||
re.MULTILINE,
|
||||
)
|
||||
_REQ_UNPINNED_PAT = re.compile(
|
||||
r'^([A-Za-z0-9_\-\.]+)(?:\[.*?\])?\s*[><!~^]=?\s*[^\s;#]+',
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
|
||||
def _parse_requirements_txt(content: str, source_file: str) -> List[Package]:
|
||||
"""
|
||||
Parse requirements.txt.
|
||||
Only pinned (==) lines yield concrete versions.
|
||||
Unpinned are recorded with empty version (unresolved).
|
||||
"""
|
||||
packages = []
|
||||
seen: Set[str] = set()
|
||||
|
||||
for m in _REQ_LINE_PAT.finditer(content):
|
||||
name, version = m.group(1), m.group(2).strip()
|
||||
key = _normalize_pkg_name(name)
|
||||
if key not in seen:
|
||||
packages.append(Package(
|
||||
name=name, version=version,
|
||||
ecosystem=ECOSYSTEM_PYPI,
|
||||
source_file=source_file, pinned=True,
|
||||
))
|
||||
seen.add(key)
|
||||
|
||||
# Record unpinned for reporting (no vuln scan)
|
||||
for m in _REQ_UNPINNED_PAT.finditer(content):
|
||||
name = m.group(1)
|
||||
key = _normalize_pkg_name(name)
|
||||
if key not in seen:
|
||||
packages.append(Package(
|
||||
name=name, version="",
|
||||
ecosystem=ECOSYSTEM_PYPI,
|
||||
source_file=source_file, pinned=False,
|
||||
))
|
||||
seen.add(key)
|
||||
|
||||
return packages
|
||||
|
||||
|
||||
def _parse_pyproject_toml(content: str, source_file: str) -> List[Package]:
|
||||
"""Extract declared deps from pyproject.toml (without resolving versions)."""
|
||||
packages = []
|
||||
# [tool.poetry.dependencies] or [project.dependencies]
|
||||
dep_section = re.search(
|
||||
r'\[(?:tool\.poetry\.dependencies|project)\]([^\[]*)', content, re.DOTALL
|
||||
)
|
||||
if not dep_section:
|
||||
return packages
|
||||
block = dep_section.group(1)
|
||||
for m in re.finditer(r'^([A-Za-z0-9_\-\.]+)\s*=', block, re.MULTILINE):
|
||||
name = m.group(1).strip()
|
||||
if name.lower() in ("python", "python-version"):
|
||||
continue
|
||||
packages.append(Package(
|
||||
name=name, version="",
|
||||
ecosystem=ECOSYSTEM_PYPI,
|
||||
source_file=source_file, pinned=False,
|
||||
))
|
||||
return packages
|
||||
|
||||
|
||||
# ─── Node Parsers ─────────────────────────────────────────────────────────────
|
||||
|
||||
def _parse_package_lock_json(content: str, source_file: str) -> List[Package]:
|
||||
"""Parse package-lock.json (npm v2/v3 format)."""
|
||||
packages = []
|
||||
try:
|
||||
data = json.loads(content)
|
||||
# v2/v3: flat packages object
|
||||
pkg_map = data.get("packages") or {}
|
||||
for path_key, info in pkg_map.items():
|
||||
if path_key == "" or not path_key.startswith("node_modules/"):
|
||||
continue
|
||||
# Extract package name from path
|
||||
name = path_key.replace("node_modules/", "").split("/node_modules/")[-1]
|
||||
version = info.get("version", "")
|
||||
if name and version:
|
||||
packages.append(Package(
|
||||
name=name, version=version,
|
||||
ecosystem=ECOSYSTEM_NPM,
|
||||
source_file=source_file, pinned=True,
|
||||
))
|
||||
# v1 fallback: nested dependencies
|
||||
if not packages:
|
||||
for name, info in (data.get("dependencies") or {}).items():
|
||||
version = info.get("version", "")
|
||||
if version:
|
||||
packages.append(Package(
|
||||
name=name, version=version,
|
||||
ecosystem=ECOSYSTEM_NPM,
|
||||
source_file=source_file, pinned=True,
|
||||
))
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not parse package-lock.json: {e}")
|
||||
return packages
|
||||
|
||||
|
||||
def _parse_pnpm_lock(content: str, source_file: str) -> List[Package]:
|
||||
"""Parse pnpm-lock.yaml packages section."""
|
||||
packages = []
|
||||
# Pattern: /package@version:
|
||||
for m in re.finditer(r'^/([^@\s]+)@([^\s:]+):', content, re.MULTILINE):
|
||||
name, version = m.group(1), m.group(2)
|
||||
packages.append(Package(
|
||||
name=name, version=version,
|
||||
ecosystem=ECOSYSTEM_NPM,
|
||||
source_file=source_file, pinned=True,
|
||||
))
|
||||
return packages
|
||||
|
||||
|
||||
def _parse_yarn_lock(content: str, source_file: str) -> List[Package]:
|
||||
"""Parse yarn.lock v1 format."""
|
||||
packages = []
|
||||
# Yarn.lock block: "package@version":\n version "X.Y.Z"
|
||||
block_pat = re.compile(
|
||||
r'^"?([^@"\s]+)@[^:]+:\n(?:\s+.*\n)*?\s+version "([^"]+)"',
|
||||
re.MULTILINE,
|
||||
)
|
||||
seen: Set[str] = set()
|
||||
for m in block_pat.finditer(content):
|
||||
name, version = m.group(1), m.group(2)
|
||||
key = f"{name}@{version}"
|
||||
if key not in seen:
|
||||
packages.append(Package(
|
||||
name=name, version=version,
|
||||
ecosystem=ECOSYSTEM_NPM,
|
||||
source_file=source_file, pinned=True,
|
||||
))
|
||||
seen.add(key)
|
||||
return packages
|
||||
|
||||
|
||||
def _parse_package_json(content: str, source_file: str) -> List[Package]:
|
||||
"""Extract declared deps from package.json (no lock = unresolved)."""
|
||||
packages = []
|
||||
try:
|
||||
data = json.loads(content)
|
||||
for section in ("dependencies", "devDependencies"):
|
||||
for name in (data.get(section) or {}):
|
||||
packages.append(Package(
|
||||
name=name, version="",
|
||||
ecosystem=ECOSYSTEM_NPM,
|
||||
source_file=source_file, pinned=False,
|
||||
))
|
||||
except Exception:
|
||||
pass
|
||||
return packages
|
||||
|
||||
|
||||
# ─── Dependency Discovery ─────────────────────────────────────────────────────
|
||||
|
||||
_PYTHON_MANIFESTS = (
|
||||
"poetry.lock", "Pipfile.lock",
|
||||
)
|
||||
_PYTHON_REQUIREMENTS = ("requirements",) # matched via endswith
|
||||
_PYTHON_PYPROJECT = ("pyproject.toml",)
|
||||
_NODE_MANIFESTS = (
|
||||
"package-lock.json", "pnpm-lock.yaml", "yarn.lock", "package.json",
|
||||
)
|
||||
|
||||
|
||||
def _find_and_parse_deps(
|
||||
repo_root: str,
|
||||
targets: List[str],
|
||||
max_files: int,
|
||||
deadline: float,
|
||||
) -> List[Package]:
|
||||
"""Walk repo and extract all packages from manifest files."""
|
||||
all_packages: List[Package] = []
|
||||
files_scanned = 0
|
||||
|
||||
for dirpath, dirnames, filenames in os.walk(repo_root):
|
||||
dirnames[:] = [
|
||||
d for d in dirnames
|
||||
if d not in EXCLUDED_DIRS and not d.startswith(".")
|
||||
]
|
||||
if time.monotonic() > deadline:
|
||||
logger.warning("dependency_scanner: walk timeout")
|
||||
break
|
||||
|
||||
for fname in filenames:
|
||||
if files_scanned >= max_files:
|
||||
break
|
||||
full = os.path.join(dirpath, fname)
|
||||
if _is_excluded(full):
|
||||
continue
|
||||
|
||||
rel = os.path.relpath(full, repo_root)
|
||||
content = None
|
||||
|
||||
if "python" in targets:
|
||||
if fname in _PYTHON_MANIFESTS:
|
||||
content = _read_file(full)
|
||||
if fname == "poetry.lock":
|
||||
all_packages.extend(_parse_poetry_lock(content, rel))
|
||||
elif fname == "Pipfile.lock":
|
||||
all_packages.extend(_parse_pipfile_lock(content, rel))
|
||||
files_scanned += 1
|
||||
elif fname.endswith(".txt") and "requirements" in fname.lower():
|
||||
content = _read_file(full)
|
||||
all_packages.extend(_parse_requirements_txt(content, rel))
|
||||
files_scanned += 1
|
||||
elif fname in _PYTHON_PYPROJECT:
|
||||
content = _read_file(full)
|
||||
all_packages.extend(_parse_pyproject_toml(content, rel))
|
||||
files_scanned += 1
|
||||
|
||||
if "node" in targets:
|
||||
if fname in _NODE_MANIFESTS:
|
||||
# Skip package.json if package-lock.json sibling exists
|
||||
if fname == "package.json":
|
||||
lock_exists = (
|
||||
os.path.exists(os.path.join(dirpath, "package-lock.json")) or
|
||||
os.path.exists(os.path.join(dirpath, "yarn.lock")) or
|
||||
os.path.exists(os.path.join(dirpath, "pnpm-lock.yaml"))
|
||||
)
|
||||
if lock_exists:
|
||||
continue
|
||||
content = _read_file(full)
|
||||
if fname == "package-lock.json":
|
||||
all_packages.extend(_parse_package_lock_json(content, rel))
|
||||
elif fname == "pnpm-lock.yaml":
|
||||
all_packages.extend(_parse_pnpm_lock(content, rel))
|
||||
elif fname == "yarn.lock":
|
||||
all_packages.extend(_parse_yarn_lock(content, rel))
|
||||
elif fname == "package.json":
|
||||
all_packages.extend(_parse_package_json(content, rel))
|
||||
files_scanned += 1
|
||||
|
||||
# Deduplicate: prefer pinned over unpinned; first seen wins
|
||||
seen: Dict[str, Package] = {}
|
||||
for pkg in all_packages:
|
||||
key = f"{pkg.ecosystem}:{pkg.normalized_name}"
|
||||
if key not in seen or (not seen[key].pinned and pkg.pinned):
|
||||
seen[key] = pkg
|
||||
|
||||
return list(seen.values())
|
||||
|
||||
|
||||
# ─── OSV Cache ────────────────────────────────────────────────────────────────
|
||||
|
||||
def _load_osv_cache(cache_path: str) -> Dict[str, Any]:
|
||||
"""Load offline OSV cache from JSON file."""
|
||||
if not cache_path or not os.path.exists(cache_path):
|
||||
return {}
|
||||
try:
|
||||
with open(cache_path, "r") as f:
|
||||
data = json.load(f)
|
||||
return data.get("entries", {})
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not load OSV cache {cache_path}: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def _save_osv_cache(cache_path: str, entries: Dict[str, Any]):
|
||||
"""Persist updated cache entries to disk."""
|
||||
os.makedirs(os.path.dirname(os.path.abspath(cache_path)), exist_ok=True)
|
||||
existing = {}
|
||||
if os.path.exists(cache_path):
|
||||
try:
|
||||
with open(cache_path, "r") as f:
|
||||
existing = json.load(f)
|
||||
except Exception:
|
||||
pass
|
||||
existing_entries = existing.get("entries", {})
|
||||
existing_entries.update(entries)
|
||||
import datetime
|
||||
output = {
|
||||
"version": 1,
|
||||
"updated_at": datetime.datetime.now(datetime.timezone.utc).isoformat(),
|
||||
"entries": existing_entries,
|
||||
}
|
||||
with open(cache_path, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
|
||||
|
||||
# ─── OSV API ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def _query_osv_online(
|
||||
packages: List[Package],
|
||||
new_cache: Dict[str, Any],
|
||||
deadline: float,
|
||||
) -> Dict[str, List[Dict]]:
|
||||
"""
|
||||
Query OSV.dev /v1/querybatch in batches.
|
||||
Returns {cache_key: [vuln_objects]}.
|
||||
"""
|
||||
try:
|
||||
import httpx
|
||||
except ImportError:
|
||||
logger.warning("httpx not available for OSV online query")
|
||||
return {}
|
||||
|
||||
results: Dict[str, List[Dict]] = {}
|
||||
batches = [packages[i:i + OSV_BATCH_SIZE] for i in range(0, len(packages), OSV_BATCH_SIZE)]
|
||||
|
||||
for batch in batches:
|
||||
if time.monotonic() > deadline:
|
||||
break
|
||||
queries = []
|
||||
batch_keys = []
|
||||
for pkg in batch:
|
||||
if not pkg.pinned or not pkg.version:
|
||||
continue
|
||||
queries.append({
|
||||
"package": {"name": pkg.normalized_name, "ecosystem": pkg.ecosystem},
|
||||
"version": pkg.version,
|
||||
})
|
||||
batch_keys.append(pkg.cache_key)
|
||||
|
||||
if not queries:
|
||||
continue
|
||||
|
||||
try:
|
||||
remaining = max(1.0, deadline - time.monotonic())
|
||||
timeout = min(OSV_TIMEOUT_SEC, remaining)
|
||||
with httpx.Client(timeout=timeout) as client:
|
||||
resp = client.post(OSV_API_URL, json={"queries": queries})
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
except Exception as e:
|
||||
logger.warning(f"OSV query failed: {e}")
|
||||
continue
|
||||
|
||||
for key, result in zip(batch_keys, data.get("results", [])):
|
||||
vulns = result.get("vulns") or []
|
||||
results[key] = vulns
|
||||
new_cache[key] = {"vulns": vulns, "cached_at": _now_iso()}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _parse_osv_severity(vuln: Dict) -> str:
|
||||
"""Extract best-effort severity from OSV vuln object."""
|
||||
# Try database_specific.severity (many databases provide this)
|
||||
db_specific = vuln.get("database_specific", {})
|
||||
sev = (db_specific.get("severity") or "").upper()
|
||||
if sev in SEVERITY_ORDER:
|
||||
return sev
|
||||
|
||||
# Try severity[].type=CVSS_V3 score
|
||||
for sev_entry in (vuln.get("severity") or []):
|
||||
score_str = sev_entry.get("score", "")
|
||||
# CVSS vector like CVSS:3.1/AV:N/AC:L/.../C:H/I:H/A:H
|
||||
# Extract base score from the end: not available directly
|
||||
# Try to extract numerical score if present
|
||||
num_m = re.search(r'(\d+\.\d+)', score_str)
|
||||
if num_m:
|
||||
score = float(num_m.group(1))
|
||||
if score >= 9.0:
|
||||
return "CRITICAL"
|
||||
if score >= 7.0:
|
||||
return "HIGH"
|
||||
if score >= 4.0:
|
||||
return "MEDIUM"
|
||||
if score > 0:
|
||||
return "LOW"
|
||||
|
||||
# Try ecosystem_specific
|
||||
eco_specific = vuln.get("ecosystem_specific", {})
|
||||
sev = (eco_specific.get("severity") or "").upper()
|
||||
if sev in SEVERITY_ORDER:
|
||||
return sev
|
||||
|
||||
return "UNKNOWN"
|
||||
|
||||
|
||||
def _extract_fixed_versions(vuln: Dict, pkg_name: str, ecosystem: str) -> List[str]:
|
||||
"""Extract fixed versions from OSV affected[].ranges[].events."""
|
||||
fixed = []
|
||||
for affected in (vuln.get("affected") or []):
|
||||
pkg = affected.get("package", {})
|
||||
if (pkg.get("ecosystem") or "").lower() != ecosystem.lower():
|
||||
continue
|
||||
if _normalize_pkg_name(pkg.get("name", "")) != _normalize_pkg_name(pkg_name):
|
||||
continue
|
||||
for rng in (affected.get("ranges") or []):
|
||||
for event in (rng.get("events") or []):
|
||||
if "fixed" in event:
|
||||
fixed.append(event["fixed"])
|
||||
return sorted(set(fixed))
|
||||
|
||||
|
||||
def _lookup_vulnerability(
|
||||
pkg: Package,
|
||||
osv_vulns: List[Dict],
|
||||
) -> List[Vulnerability]:
|
||||
"""Convert raw OSV vulns → Vulnerability objects."""
|
||||
results = []
|
||||
for vuln in osv_vulns:
|
||||
osv_id = vuln.get("id", "UNKNOWN")
|
||||
aliases = [a for a in (vuln.get("aliases") or []) if a.startswith("CVE")]
|
||||
severity = _parse_osv_severity(vuln)
|
||||
fixed = _extract_fixed_versions(vuln, pkg.name, pkg.ecosystem)
|
||||
rec = (
|
||||
f"Upgrade {pkg.name} from {pkg.version} to {fixed[0]}"
|
||||
if fixed else
|
||||
f"No fix available for {pkg.name}@{pkg.version}. Monitor {osv_id}."
|
||||
)
|
||||
results.append(Vulnerability(
|
||||
osv_id=osv_id,
|
||||
ecosystem=pkg.ecosystem,
|
||||
package=pkg.name,
|
||||
version=pkg.version,
|
||||
severity=severity,
|
||||
fixed_versions=fixed,
|
||||
aliases=aliases,
|
||||
evidence={
|
||||
"file": _redact(pkg.source_file),
|
||||
"details": f"{pkg.name}=={pkg.version} in {pkg.source_file}",
|
||||
},
|
||||
recommendation=rec,
|
||||
))
|
||||
return results
|
||||
|
||||
|
||||
# ─── Outdated Analysis ────────────────────────────────────────────────────────
|
||||
|
||||
def _analyze_outdated(
|
||||
packages: List[Package],
|
||||
vuln_results: Dict[str, List[Dict]],
|
||||
) -> List[OutdatedPackage]:
|
||||
"""
|
||||
Lockfile-only outdated analysis.
|
||||
Uses fixed_versions from OSV results as a hint for "newer version available".
|
||||
"""
|
||||
outdated = []
|
||||
for pkg in packages:
|
||||
if not pkg.pinned or not pkg.version:
|
||||
continue
|
||||
key = pkg.cache_key
|
||||
vulns = vuln_results.get(key, [])
|
||||
for vuln in vulns:
|
||||
fixed = _extract_fixed_versions(vuln, pkg.name, pkg.ecosystem)
|
||||
if not fixed:
|
||||
continue
|
||||
# Find the smallest fixed version > current
|
||||
upgrades = [v for v in fixed if _compare_versions(v, pkg.version) > 0]
|
||||
if upgrades:
|
||||
min_fix = sorted(upgrades, key=lambda v: [int(x) for x in re.findall(r'\d+', v)])[0]
|
||||
outdated.append(OutdatedPackage(
|
||||
ecosystem=pkg.ecosystem,
|
||||
package=pkg.name,
|
||||
current=pkg.version,
|
||||
latest=min_fix,
|
||||
notes=f"Security fix available (vuln: {vuln.get('id', '?')})",
|
||||
))
|
||||
break # One entry per package
|
||||
return outdated
|
||||
|
||||
|
||||
# ─── License Policy ───────────────────────────────────────────────────────────
|
||||
|
||||
def _apply_license_policy(
|
||||
packages: List[Package],
|
||||
policy_cfg: Dict,
|
||||
) -> List[LicenseFinding]:
|
||||
"""MVP: license data is rarely in lock files, so most will be UNKNOWN."""
|
||||
if not policy_cfg.get("enabled", False):
|
||||
return []
|
||||
|
||||
deny_list = {l.upper() for l in (policy_cfg.get("deny") or [])}
|
||||
warn_list = {l.upper() for l in (policy_cfg.get("warn") or [])}
|
||||
findings = []
|
||||
|
||||
for pkg in packages:
|
||||
# In MVP there's no way to get license from lockfile without network
|
||||
license_str = "UNKNOWN"
|
||||
if license_str == "UNKNOWN":
|
||||
continue # skip unknown in MVP
|
||||
policy = "ok"
|
||||
if license_str.upper() in deny_list:
|
||||
policy = "deny"
|
||||
elif license_str.upper() in warn_list:
|
||||
policy = "warn"
|
||||
findings.append(LicenseFinding(
|
||||
package=pkg.name,
|
||||
license=license_str,
|
||||
policy=policy,
|
||||
recommendation=f"Review license {license_str} for {pkg.name}." if policy != "ok" else "",
|
||||
))
|
||||
return findings
|
||||
|
||||
|
||||
# ─── Main Scanner ─────────────────────────────────────────────────────────────
|
||||
|
||||
def scan_dependencies(
|
||||
repo_root: str,
|
||||
targets: Optional[List[str]] = None,
|
||||
vuln_sources: Optional[Dict] = None,
|
||||
license_policy: Optional[Dict] = None,
|
||||
severity_thresholds: Optional[Dict] = None,
|
||||
outdated_cfg: Optional[Dict] = None,
|
||||
limits: Optional[Dict] = None,
|
||||
timeout_sec: float = 40.0,
|
||||
) -> ScanResult:
|
||||
"""
|
||||
Scan repo dependencies for vulnerabilities, outdated packages, license issues.
|
||||
|
||||
Args:
|
||||
repo_root: absolute path to repo root
|
||||
targets: ["python", "node"] (default: both)
|
||||
vuln_sources: {"osv": {"enabled": true, "mode": "online|offline_cache", "cache_path": "..."}}
|
||||
license_policy: {"enabled": false, "deny": [...], "warn": [...]}
|
||||
severity_thresholds: {"fail_on": ["CRITICAL", "HIGH"], "warn_on": ["MEDIUM"]}
|
||||
outdated_cfg: {"enabled": true, "mode": "lockfile_only"}
|
||||
limits: {"max_files": 80, "max_deps": 2000, "max_vulns": 500}
|
||||
timeout_sec: hard deadline
|
||||
|
||||
Returns:
|
||||
ScanResult with pass/fail verdict
|
||||
"""
|
||||
deadline = time.monotonic() + timeout_sec
|
||||
targets = targets or ["python", "node"]
|
||||
vuln_sources = vuln_sources or {"osv": {"enabled": True, "mode": "offline_cache",
|
||||
"cache_path": "ops/cache/osv_cache.json"}}
|
||||
license_policy = license_policy or {"enabled": False}
|
||||
severity_thresholds = severity_thresholds or {"fail_on": ["CRITICAL", "HIGH"], "warn_on": ["MEDIUM"]}
|
||||
outdated_cfg = outdated_cfg or {"enabled": True, "mode": "lockfile_only"}
|
||||
limits = limits or {"max_files": 80, "max_deps": 2000, "max_vulns": 500}
|
||||
|
||||
fail_on = {s.upper() for s in (severity_thresholds.get("fail_on") or ["CRITICAL", "HIGH"])}
|
||||
warn_on = {s.upper() for s in (severity_thresholds.get("warn_on") or ["MEDIUM"])}
|
||||
|
||||
# ── Step 1: Extract dependencies ─────────────────────────────────────────
|
||||
all_packages = _find_and_parse_deps(
|
||||
repo_root, targets,
|
||||
max_files=limits.get("max_files", 80),
|
||||
deadline=deadline,
|
||||
)
|
||||
|
||||
# Apply dep count limit
|
||||
max_deps = limits.get("max_deps", 2000)
|
||||
if len(all_packages) > max_deps:
|
||||
logger.warning(f"Dep count {len(all_packages)} > max {max_deps}, truncating")
|
||||
all_packages = all_packages[:max_deps]
|
||||
|
||||
pinned = [p for p in all_packages if p.pinned and p.version]
|
||||
unpinned = [p for p in all_packages if not p.pinned or not p.version]
|
||||
|
||||
# ── Step 2: Vulnerability lookup ─────────────────────────────────────────
|
||||
osv_cfg = vuln_sources.get("osv", {})
|
||||
osv_enabled = osv_cfg.get("enabled", True)
|
||||
osv_mode = osv_cfg.get("mode", "offline_cache")
|
||||
|
||||
# Resolve cache path (absolute or relative to repo_root)
|
||||
cache_path_raw = osv_cfg.get("cache_path", "ops/cache/osv_cache.json")
|
||||
cache_path = (
|
||||
cache_path_raw if os.path.isabs(cache_path_raw)
|
||||
else os.path.join(repo_root, cache_path_raw)
|
||||
)
|
||||
|
||||
cache_entries = _load_osv_cache(cache_path) if osv_enabled else {}
|
||||
new_cache: Dict[str, Any] = {}
|
||||
vuln_results: Dict[str, List[Dict]] = {}
|
||||
|
||||
if osv_enabled:
|
||||
# Populate from cache first
|
||||
cache_miss: List[Package] = []
|
||||
for pkg in pinned:
|
||||
key = pkg.cache_key
|
||||
if key in cache_entries:
|
||||
vuln_results[key] = (cache_entries[key] or {}).get("vulns", [])
|
||||
else:
|
||||
cache_miss.append(pkg)
|
||||
|
||||
# Online query for cache misses
|
||||
if osv_mode == "online" and cache_miss and time.monotonic() < deadline:
|
||||
online_results = _query_osv_online(cache_miss, new_cache, deadline)
|
||||
vuln_results.update(online_results)
|
||||
# Mark remaining misses as UNKNOWN (no cache entry)
|
||||
for pkg in cache_miss:
|
||||
if pkg.cache_key not in vuln_results:
|
||||
vuln_results[pkg.cache_key] = None # type: ignore[assignment]
|
||||
else:
|
||||
# Offline: cache misses → UNKNOWN
|
||||
for pkg in cache_miss:
|
||||
vuln_results[pkg.cache_key] = None # type: ignore[assignment]
|
||||
|
||||
# Persist new cache entries if online mode
|
||||
if new_cache and osv_mode == "online":
|
||||
try:
|
||||
_save_osv_cache(cache_path, new_cache)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not save OSV cache: {e}")
|
||||
|
||||
# ── Step 3: Build vulnerability findings ─────────────────────────────────
|
||||
all_vulns: List[Vulnerability] = []
|
||||
cache_miss_pkgs: List[Package] = []
|
||||
|
||||
for pkg in pinned:
|
||||
key = pkg.cache_key
|
||||
raw_vulns = vuln_results.get(key)
|
||||
if raw_vulns is None:
|
||||
cache_miss_pkgs.append(pkg)
|
||||
continue
|
||||
vulns = _lookup_vulnerability(pkg, raw_vulns)
|
||||
all_vulns.extend(vulns)
|
||||
|
||||
# Apply vuln limit
|
||||
max_vulns = limits.get("max_vulns", 500)
|
||||
all_vulns = all_vulns[:max_vulns]
|
||||
|
||||
# Sort by severity desc
|
||||
all_vulns.sort(key=lambda v: SEVERITY_ORDER.get(v.severity, 0), reverse=True)
|
||||
|
||||
# ── Step 4: Outdated ──────────────────────────────────────────────────────
|
||||
outdated: List[OutdatedPackage] = []
|
||||
if outdated_cfg.get("enabled", True):
|
||||
outdated = _analyze_outdated(pinned, {
|
||||
k: v for k, v in vuln_results.items() if v is not None
|
||||
})
|
||||
|
||||
# ── Step 5: License policy ────────────────────────────────────────────────
|
||||
licenses = _apply_license_policy(all_packages, license_policy)
|
||||
|
||||
# ── Step 6: Compute pass/fail ─────────────────────────────────────────────
|
||||
by_severity: Dict[str, int] = {s: 0 for s in SEVERITY_ORDER}
|
||||
for v in all_vulns:
|
||||
by_severity[v.severity] = by_severity.get(v.severity, 0) + 1
|
||||
|
||||
blocking_count = sum(by_severity.get(s, 0) for s in fail_on)
|
||||
warning_count = sum(by_severity.get(s, 0) for s in warn_on)
|
||||
|
||||
# License denials also block
|
||||
denied_licenses = [lf for lf in licenses if lf.policy == "deny"]
|
||||
if denied_licenses:
|
||||
blocking_count += len(denied_licenses)
|
||||
|
||||
pass_ = blocking_count == 0
|
||||
|
||||
# ── Step 7: Build recommendations ────────────────────────────────────────
|
||||
recommendations: List[str] = []
|
||||
if blocking_count > 0:
|
||||
top_crit = [v for v in all_vulns if v.severity in fail_on][:3]
|
||||
for v in top_crit:
|
||||
recommendations.append(v.recommendation)
|
||||
if warning_count > 0:
|
||||
recommendations.append(
|
||||
f"{warning_count} MEDIUM severity vulnerabilities found — review and upgrade where possible."
|
||||
)
|
||||
if cache_miss_pkgs:
|
||||
recommendations.append(
|
||||
f"{len(cache_miss_pkgs)} packages have no OSV cache entry (severity UNKNOWN). "
|
||||
"Run in online mode to populate cache: mode=online."
|
||||
)
|
||||
if unpinned:
|
||||
recommendations.append(
|
||||
f"{len(unpinned)} unpinned dependencies detected — cannot check for vulnerabilities. "
|
||||
"Pin versions in requirements.txt/lock files."
|
||||
)
|
||||
|
||||
# ── Step 8: Summary ───────────────────────────────────────────────────────
|
||||
ecosystems_found = sorted({p.ecosystem for p in all_packages})
|
||||
elapsed_ms = round((time.monotonic() - (deadline - timeout_sec)) * 1000, 1)
|
||||
|
||||
if pass_:
|
||||
summary = (
|
||||
f"✅ Dependency scan PASSED. "
|
||||
f"{len(pinned)} deps scanned, {len(all_vulns)} vulns found "
|
||||
f"({by_severity.get('CRITICAL', 0)} critical, {by_severity.get('HIGH', 0)} high)."
|
||||
)
|
||||
else:
|
||||
summary = (
|
||||
f"❌ Dependency scan FAILED. "
|
||||
f"{blocking_count} blocking issue(s): "
|
||||
f"{by_severity.get('CRITICAL', 0)} CRITICAL, {by_severity.get('HIGH', 0)} HIGH"
|
||||
+ (f", {len(denied_licenses)} denied licenses" if denied_licenses else "")
|
||||
+ "."
|
||||
)
|
||||
|
||||
stats = {
|
||||
"ecosystems": ecosystems_found,
|
||||
"files_scanned": len(set(p.source_file for p in all_packages)),
|
||||
"deps_total": len(all_packages),
|
||||
"deps_pinned": len(pinned),
|
||||
"deps_unresolved": len(cache_miss_pkgs),
|
||||
"vulns_total": len(all_vulns),
|
||||
"by_severity": by_severity,
|
||||
"outdated_total": len(outdated),
|
||||
"elapsed_ms": elapsed_ms,
|
||||
}
|
||||
|
||||
return ScanResult(
|
||||
pass_=pass_,
|
||||
summary=summary,
|
||||
stats=stats,
|
||||
vulnerabilities=[_vuln_to_dict(v) for v in all_vulns],
|
||||
outdated=[_outdated_to_dict(o) for o in outdated],
|
||||
licenses=[_license_to_dict(lf) for lf in licenses],
|
||||
recommendations=list(dict.fromkeys(recommendations)), # dedupe
|
||||
)
|
||||
|
||||
|
||||
def scan_dependencies_dict(repo_root: str, **kwargs) -> Dict:
|
||||
"""Convenience wrapper returning plain dict for ToolResult."""
|
||||
result = scan_dependencies(repo_root, **kwargs)
|
||||
return {
|
||||
"pass": result.pass_,
|
||||
"summary": result.summary,
|
||||
"stats": result.stats,
|
||||
"vulnerabilities": result.vulnerabilities,
|
||||
"outdated": result.outdated,
|
||||
"licenses": result.licenses,
|
||||
"recommendations": result.recommendations,
|
||||
}
|
||||
|
||||
|
||||
# ─── Serializers ──────────────────────────────────────────────────────────────
|
||||
|
||||
def _vuln_to_dict(v: Vulnerability) -> Dict:
|
||||
return {
|
||||
"id": v.osv_id,
|
||||
"ecosystem": v.ecosystem,
|
||||
"package": v.package,
|
||||
"version": v.version,
|
||||
"severity": v.severity,
|
||||
"fixed_versions": v.fixed_versions,
|
||||
"aliases": v.aliases,
|
||||
"evidence": {k: _redact(val) for k, val in v.evidence.items()},
|
||||
"recommendation": v.recommendation,
|
||||
}
|
||||
|
||||
|
||||
def _outdated_to_dict(o: OutdatedPackage) -> Dict:
|
||||
return {
|
||||
"ecosystem": o.ecosystem,
|
||||
"package": o.package,
|
||||
"current": o.current,
|
||||
"latest": o.latest,
|
||||
"notes": o.notes,
|
||||
}
|
||||
|
||||
|
||||
def _license_to_dict(lf: LicenseFinding) -> Dict:
|
||||
return {
|
||||
"package": lf.package,
|
||||
"license": lf.license,
|
||||
"policy": lf.policy,
|
||||
"recommendation": lf.recommendation,
|
||||
}
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
import datetime
|
||||
return datetime.datetime.now(datetime.timezone.utc).isoformat()
|
||||
898
services/router/drift_analyzer.py
Normal file
898
services/router/drift_analyzer.py
Normal file
@@ -0,0 +1,898 @@
|
||||
"""
|
||||
Drift Analyzer — знаходить розбіжності між "джерелами правди" та "фактом".
|
||||
|
||||
4 категорії перевірок (незалежні, кожна повертає findings):
|
||||
1. services — Service Catalog (inventory_services.csv / 01_SERVICE_CATALOG.md) vs docker-compose*.yml
|
||||
2. openapi — OpenAPI specs (docs/contracts/*.yaml) vs routes у коді (FastAPI decorators)
|
||||
3. nats — inventory_nats_topics.csv vs publish/subscribe usage у коді
|
||||
4. tools — tools_rollout.yml + rbac_tools_matrix.yml vs фактичні handlers у tool_manager.py
|
||||
|
||||
Формат findings:
|
||||
{ category, severity, id, title, evidence: {path, lines, details}, recommended_fix }
|
||||
|
||||
Pass rule: pass=false якщо errors > 0. Warnings/infos не валять gate.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import fnmatch
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import yaml
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, FrozenSet, List, Optional, Set, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Constants ────────────────────────────────────────────────────────────────
|
||||
|
||||
EXCLUDED_DIRS: FrozenSet[str] = frozenset({
|
||||
"node_modules", ".git", "dist", "build", "vendor",
|
||||
".venv", "venv", "venv_models", "sofia_venv",
|
||||
"__pycache__", ".pytest_cache", "rollback_backups",
|
||||
"docs/consolidation",
|
||||
})
|
||||
|
||||
MAX_FILES_PER_CATEGORY = 300
|
||||
MAX_BYTES_PER_FILE = 262144 # 256KB
|
||||
TIMEOUT_SEC = 25.0 # Hard deadline per full analysis
|
||||
|
||||
# Known tool handlers (must be kept in sync with execute_tool dispatch in tool_manager.py)
|
||||
# Source: Priority 1–17 handlers in tool_manager.py
|
||||
KNOWN_TOOL_HANDLERS: FrozenSet[str] = frozenset({
|
||||
"memory_search", "graph_query",
|
||||
"web_search", "web_extract",
|
||||
"image_generate", "comfy_generate_image", "comfy_generate_video",
|
||||
"remember_fact",
|
||||
"presentation_create", "presentation_status", "presentation_download",
|
||||
"crawl4ai_scrape", "tts_speak", "file_tool",
|
||||
"market_data",
|
||||
"crm_search_client", "crm_upsert_client", "crm_upsert_site",
|
||||
"crm_upsert_window_unit", "crm_create_quote", "crm_update_quote",
|
||||
"crm_create_job", "calc_window_quote",
|
||||
"docs_render_quote_pdf", "docs_render_invoice_pdf",
|
||||
"schedule_propose_slots", "schedule_confirm_slot",
|
||||
"repo_tool", "pr_reviewer_tool", "contract_tool",
|
||||
"oncall_tool", "observability_tool", "config_linter_tool",
|
||||
"threatmodel_tool", "job_orchestrator_tool", "kb_tool",
|
||||
"drift_analyzer_tool", # self-registration
|
||||
})
|
||||
|
||||
# ─── Data Structures ──────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class Finding:
|
||||
category: str
|
||||
severity: str # "error" | "warning" | "info"
|
||||
id: str
|
||||
title: str
|
||||
evidence: Dict[str, str] = field(default_factory=dict)
|
||||
recommended_fix: str = ""
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return {
|
||||
"category": self.category,
|
||||
"severity": self.severity,
|
||||
"id": self.id,
|
||||
"title": self.title,
|
||||
"evidence": self.evidence,
|
||||
"recommended_fix": self.recommended_fix,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class DriftReport:
|
||||
pass_: bool
|
||||
summary: str
|
||||
stats: Dict[str, Any]
|
||||
findings: List[Dict]
|
||||
|
||||
|
||||
# ─── Utility helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
def _is_excluded(path: str) -> bool:
|
||||
"""Check if any part of the path is in the excluded dirs set."""
|
||||
parts = Path(path).parts
|
||||
return any(p in EXCLUDED_DIRS for p in parts)
|
||||
|
||||
|
||||
def _walk_files(root: str, extensions: Tuple[str, ...],
|
||||
deadline: float) -> List[str]:
|
||||
"""
|
||||
Walk repo root and collect files with given extensions.
|
||||
Respects EXCLUDED_DIRS, MAX_FILES_PER_CATEGORY, TIMEOUT_SEC.
|
||||
"""
|
||||
found = []
|
||||
for dirpath, dirnames, filenames in os.walk(root):
|
||||
# Prune excluded dirs in-place (affects os.walk recursion)
|
||||
dirnames[:] = [
|
||||
d for d in dirnames
|
||||
if d not in EXCLUDED_DIRS and not d.startswith(".")
|
||||
]
|
||||
if time.monotonic() > deadline:
|
||||
logger.warning("_walk_files: timeout reached")
|
||||
break
|
||||
for fname in filenames:
|
||||
if fname.endswith(extensions):
|
||||
full = os.path.join(dirpath, fname)
|
||||
if not _is_excluded(full):
|
||||
found.append(full)
|
||||
if len(found) >= MAX_FILES_PER_CATEGORY:
|
||||
return found
|
||||
return found
|
||||
|
||||
|
||||
def _read_file(path: str) -> str:
|
||||
"""Read file with size limit. Returns empty string on error."""
|
||||
try:
|
||||
size = os.path.getsize(path)
|
||||
if size > MAX_BYTES_PER_FILE:
|
||||
with open(path, "r", errors="replace") as f:
|
||||
return f.read(MAX_BYTES_PER_FILE)
|
||||
with open(path, "r", errors="replace") as f:
|
||||
return f.read()
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
_SECRET_PAT = re.compile(
|
||||
r'(?i)(api[_-]?key|token|secret|password|bearer|jwt|private[_-]?key)'
|
||||
r'[\s=:]+[\'"`]?([a-zA-Z0-9_\-\.]{8,})[\'"`]?'
|
||||
)
|
||||
|
||||
|
||||
def _redact_evidence(text: str) -> str:
|
||||
"""Mask potential secrets in evidence strings."""
|
||||
return _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***REDACTED***", text)
|
||||
|
||||
|
||||
def _rel(path: str, root: str) -> str:
|
||||
"""Return path relative to root, or absolute if outside."""
|
||||
try:
|
||||
return os.path.relpath(path, root)
|
||||
except ValueError:
|
||||
return path
|
||||
|
||||
|
||||
# ─── Category 1: Services ─────────────────────────────────────────────────────
|
||||
|
||||
def _load_service_catalog(repo_root: str) -> Dict[str, str]:
|
||||
"""
|
||||
Load services from inventory_services.csv.
|
||||
Returns {service_name: status}.
|
||||
"""
|
||||
csv_path = os.path.join(
|
||||
repo_root, "docs", "architecture_inventory", "inventory_services.csv"
|
||||
)
|
||||
services = {}
|
||||
if not os.path.exists(csv_path):
|
||||
# Fallback: scan 01_SERVICE_CATALOG.md for table rows
|
||||
md_path = os.path.join(
|
||||
repo_root, "docs", "architecture_inventory", "01_SERVICE_CATALOG.md"
|
||||
)
|
||||
if os.path.exists(md_path):
|
||||
content = _read_file(md_path)
|
||||
for line in content.splitlines():
|
||||
m = re.match(r'\|\s*([\w\-]+)\s*\|\s*(DEPLOYED|DEFINED|PLANNED[^\|]*)', line)
|
||||
if m:
|
||||
services[m.group(1).strip()] = m.group(2).strip()
|
||||
return services
|
||||
|
||||
try:
|
||||
with open(csv_path, "r", newline="", errors="replace") as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
name = (row.get("service") or "").strip()
|
||||
status = (row.get("type") or "").strip() # csv has 'type' not 'status'
|
||||
if name:
|
||||
services[name] = status
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not load inventory_services.csv: {e}")
|
||||
return services
|
||||
|
||||
|
||||
def _load_compose_services(repo_root: str, deadline: float) -> Dict[str, str]:
|
||||
"""
|
||||
Parse docker-compose*.yml files and return {service_name: compose_file}.
|
||||
"""
|
||||
compose_files = []
|
||||
for entry in os.listdir(repo_root):
|
||||
if fnmatch.fnmatch(entry, "docker-compose*.yml"):
|
||||
compose_files.append(os.path.join(repo_root, entry))
|
||||
|
||||
# Also infra subdir
|
||||
infra_compose = os.path.join(repo_root, "infra", "compose", "docker-compose.yml")
|
||||
if os.path.exists(infra_compose):
|
||||
compose_files.append(infra_compose)
|
||||
|
||||
services = {}
|
||||
for cf in compose_files:
|
||||
if time.monotonic() > deadline:
|
||||
break
|
||||
try:
|
||||
content = _read_file(cf)
|
||||
data = yaml.safe_load(content) or {}
|
||||
svc_section = data.get("services") or {}
|
||||
for svc_name in svc_section:
|
||||
services[svc_name] = _rel(cf, repo_root)
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not parse {cf}: {e}")
|
||||
return services
|
||||
|
||||
|
||||
def _analyze_services(repo_root: str, deadline: float) -> Tuple[List[Finding], Dict]:
|
||||
findings = []
|
||||
catalog = _load_service_catalog(repo_root)
|
||||
compose_svcs = _load_compose_services(repo_root, deadline)
|
||||
|
||||
compose_names = set(compose_svcs.keys())
|
||||
catalog_names = set(catalog.keys())
|
||||
|
||||
# DEPLOYED in catalog but missing from ALL compose files
|
||||
for svc, status in catalog.items():
|
||||
if "DEPLOYED" in status.upper() and svc not in compose_names:
|
||||
# Normalize: some catalog names use dashes vs underscores differently
|
||||
normalized = svc.replace("-", "_")
|
||||
variants = {svc, normalized, svc.replace("_", "-")}
|
||||
if not variants.intersection(compose_names):
|
||||
findings.append(Finding(
|
||||
category="services",
|
||||
severity="error",
|
||||
id="DRIFT-SVC-001",
|
||||
title=f"Service '{svc}' marked DEPLOYED in catalog but absent from all docker-compose files",
|
||||
evidence={"path": "docs/architecture_inventory/inventory_services.csv",
|
||||
"details": f"status={status}, not found in compose"},
|
||||
recommended_fix=f"Add '{svc}' to appropriate docker-compose*.yml or update catalog status to DEFINED.",
|
||||
))
|
||||
|
||||
# In compose but not mentioned in catalog at all
|
||||
for svc, compose_file in compose_svcs.items():
|
||||
if svc not in catalog_names:
|
||||
normalized = svc.replace("-", "_").replace("_", "-")
|
||||
if svc not in catalog_names and normalized not in catalog_names:
|
||||
findings.append(Finding(
|
||||
category="services",
|
||||
severity="warning",
|
||||
id="DRIFT-SVC-002",
|
||||
title=f"Service '{svc}' found in compose but not in service catalog",
|
||||
evidence={"path": compose_file, "details": f"defined in {compose_file}"},
|
||||
recommended_fix=f"Add '{svc}' to inventory_services.csv / 01_SERVICE_CATALOG.md.",
|
||||
))
|
||||
|
||||
stats = {
|
||||
"catalog_entries": len(catalog),
|
||||
"compose_services": len(compose_svcs),
|
||||
"findings": len(findings),
|
||||
}
|
||||
return findings, stats
|
||||
|
||||
|
||||
# ─── Category 2: OpenAPI ──────────────────────────────────────────────────────
|
||||
|
||||
def _load_openapi_paths(repo_root: str, deadline: float) -> Dict[str, Set[str]]:
|
||||
"""
|
||||
Scan docs/contracts/*.openapi.yaml and any openapi*.yaml/yml/json.
|
||||
Returns {"/path": {"get", "post", ...}}.
|
||||
"""
|
||||
spec_files = []
|
||||
contracts_dir = os.path.join(repo_root, "docs", "contracts")
|
||||
if os.path.isdir(contracts_dir):
|
||||
for f in os.listdir(contracts_dir):
|
||||
if f.endswith((".yaml", ".yml", ".json")):
|
||||
spec_files.append(os.path.join(contracts_dir, f))
|
||||
|
||||
# Also find any openapi*.yaml in repo root and services
|
||||
for dirpath, dirnames, filenames in os.walk(repo_root):
|
||||
dirnames[:] = [d for d in dirnames if d not in EXCLUDED_DIRS and not d.startswith(".")]
|
||||
if time.monotonic() > deadline:
|
||||
break
|
||||
for f in filenames:
|
||||
if re.match(r'openapi.*\.(ya?ml|json)$', f, re.IGNORECASE):
|
||||
full = os.path.join(dirpath, f)
|
||||
if full not in spec_files:
|
||||
spec_files.append(full)
|
||||
|
||||
paths: Dict[str, Set[str]] = {}
|
||||
for sf in spec_files:
|
||||
if time.monotonic() > deadline:
|
||||
break
|
||||
try:
|
||||
content = _read_file(sf)
|
||||
data = yaml.safe_load(content) if sf.endswith((".yaml", ".yml")) else json.loads(content)
|
||||
if not isinstance(data, dict) or "paths" not in data:
|
||||
continue
|
||||
for path, methods in (data.get("paths") or {}).items():
|
||||
if not isinstance(methods, dict):
|
||||
continue
|
||||
methods_set = {
|
||||
m.lower() for m in methods
|
||||
if m.lower() in {"get", "post", "put", "patch", "delete", "head", "options"}
|
||||
}
|
||||
if path not in paths:
|
||||
paths[path] = set()
|
||||
paths[path].update(methods_set)
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not parse OpenAPI spec {sf}: {e}")
|
||||
|
||||
return paths
|
||||
|
||||
|
||||
_FASTAPI_ROUTE_PAT = re.compile(
|
||||
r'@(?:app|router)\.(get|post|put|patch|delete|head|options)\(\s*[\'"]([^\'"]+)[\'"]',
|
||||
re.MULTILINE,
|
||||
)
|
||||
_ADD_API_ROUTE_PAT = re.compile(
|
||||
r'\.add_api_route\(\s*[\'"]([^\'"]+)[\'"].*?methods\s*=\s*\[([^\]]+)\]',
|
||||
re.MULTILINE | re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
def _load_code_routes(repo_root: str, deadline: float) -> Dict[str, Set[str]]:
|
||||
"""
|
||||
Scan Python files for FastAPI route decorators.
|
||||
Returns {"/path": {"get", "post", ...}}.
|
||||
"""
|
||||
py_files = _walk_files(repo_root, (".py",), deadline)
|
||||
routes: Dict[str, Set[str]] = {}
|
||||
|
||||
for pf in py_files:
|
||||
if time.monotonic() > deadline:
|
||||
break
|
||||
if ".venv" in pf or "venv" in pf or "node_modules" in pf:
|
||||
continue
|
||||
content = _read_file(pf)
|
||||
if not content:
|
||||
continue
|
||||
|
||||
for method, path in _FASTAPI_ROUTE_PAT.findall(content):
|
||||
norm = path.rstrip("/") or "/"
|
||||
if norm not in routes:
|
||||
routes[norm] = set()
|
||||
routes[norm].add(method.lower())
|
||||
|
||||
for path, methods_raw in _ADD_API_ROUTE_PAT.findall(content):
|
||||
methods = {m.strip().strip('"\'').lower() for m in methods_raw.split(",")}
|
||||
norm = path.rstrip("/") or "/"
|
||||
if norm not in routes:
|
||||
routes[norm] = set()
|
||||
routes[norm].update(methods)
|
||||
|
||||
return routes
|
||||
|
||||
|
||||
def _normalize_path(path: str) -> str:
|
||||
"""Normalize OAS path for comparison: remove trailing slash, lowercase."""
|
||||
return path.rstrip("/").lower() or "/"
|
||||
|
||||
|
||||
# Paths that are infrastructure-level and expected to be missing from OAS specs.
|
||||
# Add /internal/* and /debug/* patterns if your project uses them.
|
||||
_OAS_IGNORE_PATH_PREFIXES: Tuple[str, ...] = (
|
||||
"/healthz", "/readyz", "/livez", "/metrics",
|
||||
"/internal/", "/debug/", "/__", "/favicon",
|
||||
)
|
||||
|
||||
|
||||
def _is_oas_ignored(path: str) -> bool:
|
||||
"""Return True if path is on the OAS ignore allowlist."""
|
||||
p = path.lower()
|
||||
return any(p == prefix.rstrip("/") or p.startswith(prefix)
|
||||
for prefix in _OAS_IGNORE_PATH_PREFIXES)
|
||||
|
||||
|
||||
def _load_openapi_deprecated(repo_root: str) -> Set[str]:
|
||||
"""
|
||||
Return normalized paths marked as 'deprecated: true' in any OAS spec.
|
||||
Deprecated endpoints downgrade from error to warning (DRIFT-OAS-001).
|
||||
"""
|
||||
deprecated: Set[str] = set()
|
||||
spec_files: List[str] = []
|
||||
for dirpath, dirnames, filenames in os.walk(repo_root):
|
||||
dirnames[:] = [d for d in dirnames if d not in EXCLUDED_DIRS and not d.startswith(".")]
|
||||
for f in filenames:
|
||||
if re.match(r'openapi.*\.(ya?ml|json)$', f, re.IGNORECASE):
|
||||
spec_files.append(os.path.join(dirpath, f))
|
||||
|
||||
for sf in spec_files:
|
||||
try:
|
||||
content = _read_file(sf)
|
||||
data = yaml.safe_load(content) if sf.endswith((".yaml", ".yml")) else json.loads(content)
|
||||
if not isinstance(data, dict) or "paths" not in data:
|
||||
continue
|
||||
for path, methods in (data.get("paths") or {}).items():
|
||||
if not isinstance(methods, dict):
|
||||
continue
|
||||
for method, operation in methods.items():
|
||||
if isinstance(operation, dict) and operation.get("deprecated", False):
|
||||
deprecated.add(_normalize_path(path))
|
||||
except Exception:
|
||||
pass
|
||||
return deprecated
|
||||
|
||||
|
||||
def _analyze_openapi(repo_root: str, deadline: float) -> Tuple[List[Finding], Dict]:
|
||||
findings = []
|
||||
spec_paths = _load_openapi_paths(repo_root, deadline)
|
||||
code_routes = _load_code_routes(repo_root, deadline)
|
||||
|
||||
if not spec_paths:
|
||||
return findings, {"spec_paths": 0, "code_routes": len(code_routes), "findings": 0}
|
||||
|
||||
deprecated_paths = _load_openapi_deprecated(repo_root)
|
||||
|
||||
spec_norm: Dict[str, Set[str]] = {
|
||||
_normalize_path(p): methods for p, methods in spec_paths.items()
|
||||
}
|
||||
code_norm: Dict[str, Set[str]] = {
|
||||
_normalize_path(p): methods for p, methods in code_routes.items()
|
||||
}
|
||||
|
||||
# DRIFT-OAS-001: In spec but not in code
|
||||
for path, methods in sorted(spec_norm.items()):
|
||||
# Skip infra/health endpoints — they are expected to be absent from OAS
|
||||
if _is_oas_ignored(path):
|
||||
continue
|
||||
if path not in code_norm:
|
||||
# Deprecated spec paths → warning only, not blocking
|
||||
severity = "warning" if path in deprecated_paths else "error"
|
||||
dep_note = " (deprecated in spec)" if path in deprecated_paths else ""
|
||||
findings.append(Finding(
|
||||
category="openapi",
|
||||
severity=severity,
|
||||
id="DRIFT-OAS-001",
|
||||
title=f"OpenAPI path '{path}'{dep_note} not found in codebase routes",
|
||||
evidence={"path": "docs/contracts/",
|
||||
"details": f"methods={sorted(methods)}, missing from FastAPI decorators"},
|
||||
recommended_fix=(
|
||||
f"Mark '{path}' as removed in OpenAPI or implement the route."
|
||||
if path in deprecated_paths
|
||||
else f"Implement '{path}' route in code or remove from OpenAPI spec."
|
||||
),
|
||||
))
|
||||
else:
|
||||
# DRIFT-OAS-003: Method mismatch
|
||||
code_methods = code_norm[path]
|
||||
missing_in_code = methods - code_methods
|
||||
if missing_in_code:
|
||||
findings.append(Finding(
|
||||
category="openapi",
|
||||
severity="warning",
|
||||
id="DRIFT-OAS-003",
|
||||
title=f"Method mismatch for path '{path}': spec has {sorted(missing_in_code)}, code missing",
|
||||
evidence={"path": "docs/contracts/",
|
||||
"details": f"spec={sorted(methods)}, code={sorted(code_methods)}"},
|
||||
recommended_fix=f"Add missing HTTP methods to code route for '{path}'.",
|
||||
))
|
||||
|
||||
# DRIFT-OAS-002: In code (/v1/ paths) but not in spec
|
||||
for path, methods in sorted(code_norm.items()):
|
||||
# Health/internal endpoints are expected to be absent from OAS
|
||||
if _is_oas_ignored(path):
|
||||
continue
|
||||
if not path.startswith("/v1/"):
|
||||
continue
|
||||
if path not in spec_norm:
|
||||
findings.append(Finding(
|
||||
category="openapi",
|
||||
severity="error",
|
||||
id="DRIFT-OAS-002",
|
||||
title=f"Code route '{path}' not documented in any OpenAPI spec",
|
||||
evidence={"path": "services/", "details": f"methods={sorted(methods)}"},
|
||||
recommended_fix=f"Add '{path}' to OpenAPI spec in docs/contracts/.",
|
||||
))
|
||||
|
||||
stats = {
|
||||
"spec_paths": len(spec_paths),
|
||||
"code_routes": len(code_routes),
|
||||
"findings": len(findings),
|
||||
}
|
||||
return findings, stats
|
||||
|
||||
|
||||
# ─── Category 3: NATS ─────────────────────────────────────────────────────────
|
||||
|
||||
_NATS_WILDCARD_PAT = re.compile(r'\{[^}]+\}|\*|>') # {agent_id}, *, >
|
||||
|
||||
def _normalize_nats_subject(subj: str) -> str:
|
||||
"""Replace wildcards with * for matching. Lowercase."""
|
||||
return _NATS_WILDCARD_PAT.sub("*", subj.strip()).lower()
|
||||
|
||||
|
||||
def _load_nats_inventory(repo_root: str) -> Optional[List[str]]:
|
||||
"""
|
||||
Load documented NATS subjects from inventory_nats_topics.csv.
|
||||
Returns list of normalized subjects, or None if file absent.
|
||||
"""
|
||||
csv_path = os.path.join(
|
||||
repo_root, "docs", "architecture_inventory", "inventory_nats_topics.csv"
|
||||
)
|
||||
if not os.path.exists(csv_path):
|
||||
return None
|
||||
|
||||
subjects = []
|
||||
try:
|
||||
with open(csv_path, "r", newline="", errors="replace") as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
subj = (row.get("subject") or "").strip()
|
||||
if subj:
|
||||
subjects.append(_normalize_nats_subject(subj))
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not load nats inventory: {e}")
|
||||
return None
|
||||
return subjects
|
||||
|
||||
|
||||
_NATS_USAGE_PATTERNS = [
|
||||
re.compile(r'(?:nc|nats|js|jetstream)\.publish\([\'"]([a-zA-Z0-9._{}*>-]+)[\'"]', re.IGNORECASE),
|
||||
re.compile(r'(?:nc|nats|js|jetstream)\.subscribe\([\'"]([a-zA-Z0-9._{}*>-]+)[\'"]', re.IGNORECASE),
|
||||
re.compile(r'nc\.subscribe\([\'"]([a-zA-Z0-9._{}*>-]+)[\'"]', re.IGNORECASE),
|
||||
re.compile(r'subject\s*=\s*[\'"]([a-zA-Z0-9._{}*>-]{4,})[\'"]', re.IGNORECASE),
|
||||
re.compile(r'SUBJECT\s*=\s*[\'"]([a-zA-Z0-9._{}*>-]{4,})[\'"]'),
|
||||
re.compile(r'[\'"]([a-z][a-z0-9_]+\.[a-z][a-z0-9_]+(?:\.[a-zA-Z0-9_{}_.*>-]+){0,4})[\'"]'),
|
||||
]
|
||||
|
||||
_NATS_SUBJECT_VALIDATE = re.compile(r'^[a-zA-Z][a-zA-Z0-9._{}*>-]{2,}$')
|
||||
|
||||
|
||||
def _load_nats_code_subjects(repo_root: str, deadline: float) -> Set[str]:
|
||||
"""Extract NATS subjects from code via regex patterns."""
|
||||
py_files = _walk_files(repo_root, (".py",), deadline)
|
||||
found: Set[str] = set()
|
||||
|
||||
for pf in py_files:
|
||||
if time.monotonic() > deadline:
|
||||
break
|
||||
if "venv" in pf or "node_modules" in pf:
|
||||
continue
|
||||
content = _read_file(pf)
|
||||
if not content:
|
||||
continue
|
||||
# Quick pre-filter: must contain at least one NATS-like call pattern
|
||||
_NATS_CALL_HINTS = ("nc.", "nats.", "js.", "jetstream.", "subject=", "SUBJECT=", ".publish(", ".subscribe(")
|
||||
if not any(hint in content for hint in _NATS_CALL_HINTS):
|
||||
continue
|
||||
|
||||
for pat in _NATS_USAGE_PATTERNS:
|
||||
for m in pat.finditer(content):
|
||||
subj = m.group(1).strip()
|
||||
# Basic subject validation (must contain a dot)
|
||||
if "." in subj and _NATS_SUBJECT_VALIDATE.match(subj):
|
||||
found.add(_normalize_nats_subject(subj))
|
||||
|
||||
return found
|
||||
|
||||
|
||||
def _nats_subject_matches(code_subj: str, inventory_subjects: List[str]) -> bool:
|
||||
"""
|
||||
Check if a code subject matches any inventory subject (wildcard-aware).
|
||||
Supports * (one segment) and > (one or more segments).
|
||||
"""
|
||||
code_parts = code_subj.split(".")
|
||||
for inv in inventory_subjects:
|
||||
inv_parts = inv.split(".")
|
||||
if _nats_match(code_parts, inv_parts) or _nats_match(inv_parts, code_parts):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _nats_match(a_parts: List[str], b_parts: List[str]) -> bool:
|
||||
"""Match NATS subject a against pattern b (with * and > wildcards)."""
|
||||
if not b_parts:
|
||||
return not a_parts
|
||||
if b_parts[-1] == ">":
|
||||
return len(a_parts) >= len(b_parts) - 1
|
||||
if len(a_parts) != len(b_parts):
|
||||
return False
|
||||
for a, b in zip(a_parts, b_parts):
|
||||
if b == "*" or a == "*":
|
||||
continue
|
||||
if a != b:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _analyze_nats(repo_root: str, deadline: float) -> Tuple[List[Finding], Dict, bool]:
|
||||
"""Returns (findings, stats, skipped)."""
|
||||
inventory = _load_nats_inventory(repo_root)
|
||||
if inventory is None:
|
||||
return [], {"skipped": True}, True
|
||||
|
||||
code_subjects = _load_nats_code_subjects(repo_root, deadline)
|
||||
findings = []
|
||||
|
||||
# DRIFT-NATS-001: Used in code but not in inventory
|
||||
for subj in sorted(code_subjects):
|
||||
if not _nats_subject_matches(subj, inventory):
|
||||
findings.append(Finding(
|
||||
category="nats",
|
||||
severity="warning",
|
||||
id="DRIFT-NATS-001",
|
||||
title=f"NATS subject '{subj}' used in code but not in inventory",
|
||||
evidence={"path": "docs/architecture_inventory/inventory_nats_topics.csv",
|
||||
"details": f"subject '{subj}' not found (wildcard-aware match)"},
|
||||
recommended_fix=f"Add '{subj}' to inventory_nats_topics.csv.",
|
||||
))
|
||||
|
||||
# DRIFT-NATS-002: In inventory but not used in code (info — may be legacy)
|
||||
for inv_subj in inventory:
|
||||
if inv_subj.endswith(".*") or inv_subj.endswith(".>"):
|
||||
continue # wildcard subscriptions — skip
|
||||
if not _nats_subject_matches(inv_subj, list(code_subjects)):
|
||||
findings.append(Finding(
|
||||
category="nats",
|
||||
severity="info",
|
||||
id="DRIFT-NATS-002",
|
||||
title=f"Documented NATS subject '{inv_subj}' not found in code (possibly legacy)",
|
||||
evidence={"path": "docs/architecture_inventory/inventory_nats_topics.csv",
|
||||
"details": "no matching publish/subscribe call found"},
|
||||
recommended_fix="Verify if subject is still active; mark as deprecated in inventory if not.",
|
||||
))
|
||||
|
||||
stats = {
|
||||
"inventory_subjects": len(inventory),
|
||||
"code_subjects": len(code_subjects),
|
||||
"findings": len(findings),
|
||||
}
|
||||
return findings, stats, False
|
||||
|
||||
|
||||
# ─── Category 4: Tools ────────────────────────────────────────────────────────
|
||||
|
||||
def _load_rollout_tools(repo_root: str) -> Set[str]:
|
||||
"""Extract all tool names mentioned in tools_rollout.yml groups."""
|
||||
rollout_path = os.path.join(repo_root, "config", "tools_rollout.yml")
|
||||
tools: Set[str] = set()
|
||||
try:
|
||||
with open(rollout_path, "r") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
except Exception:
|
||||
return tools
|
||||
|
||||
# Collect all values from group lists (non-@group entries are tool names)
|
||||
def _collect(obj):
|
||||
if isinstance(obj, list):
|
||||
for item in obj:
|
||||
if isinstance(item, str) and not item.startswith("@"):
|
||||
tools.add(item)
|
||||
elif isinstance(item, str) and item.startswith("@"):
|
||||
group_name = item[1:]
|
||||
if group_name in data:
|
||||
_collect(data[group_name])
|
||||
elif isinstance(obj, dict):
|
||||
for v in obj.values():
|
||||
_collect(v)
|
||||
|
||||
for key, value in data.items():
|
||||
if key not in ("role_map", "agent_roles"): # these are role configs, not tool lists
|
||||
_collect(value)
|
||||
|
||||
# Also scan role_map tool lists
|
||||
role_map = data.get("role_map", {})
|
||||
for role_cfg in role_map.values():
|
||||
_collect(role_cfg.get("tools", []))
|
||||
|
||||
return tools
|
||||
|
||||
|
||||
def _load_rbac_tools(repo_root: str) -> Dict[str, Set[str]]:
|
||||
"""Load tool→{actions} from rbac_tools_matrix.yml."""
|
||||
matrix_path = os.path.join(repo_root, "config", "rbac_tools_matrix.yml")
|
||||
result: Dict[str, Set[str]] = {}
|
||||
try:
|
||||
with open(matrix_path, "r") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
for tool, cfg in (data.get("tools") or {}).items():
|
||||
actions = set((cfg.get("actions") or {}).keys())
|
||||
result[tool] = actions
|
||||
except Exception:
|
||||
pass
|
||||
return result
|
||||
|
||||
|
||||
def _get_effective_tools_for_roles(repo_root: str) -> Dict[str, Set[str]]:
|
||||
"""Get effective tools for agent_default and agent_cto roles."""
|
||||
result = {}
|
||||
try:
|
||||
import sys
|
||||
router_path = os.path.join(repo_root, "services", "router")
|
||||
if router_path not in sys.path:
|
||||
sys.path.insert(0, router_path)
|
||||
if repo_root not in sys.path:
|
||||
sys.path.insert(0, repo_root)
|
||||
|
||||
from agent_tools_config import get_agent_tools, reload_rollout_config
|
||||
reload_rollout_config()
|
||||
|
||||
# Use representative agents per role
|
||||
result["agent_default"] = set(get_agent_tools("brand_new_agent_xyz_test"))
|
||||
result["agent_cto"] = set(get_agent_tools("sofiia"))
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not load effective tools: {e}")
|
||||
return result
|
||||
|
||||
|
||||
def _analyze_tools(repo_root: str) -> Tuple[List[Finding], Dict]:
|
||||
findings = []
|
||||
|
||||
rollout_tools = _load_rollout_tools(repo_root)
|
||||
rbac_tools = _load_rbac_tools(repo_root)
|
||||
role_tools = _get_effective_tools_for_roles(repo_root)
|
||||
|
||||
all_role_tools: Set[str] = set()
|
||||
for tools in role_tools.values():
|
||||
all_role_tools.update(tools)
|
||||
|
||||
# DRIFT-TOOLS-001: Tool in rollout but no handler in tool_manager.py
|
||||
for tool in sorted(rollout_tools):
|
||||
if tool not in KNOWN_TOOL_HANDLERS:
|
||||
findings.append(Finding(
|
||||
category="tools",
|
||||
severity="error",
|
||||
id="DRIFT-TOOLS-001",
|
||||
title=f"Tool '{tool}' in tools_rollout.yml but no handler in tool_manager.py",
|
||||
evidence={"path": "config/tools_rollout.yml",
|
||||
"details": f"'{tool}' referenced in rollout groups but missing from KNOWN_TOOL_HANDLERS"},
|
||||
recommended_fix=f"Add handler for '{tool}' in tool_manager.py execute_tool dispatch, or remove from rollout.",
|
||||
))
|
||||
|
||||
# DRIFT-TOOLS-002: Handler exists but not in RBAC matrix
|
||||
# Severity = error if tool is in rollout/standard_stack (actively used, no RBAC gate)
|
||||
# Severity = warning if tool appears experimental / not yet rolled out
|
||||
for tool in sorted(KNOWN_TOOL_HANDLERS):
|
||||
if tool not in rbac_tools:
|
||||
# Escalate to error if tool is actively distributed to agents
|
||||
is_rollouted = tool in rollout_tools or tool in all_role_tools
|
||||
severity = "error" if is_rollouted else "warning"
|
||||
findings.append(Finding(
|
||||
category="tools",
|
||||
severity=severity,
|
||||
id="DRIFT-TOOLS-002",
|
||||
title=f"Tool '{tool}' has a handler but is absent from rbac_tools_matrix.yml",
|
||||
evidence={"path": "config/rbac_tools_matrix.yml",
|
||||
"details": (
|
||||
f"'{tool}' not found in matrix.tools section. "
|
||||
+ ("In rollout → no RBAC gate applied." if is_rollouted
|
||||
else "Not in rollout (experimental/legacy).")
|
||||
)},
|
||||
recommended_fix=f"Add '{tool}' with actions and entitlements to rbac_tools_matrix.yml.",
|
||||
))
|
||||
|
||||
# DRIFT-TOOLS-003: Tool in RBAC matrix but never appears in effective_tools
|
||||
if all_role_tools:
|
||||
for tool in sorted(rbac_tools.keys()):
|
||||
if tool not in all_role_tools:
|
||||
findings.append(Finding(
|
||||
category="tools",
|
||||
severity="warning",
|
||||
id="DRIFT-TOOLS-003",
|
||||
title=f"Tool '{tool}' is in RBAC matrix but never appears in effective_tools (dead config?)",
|
||||
evidence={"path": "config/rbac_tools_matrix.yml",
|
||||
"details": f"'{tool}' in matrix but not in any role's effective tool list"},
|
||||
recommended_fix=f"Add '{tool}' to a role in tools_rollout.yml or remove from matrix.",
|
||||
))
|
||||
|
||||
stats = {
|
||||
"rollout_tools": len(rollout_tools),
|
||||
"rbac_tools": len(rbac_tools),
|
||||
"handlers": len(KNOWN_TOOL_HANDLERS),
|
||||
"role_tools": {role: len(tools) for role, tools in role_tools.items()},
|
||||
"findings": len(findings),
|
||||
}
|
||||
return findings, stats
|
||||
|
||||
|
||||
# ─── Main Analyzer ────────────────────────────────────────────────────────────
|
||||
|
||||
def analyze_drift(
|
||||
repo_root: str,
|
||||
categories: Optional[List[str]] = None,
|
||||
timeout_sec: float = TIMEOUT_SEC,
|
||||
) -> DriftReport:
|
||||
"""
|
||||
Run drift analysis across requested categories.
|
||||
|
||||
Args:
|
||||
repo_root: absolute path to repository root
|
||||
categories: subset of ["services", "openapi", "nats", "tools"] (all if None)
|
||||
timeout_sec: hard deadline for full analysis
|
||||
|
||||
Returns:
|
||||
DriftReport with pass/fail verdict
|
||||
"""
|
||||
all_categories = {"services", "openapi", "nats", "tools"}
|
||||
if categories:
|
||||
run_cats = {c for c in categories if c in all_categories}
|
||||
else:
|
||||
run_cats = all_categories
|
||||
|
||||
deadline = time.monotonic() + timeout_sec
|
||||
all_findings: List[Finding] = []
|
||||
skipped: List[str] = []
|
||||
|
||||
items_checked: Dict[str, int] = {}
|
||||
cat_stats: Dict[str, Any] = {}
|
||||
|
||||
if "services" in run_cats:
|
||||
findings, stats = _analyze_services(repo_root, deadline)
|
||||
all_findings.extend(findings)
|
||||
cat_stats["services"] = stats
|
||||
items_checked["services"] = stats.get("catalog_entries", 0) + stats.get("compose_services", 0)
|
||||
|
||||
if "openapi" in run_cats:
|
||||
findings, stats = _analyze_openapi(repo_root, deadline)
|
||||
all_findings.extend(findings)
|
||||
cat_stats["openapi"] = stats
|
||||
items_checked["openapi"] = stats.get("spec_paths", 0) + stats.get("code_routes", 0)
|
||||
|
||||
if "nats" in run_cats:
|
||||
findings, stats, was_skipped = _analyze_nats(repo_root, deadline)
|
||||
if was_skipped:
|
||||
skipped.append("nats")
|
||||
else:
|
||||
all_findings.extend(findings)
|
||||
cat_stats["nats"] = stats
|
||||
items_checked["nats"] = stats.get("inventory_subjects", 0) + stats.get("code_subjects", 0)
|
||||
|
||||
if "tools" in run_cats:
|
||||
findings, stats = _analyze_tools(repo_root)
|
||||
all_findings.extend(findings)
|
||||
cat_stats["tools"] = stats
|
||||
items_checked["tools"] = stats.get("rollout_tools", 0) + stats.get("rbac_tools", 0)
|
||||
|
||||
# Sort findings: severity desc (error > warning > info), then category, then id
|
||||
severity_order = {"error": 0, "warning": 1, "info": 2}
|
||||
all_findings.sort(key=lambda f: (severity_order.get(f.severity, 9), f.category, f.id))
|
||||
|
||||
# Redact evidence
|
||||
for f in all_findings:
|
||||
if f.evidence.get("details"):
|
||||
f.evidence["details"] = _redact_evidence(f.evidence["details"])
|
||||
|
||||
errors = sum(1 for f in all_findings if f.severity == "error")
|
||||
warnings = sum(1 for f in all_findings if f.severity == "warning")
|
||||
infos = sum(1 for f in all_findings if f.severity == "info")
|
||||
|
||||
pass_ = errors == 0
|
||||
|
||||
if pass_:
|
||||
summary = f"✅ Drift analysis PASSED. {len(all_findings)} findings ({warnings} warnings, {infos} infos)."
|
||||
else:
|
||||
summary = (
|
||||
f"❌ Drift analysis FAILED. {errors} error(s), {warnings} warning(s). "
|
||||
f"Categories checked: {sorted(run_cats - {'nats'} if 'nats' in skipped else run_cats)}."
|
||||
)
|
||||
if skipped:
|
||||
summary += f" Skipped (no inventory): {skipped}."
|
||||
|
||||
elapsed_ms = round((time.monotonic() - (deadline - timeout_sec)) * 1000, 1)
|
||||
|
||||
return DriftReport(
|
||||
pass_=pass_,
|
||||
summary=summary,
|
||||
stats={
|
||||
"errors": errors,
|
||||
"warnings": warnings,
|
||||
"infos": infos,
|
||||
"skipped": skipped,
|
||||
"items_checked": items_checked,
|
||||
"elapsed_ms": elapsed_ms,
|
||||
"by_category": cat_stats,
|
||||
},
|
||||
findings=[f.to_dict() for f in all_findings],
|
||||
)
|
||||
|
||||
|
||||
def analyze_drift_dict(repo_root: str, **kwargs) -> Dict:
|
||||
"""Convenience wrapper that returns a plain dict (for ToolResult)."""
|
||||
report = analyze_drift(repo_root, **kwargs)
|
||||
return {
|
||||
"pass": report.pass_,
|
||||
"summary": report.summary,
|
||||
"stats": report.stats,
|
||||
"findings": report.findings,
|
||||
}
|
||||
106
services/router/incident_artifacts.py
Normal file
106
services/router/incident_artifacts.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""
|
||||
incident_artifacts.py — File-based artifact storage for incidents.
|
||||
|
||||
Layout: ops/incidents/<incident_id>/<filename>
|
||||
|
||||
Security:
|
||||
- Path traversal guard (realpath must stay within base_dir)
|
||||
- Max 2MB per artifact
|
||||
- Only allowed formats: json, md, txt
|
||||
- Atomic writes (temp + rename)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MAX_ARTIFACT_BYTES = 2 * 1024 * 1024 # 2MB
|
||||
ALLOWED_FORMATS = {"json", "md", "txt"}
|
||||
|
||||
_ARTIFACTS_BASE = os.getenv(
|
||||
"INCIDENT_ARTIFACTS_DIR",
|
||||
str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "incidents"),
|
||||
)
|
||||
|
||||
|
||||
def _base_dir() -> Path:
|
||||
return Path(os.getenv("INCIDENT_ARTIFACTS_DIR", _ARTIFACTS_BASE))
|
||||
|
||||
|
||||
def _safe_filename(name: str) -> str:
|
||||
"""Strip path separators and dangerous chars."""
|
||||
safe = "".join(c for c in name if c.isalnum() or c in (".", "_", "-"))
|
||||
return safe or "artifact"
|
||||
|
||||
|
||||
def write_artifact(
|
||||
incident_id: str,
|
||||
filename: str,
|
||||
content_bytes: bytes,
|
||||
*,
|
||||
base_dir: Optional[str] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Write an artifact file atomically.
|
||||
|
||||
Returns: {"path": str, "sha256": str, "size_bytes": int}
|
||||
Raises: ValueError on validation failure, OSError on write failure.
|
||||
"""
|
||||
if not incident_id or "/" in incident_id or ".." in incident_id:
|
||||
raise ValueError(f"Invalid incident_id: {incident_id}")
|
||||
|
||||
if len(content_bytes) > MAX_ARTIFACT_BYTES:
|
||||
raise ValueError(f"Artifact too large: {len(content_bytes)} bytes (max {MAX_ARTIFACT_BYTES})")
|
||||
|
||||
safe_name = _safe_filename(filename)
|
||||
ext = safe_name.rsplit(".", 1)[-1].lower() if "." in safe_name else ""
|
||||
if ext not in ALLOWED_FORMATS:
|
||||
raise ValueError(f"Format '{ext}' not allowed. Allowed: {ALLOWED_FORMATS}")
|
||||
|
||||
bd = Path(base_dir) if base_dir else _base_dir()
|
||||
inc_dir = bd / incident_id
|
||||
inc_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
target = inc_dir / safe_name
|
||||
real_base = bd.resolve()
|
||||
real_target = target.resolve()
|
||||
if not str(real_target).startswith(str(real_base)):
|
||||
raise ValueError("Path traversal detected")
|
||||
|
||||
sha = hashlib.sha256(content_bytes).hexdigest()
|
||||
|
||||
# Atomic write: temp file → rename
|
||||
fd, tmp_path = tempfile.mkstemp(dir=str(inc_dir), suffix=f".{ext}.tmp")
|
||||
try:
|
||||
os.write(fd, content_bytes)
|
||||
os.close(fd)
|
||||
os.replace(tmp_path, str(target))
|
||||
except Exception:
|
||||
os.close(fd) if not os.get_inheritable(fd) else None
|
||||
if os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
raise
|
||||
|
||||
rel_path = str(target.relative_to(bd.parent.parent)) if bd.parent.parent.exists() else str(target)
|
||||
|
||||
logger.info("Artifact written: %s (%d bytes, sha256=%s…)", rel_path, len(content_bytes), sha[:12])
|
||||
return {
|
||||
"path": rel_path,
|
||||
"sha256": sha,
|
||||
"size_bytes": len(content_bytes),
|
||||
}
|
||||
|
||||
|
||||
def decode_content(content_base64: str) -> bytes:
|
||||
"""Decode base64-encoded content. Raises ValueError on failure."""
|
||||
try:
|
||||
return base64.b64decode(content_base64)
|
||||
except Exception as exc:
|
||||
raise ValueError(f"Invalid base64 content: {exc}")
|
||||
379
services/router/incident_escalation.py
Normal file
379
services/router/incident_escalation.py
Normal file
@@ -0,0 +1,379 @@
|
||||
"""
|
||||
incident_escalation.py — Deterministic Incident Escalation Engine.
|
||||
|
||||
Actions (exposed via incident_escalation_tool):
|
||||
evaluate — check active signatures against escalation thresholds
|
||||
auto_resolve_candidates — find open incidents with no recent alerts
|
||||
|
||||
No LLM usage; all logic is policy-driven.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Severity ordering ────────────────────────────────────────────────────────
|
||||
|
||||
_SEV_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "INFO": 4}
|
||||
_SEV_NAMES = ["P0", "P1", "P2", "P3", "INFO"]
|
||||
|
||||
|
||||
def _sev_higher(a: str, b: str) -> bool:
|
||||
"""Return True if a is more severe (lower P number) than b."""
|
||||
return _SEV_ORDER.get(a, 99) < _SEV_ORDER.get(b, 99)
|
||||
|
||||
|
||||
def _escalate_sev(current: str, cap: str = "P0") -> Optional[str]:
|
||||
"""Return next higher severity, or None if already at/above cap."""
|
||||
idx = _SEV_ORDER.get(current)
|
||||
if idx is None or idx == 0:
|
||||
return None
|
||||
target = _SEV_NAMES[idx - 1]
|
||||
if _SEV_ORDER.get(target, 99) < _SEV_ORDER.get(cap, 0):
|
||||
return None # would exceed cap
|
||||
return target
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.datetime.utcnow().isoformat()
|
||||
|
||||
|
||||
def _plus_hours(hours: int) -> str:
|
||||
return (datetime.datetime.utcnow() + datetime.timedelta(hours=hours)).isoformat()
|
||||
|
||||
|
||||
# ─── Policy loading ───────────────────────────────────────────────────────────
|
||||
|
||||
_POLICY_CACHE: Optional[Dict] = None
|
||||
_POLICY_PATHS = [
|
||||
Path("config/incident_escalation_policy.yml"),
|
||||
Path(__file__).resolve().parent.parent.parent / "config" / "incident_escalation_policy.yml",
|
||||
]
|
||||
|
||||
|
||||
def load_escalation_policy() -> Dict:
|
||||
global _POLICY_CACHE
|
||||
if _POLICY_CACHE is not None:
|
||||
return _POLICY_CACHE
|
||||
for path in _POLICY_PATHS:
|
||||
if path.exists():
|
||||
try:
|
||||
with open(path) as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
_POLICY_CACHE = data
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.warning("Failed to load escalation policy from %s: %s", path, e)
|
||||
logger.warning("incident_escalation_policy.yml not found; using defaults")
|
||||
_POLICY_CACHE = _builtin_defaults()
|
||||
return _POLICY_CACHE
|
||||
|
||||
|
||||
def _builtin_defaults() -> Dict:
|
||||
return {
|
||||
"defaults": {"window_minutes": 60},
|
||||
"escalation": {
|
||||
"occurrences_thresholds": {"P2_to_P1": 10, "P1_to_P0": 25},
|
||||
"triage_thresholds_24h": {"P2_to_P1": 3, "P1_to_P0": 6},
|
||||
"severity_cap": "P0",
|
||||
"create_followup_on_escalate": True,
|
||||
"followup": {
|
||||
"priority": "P1", "due_hours": 24, "owner": "oncall",
|
||||
"message_template": "Escalated: occurrences={occurrences_60m}, triages_24h={triage_count_24h}",
|
||||
},
|
||||
},
|
||||
"auto_resolve": {
|
||||
"no_alerts_minutes_for_candidate": 60,
|
||||
"close_allowed_severities": ["P2", "P3"],
|
||||
"auto_close": False,
|
||||
"candidate_event_type": "note",
|
||||
"candidate_message": "Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
|
||||
},
|
||||
"alert_loop_slo": {
|
||||
"claim_to_ack_p95_seconds": 60,
|
||||
"failed_rate_pct": 5,
|
||||
"processing_stuck_minutes": 15,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ─── Escalation thresholds helper ────────────────────────────────────────────
|
||||
|
||||
def _determine_escalation(
|
||||
current_severity: str,
|
||||
occurrences_60m: int,
|
||||
triage_count_24h: int,
|
||||
policy: Dict,
|
||||
) -> Optional[str]:
|
||||
"""Return target severity if escalation is needed, else None."""
|
||||
esc = policy.get("escalation", {})
|
||||
occ_thresh = esc.get("occurrences_thresholds", {})
|
||||
triage_thresh = esc.get("triage_thresholds_24h", {})
|
||||
cap = esc.get("severity_cap", "P0")
|
||||
|
||||
# Build escalation rules in priority order (most → least severe)
|
||||
rules = [
|
||||
("P1", "P0", occ_thresh.get("P1_to_P0", 25), triage_thresh.get("P1_to_P0", 6)),
|
||||
("P2", "P1", occ_thresh.get("P2_to_P1", 10), triage_thresh.get("P2_to_P1", 3)),
|
||||
]
|
||||
|
||||
for from_sev, to_sev, occ_limit, triage_limit in rules:
|
||||
if current_severity != from_sev:
|
||||
continue
|
||||
if occurrences_60m >= occ_limit or triage_count_24h >= triage_limit:
|
||||
# Check cap
|
||||
if not _sev_higher(cap, to_sev) and to_sev != cap:
|
||||
# to_sev is more severe than cap — not allowed
|
||||
if _sev_higher(to_sev, cap):
|
||||
return cap
|
||||
return to_sev
|
||||
return None
|
||||
|
||||
|
||||
# ─── Core evaluate function ───────────────────────────────────────────────────
|
||||
|
||||
def evaluate_escalations(
|
||||
params: Dict,
|
||||
alert_store,
|
||||
sig_state_store,
|
||||
incident_store,
|
||||
policy: Optional[Dict] = None,
|
||||
dry_run: bool = False,
|
||||
) -> Dict:
|
||||
"""
|
||||
Main escalation evaluation. Returns structured summary.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_escalation_policy()
|
||||
|
||||
env_filter = params.get("env") # "prod" / "staging" / None = any
|
||||
window_minutes = int(params.get("window_minutes",
|
||||
policy.get("defaults", {}).get("window_minutes", 60)))
|
||||
limit = int(params.get("limit", 100))
|
||||
|
||||
esc_cfg = policy.get("escalation", {})
|
||||
cap = esc_cfg.get("severity_cap", "P0")
|
||||
create_followup = esc_cfg.get("create_followup_on_escalate", True)
|
||||
followup_cfg = esc_cfg.get("followup", {})
|
||||
|
||||
# Pull active signatures
|
||||
active_sigs = sig_state_store.list_active_signatures(
|
||||
window_minutes=window_minutes, limit=limit
|
||||
)
|
||||
|
||||
evaluated = 0
|
||||
escalated = 0
|
||||
followups_created = 0
|
||||
candidates: List[Dict] = []
|
||||
recommendations: List[str] = []
|
||||
|
||||
for sig_state in active_sigs:
|
||||
signature = sig_state.get("signature", "")
|
||||
occurrences_60m = sig_state.get("occurrences_60m", 0)
|
||||
triage_count_24h = sig_state.get("triage_count_24h", 0)
|
||||
|
||||
# Find open incident with this signature
|
||||
all_incidents = incident_store.list_incidents(
|
||||
{"status": "open"}, limit=200
|
||||
)
|
||||
matching = [
|
||||
i for i in all_incidents
|
||||
if i.get("meta", {}).get("incident_signature") == signature
|
||||
and (not env_filter or i.get("env") == env_filter)
|
||||
]
|
||||
if not matching:
|
||||
# Also check mitigating
|
||||
mitigating = incident_store.list_incidents(
|
||||
{"status": "mitigating"}, limit=200
|
||||
)
|
||||
matching = [
|
||||
i for i in mitigating
|
||||
if i.get("meta", {}).get("incident_signature") == signature
|
||||
and (not env_filter or i.get("env") == env_filter)
|
||||
]
|
||||
|
||||
if not matching:
|
||||
evaluated += 1
|
||||
continue
|
||||
|
||||
incident = matching[0]
|
||||
inc_id = incident["id"]
|
||||
current_sev = incident.get("severity", "P2")
|
||||
|
||||
evaluated += 1
|
||||
|
||||
target_sev = _determine_escalation(
|
||||
current_sev, occurrences_60m, triage_count_24h, policy
|
||||
)
|
||||
|
||||
if not target_sev:
|
||||
continue # no escalation needed
|
||||
|
||||
candidates.append({
|
||||
"incident_id": inc_id,
|
||||
"service": incident.get("service"),
|
||||
"from_severity": current_sev,
|
||||
"to_severity": target_sev,
|
||||
"occurrences_60m": occurrences_60m,
|
||||
"triage_count_24h": triage_count_24h,
|
||||
"signature": signature,
|
||||
})
|
||||
|
||||
if dry_run:
|
||||
continue
|
||||
|
||||
# Append escalation decision event
|
||||
esc_msg = (
|
||||
f"Escalated {current_sev} → {target_sev}: "
|
||||
f"occurrences_60m={occurrences_60m}, "
|
||||
f"triage_count_24h={triage_count_24h}"
|
||||
)
|
||||
incident_store.append_event(inc_id, "decision", esc_msg, meta={
|
||||
"from_severity": current_sev,
|
||||
"to_severity": target_sev,
|
||||
"occurrences_60m": occurrences_60m,
|
||||
"triage_count_24h": triage_count_24h,
|
||||
"policy_cap": cap,
|
||||
"automated": True,
|
||||
})
|
||||
escalated += 1
|
||||
|
||||
# Create follow-up event if configured
|
||||
if create_followup:
|
||||
tmpl = followup_cfg.get(
|
||||
"message_template",
|
||||
"Escalation follow-up: investigate {occurrences_60m} occurrences"
|
||||
)
|
||||
followup_msg = tmpl.format(
|
||||
occurrences_60m=occurrences_60m,
|
||||
triage_count_24h=triage_count_24h,
|
||||
)
|
||||
due = _plus_hours(int(followup_cfg.get("due_hours", 24)))
|
||||
incident_store.append_event(inc_id, "followup", followup_msg, meta={
|
||||
"priority": followup_cfg.get("priority", "P1"),
|
||||
"due_date": due,
|
||||
"owner": followup_cfg.get("owner", "oncall"),
|
||||
"auto_created": True,
|
||||
})
|
||||
followups_created += 1
|
||||
|
||||
recommendations.append(
|
||||
f"Incident {inc_id} ({incident.get('service')}) escalated "
|
||||
f"{current_sev}→{target_sev}: {esc_msg}"
|
||||
)
|
||||
|
||||
return {
|
||||
"evaluated": evaluated,
|
||||
"escalated": escalated,
|
||||
"followups_created": followups_created,
|
||||
"candidates": candidates,
|
||||
"recommendations": recommendations,
|
||||
"dry_run": dry_run,
|
||||
}
|
||||
|
||||
|
||||
# ─── Auto-resolve candidates ──────────────────────────────────────────────────
|
||||
|
||||
def find_auto_resolve_candidates(
|
||||
params: Dict,
|
||||
sig_state_store,
|
||||
incident_store,
|
||||
policy: Optional[Dict] = None,
|
||||
dry_run: bool = True,
|
||||
) -> Dict:
|
||||
"""
|
||||
Find open incidents where no alerts have been seen in the last N minutes.
|
||||
Returns list of candidate incidents.
|
||||
By default dry_run=True — no state changes.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_escalation_policy()
|
||||
|
||||
ar = policy.get("auto_resolve", {})
|
||||
no_alerts_minutes = int(params.get(
|
||||
"no_alerts_minutes",
|
||||
ar.get("no_alerts_minutes_for_candidate", 60)
|
||||
))
|
||||
env_filter = params.get("env")
|
||||
limit = int(params.get("limit", 100))
|
||||
close_allowed = ar.get("close_allowed_severities", ["P2", "P3"])
|
||||
auto_close = ar.get("auto_close", False)
|
||||
candidate_event_type = ar.get("candidate_event_type", "note")
|
||||
candidate_msg_tmpl = ar.get(
|
||||
"candidate_message",
|
||||
"Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
|
||||
)
|
||||
|
||||
now_dt = datetime.datetime.utcnow()
|
||||
no_alert_cutoff = (now_dt - datetime.timedelta(minutes=no_alerts_minutes)).isoformat()
|
||||
|
||||
# Pull all open incidents
|
||||
all_open = incident_store.list_incidents({"status": "open"}, limit=limit)
|
||||
if env_filter:
|
||||
all_open = [i for i in all_open if i.get("env") == env_filter]
|
||||
|
||||
candidates: List[Dict] = []
|
||||
closed: List[str] = []
|
||||
|
||||
for incident in all_open:
|
||||
inc_id = incident["id"]
|
||||
signature = incident.get("meta", {}).get("incident_signature")
|
||||
if not signature:
|
||||
continue
|
||||
|
||||
sig_state = sig_state_store.get_state(signature)
|
||||
if not sig_state:
|
||||
continue
|
||||
|
||||
last_alert = sig_state.get("last_alert_at") or ""
|
||||
if last_alert >= no_alert_cutoff:
|
||||
continue # alert seen recently → not a candidate
|
||||
|
||||
current_sev = incident.get("severity", "P2")
|
||||
can_close = current_sev in close_allowed
|
||||
|
||||
candidates.append({
|
||||
"incident_id": inc_id,
|
||||
"service": incident.get("service"),
|
||||
"severity": current_sev,
|
||||
"last_alert_at": last_alert,
|
||||
"minutes_without_alerts": round(
|
||||
(now_dt - datetime.datetime.fromisoformat(last_alert)).total_seconds() / 60
|
||||
if last_alert else no_alerts_minutes
|
||||
),
|
||||
"auto_close_eligible": can_close and auto_close,
|
||||
})
|
||||
|
||||
if dry_run:
|
||||
continue
|
||||
|
||||
# Append candidate note to incident
|
||||
msg = candidate_msg_tmpl.format(no_alerts_minutes=no_alerts_minutes)
|
||||
incident_store.append_event(inc_id, candidate_event_type, msg, meta={
|
||||
"last_alert_at": last_alert,
|
||||
"no_alerts_minutes": no_alerts_minutes,
|
||||
"auto_created": True,
|
||||
})
|
||||
|
||||
if can_close and auto_close:
|
||||
incident_store.close_incident(
|
||||
inc_id,
|
||||
_now_iso(),
|
||||
f"Auto-closed: no alerts for {no_alerts_minutes} minutes",
|
||||
)
|
||||
closed.append(inc_id)
|
||||
|
||||
return {
|
||||
"candidates": candidates,
|
||||
"candidates_count": len(candidates),
|
||||
"closed": closed,
|
||||
"closed_count": len(closed),
|
||||
"no_alerts_minutes": no_alerts_minutes,
|
||||
"dry_run": dry_run,
|
||||
}
|
||||
143
services/router/incident_intel_utils.py
Normal file
143
services/router/incident_intel_utils.py
Normal file
@@ -0,0 +1,143 @@
|
||||
"""
|
||||
incident_intel_utils.py — Data helpers for Incident Intelligence Layer.
|
||||
|
||||
Provides:
|
||||
- kind extraction from incident (signature, meta, title heuristics)
|
||||
- normalized key fields dict
|
||||
- time-proximity helpers
|
||||
- safe truncation/masking
|
||||
|
||||
No external dependencies beyond stdlib.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import re
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
# ─── Kind heuristics ──────────────────────────────────────────────────────────
|
||||
|
||||
_TITLE_KIND_PATTERNS = [
|
||||
(re.compile(r'\b(latency|slow|timeout|p9[5-9]|p100)\b', re.I), "latency"),
|
||||
(re.compile(r'\b(error.?rate|5xx|http.?error|exception)\b', re.I), "error_rate"),
|
||||
(re.compile(r'\b(slo.?breach|slo)\b', re.I), "slo_breach"),
|
||||
(re.compile(r'\b(oom|out.?of.?memory|memory.?pressure)\b', re.I), "oom"),
|
||||
(re.compile(r'\b(disk|storage|volume.?full|inode)\b', re.I), "disk"),
|
||||
(re.compile(r'\b(security|intrusion|cve|vuln|unauthorized)\b', re.I), "security"),
|
||||
(re.compile(r'\b(deploy|rollout|release|canary)\b', re.I), "deploy"),
|
||||
(re.compile(r'\b(crash.?loop|crashloop|restart)\b', re.I), "crashloop"),
|
||||
(re.compile(r'\b(queue|lag|consumer|backlog)\b', re.I), "queue"),
|
||||
(re.compile(r'\b(network|connectivity|dns|unreachable)\b', re.I), "network"),
|
||||
]
|
||||
|
||||
_KNOWN_KINDS = frozenset([
|
||||
"slo_breach", "crashloop", "latency", "error_rate",
|
||||
"disk", "oom", "deploy", "security", "custom", "network", "queue",
|
||||
])
|
||||
|
||||
|
||||
def extract_kind(incident: Dict) -> str:
|
||||
"""
|
||||
Best-effort kind extraction. Priority:
|
||||
1. incident.meta.kind (if present)
|
||||
2. incident.meta.alert_kind
|
||||
3. Title heuristics
|
||||
4. 'custom'
|
||||
"""
|
||||
meta = incident.get("meta") or {}
|
||||
|
||||
# Direct meta fields
|
||||
for key in ("kind", "alert_kind"):
|
||||
v = meta.get(key)
|
||||
if v and v in _KNOWN_KINDS:
|
||||
return v
|
||||
|
||||
# Title heuristics
|
||||
title = incident.get("title", "") or ""
|
||||
for pat, kind_name in _TITLE_KIND_PATTERNS:
|
||||
if pat.search(title):
|
||||
return kind_name
|
||||
|
||||
return "custom"
|
||||
|
||||
|
||||
def incident_key_fields(incident: Dict) -> Dict:
|
||||
"""Return a normalized dict of key fields used for correlation."""
|
||||
meta = incident.get("meta") or {}
|
||||
return {
|
||||
"id": incident.get("id", ""),
|
||||
"service": incident.get("service", ""),
|
||||
"env": incident.get("env", "prod"),
|
||||
"severity": incident.get("severity", "P2"),
|
||||
"status": incident.get("status", "open"),
|
||||
"started_at": incident.get("started_at", ""),
|
||||
"signature": meta.get("incident_signature", ""),
|
||||
"kind": extract_kind(incident),
|
||||
}
|
||||
|
||||
|
||||
# ─── Time helpers ─────────────────────────────────────────────────────────────
|
||||
|
||||
def parse_iso(ts: str) -> Optional[datetime.datetime]:
|
||||
"""Parse ISO timestamp string to datetime, returns None on failure."""
|
||||
if not ts:
|
||||
return None
|
||||
try:
|
||||
return datetime.datetime.fromisoformat(ts.rstrip("Z").split("+")[0])
|
||||
except (ValueError, AttributeError):
|
||||
return None
|
||||
|
||||
|
||||
def minutes_apart(ts_a: str, ts_b: str) -> Optional[float]:
|
||||
"""Return absolute minutes between two ISO timestamps, or None."""
|
||||
a = parse_iso(ts_a)
|
||||
b = parse_iso(ts_b)
|
||||
if a is None or b is None:
|
||||
return None
|
||||
return abs((a - b).total_seconds()) / 60.0
|
||||
|
||||
|
||||
def incidents_within_minutes(inc_a: Dict, inc_b: Dict, within: float) -> bool:
|
||||
"""Return True if two incidents started within `within` minutes of each other."""
|
||||
gap = minutes_apart(
|
||||
inc_a.get("started_at", ""),
|
||||
inc_b.get("started_at", ""),
|
||||
)
|
||||
return gap is not None and gap <= within
|
||||
|
||||
|
||||
# ─── Text helpers ─────────────────────────────────────────────────────────────
|
||||
|
||||
def safe_truncate(text: str, max_chars: int = 200) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
return text[:max_chars] + ("…" if len(text) > max_chars else "")
|
||||
|
||||
|
||||
def mask_signature(sig: str, prefix_len: int = 8) -> str:
|
||||
"""Show only first N chars of a SHA-256 signature for readability."""
|
||||
if not sig:
|
||||
return ""
|
||||
return sig[:prefix_len]
|
||||
|
||||
|
||||
def severity_rank(sev: str) -> int:
|
||||
"""Lower = more severe."""
|
||||
return {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "INFO": 4}.get(sev, 5)
|
||||
|
||||
|
||||
def format_duration(started_at: str, ended_at: Optional[str]) -> str:
|
||||
"""Human-readable duration string."""
|
||||
a = parse_iso(started_at)
|
||||
if a is None:
|
||||
return "unknown"
|
||||
if ended_at:
|
||||
b = parse_iso(ended_at)
|
||||
if b:
|
||||
secs = (b - a).total_seconds()
|
||||
if secs < 60:
|
||||
return f"{int(secs)}s"
|
||||
if secs < 3600:
|
||||
return f"{int(secs / 60)}m"
|
||||
return f"{secs / 3600:.1f}h"
|
||||
return "ongoing"
|
||||
1149
services/router/incident_intelligence.py
Normal file
1149
services/router/incident_intelligence.py
Normal file
File diff suppressed because it is too large
Load Diff
690
services/router/incident_store.py
Normal file
690
services/router/incident_store.py
Normal file
@@ -0,0 +1,690 @@
|
||||
"""
|
||||
incident_store.py — Incident Log storage abstraction.
|
||||
|
||||
Backends:
|
||||
- MemoryIncidentStore (testing)
|
||||
- JsonlIncidentStore (MVP/fallback — ops/incidents/ directory)
|
||||
- PostgresIncidentStore(production — psycopg2 sync)
|
||||
- AutoIncidentStore (Postgres primary → JSONL fallback)
|
||||
|
||||
All writes are non-fatal: exceptions are logged as warnings.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_SECRET_PAT = re.compile(r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+')
|
||||
|
||||
|
||||
def _redact_text(text: str, max_len: int = 4000) -> str:
|
||||
"""Mask secrets, truncate."""
|
||||
text = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", text)
|
||||
return text[:max_len] if len(text) > max_len else text
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.datetime.now(datetime.timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _generate_incident_id() -> str:
|
||||
now = datetime.datetime.now(datetime.timezone.utc)
|
||||
rand = uuid.uuid4().hex[:6]
|
||||
return f"inc_{now.strftime('%Y%m%d_%H%M')}_{rand}"
|
||||
|
||||
|
||||
# ─── Abstract interface ──────────────────────────────────────────────────────
|
||||
|
||||
class IncidentStore(ABC):
|
||||
@abstractmethod
|
||||
def create_incident(self, data: Dict) -> Dict:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def get_incident(self, incident_id: str) -> Optional[Dict]:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def append_event(self, incident_id: str, event_type: str, message: str,
|
||||
meta: Optional[Dict] = None) -> Optional[Dict]:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def add_artifact(self, incident_id: str, kind: str, fmt: str,
|
||||
path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def get_artifacts(self, incident_id: str) -> List[Dict]:
|
||||
...
|
||||
|
||||
|
||||
# ─── In-memory (testing) ─────────────────────────────────────────────────────
|
||||
|
||||
class MemoryIncidentStore(IncidentStore):
|
||||
def __init__(self):
|
||||
self._incidents: Dict[str, Dict] = {}
|
||||
self._events: Dict[str, List[Dict]] = {}
|
||||
self._artifacts: Dict[str, List[Dict]] = {}
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def create_incident(self, data: Dict) -> Dict:
|
||||
inc_id = data.get("id") or _generate_incident_id()
|
||||
now = _now_iso()
|
||||
inc = {
|
||||
"id": inc_id,
|
||||
"workspace_id": data.get("workspace_id", "default"),
|
||||
"service": data["service"],
|
||||
"env": data.get("env", "prod"),
|
||||
"severity": data.get("severity", "P2"),
|
||||
"status": "open",
|
||||
"title": _redact_text(data.get("title", ""), 500),
|
||||
"summary": _redact_text(data.get("summary", "") or "", 2000),
|
||||
"started_at": data.get("started_at", now),
|
||||
"ended_at": None,
|
||||
"created_by": data.get("created_by", "unknown"),
|
||||
"created_at": now,
|
||||
"updated_at": now,
|
||||
"meta": data.get("meta") or {},
|
||||
}
|
||||
with self._lock:
|
||||
self._incidents[inc_id] = inc
|
||||
self._events[inc_id] = []
|
||||
self._artifacts[inc_id] = []
|
||||
return inc
|
||||
|
||||
def get_incident(self, incident_id: str) -> Optional[Dict]:
|
||||
inc = self._incidents.get(incident_id)
|
||||
if not inc:
|
||||
return None
|
||||
events = self._events.get(incident_id, [])[-20:]
|
||||
artifacts = self._artifacts.get(incident_id, [])
|
||||
return {**inc, "events": events, "artifacts": artifacts}
|
||||
|
||||
def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
|
||||
filters = filters or {}
|
||||
result = list(self._incidents.values())
|
||||
if filters.get("status"):
|
||||
result = [i for i in result if i["status"] == filters["status"]]
|
||||
if filters.get("service"):
|
||||
result = [i for i in result if i["service"] == filters["service"]]
|
||||
if filters.get("env"):
|
||||
result = [i for i in result if i["env"] == filters["env"]]
|
||||
if filters.get("severity"):
|
||||
result = [i for i in result if i["severity"] == filters["severity"]]
|
||||
result.sort(key=lambda x: x.get("created_at", ""), reverse=True)
|
||||
return result[:limit]
|
||||
|
||||
def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
|
||||
inc = self._incidents.get(incident_id)
|
||||
if not inc:
|
||||
return None
|
||||
with self._lock:
|
||||
inc["status"] = "closed"
|
||||
inc["ended_at"] = ended_at
|
||||
inc["summary"] = _redact_text(resolution, 2000) if resolution else inc.get("summary")
|
||||
inc["updated_at"] = _now_iso()
|
||||
self._events.setdefault(incident_id, []).append({
|
||||
"ts": _now_iso(),
|
||||
"type": "status_change",
|
||||
"message": f"Incident closed: {_redact_text(resolution, 500)}",
|
||||
"meta": None,
|
||||
})
|
||||
return inc
|
||||
|
||||
def append_event(self, incident_id: str, event_type: str, message: str,
|
||||
meta: Optional[Dict] = None) -> Optional[Dict]:
|
||||
if incident_id not in self._incidents:
|
||||
return None
|
||||
ev = {
|
||||
"ts": _now_iso(),
|
||||
"type": event_type,
|
||||
"message": _redact_text(message, 4000),
|
||||
"meta": meta,
|
||||
}
|
||||
with self._lock:
|
||||
self._events.setdefault(incident_id, []).append(ev)
|
||||
self._incidents[incident_id]["updated_at"] = _now_iso()
|
||||
return ev
|
||||
|
||||
def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
|
||||
return self._events.get(incident_id, [])[:limit]
|
||||
|
||||
def add_artifact(self, incident_id: str, kind: str, fmt: str,
|
||||
path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
|
||||
if incident_id not in self._incidents:
|
||||
return None
|
||||
art = {
|
||||
"ts": _now_iso(),
|
||||
"kind": kind,
|
||||
"format": fmt,
|
||||
"path": path,
|
||||
"sha256": sha256,
|
||||
"size_bytes": size_bytes,
|
||||
}
|
||||
with self._lock:
|
||||
self._artifacts.setdefault(incident_id, []).append(art)
|
||||
return art
|
||||
|
||||
def get_artifacts(self, incident_id: str) -> List[Dict]:
|
||||
return self._artifacts.get(incident_id, [])
|
||||
|
||||
|
||||
# ─── JSONL (MVP file backend) ────────────────────────────────────────────────
|
||||
|
||||
class JsonlIncidentStore(IncidentStore):
|
||||
"""
|
||||
Stores incidents/events/artifacts as separate JSONL files in a directory.
|
||||
Layout:
|
||||
<base_dir>/incidents.jsonl
|
||||
<base_dir>/events.jsonl
|
||||
<base_dir>/artifacts.jsonl
|
||||
"""
|
||||
|
||||
def __init__(self, base_dir: str):
|
||||
self._dir = Path(base_dir)
|
||||
self._dir.mkdir(parents=True, exist_ok=True)
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def _incidents_path(self) -> Path:
|
||||
return self._dir / "incidents.jsonl"
|
||||
|
||||
def _events_path(self) -> Path:
|
||||
return self._dir / "events.jsonl"
|
||||
|
||||
def _artifacts_path(self) -> Path:
|
||||
return self._dir / "artifacts.jsonl"
|
||||
|
||||
def _read_jsonl(self, path: Path) -> List[Dict]:
|
||||
if not path.exists():
|
||||
return []
|
||||
items = []
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as fh:
|
||||
for line in fh:
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
items.append(json.loads(line))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
return items
|
||||
|
||||
def _append_jsonl(self, path: Path, record: Dict) -> None:
|
||||
with self._lock:
|
||||
with open(path, "a", encoding="utf-8") as fh:
|
||||
fh.write(json.dumps(record, ensure_ascii=False, default=str) + "\n")
|
||||
|
||||
def _rewrite_jsonl(self, path: Path, items: List[Dict]) -> None:
|
||||
with self._lock:
|
||||
with open(path, "w", encoding="utf-8") as fh:
|
||||
for item in items:
|
||||
fh.write(json.dumps(item, ensure_ascii=False, default=str) + "\n")
|
||||
|
||||
def create_incident(self, data: Dict) -> Dict:
|
||||
inc_id = data.get("id") or _generate_incident_id()
|
||||
now = _now_iso()
|
||||
inc = {
|
||||
"id": inc_id,
|
||||
"workspace_id": data.get("workspace_id", "default"),
|
||||
"service": data["service"],
|
||||
"env": data.get("env", "prod"),
|
||||
"severity": data.get("severity", "P2"),
|
||||
"status": "open",
|
||||
"title": _redact_text(data.get("title", ""), 500),
|
||||
"summary": _redact_text(data.get("summary", "") or "", 2000),
|
||||
"started_at": data.get("started_at", now),
|
||||
"ended_at": None,
|
||||
"created_by": data.get("created_by", "unknown"),
|
||||
"created_at": now,
|
||||
"updated_at": now,
|
||||
"meta": data.get("meta") or {},
|
||||
}
|
||||
self._append_jsonl(self._incidents_path(), inc)
|
||||
return inc
|
||||
|
||||
def get_incident(self, incident_id: str) -> Optional[Dict]:
|
||||
incidents = self._read_jsonl(self._incidents_path())
|
||||
inc = next((i for i in incidents if i.get("id") == incident_id), None)
|
||||
if not inc:
|
||||
return None
|
||||
events = [e for e in self._read_jsonl(self._events_path())
|
||||
if e.get("incident_id") == incident_id][-20:]
|
||||
artifacts = [a for a in self._read_jsonl(self._artifacts_path())
|
||||
if a.get("incident_id") == incident_id]
|
||||
return {**inc, "events": events, "artifacts": artifacts}
|
||||
|
||||
def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
|
||||
filters = filters or {}
|
||||
incidents = self._read_jsonl(self._incidents_path())
|
||||
if filters.get("status"):
|
||||
incidents = [i for i in incidents if i.get("status") == filters["status"]]
|
||||
if filters.get("service"):
|
||||
incidents = [i for i in incidents if i.get("service") == filters["service"]]
|
||||
if filters.get("env"):
|
||||
incidents = [i for i in incidents if i.get("env") == filters["env"]]
|
||||
if filters.get("severity"):
|
||||
incidents = [i for i in incidents if i.get("severity") == filters["severity"]]
|
||||
incidents.sort(key=lambda x: x.get("created_at", ""), reverse=True)
|
||||
return incidents[:limit]
|
||||
|
||||
def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
|
||||
incidents = self._read_jsonl(self._incidents_path())
|
||||
found = None
|
||||
for inc in incidents:
|
||||
if inc.get("id") == incident_id:
|
||||
inc["status"] = "closed"
|
||||
inc["ended_at"] = ended_at
|
||||
if resolution:
|
||||
inc["summary"] = _redact_text(resolution, 2000)
|
||||
inc["updated_at"] = _now_iso()
|
||||
found = inc
|
||||
break
|
||||
if not found:
|
||||
return None
|
||||
self._rewrite_jsonl(self._incidents_path(), incidents)
|
||||
self.append_event(incident_id, "status_change",
|
||||
f"Incident closed: {_redact_text(resolution or '', 500)}")
|
||||
return found
|
||||
|
||||
def append_event(self, incident_id: str, event_type: str, message: str,
|
||||
meta: Optional[Dict] = None) -> Optional[Dict]:
|
||||
incidents = self._read_jsonl(self._incidents_path())
|
||||
if not any(i.get("id") == incident_id for i in incidents):
|
||||
return None
|
||||
ev = {
|
||||
"incident_id": incident_id,
|
||||
"ts": _now_iso(),
|
||||
"type": event_type,
|
||||
"message": _redact_text(message, 4000),
|
||||
"meta": meta,
|
||||
}
|
||||
self._append_jsonl(self._events_path(), ev)
|
||||
return ev
|
||||
|
||||
def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
|
||||
events = self._read_jsonl(self._events_path())
|
||||
return [e for e in events if e.get("incident_id") == incident_id][:limit]
|
||||
|
||||
def add_artifact(self, incident_id: str, kind: str, fmt: str,
|
||||
path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
|
||||
incidents = self._read_jsonl(self._incidents_path())
|
||||
if not any(i.get("id") == incident_id for i in incidents):
|
||||
return None
|
||||
art = {
|
||||
"incident_id": incident_id,
|
||||
"ts": _now_iso(),
|
||||
"kind": kind,
|
||||
"format": fmt,
|
||||
"path": path,
|
||||
"sha256": sha256,
|
||||
"size_bytes": size_bytes,
|
||||
}
|
||||
self._append_jsonl(self._artifacts_path(), art)
|
||||
return art
|
||||
|
||||
def get_artifacts(self, incident_id: str) -> List[Dict]:
|
||||
artifacts = self._read_jsonl(self._artifacts_path())
|
||||
return [a for a in artifacts if a.get("incident_id") == incident_id]
|
||||
|
||||
|
||||
# ─── Postgres backend ─────────────────────────────────────────────────────────
|
||||
|
||||
class PostgresIncidentStore(IncidentStore):
|
||||
"""
|
||||
Production backend using psycopg2 (sync).
|
||||
Tables created by ops/scripts/migrate_incidents_postgres.py.
|
||||
"""
|
||||
|
||||
def __init__(self, dsn: str):
|
||||
self._dsn = dsn
|
||||
self._local = threading.local()
|
||||
|
||||
def _conn(self):
|
||||
"""Get or create a per-thread connection."""
|
||||
conn = getattr(self._local, "conn", None)
|
||||
if conn is None or conn.closed:
|
||||
import psycopg2 # type: ignore
|
||||
conn = psycopg2.connect(self._dsn)
|
||||
conn.autocommit = True
|
||||
self._local.conn = conn
|
||||
return conn
|
||||
|
||||
def create_incident(self, data: Dict) -> Dict:
|
||||
inc_id = data.get("id") or _generate_incident_id()
|
||||
now = _now_iso()
|
||||
cur = self._conn().cursor()
|
||||
cur.execute(
|
||||
"""INSERT INTO incidents (id,workspace_id,service,env,severity,status,
|
||||
title,summary,started_at,created_by,created_at,updated_at)
|
||||
VALUES (%s,%s,%s,%s,%s,'open',%s,%s,%s,%s,%s,%s)""",
|
||||
(inc_id, data.get("workspace_id", "default"),
|
||||
data["service"], data.get("env", "prod"),
|
||||
data.get("severity", "P2"),
|
||||
_redact_text(data.get("title", ""), 500),
|
||||
_redact_text(data.get("summary", "") or "", 2000),
|
||||
data.get("started_at") or now,
|
||||
data.get("created_by", "unknown"), now, now),
|
||||
)
|
||||
cur.close()
|
||||
return {"id": inc_id, "status": "open", "service": data["service"],
|
||||
"severity": data.get("severity", "P2"),
|
||||
"started_at": data.get("started_at") or now,
|
||||
"created_at": now}
|
||||
|
||||
def get_incident(self, incident_id: str) -> Optional[Dict]:
|
||||
cur = self._conn().cursor()
|
||||
cur.execute("SELECT id,workspace_id,service,env,severity,status,title,summary,"
|
||||
"started_at,ended_at,created_by,created_at,updated_at "
|
||||
"FROM incidents WHERE id=%s", (incident_id,))
|
||||
row = cur.fetchone()
|
||||
if not row:
|
||||
cur.close()
|
||||
return None
|
||||
cols = [d[0] for d in cur.description]
|
||||
inc = {c: (v.isoformat() if isinstance(v, datetime.datetime) else v) for c, v in zip(cols, row)}
|
||||
# Events
|
||||
cur.execute("SELECT ts,type,message,meta FROM incident_events "
|
||||
"WHERE incident_id=%s ORDER BY ts DESC LIMIT 200", (incident_id,))
|
||||
events = []
|
||||
for r in cur.fetchall():
|
||||
events.append({"ts": r[0].isoformat() if r[0] else "", "type": r[1],
|
||||
"message": r[2], "meta": r[3]})
|
||||
events.reverse()
|
||||
# Artifacts
|
||||
cur.execute("SELECT ts,kind,format,path,sha256,size_bytes FROM incident_artifacts "
|
||||
"WHERE incident_id=%s ORDER BY ts", (incident_id,))
|
||||
artifacts = []
|
||||
for r in cur.fetchall():
|
||||
artifacts.append({"ts": r[0].isoformat() if r[0] else "", "kind": r[1],
|
||||
"format": r[2], "path": r[3], "sha256": r[4], "size_bytes": r[5]})
|
||||
cur.close()
|
||||
return {**inc, "events": events, "artifacts": artifacts}
|
||||
|
||||
def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
|
||||
filters = filters or {}
|
||||
clauses = []
|
||||
params: list = []
|
||||
for k in ("status", "service", "env", "severity"):
|
||||
if filters.get(k):
|
||||
clauses.append(f"{k}=%s")
|
||||
params.append(filters[k])
|
||||
if filters.get("window_days"):
|
||||
clauses.append("created_at >= NOW() - INTERVAL '%s days'")
|
||||
params.append(int(filters["window_days"]))
|
||||
where = ("WHERE " + " AND ".join(clauses)) if clauses else ""
|
||||
params.append(min(limit, 200))
|
||||
cur = self._conn().cursor()
|
||||
cur.execute(f"SELECT id,workspace_id,service,env,severity,status,title,summary,"
|
||||
f"started_at,ended_at,created_by,created_at,updated_at "
|
||||
f"FROM incidents {where} ORDER BY created_at DESC LIMIT %s", params)
|
||||
cols = [d[0] for d in cur.description]
|
||||
rows = []
|
||||
for row in cur.fetchall():
|
||||
rows.append({c: (v.isoformat() if isinstance(v, datetime.datetime) else v)
|
||||
for c, v in zip(cols, row)})
|
||||
cur.close()
|
||||
return rows
|
||||
|
||||
def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
|
||||
cur = self._conn().cursor()
|
||||
cur.execute("UPDATE incidents SET status='closed', ended_at=%s, summary=%s, updated_at=%s "
|
||||
"WHERE id=%s RETURNING id",
|
||||
(ended_at or _now_iso(), _redact_text(resolution, 2000) if resolution else None,
|
||||
_now_iso(), incident_id))
|
||||
if not cur.fetchone():
|
||||
cur.close()
|
||||
return None
|
||||
cur.close()
|
||||
self.append_event(incident_id, "status_change",
|
||||
f"Incident closed: {_redact_text(resolution or '', 500)}")
|
||||
return {"id": incident_id, "status": "closed"}
|
||||
|
||||
def append_event(self, incident_id: str, event_type: str, message: str,
|
||||
meta: Optional[Dict] = None) -> Optional[Dict]:
|
||||
now = _now_iso()
|
||||
cur = self._conn().cursor()
|
||||
meta_json = json.dumps(meta, default=str) if meta else None
|
||||
cur.execute("INSERT INTO incident_events (incident_id,ts,type,message,meta) "
|
||||
"VALUES (%s,%s,%s,%s,%s)",
|
||||
(incident_id, now, event_type, _redact_text(message, 4000), meta_json))
|
||||
cur.close()
|
||||
return {"ts": now, "type": event_type, "message": _redact_text(message, 4000), "meta": meta}
|
||||
|
||||
def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
|
||||
cur = self._conn().cursor()
|
||||
cur.execute("SELECT ts,type,message,meta FROM incident_events "
|
||||
"WHERE incident_id=%s ORDER BY ts LIMIT %s", (incident_id, limit))
|
||||
events = [{"ts": r[0].isoformat() if r[0] else "", "type": r[1],
|
||||
"message": r[2], "meta": r[3]} for r in cur.fetchall()]
|
||||
cur.close()
|
||||
return events
|
||||
|
||||
def add_artifact(self, incident_id: str, kind: str, fmt: str,
|
||||
path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
|
||||
now = _now_iso()
|
||||
cur = self._conn().cursor()
|
||||
cur.execute("INSERT INTO incident_artifacts (incident_id,ts,kind,format,path,sha256,size_bytes) "
|
||||
"VALUES (%s,%s,%s,%s,%s,%s,%s)",
|
||||
(incident_id, now, kind, fmt, path, sha256, size_bytes))
|
||||
cur.close()
|
||||
return {"ts": now, "kind": kind, "format": fmt, "path": path,
|
||||
"sha256": sha256, "size_bytes": size_bytes}
|
||||
|
||||
def get_artifacts(self, incident_id: str) -> List[Dict]:
|
||||
cur = self._conn().cursor()
|
||||
cur.execute("SELECT ts,kind,format,path,sha256,size_bytes FROM incident_artifacts "
|
||||
"WHERE incident_id=%s ORDER BY ts", (incident_id,))
|
||||
artifacts = [{"ts": r[0].isoformat() if r[0] else "", "kind": r[1], "format": r[2],
|
||||
"path": r[3], "sha256": r[4], "size_bytes": r[5]} for r in cur.fetchall()]
|
||||
cur.close()
|
||||
return artifacts
|
||||
|
||||
def close(self):
|
||||
conn = getattr(self._local, "conn", None)
|
||||
if conn and not conn.closed:
|
||||
conn.close()
|
||||
|
||||
|
||||
# ─── Auto backend (Postgres → JSONL fallback) ────────────────────────────────
|
||||
|
||||
class AutoIncidentStore(IncidentStore):
|
||||
"""
|
||||
Tries Postgres first; on any failure falls back to JSONL.
|
||||
Re-attempts Postgres after RECOVERY_INTERVAL_S (5 min).
|
||||
"""
|
||||
|
||||
_RECOVERY_INTERVAL_S = 300
|
||||
|
||||
def __init__(self, pg_dsn: str, jsonl_dir: str):
|
||||
self._pg_dsn = pg_dsn
|
||||
self._jsonl_dir = jsonl_dir
|
||||
self._primary: Optional[PostgresIncidentStore] = None
|
||||
self._fallback: Optional[JsonlIncidentStore] = None
|
||||
self._using_fallback = False
|
||||
self._fallback_since: float = 0.0
|
||||
self._init_lock = threading.Lock()
|
||||
|
||||
def _get_primary(self) -> PostgresIncidentStore:
|
||||
if self._primary is None:
|
||||
with self._init_lock:
|
||||
if self._primary is None:
|
||||
self._primary = PostgresIncidentStore(self._pg_dsn)
|
||||
return self._primary
|
||||
|
||||
def _get_fallback(self) -> JsonlIncidentStore:
|
||||
if self._fallback is None:
|
||||
with self._init_lock:
|
||||
if self._fallback is None:
|
||||
self._fallback = JsonlIncidentStore(self._jsonl_dir)
|
||||
return self._fallback
|
||||
|
||||
def _maybe_recover(self) -> None:
|
||||
if self._using_fallback and self._fallback_since > 0:
|
||||
if time.monotonic() - self._fallback_since >= self._RECOVERY_INTERVAL_S:
|
||||
logger.info("AutoIncidentStore: attempting Postgres recovery")
|
||||
self._using_fallback = False
|
||||
self._fallback_since = 0.0
|
||||
|
||||
def _switch_to_fallback(self, err: Exception) -> None:
|
||||
logger.warning("AutoIncidentStore: Postgres failed (%s), using JSONL fallback", err)
|
||||
self._using_fallback = True
|
||||
self._fallback_since = time.monotonic()
|
||||
|
||||
def active_backend(self) -> str:
|
||||
return "jsonl_fallback" if self._using_fallback else "postgres"
|
||||
|
||||
# ── Delegate methods ──────────────────────────────────────────────────────
|
||||
|
||||
def create_incident(self, data: Dict) -> Dict:
|
||||
self._maybe_recover()
|
||||
if not self._using_fallback:
|
||||
try:
|
||||
return self._get_primary().create_incident(data)
|
||||
except Exception as e:
|
||||
self._switch_to_fallback(e)
|
||||
return self._get_fallback().create_incident(data)
|
||||
|
||||
def get_incident(self, incident_id: str) -> Optional[Dict]:
|
||||
self._maybe_recover()
|
||||
if not self._using_fallback:
|
||||
try:
|
||||
return self._get_primary().get_incident(incident_id)
|
||||
except Exception as e:
|
||||
self._switch_to_fallback(e)
|
||||
return self._get_fallback().get_incident(incident_id)
|
||||
|
||||
def list_incidents(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
|
||||
self._maybe_recover()
|
||||
if not self._using_fallback:
|
||||
try:
|
||||
return self._get_primary().list_incidents(filters, limit)
|
||||
except Exception as e:
|
||||
self._switch_to_fallback(e)
|
||||
return self._get_fallback().list_incidents(filters, limit)
|
||||
|
||||
def close_incident(self, incident_id: str, ended_at: str, resolution: str) -> Optional[Dict]:
|
||||
self._maybe_recover()
|
||||
if not self._using_fallback:
|
||||
try:
|
||||
return self._get_primary().close_incident(incident_id, ended_at, resolution)
|
||||
except Exception as e:
|
||||
self._switch_to_fallback(e)
|
||||
return self._get_fallback().close_incident(incident_id, ended_at, resolution)
|
||||
|
||||
def append_event(self, incident_id: str, event_type: str, message: str,
|
||||
meta: Optional[Dict] = None) -> Optional[Dict]:
|
||||
self._maybe_recover()
|
||||
if not self._using_fallback:
|
||||
try:
|
||||
return self._get_primary().append_event(incident_id, event_type, message, meta)
|
||||
except Exception as e:
|
||||
self._switch_to_fallback(e)
|
||||
return self._get_fallback().append_event(incident_id, event_type, message, meta)
|
||||
|
||||
def get_events(self, incident_id: str, limit: int = 100) -> List[Dict]:
|
||||
self._maybe_recover()
|
||||
if not self._using_fallback:
|
||||
try:
|
||||
return self._get_primary().get_events(incident_id, limit)
|
||||
except Exception as e:
|
||||
self._switch_to_fallback(e)
|
||||
return self._get_fallback().get_events(incident_id, limit)
|
||||
|
||||
def add_artifact(self, incident_id: str, kind: str, fmt: str,
|
||||
path: str, sha256: str, size_bytes: int) -> Optional[Dict]:
|
||||
self._maybe_recover()
|
||||
if not self._using_fallback:
|
||||
try:
|
||||
return self._get_primary().add_artifact(incident_id, kind, fmt, path, sha256, size_bytes)
|
||||
except Exception as e:
|
||||
self._switch_to_fallback(e)
|
||||
return self._get_fallback().add_artifact(incident_id, kind, fmt, path, sha256, size_bytes)
|
||||
|
||||
def get_artifacts(self, incident_id: str) -> List[Dict]:
|
||||
self._maybe_recover()
|
||||
if not self._using_fallback:
|
||||
try:
|
||||
return self._get_primary().get_artifacts(incident_id)
|
||||
except Exception as e:
|
||||
self._switch_to_fallback(e)
|
||||
return self._get_fallback().get_artifacts(incident_id)
|
||||
|
||||
|
||||
# ─── Singleton ────────────────────────────────────────────────────────────────
|
||||
|
||||
_store: Optional[IncidentStore] = None
|
||||
_store_lock = threading.Lock()
|
||||
|
||||
|
||||
def get_incident_store() -> IncidentStore:
|
||||
global _store
|
||||
if _store is None:
|
||||
with _store_lock:
|
||||
if _store is None:
|
||||
_store = _create_store()
|
||||
return _store
|
||||
|
||||
|
||||
def set_incident_store(store: Optional[IncidentStore]) -> None:
|
||||
global _store
|
||||
with _store_lock:
|
||||
_store = store
|
||||
|
||||
|
||||
def _create_store() -> IncidentStore:
|
||||
backend = os.getenv("INCIDENT_BACKEND", "jsonl").lower()
|
||||
dsn = os.getenv("DATABASE_URL") or os.getenv("INCIDENT_DATABASE_URL", "")
|
||||
jsonl_dir = os.getenv(
|
||||
"INCIDENT_JSONL_DIR",
|
||||
str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "incidents"),
|
||||
)
|
||||
|
||||
if backend == "memory":
|
||||
logger.info("IncidentStore: in-memory (testing only)")
|
||||
return MemoryIncidentStore()
|
||||
|
||||
if backend == "postgres":
|
||||
if dsn:
|
||||
logger.info("IncidentStore: postgres dsn=%s…", dsn[:30])
|
||||
return PostgresIncidentStore(dsn)
|
||||
logger.warning("INCIDENT_BACKEND=postgres but no DATABASE_URL; falling back to jsonl")
|
||||
|
||||
if backend == "auto":
|
||||
if dsn:
|
||||
logger.info("IncidentStore: auto (postgres→jsonl fallback) dsn=%s…", dsn[:30])
|
||||
return AutoIncidentStore(pg_dsn=dsn, jsonl_dir=jsonl_dir)
|
||||
logger.info("IncidentStore: auto — no DATABASE_URL, using jsonl")
|
||||
|
||||
if backend == "null":
|
||||
return MemoryIncidentStore()
|
||||
|
||||
# Default: JSONL
|
||||
logger.info("IncidentStore: jsonl dir=%s", jsonl_dir)
|
||||
return JsonlIncidentStore(jsonl_dir)
|
||||
261
services/router/llm_enrichment.py
Normal file
261
services/router/llm_enrichment.py
Normal file
@@ -0,0 +1,261 @@
|
||||
"""
|
||||
llm_enrichment.py — Optional LLM enrichment for Risk Attribution (strictly bounded).
|
||||
|
||||
Design constraints:
|
||||
- LLM output is explanatory ONLY — never changes scores or decisions.
|
||||
- Default mode is OFF (llm_mode="off").
|
||||
- Local mode calls a local HTTP model runner (Ollama-compatible by default).
|
||||
- Triggers are checked before every call: off if delta < warn OR band not high/critical.
|
||||
- Input is hard-truncated to llm_max_chars_in.
|
||||
- Output is hard-truncated to llm_max_chars_out.
|
||||
- Any error → graceful skip, returns {enabled: false, text: null}.
|
||||
|
||||
Hardening guards (new):
|
||||
- model_allowlist: model must be in allowlist or call is skipped.
|
||||
- max_calls_per_digest: caller passes a mutable counter dict; stops after limit.
|
||||
- per_day_dedupe: in-memory key per (date, service, env) prevents duplicate calls.
|
||||
|
||||
Usage:
|
||||
from llm_enrichment import maybe_enrich_attribution
|
||||
call_counter = {"count": 0}
|
||||
report["llm_enrichment"] = maybe_enrich_attribution(
|
||||
attribution_report, risk_report, attr_policy,
|
||||
call_counter=call_counter,
|
||||
)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Per-day dedupe store (module-level in-memory) ───────────────────────────
|
||||
# key: "risk_enrich:{YYYY-MM-DD}:{service}:{env}" → True
|
||||
_dedupe_store: Dict[str, bool] = {}
|
||||
|
||||
|
||||
def _dedupe_key(service: str, env: str) -> str:
|
||||
date = datetime.datetime.utcnow().strftime("%Y-%m-%d")
|
||||
return f"risk_enrich:{date}:{service}:{env}"
|
||||
|
||||
|
||||
def _is_deduped(service: str, env: str) -> bool:
|
||||
return _dedupe_store.get(_dedupe_key(service, env), False)
|
||||
|
||||
|
||||
def _mark_deduped(service: str, env: str) -> None:
|
||||
_dedupe_store[_dedupe_key(service, env)] = True
|
||||
|
||||
|
||||
def _clear_dedupe_store() -> None:
|
||||
"""Test helper to reset per-day dedup state."""
|
||||
_dedupe_store.clear()
|
||||
|
||||
# ─── Trigger guard ────────────────────────────────────────────────────────────
|
||||
|
||||
def _should_trigger(risk_report: Dict, attr_policy: Dict) -> bool:
|
||||
"""
|
||||
Returns True only if triggers are met:
|
||||
delta_24h >= risk_delta_warn OR band in band_in
|
||||
Both conditions are OR — either is enough.
|
||||
"""
|
||||
triggers = attr_policy.get("llm_triggers", {})
|
||||
delta_warn = int(triggers.get("risk_delta_warn", 10))
|
||||
band_in = set(triggers.get("band_in", ["high", "critical"]))
|
||||
|
||||
band = risk_report.get("band", "low")
|
||||
delta_24h = (risk_report.get("trend") or {}).get("delta_24h")
|
||||
|
||||
if band in band_in:
|
||||
return True
|
||||
if delta_24h is not None and delta_24h >= delta_warn:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# ─── Prompt builder ───────────────────────────────────────────────────────────
|
||||
|
||||
def _build_prompt(
|
||||
attribution_report: Dict,
|
||||
risk_report: Dict,
|
||||
max_chars: int,
|
||||
) -> str:
|
||||
"""Build a compact prompt for local LLM enrichment."""
|
||||
service = attribution_report.get("service", "?")
|
||||
env = attribution_report.get("env", "prod")
|
||||
score = risk_report.get("score", 0)
|
||||
band = risk_report.get("band", "?")
|
||||
delta = attribution_report.get("delta_24h")
|
||||
causes = attribution_report.get("causes", [])[:3]
|
||||
reasons = risk_report.get("reasons", [])[:4]
|
||||
|
||||
causes_text = "\n".join(
|
||||
f" - {c['type']} (score={c['score']}, confidence={c['confidence']}): "
|
||||
+ "; ".join(c.get("evidence", []))
|
||||
for c in causes
|
||||
)
|
||||
reasons_text = "\n".join(f" - {r}" for r in reasons)
|
||||
|
||||
prompt = (
|
||||
f"You are a platform reliability assistant. Provide a 2-3 sentence human-readable "
|
||||
f"explanation for a risk spike in service '{service}' (env={env}).\n\n"
|
||||
f"Risk score: {score} ({band}). "
|
||||
+ (f"Delta 24h: +{delta}.\n\n" if delta is not None else "\n\n")
|
||||
+ f"Risk signals:\n{reasons_text}\n\n"
|
||||
f"Attributed causes:\n{causes_text}\n\n"
|
||||
f"Write a concise explanation (max 3 sentences). Do NOT include scores or numbers "
|
||||
f"from above verbatim. Focus on actionable insight."
|
||||
)
|
||||
return prompt[:max_chars]
|
||||
|
||||
|
||||
# ─── Local model call ─────────────────────────────────────────────────────────
|
||||
|
||||
def _is_model_allowed(model: str, attr_policy: Dict) -> bool:
|
||||
"""Return True if model is in llm_local.model_allowlist (or list is empty/absent)."""
|
||||
allowlist = attr_policy.get("llm_local", {}).get("model_allowlist")
|
||||
if not allowlist:
|
||||
return True # no restriction configured
|
||||
return model in allowlist
|
||||
|
||||
|
||||
def _call_local_llm(
|
||||
prompt: str,
|
||||
attr_policy: Dict,
|
||||
max_out: int,
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Calls Ollama-compatible local endpoint.
|
||||
Skips if model is not in model_allowlist.
|
||||
Returns text or None on failure.
|
||||
"""
|
||||
llm_cfg = attr_policy.get("llm_local", {})
|
||||
endpoint = llm_cfg.get("endpoint", "http://localhost:11434/api/generate")
|
||||
model = llm_cfg.get("model", "llama3")
|
||||
timeout = int(llm_cfg.get("timeout_seconds", 15))
|
||||
|
||||
if not _is_model_allowed(model, attr_policy):
|
||||
logger.warning("llm_enrichment: model '%s' not in allowlist; skipping", model)
|
||||
return None
|
||||
|
||||
try:
|
||||
import urllib.request
|
||||
payload = json.dumps({
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {"num_predict": max_out // 4}, # approx token budget
|
||||
}).encode()
|
||||
req = urllib.request.Request(
|
||||
endpoint,
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
body = json.loads(resp.read())
|
||||
text = body.get("response", "") or ""
|
||||
return text[:max_out] if text else None
|
||||
except (Exception, OSError, ConnectionError) as e:
|
||||
logger.warning("llm_enrichment: local LLM call failed: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
# ─── Public interface ─────────────────────────────────────────────────────────
|
||||
|
||||
def maybe_enrich_attribution(
|
||||
attribution_report: Dict,
|
||||
risk_report: Dict,
|
||||
attr_policy: Optional[Dict] = None,
|
||||
*,
|
||||
call_counter: Optional[Dict] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Conditionally enrich attribution_report with LLM text.
|
||||
|
||||
Hardening guards (checked in order):
|
||||
1. llm_mode must be "local" (not "off" or "remote")
|
||||
2. triggers must be met (delta >= warn OR band in high/critical)
|
||||
3. model must be in model_allowlist
|
||||
4. max_calls_per_digest not exceeded (via mutable `call_counter` dict)
|
||||
5. per-day dedupe: (service, env) pair not already enriched today
|
||||
|
||||
Returns:
|
||||
{"enabled": True/False, "text": str|None, "mode": str}
|
||||
|
||||
Never raises. LLM output does NOT alter scores.
|
||||
"""
|
||||
if attr_policy is None:
|
||||
try:
|
||||
from risk_attribution import load_attribution_policy
|
||||
attr_policy = load_attribution_policy()
|
||||
except Exception:
|
||||
return {"enabled": False, "text": None, "mode": "off"}
|
||||
|
||||
mode = (attr_policy.get("defaults") or {}).get("llm_mode", "off")
|
||||
|
||||
if mode == "off":
|
||||
return {"enabled": False, "text": None, "mode": "off"}
|
||||
|
||||
# Guard: triggers
|
||||
if not _should_trigger(risk_report, attr_policy):
|
||||
return {"enabled": False, "text": None, "mode": mode,
|
||||
"skipped_reason": "triggers not met"}
|
||||
|
||||
service = attribution_report.get("service", "")
|
||||
env = attribution_report.get("env", "prod")
|
||||
|
||||
# Guard: model allowlist (checked early so tests can assert without calling LLM)
|
||||
if mode == "local":
|
||||
llm_local_cfg_early = attr_policy.get("llm_local", {})
|
||||
model_cfg = llm_local_cfg_early.get("model", "llama3")
|
||||
if not _is_model_allowed(model_cfg, attr_policy):
|
||||
logger.warning("llm_enrichment: model '%s' not in allowlist; skipping", model_cfg)
|
||||
return {"enabled": False, "text": None, "mode": mode,
|
||||
"skipped_reason": f"model '{model_cfg}' not in allowlist"}
|
||||
|
||||
# Guard: per-day dedupe
|
||||
llm_local_cfg = attr_policy.get("llm_local", {})
|
||||
if llm_local_cfg.get("per_day_dedupe", True):
|
||||
if _is_deduped(service, env):
|
||||
return {"enabled": False, "text": None, "mode": mode,
|
||||
"skipped_reason": "per_day_dedupe: already enriched today"}
|
||||
|
||||
# Guard: max_calls_per_digest
|
||||
if call_counter is not None:
|
||||
max_calls = int(llm_local_cfg.get("max_calls_per_digest", 3))
|
||||
if call_counter.get("count", 0) >= max_calls:
|
||||
return {"enabled": False, "text": None, "mode": mode,
|
||||
"skipped_reason": f"max_calls_per_digest={max_calls} reached"}
|
||||
|
||||
defaults = attr_policy.get("defaults", {})
|
||||
max_in = int(defaults.get("llm_max_chars_in", 3500))
|
||||
max_out = int(defaults.get("llm_max_chars_out", 800))
|
||||
prompt = _build_prompt(attribution_report, risk_report, max_in)
|
||||
|
||||
if mode == "local":
|
||||
try:
|
||||
text = _call_local_llm(prompt, attr_policy, max_out)
|
||||
except Exception as e:
|
||||
logger.warning("llm_enrichment: local call raised: %s", e)
|
||||
text = None
|
||||
|
||||
if text is not None:
|
||||
# Update guards on success
|
||||
_mark_deduped(service, env)
|
||||
if call_counter is not None:
|
||||
call_counter["count"] = call_counter.get("count", 0) + 1
|
||||
|
||||
return {
|
||||
"enabled": text is not None,
|
||||
"text": text,
|
||||
"mode": "local",
|
||||
}
|
||||
|
||||
# mode == "remote" — not implemented; stub for future extensibility
|
||||
logger.debug("llm_enrichment: remote mode not implemented; skipping")
|
||||
return {"enabled": False, "text": None, "mode": "remote",
|
||||
"skipped_reason": "remote not implemented"}
|
||||
340
services/router/platform_priority_digest.py
Normal file
340
services/router/platform_priority_digest.py
Normal file
@@ -0,0 +1,340 @@
|
||||
"""
|
||||
platform_priority_digest.py — Weekly Platform Priority Digest.
|
||||
DAARION.city | deterministic, no LLM.
|
||||
|
||||
Generates a Markdown + JSON report prioritising services by Architecture Pressure,
|
||||
optionally correlated with Risk score/delta.
|
||||
|
||||
Outputs:
|
||||
ops/reports/platform/{YYYY-WW}.md
|
||||
ops/reports/platform/{YYYY-WW}.json
|
||||
|
||||
Public API:
|
||||
weekly_platform_digest(env, ...) -> DigestResult
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from architecture_pressure import load_pressure_policy
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Action templates ─────────────────────────────────────────────────────────
|
||||
|
||||
_ACTION_TEMPLATES = {
|
||||
"arch_review": (
|
||||
"📋 **Schedule architecture review**: '{service}' pressure={score} "
|
||||
"({band}). Review structural debt and recurring failure patterns."
|
||||
),
|
||||
"refactor_sprint": (
|
||||
"🔧 **Allocate refactor sprint**: '{service}' has {regressions} regressions "
|
||||
"and {escalations} escalations in 30d — structural instability requires investment."
|
||||
),
|
||||
"freeze_features": (
|
||||
"🚫 **Freeze non-critical features**: '{service}' is critical-pressure + "
|
||||
"risk-high. Stabilise before new feature work."
|
||||
),
|
||||
"reduce_backlog": (
|
||||
"📌 **Reduce followup backlog**: '{service}' has {overdue} overdue follow-ups. "
|
||||
"Address before next release cycle."
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _now_week() -> str:
|
||||
"""Return ISO week string: YYYY-WNN."""
|
||||
return datetime.datetime.utcnow().strftime("%Y-W%V")
|
||||
|
||||
|
||||
def _now_date() -> str:
|
||||
return datetime.datetime.utcnow().strftime("%Y-%m-%d")
|
||||
|
||||
|
||||
def _clamp(text: str, max_chars: int) -> str:
|
||||
if max_chars and len(text) > max_chars:
|
||||
return text[:max_chars - 3] + "…"
|
||||
return text
|
||||
|
||||
|
||||
# ─── Action list builder ──────────────────────────────────────────────────────
|
||||
|
||||
def _build_priority_actions(pressure_reports: List[Dict], risk_reports: Optional[Dict] = None) -> List[str]:
|
||||
actions = []
|
||||
risk_reports = risk_reports or {}
|
||||
|
||||
for r in pressure_reports:
|
||||
svc = r["service"]
|
||||
score = r.get("score", 0)
|
||||
band = r.get("band", "low")
|
||||
comp = r.get("components", {})
|
||||
|
||||
if r.get("requires_arch_review"):
|
||||
actions.append(
|
||||
_ACTION_TEMPLATES["arch_review"].format(
|
||||
service=svc, score=score, band=band
|
||||
)
|
||||
)
|
||||
|
||||
regressions = int(comp.get("regressions_30d", 0))
|
||||
escalations = int(comp.get("escalations_30d", 0))
|
||||
if regressions >= 3 and escalations >= 2:
|
||||
actions.append(
|
||||
_ACTION_TEMPLATES["refactor_sprint"].format(
|
||||
service=svc, regressions=regressions, escalations=escalations
|
||||
)
|
||||
)
|
||||
|
||||
rr = risk_reports.get(svc, {})
|
||||
risk_band = rr.get("band", "low") if rr else r.get("risk_band", "low")
|
||||
if band == "critical" and risk_band in ("high", "critical"):
|
||||
actions.append(
|
||||
_ACTION_TEMPLATES["freeze_features"].format(service=svc)
|
||||
)
|
||||
|
||||
overdue = int(comp.get("followups_overdue", 0))
|
||||
if overdue >= 2:
|
||||
actions.append(
|
||||
_ACTION_TEMPLATES["reduce_backlog"].format(service=svc, overdue=overdue)
|
||||
)
|
||||
|
||||
return actions[:20] # cap
|
||||
|
||||
|
||||
# ─── Markdown builder ─────────────────────────────────────────────────────────
|
||||
|
||||
def _build_markdown(
|
||||
week_str: str,
|
||||
env: str,
|
||||
pressure_reports: List[Dict],
|
||||
investment_list: List[Dict],
|
||||
actions: List[str],
|
||||
band_counts: Dict[str, int],
|
||||
) -> str:
|
||||
lines = [
|
||||
f"# Platform Priority Digest — {env.upper()} | {week_str}",
|
||||
f"_Generated: {_now_date()} | Deterministic | No LLM_",
|
||||
"",
|
||||
"## Pressure Band Summary",
|
||||
"",
|
||||
f"| Band | Services |",
|
||||
f"|------|---------|",
|
||||
f"| 🔴 Critical | {band_counts.get('critical', 0)} |",
|
||||
f"| 🟠 High | {band_counts.get('high', 0)} |",
|
||||
f"| 🟡 Medium | {band_counts.get('medium', 0)} |",
|
||||
f"| 🟢 Low | {band_counts.get('low', 0)} |",
|
||||
"",
|
||||
]
|
||||
|
||||
# Critical pressure
|
||||
critical = [r for r in pressure_reports if r.get("band") == "critical"]
|
||||
if critical:
|
||||
lines += ["## 🔴 Critical Structural Pressure", ""]
|
||||
for r in critical:
|
||||
svc = r["service"]
|
||||
score = r.get("score", 0)
|
||||
summary = "; ".join(r.get("signals_summary", [])[:3])
|
||||
arch_flag = " ⚠️ ARCH REVIEW REQUIRED" if r.get("requires_arch_review") else ""
|
||||
lines.append(f"### {svc} (score={score}){arch_flag}")
|
||||
lines.append(f"> {summary}")
|
||||
# Risk correlation
|
||||
if r.get("risk_score") is not None:
|
||||
lines.append(
|
||||
f"> Risk: {r['risk_score']} ({r.get('risk_band', '?')})"
|
||||
+ (f" Δ24h: +{r['risk_delta_24h']}" if r.get("risk_delta_24h") else "")
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
# High pressure
|
||||
high = [r for r in pressure_reports if r.get("band") == "high"]
|
||||
if high:
|
||||
lines += ["## 🟠 High Pressure Services", ""]
|
||||
for r in high:
|
||||
svc = r["service"]
|
||||
score = r.get("score", 0)
|
||||
summary = (r.get("signals_summary") or [""])[0]
|
||||
lines.append(
|
||||
f"- **{svc}** (score={score}): {summary}"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
# Investment priority list
|
||||
if investment_list:
|
||||
lines += ["## 📊 Investment Priority List", ""]
|
||||
lines.append("Services where Pressure ≥ require_arch_review_at AND risk is elevated:")
|
||||
lines.append("")
|
||||
for i, item in enumerate(investment_list, 1):
|
||||
lines.append(
|
||||
f"{i}. **{item['service']}** — Pressure: {item['pressure_score']} "
|
||||
f"({item['pressure_band']}) | Risk: {item.get('risk_score', 'N/A')} "
|
||||
f"({item.get('risk_band', 'N/A')})"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
# Action recommendations
|
||||
if actions:
|
||||
lines += ["## ✅ Action Recommendations", ""]
|
||||
for action in actions:
|
||||
lines.append(f"- {action}")
|
||||
lines.append("")
|
||||
|
||||
lines += [
|
||||
"---",
|
||||
"_Generated by DAARION.city Platform Priority Digest (deterministic, no LLM)_",
|
||||
]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ─── Main digest function ─────────────────────────────────────────────────────
|
||||
|
||||
def weekly_platform_digest(
|
||||
env: str = "prod",
|
||||
*,
|
||||
pressure_reports: Optional[List[Dict]] = None,
|
||||
risk_reports: Optional[Dict[str, Dict]] = None,
|
||||
policy: Optional[Dict] = None,
|
||||
week_str: Optional[str] = None,
|
||||
output_dir: Optional[str] = None,
|
||||
date_str: Optional[str] = None,
|
||||
write_files: bool = True,
|
||||
auto_followup: bool = True,
|
||||
incident_store=None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Generate Weekly Platform Priority Digest.
|
||||
|
||||
Args:
|
||||
pressure_reports: pre-computed pressure reports list (sorted by score desc)
|
||||
risk_reports: {service: RiskReport} for side-by-side correlation
|
||||
policy: architecture_pressure_policy (loaded if None)
|
||||
week_str: ISO week for filenames (defaults to current week)
|
||||
output_dir: override output directory
|
||||
write_files: write .md and .json to disk
|
||||
auto_followup: call maybe_create_arch_review_followup for each requiring review
|
||||
incident_store: needed for auto_followup
|
||||
|
||||
Returns: DigestResult dict with markdown, json_data, files_written, followups_created.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_pressure_policy()
|
||||
|
||||
effective_week = week_str or _now_week()
|
||||
effective_date = date_str or _now_date()
|
||||
cfg_output_dir = policy.get("digest", {}).get("output_dir", "ops/reports/platform")
|
||||
effective_output_dir = output_dir or cfg_output_dir
|
||||
max_chars = int(policy.get("digest", {}).get("max_chars", 12000))
|
||||
top_n = int(policy.get("digest", {}).get("top_n_in_digest", 10))
|
||||
|
||||
pressure_reports = sorted(pressure_reports or [], key=lambda r: -r.get("score", 0))[:top_n]
|
||||
risk_reports = risk_reports or {}
|
||||
|
||||
# Band counts
|
||||
band_counts: Dict[str, int] = {"critical": 0, "high": 0, "medium": 0, "low": 0}
|
||||
for r in pressure_reports:
|
||||
b = r.get("band", "low")
|
||||
band_counts[b] = band_counts.get(b, 0) + 1
|
||||
|
||||
# Investment priority list: requires_arch_review AND (risk high/critical OR delta > 0)
|
||||
review_at = int(policy.get("priority_rules", {}).get("require_arch_review_at", 70))
|
||||
investment_list = []
|
||||
for r in pressure_reports:
|
||||
if not r.get("requires_arch_review"):
|
||||
continue
|
||||
svc = r["service"]
|
||||
rr = risk_reports.get(svc, {})
|
||||
risk_band = rr.get("band", "low") if rr else r.get("risk_band", "low") or "low"
|
||||
risk_delta = (rr.get("trend") or {}).get("delta_24h") if rr else r.get("risk_delta_24h")
|
||||
if risk_band in ("high", "critical") or (risk_delta is not None and risk_delta > 0):
|
||||
investment_list.append({
|
||||
"service": svc,
|
||||
"pressure_score": r.get("score"),
|
||||
"pressure_band": r.get("band"),
|
||||
"risk_score": rr.get("score") if rr else r.get("risk_score"),
|
||||
"risk_band": risk_band,
|
||||
"risk_delta_24h": risk_delta,
|
||||
})
|
||||
|
||||
actions = _build_priority_actions(pressure_reports, risk_reports)
|
||||
|
||||
markdown_raw = _build_markdown(
|
||||
week_str=effective_week,
|
||||
env=env,
|
||||
pressure_reports=pressure_reports,
|
||||
investment_list=investment_list,
|
||||
actions=actions,
|
||||
band_counts=band_counts,
|
||||
)
|
||||
markdown = _clamp(markdown_raw, max_chars)
|
||||
|
||||
json_data = {
|
||||
"week": effective_week,
|
||||
"date": effective_date,
|
||||
"env": env,
|
||||
"generated_at": datetime.datetime.utcnow().isoformat(),
|
||||
"band_counts": band_counts,
|
||||
"top_pressure_services": [
|
||||
{
|
||||
"service": r.get("service"),
|
||||
"score": r.get("score"),
|
||||
"band": r.get("band"),
|
||||
"requires_arch_review": r.get("requires_arch_review"),
|
||||
"signals_summary": r.get("signals_summary", [])[:4],
|
||||
"components": r.get("components", {}),
|
||||
"risk_score": r.get("risk_score"),
|
||||
"risk_band": r.get("risk_band"),
|
||||
"risk_delta_24h": r.get("risk_delta_24h"),
|
||||
}
|
||||
for r in pressure_reports
|
||||
],
|
||||
"investment_priority_list": investment_list,
|
||||
"actions": actions,
|
||||
}
|
||||
|
||||
# ── Auto followup creation ────────────────────────────────────────────────
|
||||
followups_created = []
|
||||
if auto_followup and incident_store is not None:
|
||||
from architecture_pressure import maybe_create_arch_review_followup
|
||||
for r in pressure_reports:
|
||||
if r.get("requires_arch_review"):
|
||||
fu_result = maybe_create_arch_review_followup(
|
||||
r,
|
||||
incident_store=incident_store,
|
||||
policy=policy,
|
||||
week_str=effective_week,
|
||||
)
|
||||
if fu_result.get("created"):
|
||||
followups_created.append({
|
||||
"service": r["service"],
|
||||
"dedupe_key": fu_result.get("dedupe_key"),
|
||||
"incident_id": fu_result.get("incident_id"),
|
||||
})
|
||||
|
||||
# ── Write files ───────────────────────────────────────────────────────────
|
||||
files_written: List[str] = []
|
||||
if write_files:
|
||||
try:
|
||||
out_path = Path(effective_output_dir)
|
||||
out_path.mkdir(parents=True, exist_ok=True)
|
||||
md_file = out_path / f"{effective_week}.md"
|
||||
json_file = out_path / f"{effective_week}.json"
|
||||
md_file.write_text(markdown, encoding="utf-8")
|
||||
json_file.write_text(json.dumps(json_data, indent=2, default=str), encoding="utf-8")
|
||||
files_written = [str(md_file), str(json_file)]
|
||||
logger.info("platform_priority_digest: wrote %s and %s", md_file, json_file)
|
||||
except Exception as e:
|
||||
logger.warning("platform_priority_digest: failed to write files: %s", e)
|
||||
|
||||
return {
|
||||
"week": effective_week,
|
||||
"env": env,
|
||||
"markdown": markdown,
|
||||
"json_data": json_data,
|
||||
"files_written": files_written,
|
||||
"followups_created": followups_created,
|
||||
"band_counts": band_counts,
|
||||
}
|
||||
419
services/router/provider_budget.py
Normal file
419
services/router/provider_budget.py
Normal file
@@ -0,0 +1,419 @@
|
||||
"""Provider Budget Tracker — real-money token usage accounting.
|
||||
|
||||
Tracks:
|
||||
- Tokens used (input/output) per provider per model
|
||||
- Estimated USD cost based on published pricing
|
||||
- Approximate balance (if configured via env var)
|
||||
- Rolling 24h / 7d / 30d windows
|
||||
|
||||
Pricing table: updated Feb 2026 (USD per 1M tokens)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── Pricing catalog (USD / 1M tokens) ─────────────────────────────────────────
|
||||
|
||||
PRICING: Dict[str, Dict[str, float]] = {
|
||||
# provider → model_pattern → {input, output}
|
||||
"anthropic": {
|
||||
"claude-sonnet-4-5": {"input": 3.0, "output": 15.0},
|
||||
"claude-opus-4-5": {"input": 15.0, "output": 75.0},
|
||||
"claude-haiku-3-5": {"input": 0.8, "output": 4.0},
|
||||
"claude-3-5-sonnet": {"input": 3.0, "output": 15.0},
|
||||
"_default": {"input": 3.0, "output": 15.0},
|
||||
},
|
||||
"grok": {
|
||||
"grok-4-1-fast-reasoning": {"input": 5.0, "output": 15.0},
|
||||
"grok-3": {"input": 5.0, "output": 25.0},
|
||||
"grok-2-1212": {"input": 2.0, "output": 10.0},
|
||||
"_default": {"input": 5.0, "output": 15.0},
|
||||
},
|
||||
"deepseek": {
|
||||
"deepseek-chat": {"input": 0.27, "output": 1.10},
|
||||
"deepseek-reasoner": {"input": 0.55, "output": 2.19},
|
||||
"_default": {"input": 0.27, "output": 1.10},
|
||||
},
|
||||
"mistral": {
|
||||
"mistral-large-latest": {"input": 2.0, "output": 6.0},
|
||||
"mistral-small-latest": {"input": 0.2, "output": 0.6},
|
||||
"_default": {"input": 2.0, "output": 6.0},
|
||||
},
|
||||
"openai": {
|
||||
"gpt-4o": {"input": 2.5, "output": 10.0},
|
||||
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
|
||||
"gpt-4-turbo": {"input": 10.0, "output": 30.0},
|
||||
"_default": {"input": 2.5, "output": 10.0},
|
||||
},
|
||||
"glm": {
|
||||
"glm-4-plus": {"input": 0.05, "output": 0.05},
|
||||
"glm-4-flash": {"input": 0.0, "output": 0.0}, # free tier
|
||||
"glm-4.7-flash": {"input": 0.0, "output": 0.0},
|
||||
"glm-z1-plus": {"input": 0.07, "output": 0.07},
|
||||
"_default": {"input": 0.05, "output": 0.05},
|
||||
},
|
||||
"ollama": {
|
||||
"_default": {"input": 0.0, "output": 0.0},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def get_price(provider: str, model: str) -> Dict[str, float]:
|
||||
p = PRICING.get(provider.lower(), PRICING.get("anthropic"))
|
||||
# exact match
|
||||
if model in p:
|
||||
return p[model]
|
||||
# prefix match
|
||||
for k, v in p.items():
|
||||
if k != "_default" and model.startswith(k):
|
||||
return v
|
||||
return p.get("_default", {"input": 3.0, "output": 15.0})
|
||||
|
||||
|
||||
def calc_cost_usd(provider: str, model: str, input_tokens: int, output_tokens: int) -> float:
|
||||
price = get_price(provider, model)
|
||||
return (input_tokens * price["input"] + output_tokens * price["output"]) / 1_000_000
|
||||
|
||||
|
||||
# ── Usage record ──────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class UsageRecord:
|
||||
ts: float
|
||||
provider: str
|
||||
model: str
|
||||
agent: str
|
||||
input_tokens: int
|
||||
output_tokens: int
|
||||
cost_usd: float
|
||||
latency_ms: int = 0
|
||||
task_type: str = ""
|
||||
fallback_used: bool = False
|
||||
|
||||
|
||||
# ── Storage ────────────────────────────────────────────────────────────────────
|
||||
|
||||
_BUDGET_DIR = Path(os.getenv("BUDGET_DATA_DIR", os.path.expanduser("~/.sofiia/budget")))
|
||||
_USAGE_FILE = _BUDGET_DIR / "usage.jsonl"
|
||||
_LIMITS_FILE = _BUDGET_DIR / "limits.json"
|
||||
|
||||
_lock = threading.Lock()
|
||||
|
||||
|
||||
def _ensure_dir() -> None:
|
||||
_BUDGET_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def _append_usage(rec: UsageRecord) -> None:
|
||||
_ensure_dir()
|
||||
with _lock:
|
||||
with open(_USAGE_FILE, "a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(asdict(rec)) + "\n")
|
||||
|
||||
|
||||
def _load_usage(since_ts: float = 0.0) -> List[UsageRecord]:
|
||||
if not _USAGE_FILE.exists():
|
||||
return []
|
||||
records: List[UsageRecord] = []
|
||||
with _lock:
|
||||
try:
|
||||
with open(_USAGE_FILE, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
d = json.loads(line)
|
||||
if d.get("ts", 0) >= since_ts:
|
||||
records.append(UsageRecord(**d))
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.warning("budget: failed to load usage: %s", e)
|
||||
return records
|
||||
|
||||
|
||||
# ── Manual balance config ──────────────────────────────────────────────────────
|
||||
|
||||
def _load_limits() -> Dict[str, Any]:
|
||||
if not _LIMITS_FILE.exists():
|
||||
return {}
|
||||
try:
|
||||
with open(_LIMITS_FILE, "r") as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def _save_limits(data: Dict[str, Any]) -> None:
|
||||
_ensure_dir()
|
||||
with _lock:
|
||||
with open(_LIMITS_FILE, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
|
||||
# ── Public API ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def track_usage(
|
||||
provider: str,
|
||||
model: str,
|
||||
agent: str,
|
||||
input_tokens: int,
|
||||
output_tokens: int,
|
||||
latency_ms: int = 0,
|
||||
task_type: str = "",
|
||||
fallback_used: bool = False,
|
||||
) -> float:
|
||||
"""Record token usage and return cost in USD."""
|
||||
cost = calc_cost_usd(provider, model, input_tokens, output_tokens)
|
||||
rec = UsageRecord(
|
||||
ts=time.time(),
|
||||
provider=provider,
|
||||
model=model,
|
||||
agent=agent,
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens,
|
||||
cost_usd=cost,
|
||||
latency_ms=latency_ms,
|
||||
task_type=task_type,
|
||||
fallback_used=fallback_used,
|
||||
)
|
||||
_append_usage(rec)
|
||||
logger.debug(
|
||||
"💰 tracked: provider=%s model=%s tokens=%d+%d cost=$%.5f",
|
||||
provider, model, input_tokens, output_tokens, cost,
|
||||
)
|
||||
return cost
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProviderStats:
|
||||
provider: str
|
||||
total_input_tokens: int = 0
|
||||
total_output_tokens: int = 0
|
||||
total_cost_usd: float = 0.0
|
||||
call_count: int = 0
|
||||
avg_latency_ms: float = 0.0
|
||||
top_models: List[Dict[str, Any]] = field(default_factory=list)
|
||||
# Configured limits (from limits.json)
|
||||
monthly_limit_usd: Optional[float] = None
|
||||
topup_balance_usd: Optional[float] = None
|
||||
estimated_remaining_usd: Optional[float] = None
|
||||
|
||||
|
||||
def get_stats(window_hours: int = 720) -> Dict[str, ProviderStats]:
|
||||
"""
|
||||
Aggregate usage stats per provider for the given time window.
|
||||
Default window = 720h = 30 days.
|
||||
"""
|
||||
since_ts = time.time() - window_hours * 3600
|
||||
records = _load_usage(since_ts)
|
||||
by_provider = _aggregate_records(records)
|
||||
|
||||
limits = _load_limits()
|
||||
for p, s in by_provider.items():
|
||||
lim = limits.get(p, {})
|
||||
if "monthly_limit_usd" in lim:
|
||||
s.monthly_limit_usd = lim["monthly_limit_usd"]
|
||||
if "topup_balance_usd" in lim:
|
||||
s.topup_balance_usd = lim["topup_balance_usd"]
|
||||
s.estimated_remaining_usd = round(lim["topup_balance_usd"] - s.total_cost_usd, 4)
|
||||
|
||||
return by_provider
|
||||
|
||||
|
||||
def get_dashboard_data() -> Dict[str, Any]:
|
||||
"""
|
||||
Returns structured data for the budget dashboard UI.
|
||||
Includes 24h, 7d, 30d windows.
|
||||
Single file read + in-memory filtering for all three windows.
|
||||
"""
|
||||
now = time.time()
|
||||
ts_30d = now - 720 * 3600
|
||||
ts_7d = now - 168 * 3600
|
||||
ts_24h = now - 24 * 3600
|
||||
|
||||
all_records = _load_usage(since_ts=ts_30d)
|
||||
records_7d = [r for r in all_records if r.ts >= ts_7d]
|
||||
records_24h = [r for r in records_7d if r.ts >= ts_24h]
|
||||
|
||||
stats_30d = _aggregate_records(all_records)
|
||||
stats_7d = _aggregate_records(records_7d)
|
||||
stats_24h = _aggregate_records(records_24h)
|
||||
|
||||
limits = _load_limits()
|
||||
|
||||
# Apply limits to 30d stats
|
||||
for p, s in stats_30d.items():
|
||||
lim = limits.get(p, {})
|
||||
if "monthly_limit_usd" in lim:
|
||||
s.monthly_limit_usd = lim["monthly_limit_usd"]
|
||||
if "topup_balance_usd" in lim:
|
||||
s.topup_balance_usd = lim["topup_balance_usd"]
|
||||
s.estimated_remaining_usd = round(lim["topup_balance_usd"] - s.total_cost_usd, 4)
|
||||
|
||||
all_providers = sorted({
|
||||
*(k for k in PRICING if k != "ollama"),
|
||||
*stats_30d.keys(),
|
||||
})
|
||||
|
||||
providers_data = []
|
||||
for p in all_providers:
|
||||
s30 = stats_30d.get(p, ProviderStats(provider=p))
|
||||
s7 = stats_7d.get(p, ProviderStats(provider=p))
|
||||
s24 = stats_24h.get(p, ProviderStats(provider=p))
|
||||
plim = limits.get(p, {})
|
||||
|
||||
providers_data.append({
|
||||
"provider": p,
|
||||
"display_name": _provider_display_name(p),
|
||||
"icon": _provider_icon(p),
|
||||
"available": bool(os.getenv(_provider_env_key(p), "").strip()),
|
||||
"cost_24h": round(s24.total_cost_usd, 5),
|
||||
"cost_7d": round(s7.total_cost_usd, 5),
|
||||
"cost_30d": round(s30.total_cost_usd, 5),
|
||||
"calls_24h": s24.call_count,
|
||||
"calls_30d": s30.call_count,
|
||||
"tokens_24h": s24.total_input_tokens + s24.total_output_tokens,
|
||||
"tokens_30d": s30.total_input_tokens + s30.total_output_tokens,
|
||||
"avg_latency_ms": round(s30.avg_latency_ms),
|
||||
"monthly_limit_usd": s30.monthly_limit_usd,
|
||||
"topup_balance_usd": plim.get("topup_balance_usd"),
|
||||
"estimated_remaining_usd": s30.estimated_remaining_usd,
|
||||
"top_models": s30.top_models,
|
||||
})
|
||||
|
||||
total_24h = sum(s.total_cost_usd for s in stats_24h.values())
|
||||
total_7d = sum(s.total_cost_usd for s in stats_7d.values())
|
||||
total_30d = sum(s.total_cost_usd for s in stats_30d.values())
|
||||
|
||||
return {
|
||||
"providers": providers_data,
|
||||
"summary": {
|
||||
"total_cost_24h": round(total_24h, 5),
|
||||
"total_cost_7d": round(total_7d, 5),
|
||||
"total_cost_30d": round(total_30d, 5),
|
||||
"total_calls_30d": sum(s.call_count for s in stats_30d.values()),
|
||||
},
|
||||
"generated_at": now,
|
||||
}
|
||||
|
||||
|
||||
def _aggregate_records(records: List[UsageRecord]) -> Dict[str, ProviderStats]:
|
||||
"""Aggregate a list of records into per-provider stats."""
|
||||
by_provider: Dict[str, ProviderStats] = {}
|
||||
model_usage: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(
|
||||
lambda: defaultdict(lambda: {"calls": 0, "cost": 0.0, "tokens": 0})
|
||||
)
|
||||
for rec in records:
|
||||
p = rec.provider
|
||||
if p not in by_provider:
|
||||
by_provider[p] = ProviderStats(provider=p)
|
||||
s = by_provider[p]
|
||||
s.total_input_tokens += rec.input_tokens
|
||||
s.total_output_tokens += rec.output_tokens
|
||||
s.total_cost_usd += rec.cost_usd
|
||||
s.call_count += 1
|
||||
if rec.latency_ms:
|
||||
s.avg_latency_ms = (
|
||||
(s.avg_latency_ms * (s.call_count - 1) + rec.latency_ms) / s.call_count
|
||||
)
|
||||
model_usage[p][rec.model]["calls"] += 1
|
||||
model_usage[p][rec.model]["cost"] += rec.cost_usd
|
||||
model_usage[p][rec.model]["tokens"] += rec.input_tokens + rec.output_tokens
|
||||
|
||||
for p, s in by_provider.items():
|
||||
top = sorted(model_usage[p].items(), key=lambda x: x[1]["cost"], reverse=True)[:3]
|
||||
s.top_models = [{"model": k, **v} for k, v in top]
|
||||
|
||||
return by_provider
|
||||
|
||||
|
||||
def rotate_usage_log(max_age_days: int = 90) -> int:
|
||||
"""Remove records older than max_age_days. Returns count of removed lines."""
|
||||
if not _USAGE_FILE.exists():
|
||||
return 0
|
||||
cutoff = time.time() - max_age_days * 86400
|
||||
kept = []
|
||||
removed = 0
|
||||
with _lock:
|
||||
try:
|
||||
with open(_USAGE_FILE, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
d = json.loads(line)
|
||||
if d.get("ts", 0) >= cutoff:
|
||||
kept.append(line)
|
||||
else:
|
||||
removed += 1
|
||||
except Exception:
|
||||
removed += 1
|
||||
with open(_USAGE_FILE, "w", encoding="utf-8") as f:
|
||||
for line in kept:
|
||||
f.write(line + "\n")
|
||||
except Exception as e:
|
||||
logger.warning("budget: rotate failed: %s", e)
|
||||
if removed:
|
||||
logger.info("budget: rotated %d old records (>%dd)", removed, max_age_days)
|
||||
return removed
|
||||
|
||||
|
||||
def set_provider_limit(provider: str, monthly_limit_usd: Optional[float] = None, topup_balance_usd: Optional[float] = None) -> None:
|
||||
"""Configure budget limits for a provider."""
|
||||
limits = _load_limits()
|
||||
if provider not in limits:
|
||||
limits[provider] = {}
|
||||
if monthly_limit_usd is not None:
|
||||
limits[provider]["monthly_limit_usd"] = monthly_limit_usd
|
||||
if topup_balance_usd is not None:
|
||||
limits[provider]["topup_balance_usd"] = topup_balance_usd
|
||||
_save_limits(limits)
|
||||
logger.info("budget: set limits for %s: %s", provider, limits[provider])
|
||||
|
||||
|
||||
def _provider_display_name(p: str) -> str:
|
||||
return {
|
||||
"anthropic": "Anthropic Claude",
|
||||
"grok": "xAI Grok",
|
||||
"deepseek": "DeepSeek",
|
||||
"mistral": "Mistral AI",
|
||||
"openai": "OpenAI",
|
||||
"glm": "GLM / Z.AI",
|
||||
"ollama": "Local (Ollama)",
|
||||
}.get(p, p.title())
|
||||
|
||||
|
||||
def _provider_icon(p: str) -> str:
|
||||
return {
|
||||
"anthropic": "🟣",
|
||||
"grok": "⚡",
|
||||
"deepseek": "🔵",
|
||||
"mistral": "🌊",
|
||||
"openai": "🟢",
|
||||
"glm": "🐉",
|
||||
"ollama": "🖥️",
|
||||
}.get(p, "🤖")
|
||||
|
||||
|
||||
def _provider_env_key(p: str) -> str:
|
||||
return {
|
||||
"anthropic": "ANTHROPIC_API_KEY",
|
||||
"grok": "GROK_API_KEY",
|
||||
"deepseek": "DEEPSEEK_API_KEY",
|
||||
"mistral": "MISTRAL_API_KEY",
|
||||
"openai": "OPENAI_API_KEY",
|
||||
"glm": "GLM5_API_KEY",
|
||||
}.get(p, f"{p.upper()}_API_KEY")
|
||||
1363
services/router/release_check_runner.py
Normal file
1363
services/router/release_check_runner.py
Normal file
File diff suppressed because it is too large
Load Diff
731
services/router/risk_attribution.py
Normal file
731
services/router/risk_attribution.py
Normal file
@@ -0,0 +1,731 @@
|
||||
"""
|
||||
risk_attribution.py — Change Impact Attribution Engine (deterministic, no LLM by default).
|
||||
|
||||
Given a service + env, explains WHY risk spiked by correlating signals:
|
||||
deploy activity, dependency scan findings, drift errors, incident storms,
|
||||
SLO violations, overdue follow-ups, alert-loop degradation.
|
||||
|
||||
New in this revision:
|
||||
- Change Timeline: ordered event stream (deploy, incident, slo, followup, …)
|
||||
- Evidence refs: alert_ref[], incident_id[], release_check_run_id, artifact paths
|
||||
- Per-cause refs (clickthrough IDs for UI)
|
||||
|
||||
Provides:
|
||||
load_attribution_policy() -> Dict
|
||||
compute_attribution(service, env, ...) -> AttributionReport (includes timeline + evidence_refs)
|
||||
build_timeline(events, policy) -> List[TimelineItem]
|
||||
fetch_signals_from_stores(service, env, ...) -> SignalsData
|
||||
|
||||
LLM enrichment is separate (llm_enrichment.py) and off by default.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Policy ───────────────────────────────────────────────────────────────────
|
||||
|
||||
_ATTR_POLICY_CACHE: Optional[Dict] = None
|
||||
_ATTR_POLICY_SEARCH_PATHS = [
|
||||
Path("config/risk_attribution_policy.yml"),
|
||||
Path(__file__).resolve().parent.parent.parent / "config" / "risk_attribution_policy.yml",
|
||||
]
|
||||
|
||||
|
||||
def load_attribution_policy() -> Dict:
|
||||
global _ATTR_POLICY_CACHE
|
||||
if _ATTR_POLICY_CACHE is not None:
|
||||
return _ATTR_POLICY_CACHE
|
||||
for p in _ATTR_POLICY_SEARCH_PATHS:
|
||||
if p.exists():
|
||||
try:
|
||||
with open(p) as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
_ATTR_POLICY_CACHE = data
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.warning("Failed to load risk_attribution_policy from %s: %s", p, e)
|
||||
_ATTR_POLICY_CACHE = _builtin_attr_defaults()
|
||||
return _ATTR_POLICY_CACHE
|
||||
|
||||
|
||||
def _reload_attribution_policy() -> None:
|
||||
global _ATTR_POLICY_CACHE
|
||||
_ATTR_POLICY_CACHE = None
|
||||
|
||||
|
||||
def _builtin_attr_defaults() -> Dict:
|
||||
return {
|
||||
"defaults": {"lookback_hours": 24, "max_causes": 5, "llm_mode": "off",
|
||||
"llm_max_chars_in": 3500, "llm_max_chars_out": 800},
|
||||
"llm_triggers": {"risk_delta_warn": 10, "risk_delta_fail": 20,
|
||||
"band_in": ["high", "critical"]},
|
||||
"weights": {"deploy": 30, "dependency": 25, "drift": 25, "incident_storm": 20,
|
||||
"slo_violation": 15, "followups_overdue": 10, "alert_loop_degraded": 10},
|
||||
"signals": {
|
||||
"deploy": {"kinds": ["deploy", "deployment", "rollout", "canary"]},
|
||||
"dependency": {"release_gate_names": ["dependency_scan", "deps"]},
|
||||
"drift": {"release_gate_names": ["drift", "config_drift"]},
|
||||
"incident_storm": {"thresholds": {"occurrences_60m_warn": 10,
|
||||
"escalations_24h_warn": 2}},
|
||||
"slo": {"require_active_violation": True},
|
||||
},
|
||||
"output": {"confidence_bands": {"high": 60, "medium": 35}},
|
||||
"timeline": {
|
||||
"enabled": True,
|
||||
"lookback_hours": 24,
|
||||
"max_items": 30,
|
||||
"include_types": ["deploy", "dependency", "drift", "incident", "slo",
|
||||
"followup", "alert_loop", "release_gate"],
|
||||
"time_bucket_minutes": 5,
|
||||
},
|
||||
"evidence_linking": {"enabled": True, "max_refs_per_cause": 10},
|
||||
"llm_local": {
|
||||
"endpoint": "http://localhost:11434/api/generate",
|
||||
"model": "llama3",
|
||||
"timeout_seconds": 15,
|
||||
"model_allowlist": ["qwen2.5-coder:3b", "llama3.1:8b-instruct", "phi3:mini", "llama3"],
|
||||
"max_calls_per_digest": 3,
|
||||
"per_day_dedupe": True,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ─── Confidence ───────────────────────────────────────────────────────────────
|
||||
|
||||
def _score_to_confidence(score: int, policy: Dict) -> str:
|
||||
bands = policy.get("output", {}).get("confidence_bands", {})
|
||||
high_t = int(bands.get("high", 60))
|
||||
med_t = int(bands.get("medium", 35))
|
||||
if score >= high_t:
|
||||
return "high"
|
||||
if score >= med_t:
|
||||
return "medium"
|
||||
return "low"
|
||||
|
||||
|
||||
# ─── Signal detection helpers (now also return refs) ──────────────────────────
|
||||
|
||||
def _cap_refs(refs: List[Any], max_refs: int) -> List[Any]:
|
||||
return refs[:max_refs]
|
||||
|
||||
|
||||
def _detect_deploy(
|
||||
alerts: List[Dict],
|
||||
cutoff_iso: str,
|
||||
policy: Dict,
|
||||
max_refs: int = 10,
|
||||
) -> Tuple[int, List[str], List[Dict]]:
|
||||
"""Returns (score, evidence_list, refs)."""
|
||||
kinds = set(policy.get("signals", {}).get("deploy", {}).get(
|
||||
"kinds", ["deploy", "deployment", "rollout", "canary"]
|
||||
))
|
||||
deploy_alerts = [
|
||||
a for a in alerts
|
||||
if a.get("kind", "").lower() in kinds and a.get("created_at", "") >= cutoff_iso
|
||||
]
|
||||
if not deploy_alerts:
|
||||
return 0, [], []
|
||||
weight = int(policy.get("weights", {}).get("deploy", 30))
|
||||
last_seen = max(a.get("created_at", "") for a in deploy_alerts)
|
||||
evidence = [
|
||||
f"deploy alerts: {len(deploy_alerts)} in last 24h",
|
||||
f"last seen: {last_seen[:16] if last_seen else 'unknown'}",
|
||||
]
|
||||
refs = _cap_refs(
|
||||
[{"alert_ref": a["alert_ref"], "kind": a.get("kind", "deploy"),
|
||||
"ts": a.get("created_at", "")}
|
||||
for a in deploy_alerts if a.get("alert_ref")],
|
||||
max_refs,
|
||||
)
|
||||
return weight, evidence, refs
|
||||
|
||||
|
||||
def _detect_dependency(
|
||||
release_gate_results: List[Dict],
|
||||
policy: Dict,
|
||||
max_refs: int = 10,
|
||||
) -> Tuple[int, List[str], List[Dict]]:
|
||||
gate_names = set(policy.get("signals", {}).get("dependency", {}).get(
|
||||
"release_gate_names", ["dependency_scan", "deps"]
|
||||
))
|
||||
failing = [
|
||||
g for g in release_gate_results
|
||||
if g.get("gate") in gate_names and g.get("status") in ("fail", "warn")
|
||||
]
|
||||
if not failing:
|
||||
return 0, [], []
|
||||
weight = int(policy.get("weights", {}).get("dependency", 25))
|
||||
evidence = [f"dependency_scan gate: {g['gate']} = {g['status']}" for g in failing[:3]]
|
||||
refs = _cap_refs(
|
||||
[{"release_check_run_id": g.get("run_id"), "gate": g["gate"],
|
||||
"artifact": g.get("artifact")}
|
||||
for g in failing if g.get("run_id") or g.get("artifact")],
|
||||
max_refs,
|
||||
)
|
||||
return weight, evidence, refs
|
||||
|
||||
|
||||
def _detect_drift(
|
||||
release_gate_results: List[Dict],
|
||||
policy: Dict,
|
||||
max_refs: int = 10,
|
||||
) -> Tuple[int, List[str], List[Dict]]:
|
||||
gate_names = set(policy.get("signals", {}).get("drift", {}).get(
|
||||
"release_gate_names", ["drift", "config_drift"]
|
||||
))
|
||||
failing = [
|
||||
g for g in release_gate_results
|
||||
if g.get("gate") in gate_names and g.get("status") in ("fail", "warn")
|
||||
]
|
||||
if not failing:
|
||||
return 0, [], []
|
||||
weight = int(policy.get("weights", {}).get("drift", 25))
|
||||
evidence = [f"drift gate: {g['gate']} = {g['status']}" for g in failing[:3]]
|
||||
refs = _cap_refs(
|
||||
[{"release_check_run_id": g.get("run_id"), "gate": g["gate"],
|
||||
"artifact": g.get("artifact")}
|
||||
for g in failing if g.get("run_id") or g.get("artifact")],
|
||||
max_refs,
|
||||
)
|
||||
return weight, evidence, refs
|
||||
|
||||
|
||||
def _detect_incident_storm(
|
||||
occurrences_60m: int,
|
||||
escalations_24h: int,
|
||||
policy: Dict,
|
||||
incident_ids: Optional[List[str]] = None,
|
||||
max_refs: int = 10,
|
||||
) -> Tuple[int, List[str], List[Dict]]:
|
||||
storm_cfg = policy.get("signals", {}).get("incident_storm", {}).get("thresholds", {})
|
||||
occ_warn = int(storm_cfg.get("occurrences_60m_warn", 10))
|
||||
esc_warn = int(storm_cfg.get("escalations_24h_warn", 2))
|
||||
|
||||
triggered = (occurrences_60m >= occ_warn) or (escalations_24h >= esc_warn)
|
||||
if not triggered:
|
||||
return 0, [], []
|
||||
|
||||
weight = int(policy.get("weights", {}).get("incident_storm", 20))
|
||||
evidence = []
|
||||
if occurrences_60m >= occ_warn:
|
||||
evidence.append(f"occurrences_60m={occurrences_60m} (≥{occ_warn})")
|
||||
if escalations_24h >= esc_warn:
|
||||
evidence.append(f"escalations_24h={escalations_24h} (≥{esc_warn})")
|
||||
refs = _cap_refs(
|
||||
[{"incident_id": iid} for iid in (incident_ids or [])],
|
||||
max_refs,
|
||||
)
|
||||
return weight, evidence, refs
|
||||
|
||||
|
||||
def _detect_slo(
|
||||
slo_violations: int,
|
||||
policy: Dict,
|
||||
slo_metrics: Optional[List[str]] = None,
|
||||
max_refs: int = 10,
|
||||
) -> Tuple[int, List[str], List[Dict]]:
|
||||
require_active = policy.get("signals", {}).get("slo", {}).get("require_active_violation", True)
|
||||
if require_active and slo_violations == 0:
|
||||
return 0, [], []
|
||||
if slo_violations == 0:
|
||||
return 0, [], []
|
||||
weight = int(policy.get("weights", {}).get("slo_violation", 15))
|
||||
evidence = [f"active SLO violations: {slo_violations}"]
|
||||
refs = _cap_refs(
|
||||
[{"metric": m} for m in (slo_metrics or [])],
|
||||
max_refs,
|
||||
)
|
||||
return weight, evidence, refs
|
||||
|
||||
|
||||
def _detect_followups_overdue(
|
||||
overdue_count: int,
|
||||
policy: Dict,
|
||||
followup_refs: Optional[List[Dict]] = None,
|
||||
max_refs: int = 10,
|
||||
) -> Tuple[int, List[str], List[Dict]]:
|
||||
if overdue_count == 0:
|
||||
return 0, [], []
|
||||
weight = int(policy.get("weights", {}).get("followups_overdue", 10))
|
||||
evidence = [f"overdue follow-ups: {overdue_count}"]
|
||||
refs = _cap_refs(followup_refs or [], max_refs)
|
||||
return weight, evidence, refs
|
||||
|
||||
|
||||
def _detect_alert_loop_degraded(
|
||||
loop_slo_violations: int,
|
||||
policy: Dict,
|
||||
max_refs: int = 10,
|
||||
) -> Tuple[int, List[str], List[Dict]]:
|
||||
if loop_slo_violations == 0:
|
||||
return 0, [], []
|
||||
weight = int(policy.get("weights", {}).get("alert_loop_degraded", 10))
|
||||
evidence = [f"alert-loop SLO violations: {loop_slo_violations}"]
|
||||
refs: List[Dict] = []
|
||||
return weight, evidence, refs
|
||||
|
||||
|
||||
# ─── Timeline builder ────────────────────────────────────────────────────────
|
||||
|
||||
def _bucket_key(ts_iso: str, bucket_minutes: int) -> str:
|
||||
"""Round timestamp down to the nearest bucket boundary."""
|
||||
try:
|
||||
dt = datetime.datetime.fromisoformat(ts_iso.rstrip("Z"))
|
||||
total_mins = dt.hour * 60 + dt.minute
|
||||
bucket_start = (total_mins // bucket_minutes) * bucket_minutes
|
||||
return f"{dt.strftime('%Y-%m-%d')}T{bucket_start // 60:02d}:{bucket_start % 60:02d}"
|
||||
except Exception:
|
||||
return ts_iso[:13] # fallback: truncate to hour
|
||||
|
||||
|
||||
def build_timeline(
|
||||
raw_events: List[Dict],
|
||||
policy: Optional[Dict] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Build an ordered Change Timeline from raw event dicts.
|
||||
|
||||
raw_events is a list of:
|
||||
{ts, type, label, refs, ...}
|
||||
|
||||
Returns newest-first list, bucketed and capped at max_items.
|
||||
Multiple same-type events in the same time bucket are coalesced into
|
||||
one "xN" item.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_attribution_policy()
|
||||
|
||||
tl_cfg = policy.get("timeline", {})
|
||||
if not tl_cfg.get("enabled", True):
|
||||
return []
|
||||
|
||||
max_items = int(tl_cfg.get("max_items", 30))
|
||||
bucket_minutes = int(tl_cfg.get("time_bucket_minutes", 5))
|
||||
include_types = set(tl_cfg.get("include_types", []))
|
||||
|
||||
# Filter by allowed types
|
||||
filtered = [
|
||||
e for e in raw_events
|
||||
if not include_types or e.get("type") in include_types
|
||||
]
|
||||
|
||||
# Sort newest-first
|
||||
filtered.sort(key=lambda e: e.get("ts", ""), reverse=True)
|
||||
|
||||
# Bucket coalescing: same type + same bucket → single item with count
|
||||
seen: Dict[str, Dict] = {} # key → accumulated item
|
||||
order: List[str] = [] # preserve insertion order
|
||||
|
||||
for ev in filtered:
|
||||
bk = _bucket_key(ev.get("ts", ""), bucket_minutes)
|
||||
key = f"{ev.get('type', 'unknown')}:{bk}"
|
||||
if key not in seen:
|
||||
seen[key] = {
|
||||
"ts": ev.get("ts", ""),
|
||||
"type": ev.get("type", "unknown"),
|
||||
"label": ev.get("label", ""),
|
||||
"refs": list(ev.get("refs", {}).items() if isinstance(ev.get("refs"), dict)
|
||||
else ev.get("refs", [])),
|
||||
"_count": 1,
|
||||
"_latest_ts": ev.get("ts", ""),
|
||||
}
|
||||
order.append(key)
|
||||
else:
|
||||
seen[key]["_count"] += 1
|
||||
# Keep latest ts
|
||||
if ev.get("ts", "") > seen[key]["_latest_ts"]:
|
||||
seen[key]["_latest_ts"] = ev.get("ts", "")
|
||||
seen[key]["ts"] = ev.get("ts", "")
|
||||
# Merge refs (up to 5 per bucket)
|
||||
new_refs = (list(ev.get("refs", {}).items()) if isinstance(ev.get("refs"), dict)
|
||||
else ev.get("refs", []))
|
||||
if len(seen[key]["refs"]) < 5:
|
||||
seen[key]["refs"].extend(new_refs[:5 - len(seen[key]["refs"])])
|
||||
|
||||
# Build final items
|
||||
items = []
|
||||
for key in order:
|
||||
item = seen[key]
|
||||
count = item.pop("_count", 1)
|
||||
item.pop("_latest_ts", None)
|
||||
if count > 1:
|
||||
item["label"] = f"{item['label']} (×{count})"
|
||||
# Convert refs back to dict if needed
|
||||
if isinstance(item["refs"], list) and item["refs"] and isinstance(item["refs"][0], tuple):
|
||||
item["refs"] = dict(item["refs"])
|
||||
items.append(item)
|
||||
|
||||
return items[:max_items]
|
||||
|
||||
|
||||
def _make_timeline_events_from_alerts(
|
||||
alerts: List[Dict],
|
||||
deploy_kinds: set,
|
||||
cutoff_iso: str,
|
||||
) -> List[Dict]:
|
||||
"""Convert alert records to raw timeline events."""
|
||||
events = []
|
||||
for a in alerts:
|
||||
if a.get("created_at", "") < cutoff_iso:
|
||||
continue
|
||||
kind = a.get("kind", "").lower()
|
||||
ev_type = "deploy" if kind in deploy_kinds else "alert"
|
||||
refs = {}
|
||||
if a.get("alert_ref"):
|
||||
refs["alert_ref"] = a["alert_ref"]
|
||||
if a.get("service"):
|
||||
refs["service"] = a["service"]
|
||||
events.append({
|
||||
"ts": a.get("created_at", ""),
|
||||
"type": ev_type,
|
||||
"label": f"Alert: {kind}" + (f" ({a.get('title', '')})"
|
||||
if a.get("title") else ""),
|
||||
"refs": refs,
|
||||
})
|
||||
return events
|
||||
|
||||
|
||||
def _make_timeline_events_from_incidents(
|
||||
incidents: List[Dict],
|
||||
events_by_id: Dict[str, List[Dict]],
|
||||
cutoff_iso: str,
|
||||
) -> List[Dict]:
|
||||
"""Convert incident + escalation events to raw timeline events."""
|
||||
timeline_events = []
|
||||
for inc in incidents:
|
||||
inc_id = inc.get("id", "")
|
||||
started = inc.get("started_at") or inc.get("created_at", "")
|
||||
if started >= cutoff_iso:
|
||||
timeline_events.append({
|
||||
"ts": started,
|
||||
"type": "incident",
|
||||
"label": f"Incident started: {inc.get('title', inc_id)[:80]}",
|
||||
"refs": {"incident_id": inc_id},
|
||||
})
|
||||
for ev in events_by_id.get(inc_id, []):
|
||||
if (ev.get("type") == "decision"
|
||||
and "Escalat" in (ev.get("message") or "")
|
||||
and ev.get("ts", "") >= cutoff_iso):
|
||||
timeline_events.append({
|
||||
"ts": ev["ts"],
|
||||
"type": "incident",
|
||||
"label": f"Incident escalated: {inc_id}",
|
||||
"refs": {"incident_id": inc_id,
|
||||
"event_type": ev.get("type", "")},
|
||||
})
|
||||
return timeline_events
|
||||
|
||||
|
||||
def _make_timeline_events_from_gates(
|
||||
release_gate_results: List[Dict],
|
||||
) -> List[Dict]:
|
||||
"""Convert release gate results to raw timeline events."""
|
||||
events = []
|
||||
for g in release_gate_results:
|
||||
if g.get("status") not in ("fail", "warn"):
|
||||
continue
|
||||
gate_type = "dependency" if "dep" in g.get("gate", "").lower() else "release_gate"
|
||||
if "drift" in g.get("gate", "").lower():
|
||||
gate_type = "drift"
|
||||
refs: Dict = {}
|
||||
if g.get("run_id"):
|
||||
refs["release_check_run_id"] = g["run_id"]
|
||||
if g.get("artifact"):
|
||||
refs["artifact"] = g["artifact"]
|
||||
events.append({
|
||||
"ts": g.get("ts", datetime.datetime.utcnow().isoformat()),
|
||||
"type": gate_type,
|
||||
"label": f"Gate {g['gate']} = {g['status']}",
|
||||
"refs": refs,
|
||||
})
|
||||
return events
|
||||
|
||||
|
||||
# ─── Evidence refs builder ────────────────────────────────────────────────────
|
||||
|
||||
def build_evidence_refs(
|
||||
alerts_24h: List[Dict],
|
||||
incidents_24h: List[Dict],
|
||||
release_gate_results: List[Dict],
|
||||
followup_refs: Optional[List[Dict]] = None,
|
||||
policy: Optional[Dict] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Collect top-level evidence_refs: alert_refs, incident_ids,
|
||||
release_check_run_ids, artifacts.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_attribution_policy()
|
||||
|
||||
max_refs = int(policy.get("evidence_linking", {}).get("max_refs_per_cause", 10))
|
||||
|
||||
alert_refs = _cap_refs(
|
||||
[a["alert_ref"] for a in alerts_24h if a.get("alert_ref")], max_refs
|
||||
)
|
||||
incident_ids = _cap_refs(
|
||||
list({inc.get("id", "") for inc in incidents_24h if inc.get("id")}), max_refs
|
||||
)
|
||||
rc_ids = _cap_refs(
|
||||
list({g.get("run_id") for g in release_gate_results if g.get("run_id")}), max_refs
|
||||
)
|
||||
artifacts = _cap_refs(
|
||||
list({g.get("artifact") for g in release_gate_results if g.get("artifact")}), max_refs
|
||||
)
|
||||
fu_refs = _cap_refs(
|
||||
[r for r in (followup_refs or []) if r], max_refs
|
||||
)
|
||||
|
||||
return {
|
||||
"alerts": alert_refs,
|
||||
"incidents": incident_ids,
|
||||
"release_checks": list(filter(None, rc_ids)),
|
||||
"artifacts": list(filter(None, artifacts)),
|
||||
"followups": fu_refs,
|
||||
}
|
||||
|
||||
|
||||
# ─── Summary builder ──────────────────────────────────────────────────────────
|
||||
|
||||
_TYPE_LABELS = {
|
||||
"deploy": "deploy activity",
|
||||
"dependency": "dependency change",
|
||||
"drift": "config/infrastructure drift",
|
||||
"incident_storm": "incident storm",
|
||||
"slo_violation": "SLO violation",
|
||||
"followups_overdue": "overdue follow-ups",
|
||||
"alert_loop_degraded": "alert-loop degradation",
|
||||
}
|
||||
|
||||
|
||||
def _build_summary(causes: List[Dict]) -> str:
|
||||
if not causes:
|
||||
return "No significant attribution signals detected."
|
||||
labels = [_TYPE_LABELS.get(c["type"], c["type"]) for c in causes[:3]]
|
||||
return "Likely causes: " + " + ".join(labels) + "."
|
||||
|
||||
|
||||
# ─── Main attribution function ────────────────────────────────────────────────
|
||||
|
||||
def compute_attribution(
|
||||
service: str,
|
||||
env: str,
|
||||
*,
|
||||
risk_report: Optional[Dict] = None,
|
||||
# Signals (pre-fetched)
|
||||
alerts_24h: Optional[List[Dict]] = None,
|
||||
occurrences_60m: int = 0,
|
||||
escalations_24h: int = 0,
|
||||
release_gate_results: Optional[List[Dict]] = None,
|
||||
slo_violations: int = 0,
|
||||
slo_metrics: Optional[List[str]] = None,
|
||||
overdue_followup_count: int = 0,
|
||||
followup_refs: Optional[List[Dict]] = None,
|
||||
loop_slo_violations: int = 0,
|
||||
# For evidence + timeline
|
||||
incidents_24h: Optional[List[Dict]] = None,
|
||||
incident_events: Optional[Dict[str, List[Dict]]] = None,
|
||||
window_hours: int = 24,
|
||||
policy: Optional[Dict] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Deterministic attribution: causes with evidence, refs, timeline, evidence_refs.
|
||||
|
||||
All signal arguments default to safe empty values.
|
||||
Never raises (returns minimal report on any error).
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_attribution_policy()
|
||||
|
||||
cutoff = (
|
||||
datetime.datetime.utcnow() - datetime.timedelta(hours=window_hours)
|
||||
).isoformat()
|
||||
|
||||
max_causes = int(policy.get("defaults", {}).get("max_causes", 5))
|
||||
max_refs = int(policy.get("evidence_linking", {}).get("max_refs_per_cause", 10))
|
||||
risk_report = risk_report or {}
|
||||
alerts_24h = alerts_24h or []
|
||||
release_gate_results = release_gate_results or []
|
||||
incidents_24h = incidents_24h or []
|
||||
incident_events = incident_events or {}
|
||||
|
||||
# Extract from risk_report.components when not explicitly provided
|
||||
if slo_violations == 0 and risk_report:
|
||||
slo_violations = (risk_report.get("components", {}).get("slo") or {}).get("violations", 0)
|
||||
if overdue_followup_count == 0 and risk_report:
|
||||
fu = risk_report.get("components", {}).get("followups") or {}
|
||||
overdue_followup_count = fu.get("P0", 0) + fu.get("P1", 0) + fu.get("other", 0)
|
||||
if loop_slo_violations == 0 and risk_report:
|
||||
loop_slo_violations = (
|
||||
risk_report.get("components", {}).get("alerts_loop") or {}
|
||||
).get("violations", 0)
|
||||
|
||||
incident_ids = [inc.get("id", "") for inc in incidents_24h if inc.get("id")]
|
||||
|
||||
# ── Score each signal (now with refs) ────────────────────────────────────
|
||||
candidates: List[Dict] = []
|
||||
|
||||
score, evid, refs = _detect_deploy(alerts_24h, cutoff, policy, max_refs)
|
||||
if score:
|
||||
candidates.append({"type": "deploy", "score": score, "evidence": evid, "refs": refs})
|
||||
|
||||
score, evid, refs = _detect_dependency(release_gate_results, policy, max_refs)
|
||||
if score:
|
||||
candidates.append({"type": "dependency", "score": score, "evidence": evid, "refs": refs})
|
||||
|
||||
score, evid, refs = _detect_drift(release_gate_results, policy, max_refs)
|
||||
if score:
|
||||
candidates.append({"type": "drift", "score": score, "evidence": evid, "refs": refs})
|
||||
|
||||
score, evid, refs = _detect_incident_storm(
|
||||
occurrences_60m, escalations_24h, policy, incident_ids, max_refs
|
||||
)
|
||||
if score:
|
||||
candidates.append({"type": "incident_storm", "score": score, "evidence": evid, "refs": refs})
|
||||
|
||||
score, evid, refs = _detect_slo(slo_violations, policy, slo_metrics, max_refs)
|
||||
if score:
|
||||
candidates.append({"type": "slo_violation", "score": score, "evidence": evid, "refs": refs})
|
||||
|
||||
score, evid, refs = _detect_followups_overdue(
|
||||
overdue_followup_count, policy, followup_refs, max_refs
|
||||
)
|
||||
if score:
|
||||
candidates.append({"type": "followups_overdue", "score": score,
|
||||
"evidence": evid, "refs": refs})
|
||||
|
||||
score, evid, refs = _detect_alert_loop_degraded(loop_slo_violations, policy, max_refs)
|
||||
if score:
|
||||
candidates.append({"type": "alert_loop_degraded", "score": score,
|
||||
"evidence": evid, "refs": refs})
|
||||
|
||||
# Sort desc, cap, add confidence
|
||||
candidates.sort(key=lambda c: -c["score"])
|
||||
causes = candidates[:max_causes]
|
||||
for c in causes:
|
||||
c["confidence"] = _score_to_confidence(c["score"], policy)
|
||||
|
||||
delta_24h = (risk_report.get("trend") or {}).get("delta_24h")
|
||||
summary = _build_summary(causes)
|
||||
|
||||
# ── Timeline ──────────────────────────────────────────────────────────────
|
||||
tl_cfg = policy.get("timeline", {})
|
||||
deploy_kinds = set(policy.get("signals", {}).get("deploy", {}).get(
|
||||
"kinds", ["deploy", "deployment", "rollout", "canary"]
|
||||
))
|
||||
raw_events: List[Dict] = []
|
||||
raw_events.extend(_make_timeline_events_from_alerts(alerts_24h, deploy_kinds, cutoff))
|
||||
raw_events.extend(_make_timeline_events_from_incidents(incidents_24h, incident_events, cutoff))
|
||||
raw_events.extend(_make_timeline_events_from_gates(release_gate_results))
|
||||
timeline = build_timeline(raw_events, policy) if tl_cfg.get("enabled", True) else []
|
||||
|
||||
# ── Evidence refs ─────────────────────────────────────────────────────────
|
||||
evidence_refs: Dict = {}
|
||||
if policy.get("evidence_linking", {}).get("enabled", True):
|
||||
evidence_refs = build_evidence_refs(
|
||||
alerts_24h, incidents_24h, release_gate_results,
|
||||
followup_refs=followup_refs, policy=policy,
|
||||
)
|
||||
|
||||
return {
|
||||
"service": service,
|
||||
"env": env,
|
||||
"window_hours": window_hours,
|
||||
"delta_24h": delta_24h,
|
||||
"causes": causes,
|
||||
"summary": summary,
|
||||
"timeline": timeline,
|
||||
"evidence_refs": evidence_refs,
|
||||
"llm_enrichment": {"enabled": False, "text": None},
|
||||
}
|
||||
|
||||
|
||||
# ─── Signal fetcher (for wiring in tool_manager/risk_engine) ─────────────────
|
||||
|
||||
def fetch_signals_from_stores(
|
||||
service: str,
|
||||
env: str,
|
||||
window_hours: int = 24,
|
||||
*,
|
||||
alert_store=None,
|
||||
incident_store=None,
|
||||
policy: Optional[Dict] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Fetches raw signals from existing stores.
|
||||
Returns a dict ready to unpack into compute_attribution().
|
||||
Always non-fatal per store.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_attribution_policy()
|
||||
|
||||
cutoff = (
|
||||
datetime.datetime.utcnow() - datetime.timedelta(hours=window_hours)
|
||||
).isoformat()
|
||||
|
||||
# ── Deploy + other alerts ─────────────────────────────────────────────────
|
||||
alerts_24h: List[Dict] = []
|
||||
try:
|
||||
if alert_store is not None:
|
||||
all_alerts = alert_store.list_alerts(limit=200)
|
||||
alerts_24h = [
|
||||
a for a in all_alerts
|
||||
if a.get("created_at", "") >= cutoff
|
||||
and (not a.get("service") or a.get("service") == service)
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning("attribution fetch alerts failed: %s", e)
|
||||
|
||||
# ── Incidents in window + event maps ──────────────────────────────────────
|
||||
incidents_24h: List[Dict] = []
|
||||
incident_events: Dict[str, List[Dict]] = {}
|
||||
occurrences_60m = 0
|
||||
escalations_24h = 0
|
||||
|
||||
try:
|
||||
if incident_store is not None:
|
||||
cutoff_60m = (
|
||||
datetime.datetime.utcnow() - datetime.timedelta(minutes=60)
|
||||
).isoformat()
|
||||
|
||||
# Count alert occurrences from alert_store top_signatures
|
||||
if alert_store is not None:
|
||||
try:
|
||||
sigs = alert_store.top_signatures(window_minutes=60, limit=20)
|
||||
occurrences_60m = sum(s.get("occurrences", 0) for s in sigs)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
incs = incident_store.list_incidents({"service": service}, limit=30)
|
||||
for inc in incs:
|
||||
inc_id = inc.get("id", "")
|
||||
inc_started = inc.get("started_at") or inc.get("created_at", "")
|
||||
try:
|
||||
events = incident_store.get_events(inc_id, limit=50)
|
||||
incident_events[inc_id] = events
|
||||
for ev in events:
|
||||
if (ev.get("type") == "decision"
|
||||
and "Escalat" in (ev.get("message") or "")
|
||||
and ev.get("ts", "") >= cutoff):
|
||||
escalations_24h += 1
|
||||
except Exception:
|
||||
pass
|
||||
# Include incident if started within window
|
||||
if inc_started >= cutoff:
|
||||
incidents_24h.append(inc)
|
||||
except Exception as e:
|
||||
logger.warning("attribution fetch incident signals failed: %s", e)
|
||||
|
||||
return {
|
||||
"alerts_24h": alerts_24h,
|
||||
"occurrences_60m": occurrences_60m,
|
||||
"escalations_24h": escalations_24h,
|
||||
"incidents_24h": incidents_24h,
|
||||
"incident_events": incident_events,
|
||||
"release_gate_results": [], # caller can inject if persisted
|
||||
}
|
||||
341
services/router/risk_digest.py
Normal file
341
services/router/risk_digest.py
Normal file
@@ -0,0 +1,341 @@
|
||||
"""
|
||||
risk_digest.py — Daily Risk Digest generator (deterministic, no LLM).
|
||||
|
||||
Produces:
|
||||
ops/reports/risk/YYYY-MM-DD.json
|
||||
ops/reports/risk/YYYY-MM-DD.md
|
||||
|
||||
Content:
|
||||
- Top risky services (score desc)
|
||||
- Top regressions (delta_24h desc)
|
||||
- SLO violation summary
|
||||
- Deterministic action list based on risk state
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_ACTION_TEMPLATES = {
|
||||
"regression_fail": "🚨 **Regression detected**: {service} score +{delta} in 24h. Freeze deployments; inspect recent incidents/followups immediately.",
|
||||
"regression_warn": "⚠️ **Score rising**: {service} +{delta} in 24h. Review open incidents and overdue follow-ups.",
|
||||
"critical_band": "🔴 **Critical risk**: {service} (score {score}). Oncall review required within 2h.",
|
||||
"high_band": "🟠 **High risk**: {service} (score {score}). Coordinate with oncall before next release.",
|
||||
"overdue_followups": "📋 **Overdue follow-ups**: {service} has {count} overdue follow-up(s). Close them to reduce risk score.",
|
||||
"slo_violation": "📉 **SLO violation**: {service} has {count} active SLO violation(s). Avoid deploying until clear.",
|
||||
}
|
||||
|
||||
|
||||
def _now_date() -> str:
|
||||
return datetime.datetime.utcnow().strftime("%Y-%m-%d")
|
||||
|
||||
|
||||
def _clamp(text: str, max_chars: int) -> str:
|
||||
if len(text) <= max_chars:
|
||||
return text
|
||||
truncated = text[:max_chars]
|
||||
return truncated + "\n\n_[digest truncated to policy max_chars]_"
|
||||
|
||||
|
||||
def _build_action_list(reports: List[Dict]) -> List[str]:
|
||||
actions = []
|
||||
for r in reports[:10]:
|
||||
service = r.get("service", "?")
|
||||
score = r.get("score", 0)
|
||||
band = r.get("band", "low")
|
||||
trend = r.get("trend") or {}
|
||||
comp = r.get("components", {})
|
||||
|
||||
delta_24h = trend.get("delta_24h")
|
||||
reg = trend.get("regression", {})
|
||||
|
||||
if reg.get("fail") and delta_24h is not None and delta_24h > 0:
|
||||
actions.append(_ACTION_TEMPLATES["regression_fail"].format(
|
||||
service=service, delta=delta_24h))
|
||||
elif reg.get("warn") and delta_24h is not None and delta_24h > 0:
|
||||
actions.append(_ACTION_TEMPLATES["regression_warn"].format(
|
||||
service=service, delta=delta_24h))
|
||||
|
||||
if band == "critical":
|
||||
actions.append(_ACTION_TEMPLATES["critical_band"].format(
|
||||
service=service, score=score))
|
||||
elif band == "high":
|
||||
actions.append(_ACTION_TEMPLATES["high_band"].format(
|
||||
service=service, score=score))
|
||||
|
||||
overdue = (
|
||||
(comp.get("followups") or {}).get("P0", 0)
|
||||
+ (comp.get("followups") or {}).get("P1", 0)
|
||||
+ (comp.get("followups") or {}).get("other", 0)
|
||||
)
|
||||
if overdue:
|
||||
actions.append(_ACTION_TEMPLATES["overdue_followups"].format(
|
||||
service=service, count=overdue))
|
||||
|
||||
slo_count = (comp.get("slo") or {}).get("violations", 0)
|
||||
if slo_count:
|
||||
actions.append(_ACTION_TEMPLATES["slo_violation"].format(
|
||||
service=service, count=slo_count))
|
||||
|
||||
return actions[:20] # cap
|
||||
|
||||
|
||||
def _build_markdown(
|
||||
date_str: str,
|
||||
env: str,
|
||||
reports: List[Dict],
|
||||
top_regressions: List[Dict],
|
||||
improving: List[Dict],
|
||||
actions: List[str],
|
||||
band_counts: Dict,
|
||||
) -> str:
|
||||
lines = [
|
||||
f"# Risk Digest — {date_str} ({env})",
|
||||
"",
|
||||
f"Generated: {datetime.datetime.utcnow().isoformat()} UTC",
|
||||
"",
|
||||
"## Band Summary",
|
||||
"",
|
||||
"| Band | Count |",
|
||||
"|------|-------|",
|
||||
]
|
||||
for band in ("critical", "high", "medium", "low"):
|
||||
lines.append(f"| {band} | {band_counts.get(band, 0)} |")
|
||||
|
||||
lines += [
|
||||
"",
|
||||
"## Top Risky Services",
|
||||
"",
|
||||
"| Service | Score | Band | Δ24h | Δ7d |",
|
||||
"|---------|-------|------|------|-----|",
|
||||
]
|
||||
for r in reports:
|
||||
t = r.get("trend") or {}
|
||||
d24 = t.get("delta_24h")
|
||||
d7 = t.get("delta_7d")
|
||||
d24_str = (f"+{d24}" if d24 and d24 > 0 else str(d24)) if d24 is not None else "—"
|
||||
d7_str = (f"+{d7}" if d7 and d7 > 0 else str(d7)) if d7 is not None else "—"
|
||||
lines.append(
|
||||
f"| {r['service']} | {r.get('score', 0)} | {r.get('band', '?')} "
|
||||
f"| {d24_str} | {d7_str} |"
|
||||
)
|
||||
|
||||
if top_regressions:
|
||||
lines += ["", "## Top Regressions (Δ24h)", ""]
|
||||
for item in top_regressions:
|
||||
delta = item.get("delta_24h", 0)
|
||||
lines.append(f"- **{item['service']}**: +{delta} points in 24h")
|
||||
|
||||
# ── Likely Causes (Attribution) ───────────────────────────────────────────
|
||||
regressions_with_attribution = [
|
||||
r for r in reports
|
||||
if (r.get("trend") or {}).get("delta_24h") is not None
|
||||
and r["trend"]["delta_24h"] > 0
|
||||
and r.get("attribution") is not None
|
||||
and r["attribution"].get("causes")
|
||||
]
|
||||
regressions_with_attribution = sorted(
|
||||
regressions_with_attribution,
|
||||
key=lambda r: -(r.get("trend") or {}).get("delta_24h", 0),
|
||||
)[:5]
|
||||
|
||||
if regressions_with_attribution:
|
||||
lines += ["", "## Likely Causes (Top Regressions)", ""]
|
||||
for r in regressions_with_attribution:
|
||||
svc = r["service"]
|
||||
attr = r["attribution"]
|
||||
delta = r["trend"]["delta_24h"]
|
||||
summary = attr.get("summary", "")
|
||||
lines.append(f"### {svc} (+{delta} pts)")
|
||||
if summary:
|
||||
lines.append(f"> {summary}")
|
||||
causes = attr.get("causes", [])[:2]
|
||||
for c in causes:
|
||||
evid = "; ".join(c.get("evidence", []))
|
||||
lines.append(
|
||||
f"- **{c['type']}** (confidence: {c.get('confidence', '?')}): {evid}"
|
||||
)
|
||||
# LLM text if available
|
||||
llm = attr.get("llm_enrichment") or {}
|
||||
if llm.get("enabled") and llm.get("text"):
|
||||
lines += ["", f" _LLM insight_: {llm['text'][:400]}"]
|
||||
lines.append("")
|
||||
|
||||
# ── Change Timeline (Top Regressions) ────────────────────────────────────
|
||||
regressions_with_timeline = [
|
||||
r for r in regressions_with_attribution
|
||||
if r.get("attribution") and r["attribution"].get("timeline")
|
||||
]
|
||||
if regressions_with_timeline:
|
||||
lines += ["", "## Change Timeline (Top Regressions)", ""]
|
||||
for r in regressions_with_timeline:
|
||||
svc = r["service"]
|
||||
timeline = r["attribution"]["timeline"][:5] # top 5 per service
|
||||
lines.append(f"### {svc}")
|
||||
for item in timeline:
|
||||
ts = (item.get("ts") or "")[:16]
|
||||
label = item.get("label", "")
|
||||
ev_type = item.get("type", "")
|
||||
lines.append(f"- `{ts}` [{ev_type}] {label}")
|
||||
lines.append("")
|
||||
|
||||
if improving:
|
||||
lines += ["", "## Improving Services (Δ7d)", ""]
|
||||
for item in improving:
|
||||
delta = item.get("delta_7d", 0)
|
||||
lines.append(f"- **{item['service']}**: {delta} points over 7d")
|
||||
|
||||
if actions:
|
||||
lines += ["", "## Action List", ""]
|
||||
for action in actions:
|
||||
lines.append(f"- {action}")
|
||||
|
||||
lines += ["", "---", "_Generated by DAARION.city Risk Digest (deterministic, no LLM by default)_"]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def daily_digest(
|
||||
env: str = "prod",
|
||||
*,
|
||||
service_reports: Optional[List[Dict]] = None,
|
||||
policy: Optional[Dict] = None,
|
||||
date_str: Optional[str] = None,
|
||||
output_dir: Optional[str] = None,
|
||||
write_files: bool = True,
|
||||
) -> Dict:
|
||||
"""
|
||||
Build and optionally persist the daily risk digest.
|
||||
|
||||
service_reports — pre-fetched+enriched list of RiskReports (with trend).
|
||||
Returns {json_path, md_path, json_data, markdown, date, env}
|
||||
"""
|
||||
from risk_engine import load_risk_policy, compute_risk_dashboard
|
||||
|
||||
if policy is None:
|
||||
policy = load_risk_policy()
|
||||
|
||||
digest_cfg = policy.get("digest", {})
|
||||
top_n = int(digest_cfg.get("top_n", 10))
|
||||
max_chars = int(digest_cfg.get("markdown_max_chars", 8000))
|
||||
cfg_output_dir = digest_cfg.get("output_dir", "ops/reports/risk")
|
||||
|
||||
effective_output_dir = output_dir or cfg_output_dir
|
||||
effective_date = date_str or _now_date()
|
||||
|
||||
reports = sorted(service_reports or [], key=lambda r: -r.get("score", 0))[:top_n]
|
||||
|
||||
# Band counts
|
||||
band_counts: Dict[str, int] = {"critical": 0, "high": 0, "medium": 0, "low": 0}
|
||||
for r in reports:
|
||||
b = r.get("band", "low")
|
||||
band_counts[b] = band_counts.get(b, 0) + 1
|
||||
|
||||
# Top regressions
|
||||
top_regressions = sorted(
|
||||
[r for r in reports if (r.get("trend") or {}).get("delta_24h") is not None
|
||||
and r["trend"]["delta_24h"] > 0],
|
||||
key=lambda r: -r["trend"]["delta_24h"],
|
||||
)[:5]
|
||||
top_regressions_out = [
|
||||
{"service": r["service"], "delta_24h": r["trend"]["delta_24h"],
|
||||
"attribution_causes": [
|
||||
{"type": c["type"], "score": c["score"],
|
||||
"confidence": c.get("confidence", "low"),
|
||||
"evidence": c.get("evidence", [])[:2],
|
||||
"refs": c.get("refs", [])[:3]}
|
||||
for c in (r.get("attribution") or {}).get("causes", [])[:2]
|
||||
],
|
||||
"timeline_preview": (r.get("attribution") or {}).get("timeline", [])[:3],
|
||||
}
|
||||
for r in top_regressions
|
||||
]
|
||||
|
||||
# Improving services
|
||||
improving = sorted(
|
||||
[r for r in reports if (r.get("trend") or {}).get("delta_7d") is not None
|
||||
and r["trend"]["delta_7d"] < 0],
|
||||
key=lambda r: r["trend"]["delta_7d"],
|
||||
)[:5]
|
||||
improving_out = [
|
||||
{"service": r["service"], "delta_7d": r["trend"]["delta_7d"]}
|
||||
for r in improving
|
||||
]
|
||||
|
||||
actions = _build_action_list(reports)
|
||||
|
||||
markdown_raw = _build_markdown(
|
||||
date_str=effective_date,
|
||||
env=env,
|
||||
reports=reports,
|
||||
top_regressions=top_regressions_out,
|
||||
improving=improving_out,
|
||||
actions=actions,
|
||||
band_counts=band_counts,
|
||||
)
|
||||
markdown = _clamp(markdown_raw, max_chars)
|
||||
|
||||
json_data = {
|
||||
"date": effective_date,
|
||||
"env": env,
|
||||
"generated_at": datetime.datetime.utcnow().isoformat(),
|
||||
"band_counts": band_counts,
|
||||
"top_services": [
|
||||
{
|
||||
"service": r.get("service"),
|
||||
"score": r.get("score"),
|
||||
"band": r.get("band"),
|
||||
"delta_24h": (r.get("trend") or {}).get("delta_24h"),
|
||||
"delta_7d": (r.get("trend") or {}).get("delta_7d"),
|
||||
"regression": (r.get("trend") or {}).get("regression"),
|
||||
"reasons": r.get("reasons", [])[:5],
|
||||
"attribution_summary": (r.get("attribution") or {}).get("summary"),
|
||||
"top_causes": [
|
||||
{"type": c["type"], "score": c["score"],
|
||||
"confidence": c.get("confidence", "low"),
|
||||
"evidence": c.get("evidence", [])[:2],
|
||||
"refs": c.get("refs", [])[:3]}
|
||||
for c in (r.get("attribution") or {}).get("causes", [])[:2]
|
||||
],
|
||||
"timeline_preview": (r.get("attribution") or {}).get("timeline", [])[:3],
|
||||
"evidence_refs": (r.get("attribution") or {}).get("evidence_refs", {}),
|
||||
}
|
||||
for r in reports
|
||||
],
|
||||
"top_regressions": top_regressions_out,
|
||||
"improving_services": improving_out,
|
||||
"actions": actions,
|
||||
}
|
||||
|
||||
json_path: Optional[str] = None
|
||||
md_path: Optional[str] = None
|
||||
|
||||
if write_files:
|
||||
try:
|
||||
out = Path(effective_output_dir)
|
||||
out.mkdir(parents=True, exist_ok=True)
|
||||
json_path = str(out / f"{effective_date}.json")
|
||||
md_path = str(out / f"{effective_date}.md")
|
||||
with open(json_path, "w") as f:
|
||||
json.dump(json_data, f, indent=2)
|
||||
with open(md_path, "w") as f:
|
||||
f.write(markdown)
|
||||
logger.info("Risk digest written: %s, %s", json_path, md_path)
|
||||
except Exception as e:
|
||||
logger.warning("Risk digest write failed: %s", e)
|
||||
json_path = md_path = None
|
||||
|
||||
return {
|
||||
"date": effective_date,
|
||||
"env": env,
|
||||
"json_path": json_path,
|
||||
"md_path": md_path,
|
||||
"json_data": json_data,
|
||||
"markdown": markdown,
|
||||
}
|
||||
710
services/router/risk_engine.py
Normal file
710
services/router/risk_engine.py
Normal file
@@ -0,0 +1,710 @@
|
||||
"""
|
||||
risk_engine.py — Service Risk Index Engine (deterministic, no LLM).
|
||||
|
||||
Provides:
|
||||
compute_service_risk(service, env, ...) -> RiskReport
|
||||
compute_risk_dashboard(env, top_n, ...) -> Dashboard
|
||||
compute_trend(series) -> TrendReport
|
||||
enrich_risk_report_with_trend(report, history_store, policy) -> report (mutated)
|
||||
snapshot_all_services(env, compute_fn, history_store, policy) -> SnapshotResult
|
||||
|
||||
All inputs come from existing stores and tools.
|
||||
The engine never calls external services directly — callers inject store references.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import math
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Policy ───────────────────────────────────────────────────────────────────
|
||||
|
||||
_POLICY_CACHE: Optional[Dict] = None
|
||||
_POLICY_SEARCH_PATHS = [
|
||||
Path("config/risk_policy.yml"),
|
||||
Path(__file__).resolve().parent.parent.parent / "config" / "risk_policy.yml",
|
||||
]
|
||||
|
||||
|
||||
def load_risk_policy() -> Dict:
|
||||
global _POLICY_CACHE
|
||||
if _POLICY_CACHE is not None:
|
||||
return _POLICY_CACHE
|
||||
for p in _POLICY_SEARCH_PATHS:
|
||||
if p.exists():
|
||||
try:
|
||||
with open(p) as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
_POLICY_CACHE = data
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.warning("Failed to load risk_policy from %s: %s", p, e)
|
||||
logger.warning("risk_policy.yml not found; using built-in defaults")
|
||||
_POLICY_CACHE = _builtin_defaults()
|
||||
return _POLICY_CACHE
|
||||
|
||||
|
||||
def _builtin_defaults() -> Dict:
|
||||
return {
|
||||
"defaults": {"window_hours": 24, "recurrence_windows_days": [7, 30],
|
||||
"slo_window_minutes": 60},
|
||||
"thresholds": {
|
||||
"bands": {"low_max": 20, "medium_max": 50, "high_max": 80},
|
||||
"risk_watch": {"warn_at": 50, "fail_at": 80},
|
||||
},
|
||||
"weights": {
|
||||
"open_incidents": {"P0": 50, "P1": 25, "P2": 10, "P3": 5},
|
||||
"recurrence": {
|
||||
"signature_warn_7d": 10, "signature_high_7d": 20,
|
||||
"kind_warn_7d": 8, "kind_high_7d": 15,
|
||||
"signature_high_30d": 10, "kind_high_30d": 8,
|
||||
},
|
||||
"followups": {"overdue_P0": 20, "overdue_P1": 12, "overdue_other": 6},
|
||||
"slo": {"violation": 10},
|
||||
"alerts_loop": {"slo_violation": 10},
|
||||
"escalation": {"escalations_24h": {"warn": 5, "high": 12}},
|
||||
},
|
||||
"service_overrides": {},
|
||||
"p0_services": ["gateway", "router"],
|
||||
}
|
||||
|
||||
|
||||
def _reload_policy() -> None:
|
||||
global _POLICY_CACHE
|
||||
_POLICY_CACHE = None
|
||||
|
||||
|
||||
# ─── Band classification ──────────────────────────────────────────────────────
|
||||
|
||||
def score_to_band(score: int, policy: Dict) -> str:
|
||||
bands = policy.get("thresholds", {}).get("bands", {})
|
||||
low_max = int(bands.get("low_max", 20))
|
||||
medium_max = int(bands.get("medium_max", 50))
|
||||
high_max = int(bands.get("high_max", 80))
|
||||
if score <= low_max:
|
||||
return "low"
|
||||
if score <= medium_max:
|
||||
return "medium"
|
||||
if score <= high_max:
|
||||
return "high"
|
||||
return "critical"
|
||||
|
||||
|
||||
def get_service_thresholds(service: str, policy: Dict) -> Dict:
|
||||
overrides = policy.get("service_overrides", {}).get(service, {})
|
||||
defaults = policy.get("thresholds", {}).get("risk_watch", {})
|
||||
ov_rw = overrides.get("risk_watch", {})
|
||||
return {
|
||||
"warn_at": int(ov_rw.get("warn_at", defaults.get("warn_at", 50))),
|
||||
"fail_at": int(ov_rw.get("fail_at", defaults.get("fail_at", 80))),
|
||||
}
|
||||
|
||||
|
||||
# ─── Individual scoring components ───────────────────────────────────────────
|
||||
|
||||
def _score_open_incidents(
|
||||
open_incidents: List[Dict],
|
||||
weights: Dict,
|
||||
) -> Tuple[int, Dict, List[str]]:
|
||||
"""Score open incidents by severity."""
|
||||
w = weights.get("open_incidents", {})
|
||||
counts: Dict[str, int] = {"P0": 0, "P1": 0, "P2": 0, "P3": 0}
|
||||
points = 0
|
||||
for inc in open_incidents:
|
||||
sev = inc.get("severity", "P3")
|
||||
if sev in counts:
|
||||
counts[sev] += 1
|
||||
pts = int(w.get(sev, 0))
|
||||
points += pts
|
||||
|
||||
reasons = []
|
||||
if counts["P0"]:
|
||||
reasons.append(f"Open P0 incident(s): {counts['P0']}")
|
||||
if counts["P1"]:
|
||||
reasons.append(f"Open P1 incident(s): {counts['P1']}")
|
||||
if counts["P2"]:
|
||||
reasons.append(f"Open P2 incident(s): {counts['P2']}")
|
||||
|
||||
return points, {**counts, "points": points}, reasons
|
||||
|
||||
|
||||
def _score_recurrence(
|
||||
recurrence_data: Dict,
|
||||
weights: Dict,
|
||||
) -> Tuple[int, Dict, List[str]]:
|
||||
"""Score from recurrence detection stats."""
|
||||
w = weights.get("recurrence", {})
|
||||
high_rec = recurrence_data.get("high_recurrence", {})
|
||||
warn_rec = recurrence_data.get("warn_recurrence", {})
|
||||
|
||||
high_sigs_7d = len(high_rec.get("signatures", []))
|
||||
high_kinds_7d = len(high_rec.get("kinds", []))
|
||||
warn_sigs_7d = len(warn_rec.get("signatures", []))
|
||||
warn_kinds_7d = len(warn_rec.get("kinds", []))
|
||||
|
||||
# Note: 30d data comes from separate call; keep it optional
|
||||
high_sigs_30d = len(recurrence_data.get("high_recurrence_30d", {}).get("signatures", []))
|
||||
high_kinds_30d = len(recurrence_data.get("high_recurrence_30d", {}).get("kinds", []))
|
||||
|
||||
points = (
|
||||
high_sigs_7d * int(w.get("signature_high_7d", 20))
|
||||
+ warn_sigs_7d * int(w.get("signature_warn_7d", 10))
|
||||
+ high_kinds_7d * int(w.get("kind_high_7d", 15))
|
||||
+ warn_kinds_7d * int(w.get("kind_warn_7d", 8))
|
||||
+ high_sigs_30d * int(w.get("signature_high_30d", 10))
|
||||
+ high_kinds_30d * int(w.get("kind_high_30d", 8))
|
||||
)
|
||||
|
||||
component = {
|
||||
"high_signatures_7d": high_sigs_7d,
|
||||
"warn_signatures_7d": warn_sigs_7d,
|
||||
"high_kinds_7d": high_kinds_7d,
|
||||
"warn_kinds_7d": warn_kinds_7d,
|
||||
"high_signatures_30d": high_sigs_30d,
|
||||
"high_kinds_30d": high_kinds_30d,
|
||||
"points": points,
|
||||
}
|
||||
reasons = []
|
||||
if high_sigs_7d:
|
||||
reasons.append(f"High recurrence signatures (7d): {high_sigs_7d}")
|
||||
if high_kinds_7d:
|
||||
reasons.append(f"High recurrence kinds (7d): {high_kinds_7d}")
|
||||
if warn_sigs_7d:
|
||||
reasons.append(f"Warn recurrence signatures (7d): {warn_sigs_7d}")
|
||||
return points, component, reasons
|
||||
|
||||
|
||||
def _score_followups(
|
||||
followups_data: Dict,
|
||||
weights: Dict,
|
||||
) -> Tuple[int, Dict, List[str]]:
|
||||
"""Score overdue follow-ups by priority."""
|
||||
w = weights.get("followups", {})
|
||||
overdue = followups_data.get("overdue_followups", [])
|
||||
counts: Dict[str, int] = {"P0": 0, "P1": 0, "other": 0}
|
||||
points = 0
|
||||
|
||||
for fu in overdue:
|
||||
prio = fu.get("priority", "other")
|
||||
if prio == "P0":
|
||||
counts["P0"] += 1
|
||||
points += int(w.get("overdue_P0", 20))
|
||||
elif prio == "P1":
|
||||
counts["P1"] += 1
|
||||
points += int(w.get("overdue_P1", 12))
|
||||
else:
|
||||
counts["other"] += 1
|
||||
points += int(w.get("overdue_other", 6))
|
||||
|
||||
reasons = []
|
||||
if counts["P0"]:
|
||||
reasons.append(f"Overdue follow-ups (P0): {counts['P0']}")
|
||||
if counts["P1"]:
|
||||
reasons.append(f"Overdue follow-ups (P1): {counts['P1']}")
|
||||
if counts["other"]:
|
||||
reasons.append(f"Overdue follow-ups (other): {counts['other']}")
|
||||
|
||||
return points, {**counts, "points": points}, reasons
|
||||
|
||||
|
||||
def _score_slo(
|
||||
slo_data: Dict,
|
||||
weights: Dict,
|
||||
) -> Tuple[int, Dict, List[str]]:
|
||||
"""Score SLO violations."""
|
||||
w = weights.get("slo", {})
|
||||
violations = slo_data.get("violations", [])
|
||||
skipped = slo_data.get("skipped", False)
|
||||
|
||||
if skipped:
|
||||
return 0, {"violations": 0, "skipped": True, "points": 0}, []
|
||||
|
||||
count = len(violations)
|
||||
points = count * int(w.get("violation", 10))
|
||||
reasons = []
|
||||
if count:
|
||||
reasons.append(f"Active SLO violation(s) in window: {count}")
|
||||
return points, {"violations": count, "skipped": False, "points": points}, reasons
|
||||
|
||||
|
||||
def _score_alerts_loop(
|
||||
loop_slo: Dict,
|
||||
weights: Dict,
|
||||
) -> Tuple[int, Dict, List[str]]:
|
||||
"""Score alert-loop SLO violations (self-monitoring)."""
|
||||
w = weights.get("alerts_loop", {})
|
||||
violations = loop_slo.get("violations", [])
|
||||
count = len(violations)
|
||||
points = count * int(w.get("slo_violation", 10))
|
||||
reasons = []
|
||||
if count:
|
||||
reasons.append(f"Alert-loop SLO violation(s): {count}")
|
||||
return points, {"violations": count, "points": points}, reasons
|
||||
|
||||
|
||||
def _score_escalations(
|
||||
escalation_count: int,
|
||||
weights: Dict,
|
||||
) -> Tuple[int, Dict, List[str]]:
|
||||
"""Score escalations in last 24h."""
|
||||
esc_w = weights.get("escalation", {}).get("escalations_24h", {})
|
||||
warn_pts = int(esc_w.get("warn", 5))
|
||||
high_pts = int(esc_w.get("high", 12))
|
||||
|
||||
if escalation_count >= 3:
|
||||
points = high_pts
|
||||
elif escalation_count >= 1:
|
||||
points = warn_pts
|
||||
else:
|
||||
points = 0
|
||||
|
||||
reasons = []
|
||||
if escalation_count:
|
||||
reasons.append(f"Escalations in last 24h: {escalation_count}")
|
||||
|
||||
return points, {"count_24h": escalation_count, "points": points}, reasons
|
||||
|
||||
|
||||
# ─── Main scoring function ────────────────────────────────────────────────────
|
||||
|
||||
def compute_service_risk(
|
||||
service: str,
|
||||
env: str = "prod",
|
||||
*,
|
||||
open_incidents: Optional[List[Dict]] = None,
|
||||
recurrence_7d: Optional[Dict] = None,
|
||||
recurrence_30d: Optional[Dict] = None,
|
||||
followups_data: Optional[Dict] = None,
|
||||
slo_data: Optional[Dict] = None,
|
||||
alerts_loop_slo: Optional[Dict] = None,
|
||||
escalation_count_24h: int = 0,
|
||||
policy: Optional[Dict] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Compute risk score for a service.
|
||||
|
||||
Accepts pre-fetched data dicts (callers are responsible for fetching
|
||||
from stores/tools). All args default to empty/safe values so the engine
|
||||
never crashes due to missing data.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_risk_policy()
|
||||
|
||||
weights = policy.get("weights", _builtin_defaults()["weights"])
|
||||
|
||||
# ── Compute each component ────────────────────────────────────────────────
|
||||
open_incs = open_incidents or []
|
||||
pts_inc, comp_inc, reasons_inc = _score_open_incidents(open_incs, weights)
|
||||
|
||||
# Merge 7d + 30d recurrence into a single dict
|
||||
rec_merged = dict(recurrence_7d or {})
|
||||
if recurrence_30d:
|
||||
rec_merged["high_recurrence_30d"] = recurrence_30d.get("high_recurrence", {})
|
||||
rec_merged["warn_recurrence_30d"] = recurrence_30d.get("warn_recurrence", {})
|
||||
pts_rec, comp_rec, reasons_rec = _score_recurrence(rec_merged, weights)
|
||||
|
||||
pts_fu, comp_fu, reasons_fu = _score_followups(followups_data or {}, weights)
|
||||
pts_slo, comp_slo, reasons_slo = _score_slo(slo_data or {}, weights)
|
||||
pts_loop, comp_loop, reasons_loop = _score_alerts_loop(alerts_loop_slo or {}, weights)
|
||||
pts_esc, comp_esc, reasons_esc = _score_escalations(escalation_count_24h, weights)
|
||||
|
||||
total = max(0, pts_inc + pts_rec + pts_fu + pts_slo + pts_loop + pts_esc)
|
||||
band = score_to_band(total, policy)
|
||||
svc_thresholds = get_service_thresholds(service, policy)
|
||||
|
||||
all_reasons = reasons_inc + reasons_rec + reasons_fu + reasons_slo + reasons_loop + reasons_esc
|
||||
|
||||
# Deterministic recommendations
|
||||
recs = _build_recommendations(band, comp_inc, comp_rec, comp_fu, comp_slo)
|
||||
|
||||
return {
|
||||
"service": service,
|
||||
"env": env,
|
||||
"score": total,
|
||||
"band": band,
|
||||
"thresholds": svc_thresholds,
|
||||
"components": {
|
||||
"open_incidents": comp_inc,
|
||||
"recurrence": comp_rec,
|
||||
"followups": comp_fu,
|
||||
"slo": comp_slo,
|
||||
"alerts_loop": comp_loop,
|
||||
"escalations": comp_esc,
|
||||
},
|
||||
"reasons": all_reasons,
|
||||
"recommendations": recs,
|
||||
"updated_at": datetime.datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
def _build_recommendations(
|
||||
band: str,
|
||||
comp_inc: Dict,
|
||||
comp_rec: Dict,
|
||||
comp_fu: Dict,
|
||||
comp_slo: Dict,
|
||||
) -> List[str]:
|
||||
recs = []
|
||||
if comp_inc.get("P0", 0) or comp_inc.get("P1", 0):
|
||||
recs.append("Prioritize open P0/P1 incidents before deploying.")
|
||||
if comp_rec.get("high_signatures_7d", 0) or comp_rec.get("high_kinds_7d", 0):
|
||||
recs.append("Investigate recurring failure patterns (high recurrence buckets).")
|
||||
if comp_fu.get("P0", 0) or comp_fu.get("P1", 0):
|
||||
recs.append("Prioritize follow-up closure for recurring bucket(s).")
|
||||
if comp_slo.get("violations", 0):
|
||||
recs.append("Avoid risky deploys until SLO violation clears.")
|
||||
if band in ("high", "critical"):
|
||||
recs.append("Service is high-risk — coordinate with oncall before release.")
|
||||
return recs[:6]
|
||||
|
||||
|
||||
# ─── Dashboard ────────────────────────────────────────────────────────────────
|
||||
|
||||
# ─── Trend computation ────────────────────────────────────────────────────────
|
||||
|
||||
def compute_trend(
|
||||
series: List, # List[RiskSnapshot] — most-recent first
|
||||
policy: Optional[Dict] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Compute trend metrics from a list of RiskSnapshot objects (or dicts).
|
||||
|
||||
Returns:
|
||||
delta_24h, delta_7d, slope_per_day, volatility, regression{warn, fail}
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_risk_policy()
|
||||
|
||||
trend_cfg = policy.get("trend", {})
|
||||
reg = trend_cfg.get("regression_threshold", {})
|
||||
warn_24h = int(reg.get("delta_24h_warn", 10))
|
||||
fail_24h = int(reg.get("delta_24h_fail", 20))
|
||||
warn_7d = int(reg.get("delta_7d_warn", 15))
|
||||
fail_7d = int(reg.get("delta_7d_fail", 30))
|
||||
|
||||
if not series:
|
||||
return _empty_trend()
|
||||
|
||||
# Normalise: accept both RiskSnapshot dataclasses and plain dicts
|
||||
def _score(s) -> int:
|
||||
return int(s.score if hasattr(s, "score") else s["score"])
|
||||
|
||||
def _ts(s) -> str:
|
||||
return s.ts if hasattr(s, "ts") else s["ts"]
|
||||
|
||||
now = datetime.datetime.utcnow()
|
||||
latest_score = _score(series[0])
|
||||
|
||||
# ── delta_24h ─────────────────────────────────────────────────────────────
|
||||
cutoff_24h = (now - datetime.timedelta(hours=24)).isoformat()
|
||||
base_24h = _find_baseline(series, cutoff_24h, _ts)
|
||||
delta_24h = (latest_score - _score(base_24h)) if base_24h is not None else None
|
||||
|
||||
# ── delta_7d ──────────────────────────────────────────────────────────────
|
||||
cutoff_7d = (now - datetime.timedelta(hours=168)).isoformat()
|
||||
base_7d = _find_baseline(series, cutoff_7d, _ts)
|
||||
delta_7d = (latest_score - _score(base_7d)) if base_7d is not None else None
|
||||
|
||||
# ── slope (simple linear regression over all available points) ────────────
|
||||
slope_per_day: Optional[float] = None
|
||||
if len(series) >= 2:
|
||||
# xs = age in hours from oldest point
|
||||
pairs = [(now - _parse_ts(_ts(s))).total_seconds() / 3600.0 for s in series]
|
||||
hours_from_oldest = [max(pairs) - p for p in pairs] # 0=oldest, max=newest
|
||||
scores = [_score(s) for s in series]
|
||||
slope_per_day = _linear_slope(hours_from_oldest, scores) * 24 # per day
|
||||
|
||||
# ── volatility (stddev of daily last-score-per-day over 7d) ──────────────
|
||||
volatility: Optional[float] = None
|
||||
daily_scores = _daily_latest_scores(series, days=7, _ts_fn=_ts, _score_fn=_score)
|
||||
if len(daily_scores) >= 2:
|
||||
mean = sum(daily_scores) / len(daily_scores)
|
||||
variance = sum((x - mean) ** 2 for x in daily_scores) / len(daily_scores)
|
||||
volatility = round(math.sqrt(variance), 2)
|
||||
|
||||
# ── regression flags ──────────────────────────────────────────────────────
|
||||
reg_warn = (
|
||||
(delta_24h is not None and delta_24h >= warn_24h)
|
||||
or (delta_7d is not None and delta_7d >= warn_7d)
|
||||
)
|
||||
reg_fail = (
|
||||
(delta_24h is not None and delta_24h >= fail_24h)
|
||||
or (delta_7d is not None and delta_7d >= fail_7d)
|
||||
)
|
||||
|
||||
return {
|
||||
"delta_24h": delta_24h,
|
||||
"delta_7d": delta_7d,
|
||||
"slope_per_day": round(slope_per_day, 2) if slope_per_day is not None else None,
|
||||
"volatility": volatility,
|
||||
"regression": {"warn": reg_warn, "fail": reg_fail},
|
||||
}
|
||||
|
||||
|
||||
def _empty_trend() -> Dict:
|
||||
return {
|
||||
"delta_24h": None, "delta_7d": None,
|
||||
"slope_per_day": None, "volatility": None,
|
||||
"regression": {"warn": False, "fail": False},
|
||||
}
|
||||
|
||||
|
||||
def _find_baseline(series, cutoff_iso: str, ts_fn):
|
||||
"""Return the first element whose ts <= cutoff (series is newest-first)."""
|
||||
for s in series:
|
||||
if ts_fn(s) <= cutoff_iso:
|
||||
return s
|
||||
return None
|
||||
|
||||
|
||||
def _parse_ts(ts_str: str) -> datetime.datetime:
|
||||
ts_str = ts_str.rstrip("Z")
|
||||
for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d"):
|
||||
try:
|
||||
return datetime.datetime.strptime(ts_str, fmt)
|
||||
except ValueError:
|
||||
continue
|
||||
return datetime.datetime.utcnow()
|
||||
|
||||
|
||||
def _linear_slope(xs: List[float], ys: List[float]) -> float:
|
||||
"""Simple least-squares slope (score per hour)."""
|
||||
n = len(xs)
|
||||
if n < 2:
|
||||
return 0.0
|
||||
x_mean = sum(xs) / n
|
||||
y_mean = sum(ys) / n
|
||||
num = sum((xs[i] - x_mean) * (ys[i] - y_mean) for i in range(n))
|
||||
den = sum((xs[i] - x_mean) ** 2 for i in range(n))
|
||||
return num / den if den != 0 else 0.0
|
||||
|
||||
|
||||
def _daily_latest_scores(series, days: int, _ts_fn, _score_fn) -> List[float]:
|
||||
"""Collect the latest score for each calendar day over last `days` days."""
|
||||
now = datetime.datetime.utcnow()
|
||||
day_scores: Dict[str, int] = {}
|
||||
cutoff = (now - datetime.timedelta(days=days)).isoformat()
|
||||
for s in series:
|
||||
ts = _ts_fn(s)
|
||||
if ts < cutoff:
|
||||
break
|
||||
day_key = ts[:10] # YYYY-MM-DD
|
||||
if day_key not in day_scores: # series is newest-first, so first = latest
|
||||
day_scores[day_key] = _score_fn(s)
|
||||
return list(day_scores.values())
|
||||
|
||||
|
||||
def enrich_risk_report_with_trend(
|
||||
report: Dict,
|
||||
history_store, # RiskHistoryStore
|
||||
policy: Optional[Dict] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Mutates `report` in-place to add a `trend` key.
|
||||
Non-fatal: on any error, adds `trend: null`.
|
||||
"""
|
||||
try:
|
||||
service = report.get("service", "")
|
||||
env = report.get("env", "prod")
|
||||
if policy is None:
|
||||
policy = load_risk_policy()
|
||||
|
||||
trend_cfg = policy.get("trend", {})
|
||||
vol_hours = int(trend_cfg.get("volatility_window_hours", 168))
|
||||
series = history_store.get_series(service, env, hours=vol_hours, limit=500)
|
||||
report["trend"] = compute_trend(series, policy=policy)
|
||||
except Exception as e:
|
||||
logger.warning("enrich_risk_report_with_trend failed for %s: %s", report.get("service"), e)
|
||||
report["trend"] = None
|
||||
return report
|
||||
|
||||
|
||||
def enrich_risk_report_with_attribution(
|
||||
report: Dict,
|
||||
*,
|
||||
alert_store=None,
|
||||
incident_store=None,
|
||||
attr_policy: Optional[Dict] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Mutates `report` in-place to add an `attribution` key.
|
||||
Non-fatal: on any error, adds `attribution: null`.
|
||||
LLM enrichment is applied if policy.llm_mode != 'off' and triggers met.
|
||||
"""
|
||||
try:
|
||||
from risk_attribution import (
|
||||
compute_attribution, fetch_signals_from_stores, load_attribution_policy,
|
||||
)
|
||||
from llm_enrichment import maybe_enrich_attribution
|
||||
|
||||
if attr_policy is None:
|
||||
attr_policy = load_attribution_policy()
|
||||
|
||||
service = report.get("service", "")
|
||||
env = report.get("env", "prod")
|
||||
|
||||
# Fetch raw signals
|
||||
signals = fetch_signals_from_stores(
|
||||
service, env,
|
||||
window_hours=int((attr_policy.get("defaults") or {}).get("lookback_hours", 24)),
|
||||
alert_store=alert_store,
|
||||
incident_store=incident_store,
|
||||
policy=attr_policy,
|
||||
)
|
||||
|
||||
attribution = compute_attribution(
|
||||
service, env,
|
||||
risk_report=report,
|
||||
**signals,
|
||||
policy=attr_policy,
|
||||
)
|
||||
|
||||
# Optionally enrich with LLM (bounded, off by default)
|
||||
attribution["llm_enrichment"] = maybe_enrich_attribution(
|
||||
attribution, report, attr_policy
|
||||
)
|
||||
|
||||
report["attribution"] = attribution
|
||||
except Exception as e:
|
||||
logger.warning("enrich_risk_report_with_attribution failed for %s: %s",
|
||||
report.get("service"), e)
|
||||
report["attribution"] = None
|
||||
return report
|
||||
|
||||
|
||||
# ─── Snapshot writer ──────────────────────────────────────────────────────────
|
||||
|
||||
def snapshot_all_services(
|
||||
env: str,
|
||||
compute_fn, # Callable[[str, str], Dict] — returns RiskReport for (service, env)
|
||||
history_store, # RiskHistoryStore
|
||||
policy: Optional[Dict] = None,
|
||||
known_services: Optional[List[str]] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Compute and persist a RiskSnapshot for every known service.
|
||||
|
||||
`compute_fn(service, env)` must return a RiskReport dict.
|
||||
Returns {written, skipped, errors, services}.
|
||||
Non-fatal per service.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_risk_policy()
|
||||
|
||||
from risk_history_store import RiskSnapshot
|
||||
|
||||
max_services = int(policy.get("history", {}).get("max_services_per_run", 50))
|
||||
services = (known_services or [])[:max_services]
|
||||
|
||||
written = skipped = errors = 0
|
||||
snapped: List[str] = []
|
||||
|
||||
for svc in services:
|
||||
try:
|
||||
report = compute_fn(svc, env)
|
||||
snap = RiskSnapshot(
|
||||
ts=datetime.datetime.utcnow().isoformat(),
|
||||
service=svc,
|
||||
env=env,
|
||||
score=int(report.get("score", 0)),
|
||||
band=report.get("band", "low"),
|
||||
components=report.get("components", {}),
|
||||
reasons=report.get("reasons", []),
|
||||
)
|
||||
history_store.write_snapshot([snap])
|
||||
written += 1
|
||||
snapped.append(svc)
|
||||
except Exception as e:
|
||||
logger.warning("snapshot_all_services: error for %s/%s: %s", svc, env, e)
|
||||
errors += 1
|
||||
|
||||
return {
|
||||
"written": written,
|
||||
"skipped": skipped,
|
||||
"errors": errors,
|
||||
"services": snapped,
|
||||
"env": env,
|
||||
"ts": datetime.datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
def compute_risk_dashboard(
|
||||
env: str = "prod",
|
||||
top_n: int = 10,
|
||||
*,
|
||||
service_reports: Optional[List[Dict]] = None,
|
||||
history_store=None, # Optional[RiskHistoryStore] — if provided, enrich with trend
|
||||
policy: Optional[Dict] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Build risk dashboard from a list of pre-computed service reports.
|
||||
Sorts by score desc and returns summary.
|
||||
If history_store is provided, each report is enriched with trend data.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_risk_policy()
|
||||
|
||||
reports = sorted(
|
||||
service_reports or [],
|
||||
key=lambda r: -r.get("score", 0),
|
||||
)[:top_n]
|
||||
|
||||
# Enrich with trend if history_store provided
|
||||
if history_store is not None:
|
||||
for r in reports:
|
||||
enrich_risk_report_with_trend(r, history_store, policy)
|
||||
|
||||
band_counts: Dict[str, int] = {"critical": 0, "high": 0, "medium": 0, "low": 0}
|
||||
for r in reports:
|
||||
b = r.get("band", "low")
|
||||
band_counts[b] = band_counts.get(b, 0) + 1
|
||||
|
||||
p0_services = set(policy.get("p0_services", []))
|
||||
critical_p0 = [r for r in reports if r["service"] in p0_services
|
||||
and r["band"] in ("high", "critical")]
|
||||
|
||||
# Top regressions (highest delta_24h, trend present)
|
||||
top_regressions = sorted(
|
||||
[r for r in reports if (r.get("trend") or {}).get("delta_24h") is not None
|
||||
and r["trend"]["delta_24h"] > 0],
|
||||
key=lambda r: -r["trend"]["delta_24h"],
|
||||
)[:5]
|
||||
|
||||
# Improving services (most negative delta_7d)
|
||||
improving = sorted(
|
||||
[r for r in reports if (r.get("trend") or {}).get("delta_7d") is not None
|
||||
and r["trend"]["delta_7d"] < 0],
|
||||
key=lambda r: r["trend"]["delta_7d"],
|
||||
)[:5]
|
||||
|
||||
# Top regression summaries (with top-2 causes if attribution available)
|
||||
top_regression_summaries = []
|
||||
for r in top_regressions:
|
||||
entry: Dict = {
|
||||
"service": r["service"],
|
||||
"delta_24h": r["trend"]["delta_24h"],
|
||||
}
|
||||
attr = r.get("attribution")
|
||||
if attr and attr.get("causes"):
|
||||
entry["causes"] = attr["causes"][:2]
|
||||
entry["attribution_summary"] = attr.get("summary", "")
|
||||
top_regression_summaries.append(entry)
|
||||
|
||||
now_iso = datetime.datetime.utcnow().isoformat()
|
||||
return {
|
||||
"env": env,
|
||||
"generated_at": now_iso,
|
||||
"history_updated_at": now_iso,
|
||||
"total_services": len(reports),
|
||||
"band_counts": band_counts,
|
||||
"critical_p0_services": [r["service"] for r in critical_p0],
|
||||
"top_regressions": top_regression_summaries,
|
||||
"improving_services": [{"service": r["service"], "delta_7d": r["trend"]["delta_7d"]}
|
||||
for r in improving],
|
||||
"services": reports,
|
||||
}
|
||||
409
services/router/risk_history_store.py
Normal file
409
services/router/risk_history_store.py
Normal file
@@ -0,0 +1,409 @@
|
||||
"""
|
||||
risk_history_store.py — Storage layer for Risk Score snapshots.
|
||||
|
||||
Provides:
|
||||
RiskSnapshot — dataclass for a single point-in-time risk record
|
||||
RiskHistoryStore — abstract base
|
||||
MemoryRiskHistoryStore — in-process (tests + fallback)
|
||||
NullRiskHistoryStore — no-op (disabled)
|
||||
PostgresRiskHistoryStore — Postgres primary (psycopg2 sync)
|
||||
AutoRiskHistoryStore — Postgres → Memory fallback
|
||||
|
||||
Factory: get_risk_history_store() → AutoRiskHistoryStore by default
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ─── Data model ───────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class RiskSnapshot:
|
||||
ts: str # ISO-8601 UTC
|
||||
service: str
|
||||
env: str
|
||||
score: int
|
||||
band: str
|
||||
components: Dict = field(default_factory=dict)
|
||||
reasons: List[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
return asdict(self)
|
||||
|
||||
@staticmethod
|
||||
def from_dict(d: Dict) -> "RiskSnapshot":
|
||||
return RiskSnapshot(
|
||||
ts=d["ts"], service=d["service"], env=d.get("env", "prod"),
|
||||
score=int(d["score"]), band=d.get("band", "low"),
|
||||
components=d.get("components", {}),
|
||||
reasons=d.get("reasons", []),
|
||||
)
|
||||
|
||||
|
||||
# ─── Abstract base ────────────────────────────────────────────────────────────
|
||||
|
||||
class RiskHistoryStore(ABC):
|
||||
@abstractmethod
|
||||
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
|
||||
"""Persist records; returns number written."""
|
||||
|
||||
@abstractmethod
|
||||
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
|
||||
"""Most recent snapshot for service/env."""
|
||||
|
||||
@abstractmethod
|
||||
def get_series(
|
||||
self, service: str, env: str, hours: int = 168, limit: int = 200
|
||||
) -> List[RiskSnapshot]:
|
||||
"""Snapshots in descending time order within last `hours` hours."""
|
||||
|
||||
def get_delta(self, service: str, env: str, hours: int = 24) -> Optional[int]:
|
||||
"""
|
||||
latest.score - closest-to-(now-hours) score.
|
||||
Returns None if no baseline is available.
|
||||
"""
|
||||
series = self.get_series(service, env, hours=hours * 2, limit=500)
|
||||
if not series:
|
||||
return None
|
||||
latest = series[0]
|
||||
cutoff_ts = (
|
||||
datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
|
||||
).isoformat()
|
||||
# Find snapshot closest to cutoff (first one before or at cutoff)
|
||||
baseline = None
|
||||
for snap in series:
|
||||
if snap.ts <= cutoff_ts:
|
||||
baseline = snap
|
||||
break
|
||||
if baseline is None:
|
||||
return None
|
||||
return latest.score - baseline.score
|
||||
|
||||
def dashboard_series(
|
||||
self, env: str, hours: int = 24, top_n: int = 10
|
||||
) -> List[Dict]:
|
||||
"""Return latest snapshot for each service in env, sorted by score desc."""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def cleanup(self, retention_days: int = 90) -> int:
|
||||
"""Delete records older than retention_days; returns count deleted."""
|
||||
|
||||
|
||||
# ─── Memory backend (tests + fallback) ────────────────────────────────────────
|
||||
|
||||
class MemoryRiskHistoryStore(RiskHistoryStore):
|
||||
def __init__(self) -> None:
|
||||
self._lock = threading.Lock()
|
||||
# key: (service, env) → list of RiskSnapshot sorted desc by ts
|
||||
self._data: Dict = defaultdict(list)
|
||||
|
||||
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
|
||||
with self._lock:
|
||||
for rec in records:
|
||||
key = (rec.service, rec.env)
|
||||
self._data[key].append(rec)
|
||||
self._data[key].sort(key=lambda r: r.ts, reverse=True)
|
||||
return len(records)
|
||||
|
||||
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
|
||||
with self._lock:
|
||||
series = self._data.get((service, env), [])
|
||||
return series[0] if series else None
|
||||
|
||||
def get_series(
|
||||
self, service: str, env: str, hours: int = 168, limit: int = 200
|
||||
) -> List[RiskSnapshot]:
|
||||
cutoff = (
|
||||
datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
|
||||
).isoformat()
|
||||
with self._lock:
|
||||
series = self._data.get((service, env), [])
|
||||
result = [s for s in series if s.ts >= cutoff]
|
||||
return result[:limit]
|
||||
|
||||
def dashboard_series(
|
||||
self, env: str, hours: int = 24, top_n: int = 10
|
||||
) -> List[Dict]:
|
||||
cutoff = (
|
||||
datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
|
||||
).isoformat()
|
||||
with self._lock:
|
||||
latest_per_service: Dict[str, RiskSnapshot] = {}
|
||||
for (svc, e), snaps in self._data.items():
|
||||
if e != env:
|
||||
continue
|
||||
recent = [s for s in snaps if s.ts >= cutoff]
|
||||
if recent:
|
||||
latest_per_service[svc] = recent[0]
|
||||
return sorted(
|
||||
[s.to_dict() for s in latest_per_service.values()],
|
||||
key=lambda r: -r["score"],
|
||||
)[:top_n]
|
||||
|
||||
def cleanup(self, retention_days: int = 90) -> int:
|
||||
cutoff = (
|
||||
datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
|
||||
).isoformat()
|
||||
deleted = 0
|
||||
with self._lock:
|
||||
for key in list(self._data.keys()):
|
||||
before = len(self._data[key])
|
||||
self._data[key] = [s for s in self._data[key] if s.ts >= cutoff]
|
||||
deleted += before - len(self._data[key])
|
||||
return deleted
|
||||
|
||||
|
||||
# ─── Null backend ──────────────────────────────────────────────────────────────
|
||||
|
||||
class NullRiskHistoryStore(RiskHistoryStore):
|
||||
"""No-op: all writes discarded, all reads return empty."""
|
||||
|
||||
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
|
||||
return 0
|
||||
|
||||
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
|
||||
return None
|
||||
|
||||
def get_series(
|
||||
self, service: str, env: str, hours: int = 168, limit: int = 200
|
||||
) -> List[RiskSnapshot]:
|
||||
return []
|
||||
|
||||
def cleanup(self, retention_days: int = 90) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
# ─── Postgres backend ──────────────────────────────────────────────────────────
|
||||
|
||||
class PostgresRiskHistoryStore(RiskHistoryStore):
|
||||
"""
|
||||
Production Postgres backend (psycopg2 sync, per-thread connection).
|
||||
Schema created by ops/scripts/migrate_risk_history_postgres.py.
|
||||
"""
|
||||
|
||||
def __init__(self, dsn: str) -> None:
|
||||
self._dsn = dsn
|
||||
self._local = threading.local()
|
||||
|
||||
def _conn(self):
|
||||
conn = getattr(self._local, "conn", None)
|
||||
if conn is None or conn.closed:
|
||||
import psycopg2 # type: ignore
|
||||
conn = psycopg2.connect(self._dsn)
|
||||
conn.autocommit = True
|
||||
self._local.conn = conn
|
||||
return conn
|
||||
|
||||
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
|
||||
if not records:
|
||||
return 0
|
||||
cur = self._conn().cursor()
|
||||
written = 0
|
||||
for rec in records:
|
||||
try:
|
||||
cur.execute(
|
||||
"""INSERT INTO risk_history (ts, service, env, score, band, components, reasons)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (ts, service, env) DO UPDATE
|
||||
SET score=EXCLUDED.score, band=EXCLUDED.band,
|
||||
components=EXCLUDED.components, reasons=EXCLUDED.reasons""",
|
||||
(rec.ts, rec.service, rec.env, rec.score, rec.band,
|
||||
json.dumps(rec.components), json.dumps(rec.reasons)),
|
||||
)
|
||||
written += 1
|
||||
except Exception as e:
|
||||
logger.warning("risk_history write failed for %s/%s: %s", rec.service, rec.env, e)
|
||||
cur.close()
|
||||
return written
|
||||
|
||||
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
|
||||
cur = self._conn().cursor()
|
||||
cur.execute(
|
||||
"SELECT ts,service,env,score,band,components,reasons FROM risk_history "
|
||||
"WHERE service=%s AND env=%s ORDER BY ts DESC LIMIT 1",
|
||||
(service, env),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
cur.close()
|
||||
if not row:
|
||||
return None
|
||||
return self._row_to_snap(row)
|
||||
|
||||
def get_series(
|
||||
self, service: str, env: str, hours: int = 168, limit: int = 200
|
||||
) -> List[RiskSnapshot]:
|
||||
cutoff = datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
|
||||
cur = self._conn().cursor()
|
||||
cur.execute(
|
||||
"SELECT ts,service,env,score,band,components,reasons FROM risk_history "
|
||||
"WHERE service=%s AND env=%s AND ts >= %s ORDER BY ts DESC LIMIT %s",
|
||||
(service, env, cutoff, limit),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
return [self._row_to_snap(r) for r in rows]
|
||||
|
||||
def dashboard_series(
|
||||
self, env: str, hours: int = 24, top_n: int = 10
|
||||
) -> List[Dict]:
|
||||
cutoff = datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
|
||||
cur = self._conn().cursor()
|
||||
# Latest snapshot per service in env within window
|
||||
cur.execute(
|
||||
"""SELECT DISTINCT ON (service)
|
||||
ts, service, env, score, band, components, reasons
|
||||
FROM risk_history
|
||||
WHERE env=%s AND ts >= %s
|
||||
ORDER BY service, ts DESC""",
|
||||
(env, cutoff),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
snaps = [self._row_to_snap(r).to_dict() for r in rows]
|
||||
return sorted(snaps, key=lambda r: -r["score"])[:top_n]
|
||||
|
||||
def cleanup(self, retention_days: int = 90) -> int:
|
||||
cutoff = datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
|
||||
cur = self._conn().cursor()
|
||||
cur.execute("DELETE FROM risk_history WHERE ts < %s", (cutoff,))
|
||||
deleted = cur.rowcount
|
||||
cur.close()
|
||||
return deleted
|
||||
|
||||
@staticmethod
|
||||
def _row_to_snap(row) -> RiskSnapshot:
|
||||
ts, service, env, score, band, components, reasons = row
|
||||
if isinstance(ts, datetime.datetime):
|
||||
ts = ts.isoformat()
|
||||
if isinstance(components, str):
|
||||
components = json.loads(components)
|
||||
if isinstance(reasons, str):
|
||||
reasons = json.loads(reasons)
|
||||
return RiskSnapshot(
|
||||
ts=ts, service=service, env=env,
|
||||
score=int(score), band=band,
|
||||
components=components or {},
|
||||
reasons=reasons or [],
|
||||
)
|
||||
|
||||
|
||||
# ─── Auto backend ─────────────────────────────────────────────────────────────
|
||||
|
||||
class AutoRiskHistoryStore(RiskHistoryStore):
|
||||
"""
|
||||
Postgres primary; falls back to MemoryRiskHistoryStore on connection failures.
|
||||
Reads are always tried against Postgres first. On failure, returns from memory buffer.
|
||||
"""
|
||||
|
||||
def __init__(self, pg_dsn: str) -> None:
|
||||
self._pg = PostgresRiskHistoryStore(pg_dsn)
|
||||
self._mem = MemoryRiskHistoryStore()
|
||||
self._pg_ok = True
|
||||
|
||||
def _try_pg(self, method: str, *args, **kwargs):
|
||||
try:
|
||||
result = getattr(self._pg, method)(*args, **kwargs)
|
||||
self._pg_ok = True
|
||||
return True, result
|
||||
except Exception as e:
|
||||
if self._pg_ok:
|
||||
logger.warning("AutoRiskHistoryStore: Postgres unavailable (%s), using memory", e)
|
||||
self._pg_ok = False
|
||||
return False, None
|
||||
|
||||
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
|
||||
ok, written = self._try_pg("write_snapshot", records)
|
||||
self._mem.write_snapshot(records) # always keep in-memory buffer
|
||||
return written if ok else len(records)
|
||||
|
||||
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
|
||||
ok, result = self._try_pg("get_latest", service, env)
|
||||
if ok:
|
||||
return result
|
||||
return self._mem.get_latest(service, env)
|
||||
|
||||
def get_series(
|
||||
self, service: str, env: str, hours: int = 168, limit: int = 200
|
||||
) -> List[RiskSnapshot]:
|
||||
ok, result = self._try_pg("get_series", service, env, hours, limit)
|
||||
if ok:
|
||||
return result
|
||||
return self._mem.get_series(service, env, hours, limit)
|
||||
|
||||
def dashboard_series(
|
||||
self, env: str, hours: int = 24, top_n: int = 10
|
||||
) -> List[Dict]:
|
||||
ok, result = self._try_pg("dashboard_series", env, hours, top_n)
|
||||
if ok:
|
||||
return result
|
||||
return self._mem.dashboard_series(env, hours, top_n)
|
||||
|
||||
def cleanup(self, retention_days: int = 90) -> int:
|
||||
ok, count = self._try_pg("cleanup", retention_days)
|
||||
self._mem.cleanup(retention_days)
|
||||
return count if ok else 0
|
||||
|
||||
|
||||
# ─── Singleton factory ────────────────────────────────────────────────────────
|
||||
|
||||
_store: Optional[RiskHistoryStore] = None
|
||||
_store_lock = threading.Lock()
|
||||
|
||||
|
||||
def get_risk_history_store() -> RiskHistoryStore:
|
||||
global _store
|
||||
if _store is None:
|
||||
with _store_lock:
|
||||
if _store is None:
|
||||
_store = _create_store()
|
||||
return _store
|
||||
|
||||
|
||||
def set_risk_history_store(store: Optional[RiskHistoryStore]) -> None:
|
||||
global _store
|
||||
with _store_lock:
|
||||
_store = store
|
||||
|
||||
|
||||
def _create_store() -> RiskHistoryStore:
|
||||
backend = os.getenv("RISK_HISTORY_BACKEND", "auto").lower()
|
||||
dsn = (
|
||||
os.getenv("RISK_DATABASE_URL")
|
||||
or os.getenv("DATABASE_URL")
|
||||
or ""
|
||||
)
|
||||
|
||||
if backend == "memory":
|
||||
logger.info("RiskHistoryStore: in-memory")
|
||||
return MemoryRiskHistoryStore()
|
||||
|
||||
if backend == "null":
|
||||
logger.info("RiskHistoryStore: null (disabled)")
|
||||
return NullRiskHistoryStore()
|
||||
|
||||
if backend == "postgres":
|
||||
if dsn:
|
||||
logger.info("RiskHistoryStore: postgres dsn=%s…", dsn[:30])
|
||||
return PostgresRiskHistoryStore(dsn)
|
||||
logger.warning("RISK_HISTORY_BACKEND=postgres but no DATABASE_URL; falling back to memory")
|
||||
return MemoryRiskHistoryStore()
|
||||
|
||||
# Default: auto
|
||||
if dsn:
|
||||
logger.info("RiskHistoryStore: auto (postgres→memory fallback) dsn=%s…", dsn[:30])
|
||||
return AutoRiskHistoryStore(pg_dsn=dsn)
|
||||
|
||||
logger.info("RiskHistoryStore: auto — no DATABASE_URL, using memory")
|
||||
return MemoryRiskHistoryStore()
|
||||
376
services/router/signature_state_store.py
Normal file
376
services/router/signature_state_store.py
Normal file
@@ -0,0 +1,376 @@
|
||||
"""
|
||||
signature_state_store.py — Cooldown tracking per incident signature.
|
||||
|
||||
Prevents triage from running too frequently for the same failure type.
|
||||
A "signature" is the same one computed by alert_routing.compute_incident_signature.
|
||||
|
||||
Backends:
|
||||
- MemorySignatureStateStore (tests / single-process)
|
||||
- PostgresSignatureStateStore (production)
|
||||
- AutoSignatureStateStore (Postgres → Memory fallback)
|
||||
|
||||
Table: incident_signature_state
|
||||
signature text PK, last_triage_at timestamptz, last_alert_at timestamptz,
|
||||
triage_count_24h int, updated_at timestamptz
|
||||
|
||||
DDL: ops/scripts/migrate_alerts_postgres.py
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_COOLDOWN_MINUTES = 15
|
||||
|
||||
|
||||
def _now_dt() -> datetime.datetime:
|
||||
return datetime.datetime.utcnow()
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.datetime.utcnow().isoformat()
|
||||
|
||||
|
||||
# ─── Abstract ─────────────────────────────────────────────────────────────────
|
||||
|
||||
class SignatureStateStore(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def should_run_triage(
|
||||
self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES
|
||||
) -> bool:
|
||||
"""Return True if cooldown has passed (triage may proceed)."""
|
||||
|
||||
@abstractmethod
|
||||
def mark_alert_seen(self, signature: str) -> None:
|
||||
"""Record that an alert with this signature was observed.
|
||||
Also updates occurrences_60m rolling bucket."""
|
||||
|
||||
@abstractmethod
|
||||
def mark_triage_run(self, signature: str) -> None:
|
||||
"""Record that triage was executed for this signature."""
|
||||
|
||||
@abstractmethod
|
||||
def get_state(self, signature: str) -> Optional[Dict]:
|
||||
"""Return raw state dict or None."""
|
||||
|
||||
@abstractmethod
|
||||
def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]:
|
||||
"""Return signatures seen in last window_minutes, ordered by occurrences_60m desc."""
|
||||
|
||||
|
||||
# ─── Memory backend ────────────────────────────────────────────────────────────
|
||||
|
||||
class MemorySignatureStateStore(SignatureStateStore):
|
||||
BUCKET_MINUTES = 60 # rolling window for occurrences_60m
|
||||
|
||||
def __init__(self):
|
||||
self._lock = threading.Lock()
|
||||
self._states: Dict[str, Dict] = {}
|
||||
|
||||
def _update_bucket(self, state: Dict, now: str) -> None:
|
||||
"""Update the 60-min rolling occurrence bucket in-place."""
|
||||
bucket_start = state.get("occurrences_60m_bucket_start") or ""
|
||||
cutoff = (_now_dt() - datetime.timedelta(minutes=self.BUCKET_MINUTES)).isoformat()
|
||||
if bucket_start < cutoff:
|
||||
state["occurrences_60m"] = 1
|
||||
state["occurrences_60m_bucket_start"] = now
|
||||
else:
|
||||
state["occurrences_60m"] = state.get("occurrences_60m", 0) + 1
|
||||
|
||||
def should_run_triage(
|
||||
self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES
|
||||
) -> bool:
|
||||
with self._lock:
|
||||
state = self._states.get(signature)
|
||||
if state is None:
|
||||
return True
|
||||
last_triage = state.get("last_triage_at")
|
||||
if not last_triage:
|
||||
return True
|
||||
cutoff = (_now_dt() - datetime.timedelta(minutes=cooldown_minutes)).isoformat()
|
||||
return last_triage < cutoff
|
||||
|
||||
def mark_alert_seen(self, signature: str) -> None:
|
||||
now = _now_iso()
|
||||
with self._lock:
|
||||
if signature not in self._states:
|
||||
self._states[signature] = {
|
||||
"signature": signature,
|
||||
"last_triage_at": None,
|
||||
"last_alert_at": now,
|
||||
"triage_count_24h": 0,
|
||||
"occurrences_60m": 1,
|
||||
"occurrences_60m_bucket_start": now,
|
||||
"updated_at": now,
|
||||
}
|
||||
else:
|
||||
s = self._states[signature]
|
||||
s["last_alert_at"] = now
|
||||
s["updated_at"] = now
|
||||
self._update_bucket(s, now)
|
||||
|
||||
def mark_triage_run(self, signature: str) -> None:
|
||||
now = _now_iso()
|
||||
cutoff_24h = (_now_dt() - datetime.timedelta(hours=24)).isoformat()
|
||||
with self._lock:
|
||||
if signature not in self._states:
|
||||
self._states[signature] = {
|
||||
"signature": signature,
|
||||
"last_triage_at": now,
|
||||
"last_alert_at": now,
|
||||
"triage_count_24h": 1,
|
||||
"occurrences_60m": 0,
|
||||
"occurrences_60m_bucket_start": now,
|
||||
"updated_at": now,
|
||||
}
|
||||
else:
|
||||
s = self._states[signature]
|
||||
prev = s.get("last_triage_at") or ""
|
||||
if prev < cutoff_24h:
|
||||
s["triage_count_24h"] = 1
|
||||
else:
|
||||
s["triage_count_24h"] = s.get("triage_count_24h", 0) + 1
|
||||
s["last_triage_at"] = now
|
||||
s["updated_at"] = now
|
||||
|
||||
def get_state(self, signature: str) -> Optional[Dict]:
|
||||
with self._lock:
|
||||
s = self._states.get(signature)
|
||||
return dict(s) if s else None
|
||||
|
||||
def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]:
|
||||
cutoff = (_now_dt() - datetime.timedelta(minutes=window_minutes)).isoformat()
|
||||
with self._lock:
|
||||
active = [
|
||||
dict(s) for s in self._states.values()
|
||||
if (s.get("last_alert_at") or "") >= cutoff
|
||||
]
|
||||
return sorted(active, key=lambda x: x.get("occurrences_60m", 0), reverse=True)[:limit]
|
||||
|
||||
|
||||
# ─── Postgres backend ──────────────────────────────────────────────────────────
|
||||
|
||||
class PostgresSignatureStateStore(SignatureStateStore):
|
||||
def __init__(self, dsn: str):
|
||||
self._dsn = dsn
|
||||
self._local = threading.local()
|
||||
|
||||
def _conn(self):
|
||||
conn = getattr(self._local, "conn", None)
|
||||
if conn is None or conn.closed:
|
||||
import psycopg2 # type: ignore
|
||||
conn = psycopg2.connect(self._dsn)
|
||||
conn.autocommit = True
|
||||
self._local.conn = conn
|
||||
return conn
|
||||
|
||||
def should_run_triage(
|
||||
self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES
|
||||
) -> bool:
|
||||
cur = self._conn().cursor()
|
||||
cur.execute(
|
||||
"SELECT last_triage_at FROM incident_signature_state WHERE signature=%s",
|
||||
(signature,),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
cur.close()
|
||||
if not row or row[0] is None:
|
||||
return True
|
||||
cutoff = _now_dt() - datetime.timedelta(minutes=cooldown_minutes)
|
||||
last = row[0]
|
||||
if hasattr(last, "tzinfo") and last.tzinfo:
|
||||
last = last.replace(tzinfo=None)
|
||||
return last < cutoff
|
||||
|
||||
def mark_alert_seen(self, signature: str) -> None:
|
||||
now = _now_iso()
|
||||
cutoff_60m = (_now_dt() - datetime.timedelta(minutes=60)).isoformat()
|
||||
cur = self._conn().cursor()
|
||||
cur.execute(
|
||||
"""INSERT INTO incident_signature_state
|
||||
(signature, last_alert_at, triage_count_24h, updated_at,
|
||||
occurrences_60m, occurrences_60m_bucket_start)
|
||||
VALUES (%s, %s, 0, %s, 1, %s)
|
||||
ON CONFLICT (signature) DO UPDATE
|
||||
SET last_alert_at=EXCLUDED.last_alert_at,
|
||||
updated_at=EXCLUDED.updated_at,
|
||||
occurrences_60m = CASE
|
||||
WHEN incident_signature_state.occurrences_60m_bucket_start IS NULL
|
||||
OR incident_signature_state.occurrences_60m_bucket_start < %s
|
||||
THEN 1
|
||||
ELSE incident_signature_state.occurrences_60m + 1
|
||||
END,
|
||||
occurrences_60m_bucket_start = CASE
|
||||
WHEN incident_signature_state.occurrences_60m_bucket_start IS NULL
|
||||
OR incident_signature_state.occurrences_60m_bucket_start < %s
|
||||
THEN EXCLUDED.occurrences_60m_bucket_start
|
||||
ELSE incident_signature_state.occurrences_60m_bucket_start
|
||||
END""",
|
||||
(signature, now, now, now, cutoff_60m, cutoff_60m),
|
||||
)
|
||||
cur.close()
|
||||
|
||||
def mark_triage_run(self, signature: str) -> None:
|
||||
now = _now_iso()
|
||||
cutoff_24h = (_now_dt() - datetime.timedelta(hours=24)).isoformat()
|
||||
cur = self._conn().cursor()
|
||||
cur.execute(
|
||||
"""INSERT INTO incident_signature_state
|
||||
(signature, last_triage_at, last_alert_at, triage_count_24h, updated_at,
|
||||
occurrences_60m, occurrences_60m_bucket_start)
|
||||
VALUES (%s, %s, %s, 1, %s, 0, %s)
|
||||
ON CONFLICT (signature) DO UPDATE
|
||||
SET last_triage_at=EXCLUDED.last_triage_at,
|
||||
triage_count_24h = CASE
|
||||
WHEN incident_signature_state.last_triage_at IS NULL
|
||||
OR incident_signature_state.last_triage_at < %s
|
||||
THEN 1
|
||||
ELSE incident_signature_state.triage_count_24h + 1
|
||||
END,
|
||||
updated_at=EXCLUDED.updated_at""",
|
||||
(signature, now, now, now, now, cutoff_24h),
|
||||
)
|
||||
cur.close()
|
||||
|
||||
def get_state(self, signature: str) -> Optional[Dict]:
|
||||
cur = self._conn().cursor()
|
||||
cur.execute(
|
||||
"SELECT signature, last_triage_at, last_alert_at, triage_count_24h, updated_at, "
|
||||
"occurrences_60m, occurrences_60m_bucket_start "
|
||||
"FROM incident_signature_state WHERE signature=%s",
|
||||
(signature,),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
cur.close()
|
||||
if not row:
|
||||
return None
|
||||
sig, lta, laa, cnt, upd, occ60, occ_start = row
|
||||
return {
|
||||
"signature": sig,
|
||||
"last_triage_at": lta.isoformat() if hasattr(lta, "isoformat") else lta,
|
||||
"last_alert_at": laa.isoformat() if hasattr(laa, "isoformat") else laa,
|
||||
"triage_count_24h": cnt,
|
||||
"updated_at": upd.isoformat() if hasattr(upd, "isoformat") else upd,
|
||||
"occurrences_60m": occ60 or 0,
|
||||
"occurrences_60m_bucket_start": (
|
||||
occ_start.isoformat() if hasattr(occ_start, "isoformat") else occ_start
|
||||
),
|
||||
}
|
||||
|
||||
def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]:
|
||||
cutoff = (_now_dt() - datetime.timedelta(minutes=window_minutes)).isoformat()
|
||||
cur = self._conn().cursor()
|
||||
cur.execute(
|
||||
"SELECT signature, last_triage_at, last_alert_at, triage_count_24h, updated_at, "
|
||||
"occurrences_60m, occurrences_60m_bucket_start "
|
||||
"FROM incident_signature_state "
|
||||
"WHERE last_alert_at >= %s "
|
||||
"ORDER BY occurrences_60m DESC NULLS LAST LIMIT %s",
|
||||
(cutoff, limit),
|
||||
)
|
||||
rows = []
|
||||
for row in cur.fetchall():
|
||||
sig, lta, laa, cnt, upd, occ60, occ_start = row
|
||||
rows.append({
|
||||
"signature": sig,
|
||||
"last_triage_at": lta.isoformat() if hasattr(lta, "isoformat") else lta,
|
||||
"last_alert_at": laa.isoformat() if hasattr(laa, "isoformat") else laa,
|
||||
"triage_count_24h": cnt,
|
||||
"updated_at": upd.isoformat() if hasattr(upd, "isoformat") else upd,
|
||||
"occurrences_60m": occ60 or 0,
|
||||
"occurrences_60m_bucket_start": (
|
||||
occ_start.isoformat() if hasattr(occ_start, "isoformat") else occ_start
|
||||
),
|
||||
})
|
||||
cur.close()
|
||||
return rows
|
||||
|
||||
|
||||
# ─── Auto backend ──────────────────────────────────────────────────────────────
|
||||
|
||||
class AutoSignatureStateStore(SignatureStateStore):
|
||||
_RECOVERY_S = 300
|
||||
|
||||
def __init__(self, pg_dsn: str):
|
||||
self._pg_dsn = pg_dsn
|
||||
self._primary: Optional[PostgresSignatureStateStore] = None
|
||||
self._fallback = MemorySignatureStateStore()
|
||||
self._using_fallback = False
|
||||
self._since: float = 0.0
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def _get_primary(self) -> PostgresSignatureStateStore:
|
||||
if self._primary is None:
|
||||
with self._lock:
|
||||
if self._primary is None:
|
||||
self._primary = PostgresSignatureStateStore(self._pg_dsn)
|
||||
return self._primary
|
||||
|
||||
def _maybe_recover(self):
|
||||
if self._using_fallback and time.monotonic() - self._since >= self._RECOVERY_S:
|
||||
self._using_fallback = False
|
||||
|
||||
def _delegate(self, method: str, *args, **kwargs):
|
||||
self._maybe_recover()
|
||||
if not self._using_fallback:
|
||||
try:
|
||||
return getattr(self._get_primary(), method)(*args, **kwargs)
|
||||
except Exception as e:
|
||||
logger.warning("AutoSignatureStateStore Postgres failed: %s", e)
|
||||
self._using_fallback = True
|
||||
self._since = time.monotonic()
|
||||
return getattr(self._fallback, method)(*args, **kwargs)
|
||||
|
||||
def should_run_triage(self, signature, cooldown_minutes=DEFAULT_COOLDOWN_MINUTES):
|
||||
return self._delegate("should_run_triage", signature, cooldown_minutes)
|
||||
|
||||
def mark_alert_seen(self, signature):
|
||||
self._delegate("mark_alert_seen", signature)
|
||||
|
||||
def mark_triage_run(self, signature):
|
||||
self._delegate("mark_triage_run", signature)
|
||||
|
||||
def get_state(self, signature):
|
||||
return self._delegate("get_state", signature)
|
||||
|
||||
def list_active_signatures(self, window_minutes=60, limit=100):
|
||||
return self._delegate("list_active_signatures", window_minutes, limit)
|
||||
|
||||
|
||||
# ─── Singleton ────────────────────────────────────────────────────────────────
|
||||
|
||||
_sig_store: Optional[SignatureStateStore] = None
|
||||
_sig_lock = threading.Lock()
|
||||
|
||||
|
||||
def get_signature_state_store() -> SignatureStateStore:
|
||||
global _sig_store
|
||||
if _sig_store is None:
|
||||
with _sig_lock:
|
||||
if _sig_store is None:
|
||||
_sig_store = _create_sig_store()
|
||||
return _sig_store
|
||||
|
||||
|
||||
def set_signature_state_store(store: Optional[SignatureStateStore]) -> None:
|
||||
global _sig_store
|
||||
with _sig_lock:
|
||||
_sig_store = store
|
||||
|
||||
|
||||
def _create_sig_store() -> SignatureStateStore:
|
||||
backend = os.getenv("ALERT_BACKEND", "memory").lower()
|
||||
dsn = os.getenv("DATABASE_URL") or os.getenv("ALERT_DATABASE_URL", "")
|
||||
if backend == "postgres" and dsn:
|
||||
return PostgresSignatureStateStore(dsn)
|
||||
if backend == "auto" and dsn:
|
||||
return AutoSignatureStateStore(dsn)
|
||||
return MemorySignatureStateStore()
|
||||
767
services/router/sofiia_auto_router.py
Normal file
767
services/router/sofiia_auto_router.py
Normal file
@@ -0,0 +1,767 @@
|
||||
"""Sofiia Smart Auto-Router — Cursor-style model selection for Sofiia agent.
|
||||
|
||||
Classifies incoming prompt by task type and selects the best available model,
|
||||
balancing capability, speed, cost, and provider availability.
|
||||
|
||||
Full model catalog includes:
|
||||
- Cloud: Anthropic Claude, xAI Grok, DeepSeek, Mistral AI, GLM-5 (Z.AI)
|
||||
- Local Ollama (NODA2/MacBook): qwen3.5:35b-a3b, qwen3:14b, glm-4.7-flash:32k,
|
||||
deepseek-r1:70b, deepseek-coder:33b, gemma3, mistral-nemo:12b,
|
||||
starcoder2:3b, phi3, llava:13b
|
||||
|
||||
Task taxonomy (inspired by Cursor Auto mode):
|
||||
code_gen, code_review, code_debug, code_refactor,
|
||||
architecture, devops, security, analysis, quick_answer, creative, reasoning,
|
||||
math_code, vision, chatbot
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── Task taxonomy ──────────────────────────────────────────────────────────────
|
||||
# Each pattern group uses multi-word or context-aware patterns to reduce false
|
||||
# positives. Single common words (system, design, check, list, graph, tree) are
|
||||
# avoided unless paired with a qualifier.
|
||||
|
||||
TASK_PATTERNS: List[Tuple[str, List[str], float]] = [
|
||||
# (task_type, patterns, base_weight) — weight scales final score
|
||||
("code_gen", [
|
||||
r"\bнапиши\s+(функці|код|клас|скрипт|модуль|endpoint|api)",
|
||||
r"\bреалізуй\b", r"\bcreate\s+(function|class|module|endpoint|api|component)",
|
||||
r"\bimplement\b", r"\bgenerate\s+code\b", r"\bзгенеруй\s+код\b",
|
||||
r"\bфункці[юя]\s+для\b", r"\bклас\s+для\b", r"\bнапиши\s+код\b",
|
||||
r"\bwrite\s+a?\s*(function|class|module|script|endpoint)\b",
|
||||
r"\bcontroller\b", r"\bendpoint\s+(для|for)\b",
|
||||
], 1.0),
|
||||
("code_debug", [
|
||||
r"\bвиправ\b", r"\bбаг\b", r"\bпомилк[аи]\b", r"\btraceback\b",
|
||||
r"\bexception\b", r"\bfailed\b", r"\bcrash(es|ed)?\b", r"\bне\s+працю",
|
||||
r"\bдебаг\b", r"\bdebug\b", r"\bfix\s+(the\s+)?(bug|error|issue|crash)\b",
|
||||
r"\bsyntax\s*error\b", r"\btype\s*error\b", r"\battribute\s*error\b",
|
||||
r"\bruntime\s*error\b", r"\bvalue\s*error\b",
|
||||
], 1.0),
|
||||
("code_review", [
|
||||
r"\breview\s+(the\s+)?(code|pr|pull\s+request|diff)\b",
|
||||
r"\bаудит\s+(код|сервіс|систем)\b", r"\baudit\s+(code|service)\b",
|
||||
r"\bперевір\w*\s+(код|якість)\b", r"\bcode\s+quality\b",
|
||||
r"\bcode\s+review\b", r"\brev'ю\b",
|
||||
], 1.0),
|
||||
("code_refactor", [
|
||||
r"\bрефактор\b", r"\brefactor\b",
|
||||
r"\bоптимізу[йї]\s+(код|функці|клас)\b", r"\boptimize\s+(the\s+)?(code|function|class)\b",
|
||||
r"\bclean\s+up\s+(the\s+)?code\b", r"\bpolish\s+(the\s+)?code\b",
|
||||
r"\bspeed\s+up\b", r"\bimprove\s+(the\s+)?code\b",
|
||||
], 1.0),
|
||||
("architecture", [
|
||||
r"\bархітектур\w+\b", r"\barchitecture\b",
|
||||
r"\bспроєктуй\b", r"\bsystem\s+design\b",
|
||||
r"\bmicroservice\s+(architect|design|pattern)\b",
|
||||
r"\bdatabase\s+design\b", r"\bapi\s+design\b",
|
||||
r"\bscalab(le|ility)\b", r"\bscaling\s+strateg\b",
|
||||
r"\bdesign\s+pattern\b", r"\bsystem\s+structure\b",
|
||||
], 1.0),
|
||||
("devops", [
|
||||
r"\bdeploy\b", r"\bdocker\s*(file|compose|-compose|ize)?\b",
|
||||
r"\bkubernetes\b", r"\bk8s\b", r"\bci[\s/]cd\b",
|
||||
r"\bpipeline\b", r"\bnginx\b", r"\bcaddy\b",
|
||||
r"\bнода\d?\b", r"\bnoda\d?\b", r"\bcontainer\s+(start|stop|restart|build|image)\b",
|
||||
r"\bдеплой\b", r"\bssh\s+(to|into|root|connect)\b",
|
||||
r"\bhelm\b", r"\bterraform\b", r"\binfrastructure\b",
|
||||
r"\bdocker\s+compose\s+up\b",
|
||||
], 1.0),
|
||||
("security", [
|
||||
r"\bvulnerability\b", r"\bCVE-\d+\b", r"\bsecurity\s+(audit|review|issue|scan)\b",
|
||||
r"\bauth(entication|orization)\b", r"\bencrypt(ion)?\b",
|
||||
r"\bRBAC\b", r"\bpermission\s+(model|system)\b",
|
||||
r"\bбезпек\w+\b", r"\bpentest\b", r"\b(sql|xss|csrf)\s*injection\b",
|
||||
r"\bthreat\s+model\b",
|
||||
], 1.0),
|
||||
("reasoning", [
|
||||
r"\bчому\s+\w+\b", r"\bwhy\s+(does|is|do|did|should|would)\b",
|
||||
r"\bpros\s+and\s+cons\b", r"\btrade[\s-]?off\b",
|
||||
r"\bпорівняй\b", r"\bcompare\s+\w+\s+(vs|and|with|to)\b",
|
||||
r"\bяк\s+краще\b", r"\bперевага\b", r"\bнедолік\b",
|
||||
r"\bdecision\s+(between|about)\b",
|
||||
r"\bversus\b", r"\b\w+\s+vs\s+\w+\b",
|
||||
], 1.0),
|
||||
("analysis", [
|
||||
r"\bпроаналізуй\b", r"\bаналіз\s+\w+\b",
|
||||
r"\banalyze\s+\w+\b", r"\binvestigate\b",
|
||||
r"\bexplain\s+(how|why|what)\b", r"\bsummariz(e|ation)\b",
|
||||
r"\bдослідж\b", r"\bпоясни\s+(як|чому|що)\b",
|
||||
r"\bhow\s+does\s+\w+\s+work\b",
|
||||
], 1.0),
|
||||
("creative", [
|
||||
r"\bнапиши\s+(текст|стат|пост|лист|опис)\b",
|
||||
r"\bwrite\s+a\s+(blog|article|post|email|description|letter)\b",
|
||||
r"\bdraft\s+(a\s+)?(doc|email|message|proposal)\b",
|
||||
r"\breadme\b", r"\bchangelog\b", r"\bdocumentation\b",
|
||||
], 1.0),
|
||||
("quick_answer", [
|
||||
r"\bщо\s+таке\b", r"\bwhat\s+is\s+(a|an|the)?\b",
|
||||
r"\bhow\s+to\s+\w+\b", r"\bdefinition\s+of\b",
|
||||
r"\bшвидко\b", r"\bсинтаксис\s+\w+\b",
|
||||
r"\bgive\s+me\s+an?\s+example\b", r"\bexample\s+of\b",
|
||||
], 0.9),
|
||||
("vision", [
|
||||
r"\bзображен\w+\b", r"\bфото\b", r"\bimage\s+(analysis|recognition|detect)\b",
|
||||
r"\bскріншот\b", r"\bscreenshot\b",
|
||||
r"\bвізуальн\w+\s+аналіз\b", r"\bвідео\s+(аналіз|розпізна)\b",
|
||||
], 1.0),
|
||||
("math_code", [
|
||||
r"\bалгоритм\s+\w+\b", r"\balgorithm\s+(for|to)\b",
|
||||
r"\bсортуван\w+\b", r"\bsort(ing)?\s+algorithm\b",
|
||||
r"\bdynamic\s+programming\b", r"\bgraph\s+(algorithm|traversal|search)\b",
|
||||
r"\bmatrix\s+(mult|inver|decomp)\b",
|
||||
r"\bcalculate\s+\w+\b", r"\bcompute\s+\w+\b",
|
||||
r"\bformula\s+(for|to)\b", r"\bДейкстр\b", r"\bDijkstra\b",
|
||||
], 1.0),
|
||||
# Chatbot / conversational — greetings, small talk, acknowledgements
|
||||
("chatbot", [
|
||||
r"^(привіт|вітаю|добрий|доброго|hi|hello|hey)\b",
|
||||
r"^(дякую|спасибі|thank|thanks)\b",
|
||||
r"^(ок|добре|зрозумів|зрозуміло|so?|ok|yes|no|ні|так)\s*[,!.]?\s*$",
|
||||
r"\bяк\s+(справи|діла|ся маєш)\b", r"\bhow\s+are\s+you\b",
|
||||
], 0.8),
|
||||
]
|
||||
|
||||
# Pre-compile patterns once for performance
|
||||
_COMPILED_PATTERNS: Optional[List[Tuple[str, List[re.Pattern], float]]] = None
|
||||
|
||||
|
||||
def _get_compiled_patterns() -> List[Tuple[str, List[re.Pattern], float]]:
|
||||
global _COMPILED_PATTERNS
|
||||
if _COMPILED_PATTERNS is None:
|
||||
_COMPILED_PATTERNS = [
|
||||
(task_type, [re.compile(p, re.IGNORECASE) for p in patterns], weight)
|
||||
for task_type, patterns, weight in TASK_PATTERNS
|
||||
]
|
||||
return _COMPILED_PATTERNS
|
||||
|
||||
|
||||
# ── Model catalog ──────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class ModelSpec:
|
||||
profile_name: str
|
||||
provider: str
|
||||
model_id: str
|
||||
api_key_env: str = ""
|
||||
strengths: List[str] = field(default_factory=list)
|
||||
cost_tier: int = 1 # 0=free(local), 1=cheap, 2=mid, 3=expensive
|
||||
speed_tier: int = 1 # 1=fast, 2=medium, 3=slow
|
||||
context_k: int = 8 # context window in thousands
|
||||
local: bool = False
|
||||
max_tokens: int = 4096
|
||||
vram_gb: float = 0.0
|
||||
description: str = ""
|
||||
|
||||
@property
|
||||
def available(self) -> bool:
|
||||
if self.local:
|
||||
return _is_ollama_model_available(self.model_id)
|
||||
return bool(os.getenv(self.api_key_env, "").strip())
|
||||
|
||||
@property
|
||||
def has_credits(self) -> bool:
|
||||
return ProviderBudget.is_available(self.provider)
|
||||
|
||||
|
||||
# ── Ollama model availability cache ───────────────────────────────────────────
|
||||
|
||||
_ollama_available_models: Optional[List[str]] = None
|
||||
_ollama_cache_ts: float = 0.0
|
||||
_OLLAMA_CACHE_TTL = 60.0
|
||||
|
||||
|
||||
def _is_ollama_model_available(model_id: str) -> bool:
|
||||
global _ollama_available_models, _ollama_cache_ts
|
||||
now = time.time()
|
||||
if _ollama_available_models is None or (now - _ollama_cache_ts) > _OLLAMA_CACHE_TTL:
|
||||
_refresh_ollama_models_sync()
|
||||
if _ollama_available_models is None:
|
||||
return False
|
||||
model_lower = model_id.lower()
|
||||
model_base = model_lower.split(":")[0]
|
||||
for m in _ollama_available_models:
|
||||
ml = m.lower()
|
||||
if ml == model_lower or ml.split(":")[0] == model_base:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _refresh_ollama_models_sync() -> None:
|
||||
global _ollama_available_models, _ollama_cache_ts
|
||||
import urllib.request
|
||||
import json as _json
|
||||
ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
|
||||
try:
|
||||
with urllib.request.urlopen(f"{ollama_url}/api/tags", timeout=2) as resp:
|
||||
data = _json.loads(resp.read())
|
||||
_ollama_available_models = [m["name"] for m in data.get("models", [])]
|
||||
_ollama_cache_ts = time.time()
|
||||
except Exception:
|
||||
_ollama_available_models = []
|
||||
_ollama_cache_ts = time.time()
|
||||
|
||||
|
||||
async def refresh_ollama_models_async() -> List[str]:
|
||||
global _ollama_available_models, _ollama_cache_ts
|
||||
try:
|
||||
import httpx
|
||||
ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
|
||||
async with httpx.AsyncClient(timeout=2.0) as client:
|
||||
resp = await client.get(f"{ollama_url}/api/tags")
|
||||
data = resp.json()
|
||||
_ollama_available_models = [m["name"] for m in data.get("models", [])]
|
||||
_ollama_cache_ts = time.time()
|
||||
return _ollama_available_models
|
||||
except Exception:
|
||||
_ollama_available_models = _ollama_available_models or []
|
||||
return _ollama_available_models
|
||||
|
||||
|
||||
# ── Full model catalog ─────────────────────────────────────────────────────────
|
||||
|
||||
SOFIIA_MODEL_CATALOG: List[ModelSpec] = [
|
||||
|
||||
# ── Anthropic Claude ─────────────────────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="cloud_claude_sonnet",
|
||||
provider="anthropic", model_id="claude-sonnet-4-5",
|
||||
api_key_env="ANTHROPIC_API_KEY",
|
||||
strengths=["code_gen", "code_debug", "code_refactor", "architecture", "security", "reasoning"],
|
||||
cost_tier=2, speed_tier=2, context_k=200, max_tokens=8192,
|
||||
description="Claude Sonnet 4.5 — найкращий для коду та архітектури",
|
||||
),
|
||||
ModelSpec(
|
||||
profile_name="cloud_claude_haiku",
|
||||
provider="anthropic", model_id="claude-haiku-3-5",
|
||||
api_key_env="ANTHROPIC_API_KEY",
|
||||
strengths=["quick_answer", "code_review", "creative", "analysis", "chatbot"],
|
||||
cost_tier=1, speed_tier=1, context_k=200, max_tokens=4096,
|
||||
description="Claude Haiku 3.5 — швидкий та дешевий",
|
||||
),
|
||||
|
||||
# ── xAI Grok ─────────────────────────────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="cloud_grok",
|
||||
provider="grok", model_id="grok-4-1-fast-reasoning",
|
||||
api_key_env="GROK_API_KEY",
|
||||
strengths=["reasoning", "architecture", "analysis", "code_gen"],
|
||||
cost_tier=2, speed_tier=1, context_k=2000, max_tokens=8192,
|
||||
description="Grok 4.1 Fast — 2M контекст, кращий для reasoning",
|
||||
),
|
||||
|
||||
# ── DeepSeek API ─────────────────────────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="cloud_deepseek",
|
||||
provider="deepseek", model_id="deepseek-chat",
|
||||
api_key_env="DEEPSEEK_API_KEY",
|
||||
strengths=["code_gen", "code_debug", "code_refactor", "devops", "quick_answer"],
|
||||
cost_tier=1, speed_tier=2, context_k=64, max_tokens=4096,
|
||||
description="DeepSeek Chat — дешевий і добре знає код/devops",
|
||||
),
|
||||
|
||||
# ── GLM-5 / Z.AI (API) ───────────────────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="cloud_glm5",
|
||||
provider="glm", model_id="glm-4-plus",
|
||||
api_key_env="GLM5_API_KEY",
|
||||
strengths=["quick_answer", "creative", "analysis", "code_gen", "chatbot"],
|
||||
cost_tier=1, speed_tier=1, context_k=128, max_tokens=4096,
|
||||
description="GLM-4 Plus (Z.AI) — швидкий, дешевий, гарно знає українську/CJK",
|
||||
),
|
||||
ModelSpec(
|
||||
profile_name="cloud_glm5_flash",
|
||||
provider="glm", model_id="glm-4-flash",
|
||||
api_key_env="GLM5_API_KEY",
|
||||
strengths=["quick_answer", "creative", "chatbot"],
|
||||
cost_tier=0, speed_tier=1, context_k=128, max_tokens=2048,
|
||||
description="GLM-4 Flash (Z.AI) — безкоштовний, найшвидший",
|
||||
),
|
||||
|
||||
# ── Mistral AI (API) ─────────────────────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="cloud_mistral",
|
||||
provider="mistral", model_id="mistral-large-latest",
|
||||
api_key_env="MISTRAL_API_KEY",
|
||||
strengths=["analysis", "creative", "reasoning", "architecture"],
|
||||
cost_tier=2, speed_tier=2, context_k=128, max_tokens=4096,
|
||||
description="Mistral Large — добрий для аналізу та creative",
|
||||
),
|
||||
|
||||
# ── Local: qwen3.5:35b-a3b (FLAGSHIP) ────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="local_qwen35_35b",
|
||||
provider="ollama", model_id="qwen3.5:35b-a3b",
|
||||
strengths=["code_gen", "code_debug", "code_refactor", "reasoning", "architecture",
|
||||
"analysis", "devops", "security", "chatbot"],
|
||||
cost_tier=0, speed_tier=2, context_k=32, max_tokens=4096,
|
||||
local=True, vram_gb=24.0,
|
||||
description="Qwen3.5 35B MoE (NODA2) — флагман локально, якість ≈ cloud",
|
||||
),
|
||||
|
||||
# ── Local: qwen3:14b ─────────────────────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="local_qwen3_14b",
|
||||
provider="ollama", model_id="qwen3:14b",
|
||||
strengths=["code_gen", "code_debug", "quick_answer", "devops", "analysis", "chatbot"],
|
||||
cost_tier=0, speed_tier=2, context_k=32, max_tokens=2048,
|
||||
local=True, vram_gb=10.0,
|
||||
description="Qwen3 14B (NODA2) — швидкий локальний загальний",
|
||||
),
|
||||
|
||||
# ── Local: glm-4.7-flash:32k ─────────────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="local_glm47_32k",
|
||||
provider="ollama", model_id="glm-4.7-flash:32k",
|
||||
strengths=["quick_answer", "creative", "analysis", "code_review", "chatbot"],
|
||||
cost_tier=0, speed_tier=2, context_k=32, max_tokens=2048,
|
||||
local=True, vram_gb=20.0,
|
||||
description="GLM-4.7 Flash 32K (NODA2) — локальний GLM, великий контекст",
|
||||
),
|
||||
|
||||
# ── Local: deepseek-r1:70b ────────────────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="local_deepseek_r1_70b",
|
||||
provider="ollama", model_id="deepseek-r1:70b",
|
||||
strengths=["reasoning", "math_code", "architecture", "analysis"],
|
||||
cost_tier=0, speed_tier=3, context_k=64, max_tokens=4096,
|
||||
local=True, vram_gb=48.0,
|
||||
description="DeepSeek-R1 70B (NODA2) — локальний reasoning як o1",
|
||||
),
|
||||
|
||||
# ── Local: deepseek-coder:33b ─────────────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="local_deepseek_coder_33b",
|
||||
provider="ollama", model_id="deepseek-coder:33b",
|
||||
strengths=["code_gen", "code_debug", "code_refactor", "math_code"],
|
||||
cost_tier=0, speed_tier=2, context_k=16, max_tokens=2048,
|
||||
local=True, vram_gb=20.0,
|
||||
description="DeepSeek Coder 33B (NODA2) — спеціаліст по коду",
|
||||
),
|
||||
|
||||
# ── Local: gemma3:latest ──────────────────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="local_gemma3",
|
||||
provider="ollama", model_id="gemma3:latest",
|
||||
strengths=["quick_answer", "analysis", "creative", "chatbot"],
|
||||
cost_tier=0, speed_tier=2, context_k=8, max_tokens=2048,
|
||||
local=True, vram_gb=8.0,
|
||||
description="Gemma3 (NODA2) — Google's ефективна модель",
|
||||
),
|
||||
|
||||
# ── Local: mistral-nemo:12b ───────────────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="local_mistral_nemo",
|
||||
provider="ollama", model_id="mistral-nemo:12b",
|
||||
strengths=["creative", "quick_answer", "analysis", "chatbot"],
|
||||
cost_tier=0, speed_tier=2, context_k=128, max_tokens=2048,
|
||||
local=True, vram_gb=8.0,
|
||||
description="Mistral Nemo 12B (NODA2) — 128K контекст локально",
|
||||
),
|
||||
|
||||
# ── Local: starcoder2:3b ──────────────────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="local_starcoder2",
|
||||
provider="ollama", model_id="starcoder2:3b",
|
||||
strengths=["code_gen", "code_review"],
|
||||
cost_tier=0, speed_tier=1, context_k=16, max_tokens=2048,
|
||||
local=True, vram_gb=2.0,
|
||||
description="StarCoder2 3B (NODA2) — мікро-модель для code completion",
|
||||
),
|
||||
|
||||
# ── Local: phi3:latest ────────────────────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="local_phi3",
|
||||
provider="ollama", model_id="phi3:latest",
|
||||
strengths=["quick_answer", "analysis", "chatbot"],
|
||||
cost_tier=0, speed_tier=1, context_k=128, max_tokens=2048,
|
||||
local=True, vram_gb=4.0,
|
||||
description="Phi-3 (NODA2) — Microsoft мала ефективна модель",
|
||||
),
|
||||
|
||||
# ── Local: llava:13b (vision) ─────────────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="local_llava_13b",
|
||||
provider="ollama", model_id="llava:13b",
|
||||
strengths=["vision"],
|
||||
cost_tier=0, speed_tier=2, context_k=4, max_tokens=2048,
|
||||
local=True, vram_gb=10.0,
|
||||
description="LLaVA 13B (NODA2) — vision модель для зображень",
|
||||
),
|
||||
|
||||
# ── Local: gpt-oss:latest ─────────────────────────────────────────────────
|
||||
ModelSpec(
|
||||
profile_name="local_gpt_oss",
|
||||
provider="ollama", model_id="gpt-oss:latest",
|
||||
strengths=["code_gen", "quick_answer"],
|
||||
cost_tier=0, speed_tier=2, context_k=8, max_tokens=2048,
|
||||
local=True, vram_gb=8.0,
|
||||
description="GPT-OSS (NODA2) — відкрита OSS GPT-like модель",
|
||||
),
|
||||
]
|
||||
|
||||
# ── Task → preferred model matrix ─────────────────────────────────────────────
|
||||
|
||||
TASK_MODEL_PRIORITY: Dict[str, List[str]] = {
|
||||
# Principle: local-first for tasks where local quality is sufficient.
|
||||
# Cloud only when the task genuinely needs it (complex code, deep reasoning,
|
||||
# very long context, security audits).
|
||||
#
|
||||
# qwen3.5:35b-a3b is the flagship local — MoE with cloud-level quality.
|
||||
# It should be preferred over cloud APIs for most routine tasks.
|
||||
|
||||
"code_gen": [
|
||||
"local_qwen35_35b", "cloud_claude_sonnet", "local_deepseek_coder_33b",
|
||||
"cloud_deepseek", "local_qwen3_14b", "cloud_grok",
|
||||
],
|
||||
"code_debug": [
|
||||
"local_qwen35_35b", "local_deepseek_coder_33b", "cloud_claude_sonnet",
|
||||
"cloud_deepseek", "local_qwen3_14b",
|
||||
],
|
||||
"code_review": [
|
||||
"local_qwen35_35b", "cloud_claude_haiku", "local_deepseek_coder_33b",
|
||||
"cloud_claude_sonnet", "cloud_deepseek",
|
||||
],
|
||||
"code_refactor": [
|
||||
"local_qwen35_35b", "local_deepseek_coder_33b", "cloud_claude_sonnet",
|
||||
"cloud_deepseek", "local_qwen3_14b",
|
||||
],
|
||||
"math_code": [
|
||||
"local_deepseek_r1_70b", "local_qwen35_35b", "cloud_grok",
|
||||
"cloud_claude_sonnet", "local_deepseek_coder_33b",
|
||||
],
|
||||
"architecture": [
|
||||
"local_qwen35_35b", "cloud_grok", "cloud_claude_sonnet",
|
||||
"local_deepseek_r1_70b", "cloud_mistral",
|
||||
],
|
||||
"devops": [
|
||||
"local_qwen35_35b", "local_qwen3_14b", "cloud_deepseek",
|
||||
"cloud_claude_sonnet", "local_glm47_32k",
|
||||
],
|
||||
"security": [
|
||||
"cloud_claude_sonnet", "local_qwen35_35b", "cloud_grok", "cloud_mistral",
|
||||
],
|
||||
"reasoning": [
|
||||
"local_deepseek_r1_70b", "local_qwen35_35b", "cloud_grok",
|
||||
"cloud_claude_sonnet", "cloud_mistral",
|
||||
],
|
||||
"analysis": [
|
||||
"local_qwen35_35b", "local_glm47_32k", "cloud_grok",
|
||||
"cloud_claude_haiku", "local_mistral_nemo", "cloud_mistral",
|
||||
],
|
||||
"creative": [
|
||||
"local_qwen35_35b", "local_mistral_nemo", "cloud_claude_haiku",
|
||||
"local_glm47_32k", "cloud_mistral",
|
||||
],
|
||||
"quick_answer": [
|
||||
"local_qwen3_14b", "local_qwen35_35b", "local_phi3",
|
||||
"local_gemma3", "cloud_deepseek", "cloud_glm5_flash",
|
||||
],
|
||||
"chatbot": [
|
||||
"local_qwen3_14b", "local_qwen35_35b", "local_gemma3",
|
||||
"local_phi3", "local_mistral_nemo",
|
||||
],
|
||||
"vision": [
|
||||
"local_llava_13b",
|
||||
],
|
||||
"unknown": [
|
||||
"local_qwen35_35b", "local_qwen3_14b", "cloud_claude_sonnet",
|
||||
"cloud_grok", "cloud_deepseek",
|
||||
],
|
||||
}
|
||||
|
||||
# ── Budget integration ─────────────────────────────────────────────────────────
|
||||
|
||||
class ProviderBudget:
|
||||
"""In-memory budget gate: marks providers exhausted until TTL expires."""
|
||||
_exhausted: Dict[str, float] = {}
|
||||
_exhausted_ttl: int = 3600
|
||||
|
||||
@classmethod
|
||||
def mark_exhausted(cls, provider: str) -> None:
|
||||
cls._exhausted[provider] = time.time()
|
||||
logger.warning("💸 Provider %s marked as budget-exhausted", provider)
|
||||
|
||||
@classmethod
|
||||
def is_available(cls, provider: str) -> bool:
|
||||
ts = cls._exhausted.get(provider)
|
||||
if ts is None:
|
||||
return True
|
||||
if time.time() - ts > cls._exhausted_ttl:
|
||||
cls._exhausted.pop(provider, None)
|
||||
return True
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def reset(cls, provider: str) -> None:
|
||||
cls._exhausted.pop(provider, None)
|
||||
|
||||
|
||||
# ── Task classification ────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class ClassificationResult:
|
||||
task_type: str
|
||||
confidence: float
|
||||
all_scores: Dict[str, float]
|
||||
ambiguous: bool = False
|
||||
runner_up: Optional[str] = None
|
||||
|
||||
|
||||
def classify_task(prompt: str, context_len: int = 0) -> Tuple[str, float]:
|
||||
"""Classify prompt into a task type. Returns (task_type, confidence)."""
|
||||
result = classify_task_detailed(prompt, context_len)
|
||||
return result.task_type, result.confidence
|
||||
|
||||
|
||||
def classify_task_detailed(prompt: str, context_len: int = 0) -> ClassificationResult:
|
||||
"""Detailed classification with ambiguity detection and all scores."""
|
||||
if not prompt or not prompt.strip():
|
||||
return ClassificationResult("chatbot", 0.5, {}, ambiguous=False)
|
||||
|
||||
text = prompt.strip()
|
||||
compiled = _get_compiled_patterns()
|
||||
scores: Dict[str, float] = {}
|
||||
|
||||
for task_type, patterns, weight in compiled:
|
||||
hits = sum(1 for p in patterns if p.search(text))
|
||||
if hits > 0:
|
||||
raw = hits / len(patterns)
|
||||
scores[task_type] = raw * weight
|
||||
|
||||
if not scores:
|
||||
return ClassificationResult("unknown", 0.3, {}, ambiguous=False)
|
||||
|
||||
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
||||
best_task, best_score = sorted_scores[0]
|
||||
confidence = min(best_score * 10, 1.0)
|
||||
|
||||
# Penalize confidence for very short prompts (fewer signals)
|
||||
word_count = len(text.split())
|
||||
if word_count <= 3:
|
||||
confidence *= 0.6
|
||||
elif word_count <= 8:
|
||||
confidence *= 0.85
|
||||
|
||||
# Detect ambiguity: second-place is within 30% of the best
|
||||
ambiguous = False
|
||||
runner_up = None
|
||||
if len(sorted_scores) >= 2:
|
||||
_, second_score = sorted_scores[1]
|
||||
if second_score > 0 and second_score / best_score > 0.7:
|
||||
ambiguous = True
|
||||
runner_up = sorted_scores[1][0]
|
||||
|
||||
# For long conversations, slight preference for context-heavy models
|
||||
# (influences scoring, not classification)
|
||||
if context_len > 50:
|
||||
confidence = max(confidence, 0.5)
|
||||
|
||||
return ClassificationResult(
|
||||
task_type=best_task,
|
||||
confidence=round(confidence, 3),
|
||||
all_scores={k: round(v, 4) for k, v in sorted_scores[:5]},
|
||||
ambiguous=ambiguous,
|
||||
runner_up=runner_up,
|
||||
)
|
||||
|
||||
|
||||
def _prompt_complexity(prompt: str) -> str:
|
||||
"""Estimate prompt complexity: simple | medium | complex"""
|
||||
words = len(prompt.split())
|
||||
lines = prompt.count("\n")
|
||||
code_blocks = prompt.count("```")
|
||||
if words < 20 and lines < 3 and code_blocks == 0:
|
||||
return "simple"
|
||||
if words > 200 or code_blocks >= 2 or lines > 20:
|
||||
return "complex"
|
||||
return "medium"
|
||||
|
||||
|
||||
# ── Main selection function ────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class AutoRouteResult:
|
||||
profile_name: str
|
||||
model_id: str
|
||||
provider: str
|
||||
task_type: str
|
||||
confidence: float
|
||||
complexity: str
|
||||
reason: str
|
||||
fallback_used: bool = False
|
||||
all_candidates: List[str] = field(default_factory=list)
|
||||
ambiguous: bool = False
|
||||
runner_up: Optional[str] = None
|
||||
all_scores: Dict[str, float] = field(default_factory=dict)
|
||||
|
||||
|
||||
def select_model_auto(
|
||||
prompt: str,
|
||||
force_fast: bool = False,
|
||||
force_capable: bool = False,
|
||||
prefer_local: bool = False,
|
||||
prefer_cheap: bool = False,
|
||||
budget_aware: bool = True,
|
||||
context_messages_len: int = 0,
|
||||
) -> AutoRouteResult:
|
||||
"""
|
||||
Cursor-style auto model selection for Sofiia.
|
||||
|
||||
Logic:
|
||||
1. Classify task type from prompt (with ambiguity detection)
|
||||
2. Estimate complexity (simple/medium/complex)
|
||||
3. Apply modifiers (force_fast, force_capable, prefer_local, prefer_cheap)
|
||||
4. Score candidates from priority list factoring availability, budget, speed, cost
|
||||
5. For long conversations, prefer large-context models
|
||||
"""
|
||||
classification = classify_task_detailed(prompt, context_messages_len)
|
||||
task_type = classification.task_type
|
||||
confidence = classification.confidence
|
||||
complexity = _prompt_complexity(prompt)
|
||||
|
||||
effective_task = task_type
|
||||
|
||||
# Modifier overrides (parentheses fix for operator precedence)
|
||||
if force_fast and task_type not in ("code_gen", "code_debug", "math_code"):
|
||||
effective_task = "quick_answer"
|
||||
if (prefer_cheap or complexity == "simple") and task_type in ("quick_answer", "creative", "chatbot"):
|
||||
effective_task = "quick_answer"
|
||||
|
||||
priority_list = TASK_MODEL_PRIORITY.get(effective_task, TASK_MODEL_PRIORITY["unknown"])
|
||||
catalog_map = {m.profile_name: m for m in SOFIIA_MODEL_CATALOG}
|
||||
|
||||
candidates = [p for p in priority_list if p in catalog_map]
|
||||
if prefer_local:
|
||||
local_cands = [p for p in candidates if catalog_map[p].local]
|
||||
if local_cands:
|
||||
candidates = local_cands
|
||||
|
||||
def _score(profile_name: str) -> float:
|
||||
spec = catalog_map[profile_name]
|
||||
score = 0.0
|
||||
|
||||
if not spec.available:
|
||||
score += 1000
|
||||
if budget_aware and not spec.has_credits:
|
||||
score += 500
|
||||
|
||||
# Priority-list position is the strongest signal
|
||||
try:
|
||||
pos = priority_list.index(profile_name)
|
||||
score += pos * 20
|
||||
except ValueError:
|
||||
score += 200
|
||||
|
||||
if prefer_local and not spec.local:
|
||||
score += 200
|
||||
if force_fast:
|
||||
score += spec.speed_tier * 15
|
||||
if prefer_cheap or prefer_local:
|
||||
score -= spec.cost_tier * 20
|
||||
else:
|
||||
score += spec.cost_tier * 2
|
||||
|
||||
if force_capable:
|
||||
score -= spec.context_k / 100
|
||||
|
||||
if complexity == "complex" and spec.context_k < 32:
|
||||
score += 40
|
||||
|
||||
# Long conversation bonus for large-context models
|
||||
if context_messages_len > 30 and spec.context_k >= 128:
|
||||
score -= 15
|
||||
elif context_messages_len > 50 and spec.context_k < 32:
|
||||
score += 25
|
||||
|
||||
return score
|
||||
|
||||
scored = sorted([c for c in candidates if c in catalog_map], key=_score)
|
||||
|
||||
if not scored:
|
||||
for fallback in ["local_qwen35_35b", "local_qwen3_14b", "local_phi3"]:
|
||||
if fallback in catalog_map:
|
||||
scored = [fallback]
|
||||
break
|
||||
|
||||
best = scored[0] if scored else "local_qwen3_14b"
|
||||
spec = catalog_map.get(best)
|
||||
fallback_used = best not in priority_list[:2]
|
||||
|
||||
reasons: List[str] = [f"task={task_type} ({confidence:.0%})", f"complexity={complexity}"]
|
||||
if classification.ambiguous:
|
||||
reasons.append(f"ambiguous (runner_up={classification.runner_up})")
|
||||
if force_fast:
|
||||
reasons.append("force_fast")
|
||||
if prefer_local:
|
||||
reasons.append("prefer_local")
|
||||
if prefer_cheap:
|
||||
reasons.append("prefer_cheap")
|
||||
if force_capable:
|
||||
reasons.append("force_capable")
|
||||
if context_messages_len > 30:
|
||||
reasons.append(f"long_conversation({context_messages_len})")
|
||||
if fallback_used:
|
||||
reasons.append("fallback (top unavailable)")
|
||||
|
||||
return AutoRouteResult(
|
||||
profile_name=best,
|
||||
model_id=spec.model_id if spec else best,
|
||||
provider=spec.provider if spec else "unknown",
|
||||
task_type=task_type,
|
||||
confidence=confidence,
|
||||
complexity=complexity,
|
||||
reason=" | ".join(reasons),
|
||||
fallback_used=fallback_used,
|
||||
all_candidates=scored[:5],
|
||||
ambiguous=classification.ambiguous,
|
||||
runner_up=classification.runner_up,
|
||||
all_scores=classification.all_scores,
|
||||
)
|
||||
|
||||
|
||||
def explain_selection(result: AutoRouteResult) -> str:
|
||||
"""Human-readable explanation of model selection (for debug/UI)."""
|
||||
lines = [
|
||||
f"Auto-selected **{result.model_id}** ({result.provider})",
|
||||
f"Task: `{result.task_type}` | Complexity: `{result.complexity}` | "
|
||||
f"Confidence: {result.confidence:.0%}",
|
||||
f"Reason: {result.reason}",
|
||||
]
|
||||
if result.ambiguous:
|
||||
lines.append(f"Ambiguous: runner-up was `{result.runner_up}`")
|
||||
if result.all_scores:
|
||||
top3 = list(result.all_scores.items())[:3]
|
||||
lines.append("Scores: " + ", ".join(f"{k}={v:.3f}" for k, v in top3))
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def get_full_catalog() -> List[Dict[str, Any]]:
|
||||
"""Return full model catalog with availability status for dashboard."""
|
||||
return [
|
||||
{
|
||||
"profile_name": m.profile_name,
|
||||
"provider": m.provider,
|
||||
"model_id": m.model_id,
|
||||
"description": m.description,
|
||||
"strengths": m.strengths,
|
||||
"cost_tier": m.cost_tier,
|
||||
"speed_tier": m.speed_tier,
|
||||
"context_k": m.context_k,
|
||||
"local": m.local,
|
||||
"vram_gb": m.vram_gb,
|
||||
"available": m.available,
|
||||
"has_credits": m.has_credits,
|
||||
}
|
||||
for m in SOFIIA_MODEL_CATALOG
|
||||
]
|
||||
473
services/router/tool_governance.py
Normal file
473
services/router/tool_governance.py
Normal file
@@ -0,0 +1,473 @@
|
||||
"""
|
||||
Tool Governance: RBAC enforcement, Safety Middleware, Audit.
|
||||
|
||||
Applies to ALL /v1/tools/* dispatch.
|
||||
|
||||
Components:
|
||||
1. RBAC Matrix enforcement – deny without entitlement
|
||||
2. Tool Safety Middleware – limits, redaction, allowlist, audit
|
||||
3. Audit events – structured per-call events (no payload, only metadata)
|
||||
|
||||
Usage (in tool_manager.py execute_tool):
|
||||
from tool_governance import ToolGovernance
|
||||
|
||||
governance = ToolGovernance()
|
||||
|
||||
# Pre-call
|
||||
check = governance.pre_call(tool_name, action, agent_id, user_id, workspace_id, input_text)
|
||||
if not check.allowed:
|
||||
return ToolResult(success=False, error=check.reason)
|
||||
|
||||
# Execute actual tool handler ...
|
||||
result = await _actual_handler(args)
|
||||
|
||||
# Post-call
|
||||
governance.post_call(check.call_ctx, result, duration_ms)
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import ipaddress
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Config Paths ─────────────────────────────────────────────────────────────
|
||||
_CONFIG_DIR = Path(__file__).parent.parent.parent / "config"
|
||||
_RBAC_PATH = _CONFIG_DIR / "rbac_tools_matrix.yml"
|
||||
_LIMITS_PATH = _CONFIG_DIR / "tool_limits.yml"
|
||||
_ALLOWLIST_PATH = _CONFIG_DIR / "network_allowlist.yml"
|
||||
|
||||
|
||||
# ─── Data Classes ─────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class CallContext:
|
||||
req_id: str
|
||||
tool: str
|
||||
action: str
|
||||
agent_id: str
|
||||
user_id: str
|
||||
workspace_id: str
|
||||
ts_start: float
|
||||
input_hash: str
|
||||
input_chars: int
|
||||
limits_applied: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PreCallResult:
|
||||
allowed: bool
|
||||
reason: str = ""
|
||||
call_ctx: Optional[CallContext] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class AuditEvent:
|
||||
ts: str
|
||||
req_id: str
|
||||
tool: str
|
||||
action: str
|
||||
workspace_id: str
|
||||
user_id: str
|
||||
agent_id: str
|
||||
status: str # "pass" | "deny" | "error"
|
||||
duration_ms: float
|
||||
limits_applied: Dict[str, Any]
|
||||
input_hash: str
|
||||
input_chars: int
|
||||
output_size_bytes: int
|
||||
|
||||
|
||||
# ─── YAML Loader (lazy, cached) ───────────────────────────────────────────────
|
||||
|
||||
_yaml_cache: Dict[str, Any] = {}
|
||||
|
||||
|
||||
def _load_yaml(path: Path) -> dict:
|
||||
key = str(path)
|
||||
if key not in _yaml_cache:
|
||||
try:
|
||||
import yaml
|
||||
with open(path, "r") as f:
|
||||
_yaml_cache[key] = yaml.safe_load(f) or {}
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not load {path}: {e}")
|
||||
_yaml_cache[key] = {}
|
||||
return _yaml_cache[key]
|
||||
|
||||
|
||||
def _reload_yaml_cache():
|
||||
"""Force reload all yaml caches (for tests / hot-reload)."""
|
||||
_yaml_cache.clear()
|
||||
|
||||
|
||||
# ─── Secret Redaction ─────────────────────────────────────────────────────────
|
||||
|
||||
_SECRET_PATTERNS = [
|
||||
# API keys / tokens
|
||||
re.compile(
|
||||
r'(?i)(api[_-]?key|token|secret|password|passwd|pwd|auth|bearer|jwt|'
|
||||
r'oauth|private[_-]?key|sk-|ghp_|xoxb-|AKIA|client_secret)'
|
||||
r'[\s=:]+[\'"`]?([a-zA-Z0-9_\-\.]{8,})[\'"`]?',
|
||||
re.MULTILINE,
|
||||
),
|
||||
# Generic high-entropy strings after known labels
|
||||
re.compile(
|
||||
r'(?i)(credential|access[_-]?key|refresh[_-]?token|signing[_-]?key)'
|
||||
r'[\s=:]+[\'"`]?([a-zA-Z0-9/+]{20,}={0,2})[\'"`]?',
|
||||
re.MULTILINE,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def redact(text: str) -> str:
|
||||
"""Mask secret values in text. Always enabled by default."""
|
||||
if not text:
|
||||
return text
|
||||
for pat in _SECRET_PATTERNS:
|
||||
def _replace(m):
|
||||
label = m.group(1)
|
||||
return f"{label}=***REDACTED***"
|
||||
text = pat.sub(_replace, text)
|
||||
return text
|
||||
|
||||
|
||||
# ─── Network Allowlist Check ──────────────────────────────────────────────────
|
||||
|
||||
_PRIVATE_RANGES = [
|
||||
ipaddress.ip_network("10.0.0.0/8"),
|
||||
ipaddress.ip_network("172.16.0.0/12"),
|
||||
ipaddress.ip_network("192.168.0.0/16"),
|
||||
ipaddress.ip_network("127.0.0.0/8"),
|
||||
ipaddress.ip_network("169.254.0.0/16"),
|
||||
ipaddress.ip_network("::1/128"),
|
||||
ipaddress.ip_network("fc00::/7"),
|
||||
]
|
||||
|
||||
|
||||
def _is_private_ip(host: str) -> bool:
|
||||
try:
|
||||
addr = ipaddress.ip_address(host)
|
||||
return any(addr in net for net in _PRIVATE_RANGES)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def check_url_allowed(tool: str, url: str) -> Tuple[bool, str]:
|
||||
"""
|
||||
Check if a URL is allowed for a given tool per network_allowlist.yml.
|
||||
Returns (allowed, reason).
|
||||
"""
|
||||
import urllib.parse
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
host = parsed.hostname or ""
|
||||
scheme = parsed.scheme or "https"
|
||||
|
||||
allowlist_cfg = _load_yaml(_ALLOWLIST_PATH)
|
||||
tool_cfg = allowlist_cfg.get(tool, {})
|
||||
|
||||
if not tool_cfg:
|
||||
# No config: deny by default (safe default)
|
||||
return False, f"No allowlist config for tool '{tool}'"
|
||||
|
||||
# Check scheme
|
||||
allowed_schemes = tool_cfg.get("schemes", ["https"])
|
||||
if scheme not in allowed_schemes:
|
||||
return False, f"Scheme '{scheme}' not allowed for tool '{tool}'"
|
||||
|
||||
# Check allow_any_public flag
|
||||
if tool_cfg.get("allow_any_public"):
|
||||
if tool_cfg.get("block_private_ranges") and _is_private_ip(host):
|
||||
return False, f"Private IP blocked: {host}"
|
||||
return True, ""
|
||||
|
||||
# Check explicit hosts
|
||||
allowed_hosts = tool_cfg.get("hosts", [])
|
||||
if host in allowed_hosts:
|
||||
return True, ""
|
||||
|
||||
return False, f"Host '{host}' not in allowlist for tool '{tool}'"
|
||||
|
||||
|
||||
# ─── RBAC Matrix ──────────────────────────────────────────────────────────────
|
||||
|
||||
def _get_agent_role(agent_id: str) -> str:
|
||||
"""Resolve agent role (delegates to agent_tools_config)."""
|
||||
try:
|
||||
from agent_tools_config import get_agent_role
|
||||
return get_agent_role(agent_id)
|
||||
except Exception:
|
||||
return "agent_default"
|
||||
|
||||
|
||||
def _get_role_entitlements(role: str) -> List[str]:
|
||||
"""Get entitlements for a role from RBAC matrix."""
|
||||
matrix = _load_yaml(_RBAC_PATH)
|
||||
role_entitlements = matrix.get("role_entitlements", {})
|
||||
return role_entitlements.get(role, role_entitlements.get("agent_default", []))
|
||||
|
||||
|
||||
def _get_required_entitlements(tool: str, action: str) -> List[str]:
|
||||
"""Get required entitlements for tool+action from matrix."""
|
||||
matrix = _load_yaml(_RBAC_PATH)
|
||||
tools_section = matrix.get("tools", {})
|
||||
tool_cfg = tools_section.get(tool, {})
|
||||
actions = tool_cfg.get("actions", {})
|
||||
|
||||
# Try exact action, then _default
|
||||
action_cfg = actions.get(action) or actions.get("_default", {})
|
||||
return action_cfg.get("entitlements", []) if action_cfg else []
|
||||
|
||||
|
||||
def check_rbac(agent_id: str, tool: str, action: str) -> Tuple[bool, str]:
|
||||
"""
|
||||
Check RBAC: agent role → entitlements → required entitlements for tool+action.
|
||||
Returns (allowed, reason).
|
||||
"""
|
||||
role = _get_agent_role(agent_id)
|
||||
agent_ents = set(_get_role_entitlements(role))
|
||||
required = _get_required_entitlements(tool, action)
|
||||
|
||||
if not required:
|
||||
# No entitlements required → allowed
|
||||
return True, ""
|
||||
|
||||
missing = [e for e in required if e not in agent_ents]
|
||||
if missing:
|
||||
return False, f"Missing entitlements: {missing} (agent={agent_id}, role={role})"
|
||||
|
||||
return True, ""
|
||||
|
||||
|
||||
# ─── Limits ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def _get_limits(tool: str) -> Dict[str, Any]:
|
||||
"""Get effective limits for a tool (per-tool overrides merged with defaults)."""
|
||||
cfg = _load_yaml(_LIMITS_PATH)
|
||||
defaults = cfg.get("defaults", {
|
||||
"timeout_ms": 30000,
|
||||
"max_chars_in": 200000,
|
||||
"max_bytes_out": 524288,
|
||||
"rate_limit_rpm": 60,
|
||||
"concurrency": 5,
|
||||
})
|
||||
per_tool = cfg.get("tools", {}).get(tool, {})
|
||||
return {**defaults, **per_tool}
|
||||
|
||||
|
||||
def check_input_limits(tool: str, input_text: str) -> Tuple[bool, str, Dict]:
|
||||
"""
|
||||
Enforce max_chars_in limit.
|
||||
Returns (ok, reason, limits_applied).
|
||||
"""
|
||||
limits = _get_limits(tool)
|
||||
max_chars = limits.get("max_chars_in", 200000)
|
||||
actual = len(input_text) if input_text else 0
|
||||
|
||||
if actual > max_chars:
|
||||
return False, f"Input too large: {actual} chars (max {max_chars} for {tool})", limits
|
||||
|
||||
return True, "", limits
|
||||
|
||||
|
||||
# ─── Audit ────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _emit_audit(event: AuditEvent):
|
||||
"""
|
||||
Emit structured audit event.
|
||||
1. Writes to logger (structured, no payload).
|
||||
2. Persists to AuditStore (JSONL/Postgres/Memory) for FinOps analysis.
|
||||
|
||||
Persistence is non-fatal: errors are logged as warnings without interrupting tool execution.
|
||||
"""
|
||||
import datetime
|
||||
record = {
|
||||
"ts": event.ts or datetime.datetime.now(datetime.timezone.utc).isoformat(),
|
||||
"req_id": event.req_id,
|
||||
"tool": event.tool,
|
||||
"action": event.action,
|
||||
"workspace_id": event.workspace_id,
|
||||
"user_id": event.user_id,
|
||||
"agent_id": event.agent_id,
|
||||
"status": event.status,
|
||||
"duration_ms": round(event.duration_ms, 2),
|
||||
"limits_applied": event.limits_applied,
|
||||
"input_hash": event.input_hash,
|
||||
"input_chars": event.input_chars,
|
||||
"output_size_bytes": event.output_size_bytes,
|
||||
}
|
||||
logger.info(f"TOOL_AUDIT {json.dumps(record)}")
|
||||
|
||||
# Persist to audit store (non-fatal)
|
||||
try:
|
||||
from audit_store import get_audit_store
|
||||
store = get_audit_store()
|
||||
store.write(event)
|
||||
except Exception as _audit_err:
|
||||
logger.warning("audit_store.write failed (non-fatal): %s", _audit_err)
|
||||
|
||||
|
||||
# ─── Main Governance Class ────────────────────────────────────────────────────
|
||||
|
||||
class ToolGovernance:
|
||||
"""
|
||||
Single entry point for tool governance.
|
||||
|
||||
Call pre_call() before executing any tool.
|
||||
Call post_call() after execution to emit audit event.
|
||||
"""
|
||||
|
||||
def __init__(self, *, enable_rbac: bool = True, enable_redaction: bool = True,
|
||||
enable_limits: bool = True, enable_audit: bool = True,
|
||||
enable_allowlist: bool = True):
|
||||
self.enable_rbac = enable_rbac
|
||||
self.enable_redaction = enable_redaction
|
||||
self.enable_limits = enable_limits
|
||||
self.enable_audit = enable_audit
|
||||
self.enable_allowlist = enable_allowlist
|
||||
|
||||
def pre_call(
|
||||
self,
|
||||
tool: str,
|
||||
action: str,
|
||||
agent_id: str,
|
||||
user_id: str = "unknown",
|
||||
workspace_id: str = "unknown",
|
||||
input_text: str = "",
|
||||
) -> PreCallResult:
|
||||
"""
|
||||
Run all pre-call checks. Returns PreCallResult.
|
||||
If allowed=False, caller must return error immediately.
|
||||
"""
|
||||
req_id = str(uuid.uuid4())[:12]
|
||||
ts_start = time.monotonic()
|
||||
|
||||
# 1. RBAC check
|
||||
if self.enable_rbac:
|
||||
ok, reason = check_rbac(agent_id, tool, action)
|
||||
if not ok:
|
||||
if self.enable_audit:
|
||||
_emit_audit(AuditEvent(
|
||||
ts=_now_iso(), req_id=req_id, tool=tool, action=action,
|
||||
workspace_id=workspace_id, user_id=user_id, agent_id=agent_id,
|
||||
status="deny", duration_ms=0,
|
||||
limits_applied={}, input_hash="", input_chars=0, output_size_bytes=0,
|
||||
))
|
||||
return PreCallResult(allowed=False, reason=f"RBAC denied: {reason}")
|
||||
|
||||
# 2. Input limits
|
||||
limits_applied = {}
|
||||
if self.enable_limits and input_text:
|
||||
ok, reason, limits_applied = check_input_limits(tool, input_text)
|
||||
if not ok:
|
||||
if self.enable_audit:
|
||||
_emit_audit(AuditEvent(
|
||||
ts=_now_iso(), req_id=req_id, tool=tool, action=action,
|
||||
workspace_id=workspace_id, user_id=user_id, agent_id=agent_id,
|
||||
status="deny", duration_ms=0,
|
||||
limits_applied=limits_applied,
|
||||
input_hash="", input_chars=len(input_text), output_size_bytes=0,
|
||||
))
|
||||
return PreCallResult(allowed=False, reason=f"Limits exceeded: {reason}")
|
||||
elif not limits_applied:
|
||||
limits_applied = _get_limits(tool)
|
||||
|
||||
# Build call context
|
||||
input_hash = hashlib.sha256(input_text.encode()).hexdigest()[:16] if input_text else ""
|
||||
ctx = CallContext(
|
||||
req_id=req_id,
|
||||
tool=tool,
|
||||
action=action,
|
||||
agent_id=agent_id,
|
||||
user_id=user_id,
|
||||
workspace_id=workspace_id,
|
||||
ts_start=ts_start,
|
||||
input_hash=input_hash,
|
||||
input_chars=len(input_text) if input_text else 0,
|
||||
limits_applied=limits_applied,
|
||||
)
|
||||
return PreCallResult(allowed=True, call_ctx=ctx)
|
||||
|
||||
def post_call(self, ctx: CallContext, result_value: Any, error: Optional[str] = None):
|
||||
"""
|
||||
Emit audit event after tool execution.
|
||||
result_value: raw result data (used only for size calculation, not logged).
|
||||
"""
|
||||
if not self.enable_audit or ctx is None:
|
||||
return
|
||||
|
||||
duration_ms = (time.monotonic() - ctx.ts_start) * 1000
|
||||
status = "error" if error else "pass"
|
||||
|
||||
# Calculate output size (bytes) without logging content
|
||||
try:
|
||||
out_bytes = len(json.dumps(result_value).encode()) if result_value is not None else 0
|
||||
except Exception:
|
||||
out_bytes = 0
|
||||
|
||||
_emit_audit(AuditEvent(
|
||||
ts=_now_iso(),
|
||||
req_id=ctx.req_id,
|
||||
tool=ctx.tool,
|
||||
action=ctx.action,
|
||||
workspace_id=ctx.workspace_id,
|
||||
user_id=ctx.user_id,
|
||||
agent_id=ctx.agent_id,
|
||||
status=status,
|
||||
duration_ms=duration_ms,
|
||||
limits_applied=ctx.limits_applied,
|
||||
input_hash=ctx.input_hash,
|
||||
input_chars=ctx.input_chars,
|
||||
output_size_bytes=out_bytes,
|
||||
))
|
||||
|
||||
def apply_redaction(self, text: str) -> str:
|
||||
"""Apply secret redaction if enabled."""
|
||||
if not self.enable_redaction:
|
||||
return text
|
||||
return redact(text)
|
||||
|
||||
def check_url(self, tool: str, url: str) -> Tuple[bool, str]:
|
||||
"""Check URL against allowlist if enabled."""
|
||||
if not self.enable_allowlist:
|
||||
return True, ""
|
||||
return check_url_allowed(tool, url)
|
||||
|
||||
def get_timeout_ms(self, tool: str) -> int:
|
||||
"""Get configured timeout for a tool."""
|
||||
limits = _get_limits(tool)
|
||||
return limits.get("timeout_ms", 30000)
|
||||
|
||||
|
||||
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def _now_iso() -> str:
|
||||
import datetime
|
||||
return datetime.datetime.now(datetime.timezone.utc).isoformat()
|
||||
|
||||
|
||||
# ─── Module-level singleton ───────────────────────────────────────────────────
|
||||
|
||||
_governance: Optional[ToolGovernance] = None
|
||||
|
||||
|
||||
def get_governance() -> ToolGovernance:
|
||||
"""Get the shared ToolGovernance singleton."""
|
||||
global _governance
|
||||
if _governance is None:
|
||||
_governance = ToolGovernance()
|
||||
return _governance
|
||||
|
||||
|
||||
def reset_governance(instance: Optional[ToolGovernance] = None):
|
||||
"""Reset singleton (for testing)."""
|
||||
global _governance
|
||||
_governance = instance
|
||||
Reference in New Issue
Block a user