Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
412 lines
18 KiB
Python
412 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
verify_sofiia_stack.py — Sofiia stack parity verifier (NODA1 / NODA2).
|
|
DAARION.city | deterministic PASS/FAIL/WARN, no LLM.
|
|
|
|
Checks (per node):
|
|
- Router /healthz (or /health)
|
|
- /v1/tools/execute dry-run: risk_engine_tool.service, architecture_pressure_tool.service, backlog_tool.dashboard
|
|
- BFF /api/status/full → reachable, router+memory reachable, alerts backend != memory
|
|
- BFF /api/health → service=sofiia-console
|
|
- Cron: jobs present (via status/full or local file)
|
|
- Optional: supervisor health if SUPERVISOR_URL set
|
|
|
|
Parity (--compare-with):
|
|
- Compare BFF version between two nodes (WARN if different, not FAIL)
|
|
- Compare router/memory reachable on both
|
|
|
|
Usage:
|
|
python3 ops/scripts/verify_sofiia_stack.py
|
|
python3 ops/scripts/verify_sofiia_stack.py --node NODA2 --bff-url http://localhost:8002
|
|
python3 ops/scripts/verify_sofiia_stack.py \\
|
|
--node NODA2 --bff-url http://noda2:8002 \\
|
|
--compare-with http://noda1:8002
|
|
|
|
Exit: 0 if all critical checks PASS, 1 otherwise.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
import urllib.error
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parent.parent.parent
|
|
CRON_FILE = REPO_ROOT / "ops" / "cron" / "jobs.cron"
|
|
TOOLS_TIMEOUT = 25
|
|
|
|
CRON_JOBS_EXPECTED = [
|
|
"hourly_risk_snapshot",
|
|
"daily_risk_digest",
|
|
"risk_history_cleanup",
|
|
"weekly_platform_priority_digest",
|
|
"weekly_backlog_generate",
|
|
"daily_backlog_cleanup",
|
|
]
|
|
|
|
# ── HTTP helpers ──────────────────────────────────────────────────────────────
|
|
|
|
def _get(url: str, timeout: int = 8) -> tuple[int, dict]:
|
|
try:
|
|
with urllib.request.urlopen(url, timeout=timeout) as resp:
|
|
return resp.getcode(), json.loads(resp.read().decode())
|
|
except urllib.error.HTTPError as e:
|
|
try:
|
|
body = json.loads(e.read().decode())
|
|
except Exception:
|
|
body = {}
|
|
return e.code, body
|
|
except Exception:
|
|
return 0, {}
|
|
|
|
|
|
def _post_json(url: str, body: dict, api_key: str = "", timeout: int = 30) -> tuple[int, dict]:
|
|
try:
|
|
data = json.dumps(body).encode()
|
|
req = urllib.request.Request(url, data=data, method="POST",
|
|
headers={"Content-Type": "application/json"})
|
|
if api_key:
|
|
req.add_header("Authorization", f"Bearer {api_key}")
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
return resp.getcode(), json.loads(resp.read().decode())
|
|
except urllib.error.HTTPError as e:
|
|
try:
|
|
body = json.loads(e.read().decode())
|
|
except Exception:
|
|
body = {}
|
|
return e.code, body
|
|
except Exception:
|
|
return 0, {}
|
|
|
|
|
|
# ── Individual checks ─────────────────────────────────────────────────────────
|
|
|
|
def check_router_health(base_url: str) -> dict:
|
|
"""CRITICAL: router must respond 200."""
|
|
for path in ("/healthz", "/health"):
|
|
code, _ = _get(f"{base_url.rstrip('/')}{path}", timeout=5)
|
|
if code == 200:
|
|
return {"name": "router_health", "pass": True, "level": "critical",
|
|
"detail": f"GET {path} 200"}
|
|
return {"name": "router_health", "pass": False, "level": "critical",
|
|
"detail": "router unreachable (no 200 from /healthz or /health)"}
|
|
|
|
|
|
def check_tool(base_url: str, tool: str, action: str, params: dict, api_key: str) -> dict:
|
|
"""CRITICAL: tool execute must reach router (400/422 = reached, schema error = ok)."""
|
|
url = f"{base_url.rstrip('/')}/v1/tools/execute"
|
|
body = {"tool": tool, "action": action, "agent_id": "sofiia", **params}
|
|
code, data = _post_json(url, body, api_key=api_key, timeout=TOOLS_TIMEOUT)
|
|
# 200 = success, 400/422 = reached but bad params (tool not loaded) — still PASS
|
|
reached = code in (200, 400, 422)
|
|
succeeded = code == 200 and (
|
|
data.get("status") == "succeeded" or data.get("data") is not None
|
|
)
|
|
return {
|
|
"name": f"tool_{tool}_{action}",
|
|
"pass": reached,
|
|
"level": "critical",
|
|
"detail": (
|
|
f"HTTP {code} status={data.get('status', '—')}"
|
|
+ (" [data returned]" if succeeded else "")
|
|
),
|
|
}
|
|
|
|
|
|
def check_bff_health(bff_url: str) -> dict:
|
|
"""CRITICAL: BFF must identify as sofiia-console."""
|
|
code, data = _get(f"{bff_url.rstrip('/')}/api/health", timeout=6)
|
|
if code == 200 and data.get("service") == "sofiia-console":
|
|
return {"name": "bff_health", "pass": True, "level": "critical",
|
|
"detail": f"version={data.get('version')} env={data.get('env')} uptime={data.get('uptime_s')}s",
|
|
"version": data.get("version", ""), "build": data.get("build", "")}
|
|
return {"name": "bff_health", "pass": False, "level": "critical",
|
|
"detail": f"HTTP {code} — expected service=sofiia-console, got: {str(data)[:120]}",
|
|
"version": "", "build": ""}
|
|
|
|
|
|
def check_status_full(bff_url: str, env: str = "dev") -> dict:
|
|
"""CRITICAL: /api/status/full must show router+memory reachable + alerts backend."""
|
|
code, data = _get(f"{bff_url.rstrip('/')}/api/status/full", timeout=12)
|
|
issues = []
|
|
warns = []
|
|
|
|
if code != 200:
|
|
return {"name": "bff_status_full", "pass": False, "level": "critical",
|
|
"detail": f"HTTP {code} — /api/status/full unreachable",
|
|
"data": {}}
|
|
|
|
router_ok = (data.get("router") or {}).get("reachable", False)
|
|
mem_ok = (data.get("memory") or {}).get("reachable", False)
|
|
ollama_ok = (data.get("ollama") or {}).get("reachable", False)
|
|
backends = data.get("backends") or {}
|
|
cron = data.get("cron") or {}
|
|
|
|
if not router_ok:
|
|
issues.append("router.reachable=false")
|
|
if not mem_ok:
|
|
issues.append("memory.reachable=false")
|
|
|
|
# Alerts backend must not be 'memory' in prod/staging
|
|
alerts_be = backends.get("alerts", "unknown")
|
|
if env in ("prod", "staging") and alerts_be == "memory":
|
|
issues.append(f"alerts backend=memory (must be postgres in {env})")
|
|
elif alerts_be == "memory":
|
|
warns.append(f"alerts backend=memory (ok in dev, not prod)")
|
|
|
|
cron_installed = cron.get("installed", False)
|
|
if cron_installed is False and env in ("prod", "staging"):
|
|
warns.append("cron.installed=false")
|
|
|
|
cron_jobs = cron.get("jobs_present", [])
|
|
missing_jobs = [j for j in CRON_JOBS_EXPECTED if j not in cron_jobs]
|
|
if missing_jobs and env in ("prod", "staging"):
|
|
warns.append(f"cron missing jobs: {missing_jobs}")
|
|
|
|
ok = len(issues) == 0
|
|
detail_parts = [
|
|
f"router={'ok' if router_ok else 'FAIL'}",
|
|
f"memory={'ok' if mem_ok else 'FAIL'}",
|
|
f"ollama={'ok' if ollama_ok else 'offline'}",
|
|
f"alerts_be={alerts_be}",
|
|
f"cron={cron_installed}",
|
|
]
|
|
if issues:
|
|
detail_parts.append(f"issues={issues}")
|
|
if warns:
|
|
detail_parts.append(f"warns={warns}")
|
|
|
|
return {
|
|
"name": "bff_status_full",
|
|
"pass": ok,
|
|
"level": "critical",
|
|
"detail": " | ".join(detail_parts),
|
|
"warns": warns,
|
|
"data": {
|
|
"router_ok": router_ok, "memory_ok": mem_ok, "ollama_ok": ollama_ok,
|
|
"alerts_backend": alerts_be, "cron_installed": cron_installed,
|
|
"cron_jobs_present": cron_jobs,
|
|
}
|
|
}
|
|
|
|
|
|
def check_alerts_backend_not_memory(bff_url: str, env: str) -> dict:
|
|
"""CRITICAL in prod/staging: alerts must not use in-memory store."""
|
|
code, data = _get(f"{bff_url.rstrip('/')}/api/status/full", timeout=10)
|
|
if code != 200:
|
|
return {"name": "alerts_backend", "pass": True, "level": "warn",
|
|
"detail": "skipped (status/full unreachable)"}
|
|
backend = (data.get("backends") or {}).get("alerts", "unknown")
|
|
if env in ("prod", "staging") and backend == "memory":
|
|
return {"name": "alerts_backend", "pass": False, "level": "critical",
|
|
"detail": f"alerts backend=memory in {env} — must be postgres"}
|
|
return {"name": "alerts_backend", "pass": True, "level": "critical",
|
|
"detail": f"alerts backend={backend}"}
|
|
|
|
|
|
def check_cron_entries() -> dict:
|
|
"""WARN: local cron file should have all governance entries."""
|
|
if not CRON_FILE.exists():
|
|
return {"name": "cron_local_file", "pass": False, "level": "warn",
|
|
"detail": f"not found: {CRON_FILE.relative_to(REPO_ROOT)}"}
|
|
text = CRON_FILE.read_text(encoding="utf-8")
|
|
missing = [r for r in CRON_JOBS_EXPECTED if r not in text]
|
|
if missing:
|
|
return {"name": "cron_local_file", "pass": False, "level": "warn",
|
|
"detail": f"missing entries: {missing}"}
|
|
return {"name": "cron_local_file", "pass": True, "level": "warn",
|
|
"detail": "all governance entries present"}
|
|
|
|
|
|
def check_supervisor(supervisor_url: str) -> dict:
|
|
if not supervisor_url:
|
|
return {"name": "supervisor_health", "pass": True, "level": "info",
|
|
"detail": "skipped (no SUPERVISOR_URL)"}
|
|
code, _ = _get(f"{supervisor_url.rstrip('/')}/health", timeout=5)
|
|
ok = code == 200
|
|
return {"name": "supervisor_health", "pass": ok, "level": "warn",
|
|
"detail": f"GET /health → {code}" if code else "unreachable"}
|
|
|
|
|
|
# ── Parity comparison ─────────────────────────────────────────────────────────
|
|
|
|
def compare_nodes(bff_a: str, bff_b: str, node_a: str = "A", node_b: str = "B") -> list[dict]:
|
|
"""Compare two BFF nodes. Returns list of parity check results."""
|
|
checks = []
|
|
|
|
def _full(url: str) -> dict:
|
|
_, d = _get(f"{url.rstrip('/')}/api/status/full", timeout=10)
|
|
return d
|
|
|
|
def _health(url: str) -> dict:
|
|
_, d = _get(f"{url.rstrip('/')}/api/health", timeout=6)
|
|
return d
|
|
|
|
ha, hb = _health(bff_a), _health(bff_b)
|
|
ver_a, ver_b = ha.get("version", "?"), hb.get("version", "?")
|
|
version_match = ver_a == ver_b
|
|
checks.append({
|
|
"name": f"parity_version_{node_a}_vs_{node_b}",
|
|
"pass": version_match,
|
|
"level": "warn", # mismatch is WARN, not FAIL
|
|
"detail": f"{node_a}={ver_a} {node_b}={ver_b}" + ("" if version_match else " [MISMATCH — consider deploying same version]"),
|
|
})
|
|
|
|
fa, fb = _full(bff_a), _full(bff_b)
|
|
for key in ("router", "memory"):
|
|
ok_a = (fa.get(key) or {}).get("reachable", False)
|
|
ok_b = (fb.get(key) or {}).get("reachable", False)
|
|
same = ok_a == ok_b
|
|
checks.append({
|
|
"name": f"parity_{key}_reachable_{node_a}_vs_{node_b}",
|
|
"pass": ok_a and ok_b, # FAIL if either node missing critical service
|
|
"level": "critical" if key == "router" else "warn",
|
|
"detail": f"{node_a}.{key}={'ok' if ok_a else 'FAIL'} {node_b}.{key}={'ok' if ok_b else 'FAIL'}",
|
|
})
|
|
|
|
be_a = (fa.get("backends") or {}).get("alerts", "?")
|
|
be_b = (fb.get("backends") or {}).get("alerts", "?")
|
|
checks.append({
|
|
"name": f"parity_alerts_backend_{node_a}_vs_{node_b}",
|
|
"pass": be_a == be_b,
|
|
"level": "warn",
|
|
"detail": f"{node_a}.alerts={be_a} {node_b}.alerts={be_b}" + ("" if be_a == be_b else " [backends differ]"),
|
|
})
|
|
|
|
return checks
|
|
|
|
|
|
# ── Main ──────────────────────────────────────────────────────────────────────
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser(description="Verify Sofiia stack (NODA1/NODA2)")
|
|
ap.add_argument("--node", default="NODA2", help="Node label (for display)")
|
|
ap.add_argument("--router-url", default=os.getenv("ROUTER_URL", "http://localhost:8000"),
|
|
help="Router URL for this node")
|
|
ap.add_argument("--bff-url", default=os.getenv("BFF_URL", "http://localhost:8002"),
|
|
help="sofiia-console BFF URL for this node")
|
|
ap.add_argument("--compare-with", default=os.getenv("COMPARE_WITH_BFF", ""),
|
|
help="Second BFF URL for parity comparison (optional)")
|
|
ap.add_argument("--compare-node", default="NODA1",
|
|
help="Label for the comparison node (default: NODA1)")
|
|
ap.add_argument("--supervisor-url", default=os.getenv("SUPERVISOR_URL", ""))
|
|
ap.add_argument("--api-key", default=os.getenv("SUPERVISOR_API_KEY", ""))
|
|
ap.add_argument("--env", default=os.getenv("ENV", "dev"),
|
|
help="Environment (dev|staging|prod) — affects alert backend strictness")
|
|
ap.add_argument("--json", dest="json_out", action="store_true", help="JSON output only")
|
|
args = ap.parse_args()
|
|
|
|
api_key = args.api_key.strip()
|
|
env = args.env.strip().lower()
|
|
results: list[dict] = []
|
|
|
|
# ── Router checks ──────────────────────────────────────────────────────────
|
|
results.append(check_router_health(args.router_url))
|
|
results.append(check_tool(args.router_url, "risk_engine_tool", "service",
|
|
{"env": "prod", "service": "gateway"}, api_key))
|
|
results.append(check_tool(args.router_url, "architecture_pressure_tool", "service",
|
|
{"env": "prod", "service": "gateway"}, api_key))
|
|
results.append(check_tool(args.router_url, "backlog_tool", "dashboard",
|
|
{"env": "prod"}, api_key))
|
|
|
|
# ── BFF checks ─────────────────────────────────────────────────────────────
|
|
results.append(check_bff_health(args.bff_url))
|
|
results.append(check_status_full(args.bff_url, env=env))
|
|
|
|
# ── Cron (local file) ──────────────────────────────────────────────────────
|
|
results.append(check_cron_entries())
|
|
|
|
# ── Supervisor (optional) ──────────────────────────────────────────────────
|
|
results.append(check_supervisor(args.supervisor_url))
|
|
|
|
# ── Parity (optional) ─────────────────────────────────────────────────────
|
|
parity_results: list[dict] = []
|
|
if args.compare_with:
|
|
parity_results = compare_nodes(
|
|
args.bff_url, args.compare_with,
|
|
node_a=args.node, node_b=args.compare_node,
|
|
)
|
|
results.extend(parity_results)
|
|
|
|
# ── Evaluate ───────────────────────────────────────────────────────────────
|
|
critical_fail = [r for r in results if not r["pass"] and r.get("level") == "critical"]
|
|
warn_fail = [r for r in results if not r["pass"] and r.get("level") in ("warn",)]
|
|
all_pass = len(critical_fail) == 0
|
|
|
|
# Collect all inline warns from status_full
|
|
inline_warns: list[str] = []
|
|
for r in results:
|
|
if isinstance(r.get("warns"), list):
|
|
inline_warns.extend(r["warns"])
|
|
|
|
summary = {
|
|
"node": args.node,
|
|
"env": env,
|
|
"bff_url": args.bff_url,
|
|
"router_url": args.router_url,
|
|
"pass": all_pass,
|
|
"critical_failures": [r["name"] for r in critical_fail],
|
|
"warnings": [r["name"] for r in warn_fail] + inline_warns,
|
|
"checks": results,
|
|
"parity_checks": parity_results,
|
|
"recommendations": (
|
|
[] if all_pass else
|
|
["Fix critical failures listed above."] +
|
|
([f"alerts_backend must be postgres (not memory) in {env}"]
|
|
if any("alerts backend=memory" in r.get("detail","") for r in critical_fail) else []) +
|
|
(["Ensure cron jobs are deployed on this node"] if any("cron" in r["name"] for r in warn_fail) else [])
|
|
),
|
|
}
|
|
|
|
if args.json_out:
|
|
print(json.dumps(summary, indent=2))
|
|
else:
|
|
print(f"\n{'='*60}")
|
|
print(f" Sofiia Stack Verifier — {args.node} ({env.upper()})")
|
|
print(f" BFF: {args.bff_url}")
|
|
print(f" Router: {args.router_url}")
|
|
if args.compare_with:
|
|
print(f" Parity: comparing with {args.compare_node} @ {args.compare_with}")
|
|
print(f"{'='*60}\n")
|
|
|
|
all_checks = [r for r in results if r not in parity_results]
|
|
if parity_results:
|
|
print("Node checks:")
|
|
for r in all_checks:
|
|
icon = "✓" if r["pass"] else ("⚠" if r.get("level") == "warn" else "✗")
|
|
lvl = f"[{r.get('level','?').upper():<8}]"
|
|
print(f" {icon} {lvl} {r['name']:<45} {r.get('detail','')}")
|
|
if r.get("warns"):
|
|
for w in r["warns"]:
|
|
print(f" ⚠ {w}")
|
|
|
|
if parity_results:
|
|
print("\nParity checks:")
|
|
for r in parity_results:
|
|
icon = "✓" if r["pass"] else ("⚠" if r.get("level") == "warn" else "✗")
|
|
lvl = f"[{r.get('level','?').upper():<8}]"
|
|
print(f" {icon} {lvl} {r['name']:<55} {r.get('detail','')}")
|
|
|
|
print()
|
|
if all_pass:
|
|
print(f" OVERALL: ✓ PASS (warnings: {len(summary['warnings'])})")
|
|
else:
|
|
print(f" OVERALL: ✗ FAIL")
|
|
print(f" Critical failures: {summary['critical_failures']}")
|
|
if summary["warnings"]:
|
|
print(f" Warnings: {summary['warnings']}")
|
|
if summary["recommendations"]:
|
|
print(f"\n Recommendations:")
|
|
for rec in summary["recommendations"]:
|
|
print(f" → {rec}")
|
|
print()
|
|
|
|
return 0 if all_pass else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|