docs(platform): add policy configs, runbooks, ops scripts and platform documentation

Config policies (16 files): alert_routing, architecture_pressure, backlog,
cost_weights, data_governance, incident_escalation, incident_intelligence,
network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix,
release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout

Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard,
deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice,
cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule),
task_registry, voice alerts/ha/latency/policy

Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks,
NODA1/NODA2 status and setup, audit index and traces, backlog, incident,
supervisor, tools, voice, opencode, release, risk, aistalk, spacebot

Made-with: Cursor
This commit is contained in:
Apple
2026-03-03 07:14:53 -08:00
parent 129e4ea1fc
commit 67225a39fa
102 changed files with 20060 additions and 0 deletions

182
ops/scripts/run_governance_job.py Executable file
View File

@@ -0,0 +1,182 @@
#!/usr/bin/env python3
"""
run_governance_job.py — Universal Governance Job Runner.
DAARION.city | used by cron to trigger scheduled governance jobs.
Usage:
python3 ops/scripts/run_governance_job.py \\
--tool risk_history_tool \\
--action snapshot \\
--params-json '{"env":"prod"}'
python3 ops/scripts/run_governance_job.py \\
--tool backlog_tool --action cleanup --params-json '{"env":"prod"}' \\
--router-url http://localhost:8000 \\
--agent-id scheduler
Exit codes:
0 — success (HTTP 200, result.success=true)
1 — HTTP error or tool returned success=false
2 — usage / configuration error
Environment variables (read from .env if present):
ROUTER_URL — base URL of the router service (default: http://localhost:8000)
SCHEDULER_API_KEY — optional Bearer token for router auth
GOVERNANCE_ENV — default env param passed in tool arguments (default: prod)
"""
from __future__ import annotations
import argparse
import datetime
import json
import logging
import os
import sys
import urllib.error
import urllib.request
from pathlib import Path
# ── Try loading .env from repo root ──────────────────────────────────────────
def _load_dotenv(path: Path) -> None:
if not path.exists():
return
with open(path) as f:
for line in f:
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, _, value = line.partition("=")
key = key.strip()
value = value.strip().strip('"').strip("'")
if key and key not in os.environ: # don't override existing env vars
os.environ[key] = value
_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
_load_dotenv(_REPO_ROOT / ".env")
_load_dotenv(_REPO_ROOT / ".env.local")
# ── Logging ───────────────────────────────────────────────────────────────────
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [run_governance_job] %(levelname)s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("run_governance_job")
# ── HTTP helper ───────────────────────────────────────────────────────────────
def _post_json(url: str, payload: dict, api_key: str = "", timeout: int = 60) -> dict:
"""POST JSON payload; return parsed response dict. Raises on HTTP error."""
body = json.dumps(payload).encode()
headers = {"Content-Type": "application/json", "Accept": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
req = urllib.request.Request(url, data=body, headers=headers, method="POST")
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read().decode())
except urllib.error.HTTPError as e:
body_txt = e.read().decode(errors="replace")[:500]
raise RuntimeError(f"HTTP {e.code} from {url}: {body_txt}") from e
except urllib.error.URLError as e:
raise RuntimeError(f"Cannot reach {url}: {e.reason}") from e
# ── Main ──────────────────────────────────────────────────────────────────────
def main() -> int:
parser = argparse.ArgumentParser(
description="Trigger a governance tool action via the DAARION router API."
)
parser.add_argument("--tool", required=True, help="Tool name (e.g. risk_history_tool)")
parser.add_argument("--action", required=True, help="Action (e.g. snapshot)")
parser.add_argument(
"--params-json",
default="{}",
help='JSON dict of extra parameters (e.g. \'{"env":"prod"}\')',
)
parser.add_argument(
"--router-url",
default=os.environ.get("ROUTER_URL", "http://localhost:8000"),
help="Router base URL (default: $ROUTER_URL or http://localhost:8000)",
)
parser.add_argument(
"--agent-id",
default="scheduler",
help='Agent identity for audit trail (default: scheduler)',
)
parser.add_argument(
"--timeout",
type=int,
default=90,
help="HTTP timeout in seconds (default: 90)",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Print the request payload without sending it",
)
args = parser.parse_args()
# Parse extra params
try:
extra_params = json.loads(args.params_json)
except json.JSONDecodeError as e:
logger.error("Invalid --params-json: %s", e)
return 2
api_key = os.environ.get("SCHEDULER_API_KEY", "")
endpoint = f"{args.router_url.rstrip('/')}/v1/tools/execute"
payload = {
"tool": args.tool,
"action": args.action,
"agent_id": args.agent_id,
**extra_params,
}
ts = datetime.datetime.utcnow().isoformat()
logger.info("Job: %s.%s params=%s ts=%s", args.tool, args.action, extra_params, ts)
if args.dry_run:
print("[dry-run] Would POST to:", endpoint)
print("[dry-run] Payload:", json.dumps(payload, indent=2))
return 0
try:
result = _post_json(endpoint, payload, api_key=api_key, timeout=args.timeout)
except RuntimeError as e:
logger.error("Request failed: %s", e)
return 1
# Normalise result — router returns {"success": bool, "result": ..., "error": ...}
success = result.get("success", True) # assume success if key absent
error = result.get("error")
res_data = result.get("result", result)
if success:
# Pretty-print a summary
summary = {}
if isinstance(res_data, dict):
for key in ("created", "updated", "skipped", "deleted", "snapshot_id",
"services", "total", "week", "band", "score"):
if key in res_data:
summary[key] = res_data[key]
logger.info(
"%s.%s → OK %s",
args.tool, args.action,
json.dumps(summary) if summary else "(done)",
)
return 0
else:
logger.error("%s.%s → FAIL error=%s", args.tool, args.action, error)
return 1
if __name__ == "__main__":
sys.exit(main())