Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
297 lines
8.9 KiB
Python
297 lines
8.9 KiB
Python
"""
|
|
control_runner — M3.1 + M3.2 + M3.3
|
|
|
|
Thin async HTTP client that calls the sofiia-console internal runbook API
|
|
on behalf of the Matrix bridge control channel.
|
|
|
|
All functions are stateless; callers supply the pre-built AsyncClient.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Runbook path guards (fail-fast in the bridge, before calling the console)
|
|
_MAX_PATH_LEN = 256
|
|
_FORBIDDEN_SEGMENTS = {"..", "~"}
|
|
|
|
|
|
class RunnerError(Exception):
|
|
"""Raised when the sofiia-console returns an error or call fails."""
|
|
|
|
|
|
def validate_runbook_path(path: str) -> Optional[str]:
|
|
"""
|
|
Return None if valid, or an error string describing the problem.
|
|
Checks: non-empty, max length, no traversal segments, no absolute paths.
|
|
"""
|
|
path = path.strip()
|
|
if not path:
|
|
return "runbook_path is required"
|
|
if len(path) > _MAX_PATH_LEN:
|
|
return f"runbook_path too long (max {_MAX_PATH_LEN} chars)"
|
|
if path.startswith("/"):
|
|
return "absolute paths are not allowed"
|
|
parts = path.replace("\\", "/").split("/")
|
|
for part in parts:
|
|
if part in _FORBIDDEN_SEGMENTS:
|
|
return f"forbidden path segment: {part!r}"
|
|
return None
|
|
|
|
|
|
async def start_runbook_run(
|
|
http_client: httpx.AsyncClient,
|
|
console_url: str,
|
|
control_token: str,
|
|
runbook_path: str,
|
|
operator_id: str,
|
|
node_id: str = "NODA1",
|
|
timeout: float = 15.0,
|
|
) -> dict:
|
|
"""
|
|
POST /api/runbooks/internal/runs → {run_id, status, current_step, steps_total}
|
|
|
|
Raises RunnerError on HTTP error or non-2xx response.
|
|
"""
|
|
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs"
|
|
payload: dict = {
|
|
"runbook_path": runbook_path,
|
|
"operator_id": operator_id,
|
|
"node_id": node_id,
|
|
}
|
|
|
|
try:
|
|
resp = await http_client.post(
|
|
url,
|
|
json=payload,
|
|
headers={"X-Control-Token": control_token},
|
|
timeout=timeout,
|
|
)
|
|
except httpx.RequestError as exc:
|
|
raise RunnerError(f"connection error: {exc}") from exc
|
|
|
|
if resp.status_code != 200:
|
|
detail = _extract_error_detail(resp)
|
|
raise RunnerError(f"HTTP {resp.status_code}: {detail}")
|
|
|
|
try:
|
|
return resp.json()
|
|
except Exception as exc:
|
|
raise RunnerError(f"invalid JSON response: {exc}") from exc
|
|
|
|
|
|
def _extract_error_detail(resp: httpx.Response) -> str:
|
|
"""Extract a short error detail from an httpx response (safe: never raises)."""
|
|
try:
|
|
body = resp.json()
|
|
if isinstance(body, dict) and body.get("detail"):
|
|
return str(body["detail"])[:200]
|
|
except Exception:
|
|
pass
|
|
try:
|
|
return (resp.text or "")[:200]
|
|
except Exception:
|
|
return "<no detail>"
|
|
|
|
|
|
async def get_runbook_run(
|
|
http_client: httpx.AsyncClient,
|
|
console_url: str,
|
|
control_token: str,
|
|
run_id: str,
|
|
timeout: float = 10.0,
|
|
) -> dict:
|
|
"""
|
|
GET /api/runbooks/internal/runs/{run_id} → full run with steps.
|
|
"""
|
|
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}"
|
|
|
|
try:
|
|
resp = await http_client.get(
|
|
url,
|
|
headers={"X-Control-Token": control_token},
|
|
timeout=timeout,
|
|
)
|
|
except httpx.RequestError as exc:
|
|
raise RunnerError(f"connection error: {exc}") from exc
|
|
|
|
if resp.status_code == 404:
|
|
raise RunnerError(f"run {run_id!r} not found")
|
|
if resp.status_code != 200:
|
|
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
|
|
|
|
try:
|
|
return resp.json()
|
|
except Exception as exc:
|
|
raise RunnerError(f"invalid JSON response: {exc}") from exc
|
|
|
|
|
|
# ── M3.2 ──────────────────────────────────────────────────────────────────────
|
|
|
|
async def next_runbook_step(
|
|
http_client: httpx.AsyncClient,
|
|
console_url: str,
|
|
control_token: str,
|
|
run_id: str,
|
|
operator_id: str = "",
|
|
timeout: float = 30.0,
|
|
) -> dict:
|
|
"""
|
|
POST /api/runbooks/internal/runs/{run_id}/next
|
|
|
|
Returns either:
|
|
{type:"manual", step_index, title, section, instructions_md, steps_total?}
|
|
{type:"http_check"|"script", step_index, title, result, step_status, next_step, completed}
|
|
|
|
Raises RunnerError on HTTP error, 404 (run not found / not active).
|
|
"""
|
|
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/next"
|
|
payload = {"operator_id": operator_id} if operator_id else {}
|
|
|
|
try:
|
|
resp = await http_client.post(
|
|
url,
|
|
json=payload,
|
|
headers={"X-Control-Token": control_token},
|
|
timeout=timeout,
|
|
)
|
|
except httpx.RequestError as exc:
|
|
raise RunnerError(f"connection error: {exc}") from exc
|
|
|
|
if resp.status_code == 404:
|
|
detail = _extract_error_detail(resp)
|
|
raise RunnerError(f"run not found or not active: {detail}")
|
|
if resp.status_code != 200:
|
|
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
|
|
|
|
try:
|
|
return resp.json()
|
|
except Exception as exc:
|
|
raise RunnerError(f"invalid JSON response: {exc}") from exc
|
|
|
|
|
|
async def complete_runbook_step(
|
|
http_client: httpx.AsyncClient,
|
|
console_url: str,
|
|
control_token: str,
|
|
run_id: str,
|
|
step_index: int,
|
|
status: str,
|
|
notes: str = "",
|
|
operator_id: str = "",
|
|
timeout: float = 15.0,
|
|
) -> dict:
|
|
"""
|
|
POST /api/runbooks/internal/runs/{run_id}/steps/{step_index}/complete
|
|
|
|
Returns: {ok, run_id, step_index, status, next_step, steps_total, run_completed}
|
|
|
|
Raises RunnerError on HTTP error or 404 (run/step not found or wrong current step).
|
|
"""
|
|
url = (
|
|
f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}"
|
|
f"/steps/{step_index}/complete"
|
|
)
|
|
payload: dict = {"status": status}
|
|
if notes:
|
|
payload["notes"] = notes
|
|
if operator_id:
|
|
payload["operator_id"] = operator_id
|
|
|
|
try:
|
|
resp = await http_client.post(
|
|
url,
|
|
json=payload,
|
|
headers={"X-Control-Token": control_token},
|
|
timeout=timeout,
|
|
)
|
|
except httpx.RequestError as exc:
|
|
raise RunnerError(f"connection error: {exc}") from exc
|
|
|
|
if resp.status_code == 404:
|
|
detail = _extract_error_detail(resp)
|
|
raise RunnerError(f"step not found or not current: {detail}")
|
|
if resp.status_code != 200:
|
|
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
|
|
|
|
try:
|
|
return resp.json()
|
|
except Exception as exc:
|
|
raise RunnerError(f"invalid JSON response: {exc}") from exc
|
|
|
|
|
|
# ── M3.3 ──────────────────────────────────────────────────────────────────────
|
|
|
|
async def generate_evidence(
|
|
http_client: httpx.AsyncClient,
|
|
console_url: str,
|
|
control_token: str,
|
|
run_id: str,
|
|
timeout: float = 30.0,
|
|
) -> dict:
|
|
"""
|
|
POST /api/runbooks/internal/runs/{run_id}/evidence
|
|
|
|
Returns: {evidence_path, bytes, created_at, run_id}
|
|
|
|
Raises RunnerError on HTTP error or 404 (run not found).
|
|
"""
|
|
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/evidence"
|
|
try:
|
|
resp = await http_client.post(
|
|
url,
|
|
headers={"X-Control-Token": control_token},
|
|
timeout=timeout,
|
|
)
|
|
except httpx.RequestError as exc:
|
|
raise RunnerError(f"connection error: {exc}") from exc
|
|
|
|
if resp.status_code == 404:
|
|
raise RunnerError(f"run {run_id!r} not found")
|
|
if resp.status_code != 200:
|
|
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
|
|
|
|
try:
|
|
return resp.json()
|
|
except Exception as exc:
|
|
raise RunnerError(f"invalid JSON response: {exc}") from exc
|
|
|
|
|
|
async def generate_post_review(
|
|
http_client: httpx.AsyncClient,
|
|
console_url: str,
|
|
control_token: str,
|
|
run_id: str,
|
|
timeout: float = 30.0,
|
|
) -> dict:
|
|
"""
|
|
POST /api/runbooks/internal/runs/{run_id}/post_review
|
|
|
|
Returns: {path, bytes, created_at, run_id}
|
|
|
|
Raises RunnerError on HTTP error or 404.
|
|
"""
|
|
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/post_review"
|
|
try:
|
|
resp = await http_client.post(
|
|
url,
|
|
headers={"X-Control-Token": control_token},
|
|
timeout=timeout,
|
|
)
|
|
except httpx.RequestError as exc:
|
|
raise RunnerError(f"connection error: {exc}") from exc
|
|
|
|
if resp.status_code == 404:
|
|
raise RunnerError(f"run {run_id!r} not found")
|
|
if resp.status_code != 200:
|
|
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
|
|
|
|
try:
|
|
return resp.json()
|
|
except Exception as exc:
|
|
raise RunnerError(f"invalid JSON response: {exc}") from exc
|