Files
microdao-daarion/services/matrix-bridge-dagi/app/control_runner.py
Apple 82d5ff2a4f feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11:
- M4: agent discovery (!agents / !status)
- M5: node-aware routing + per-node observability
- M6: dynamic policy store (node/agent overrides, import/export)
- M7: Prometheus alerts + Grafana dashboard + metrics contract
- M8: node health tracker + soft failover + sticky cache + HA persistence
- M9: two-step confirm + diff preview for dangerous commands
- M10: auto-backup, restore, retention, policy history + change detail
- M11: soak scenarios (CI tests) + live soak script

Soak infrastructure (this commit):
- POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false)
- _preflight_inject() and _check_wal() in soak script
- --db-path arg for WAL delta reporting
- Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands

Made-with: Cursor
2026-03-05 07:51:37 -08:00

297 lines
8.9 KiB
Python

"""
control_runner — M3.1 + M3.2 + M3.3
Thin async HTTP client that calls the sofiia-console internal runbook API
on behalf of the Matrix bridge control channel.
All functions are stateless; callers supply the pre-built AsyncClient.
"""
from __future__ import annotations
import logging
from typing import Optional
import httpx
logger = logging.getLogger(__name__)
# Runbook path guards (fail-fast in the bridge, before calling the console)
_MAX_PATH_LEN = 256
_FORBIDDEN_SEGMENTS = {"..", "~"}
class RunnerError(Exception):
"""Raised when the sofiia-console returns an error or call fails."""
def validate_runbook_path(path: str) -> Optional[str]:
"""
Return None if valid, or an error string describing the problem.
Checks: non-empty, max length, no traversal segments, no absolute paths.
"""
path = path.strip()
if not path:
return "runbook_path is required"
if len(path) > _MAX_PATH_LEN:
return f"runbook_path too long (max {_MAX_PATH_LEN} chars)"
if path.startswith("/"):
return "absolute paths are not allowed"
parts = path.replace("\\", "/").split("/")
for part in parts:
if part in _FORBIDDEN_SEGMENTS:
return f"forbidden path segment: {part!r}"
return None
async def start_runbook_run(
http_client: httpx.AsyncClient,
console_url: str,
control_token: str,
runbook_path: str,
operator_id: str,
node_id: str = "NODA1",
timeout: float = 15.0,
) -> dict:
"""
POST /api/runbooks/internal/runs → {run_id, status, current_step, steps_total}
Raises RunnerError on HTTP error or non-2xx response.
"""
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs"
payload: dict = {
"runbook_path": runbook_path,
"operator_id": operator_id,
"node_id": node_id,
}
try:
resp = await http_client.post(
url,
json=payload,
headers={"X-Control-Token": control_token},
timeout=timeout,
)
except httpx.RequestError as exc:
raise RunnerError(f"connection error: {exc}") from exc
if resp.status_code != 200:
detail = _extract_error_detail(resp)
raise RunnerError(f"HTTP {resp.status_code}: {detail}")
try:
return resp.json()
except Exception as exc:
raise RunnerError(f"invalid JSON response: {exc}") from exc
def _extract_error_detail(resp: httpx.Response) -> str:
"""Extract a short error detail from an httpx response (safe: never raises)."""
try:
body = resp.json()
if isinstance(body, dict) and body.get("detail"):
return str(body["detail"])[:200]
except Exception:
pass
try:
return (resp.text or "")[:200]
except Exception:
return "<no detail>"
async def get_runbook_run(
http_client: httpx.AsyncClient,
console_url: str,
control_token: str,
run_id: str,
timeout: float = 10.0,
) -> dict:
"""
GET /api/runbooks/internal/runs/{run_id} → full run with steps.
"""
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}"
try:
resp = await http_client.get(
url,
headers={"X-Control-Token": control_token},
timeout=timeout,
)
except httpx.RequestError as exc:
raise RunnerError(f"connection error: {exc}") from exc
if resp.status_code == 404:
raise RunnerError(f"run {run_id!r} not found")
if resp.status_code != 200:
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
try:
return resp.json()
except Exception as exc:
raise RunnerError(f"invalid JSON response: {exc}") from exc
# ── M3.2 ──────────────────────────────────────────────────────────────────────
async def next_runbook_step(
http_client: httpx.AsyncClient,
console_url: str,
control_token: str,
run_id: str,
operator_id: str = "",
timeout: float = 30.0,
) -> dict:
"""
POST /api/runbooks/internal/runs/{run_id}/next
Returns either:
{type:"manual", step_index, title, section, instructions_md, steps_total?}
{type:"http_check"|"script", step_index, title, result, step_status, next_step, completed}
Raises RunnerError on HTTP error, 404 (run not found / not active).
"""
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/next"
payload = {"operator_id": operator_id} if operator_id else {}
try:
resp = await http_client.post(
url,
json=payload,
headers={"X-Control-Token": control_token},
timeout=timeout,
)
except httpx.RequestError as exc:
raise RunnerError(f"connection error: {exc}") from exc
if resp.status_code == 404:
detail = _extract_error_detail(resp)
raise RunnerError(f"run not found or not active: {detail}")
if resp.status_code != 200:
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
try:
return resp.json()
except Exception as exc:
raise RunnerError(f"invalid JSON response: {exc}") from exc
async def complete_runbook_step(
http_client: httpx.AsyncClient,
console_url: str,
control_token: str,
run_id: str,
step_index: int,
status: str,
notes: str = "",
operator_id: str = "",
timeout: float = 15.0,
) -> dict:
"""
POST /api/runbooks/internal/runs/{run_id}/steps/{step_index}/complete
Returns: {ok, run_id, step_index, status, next_step, steps_total, run_completed}
Raises RunnerError on HTTP error or 404 (run/step not found or wrong current step).
"""
url = (
f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}"
f"/steps/{step_index}/complete"
)
payload: dict = {"status": status}
if notes:
payload["notes"] = notes
if operator_id:
payload["operator_id"] = operator_id
try:
resp = await http_client.post(
url,
json=payload,
headers={"X-Control-Token": control_token},
timeout=timeout,
)
except httpx.RequestError as exc:
raise RunnerError(f"connection error: {exc}") from exc
if resp.status_code == 404:
detail = _extract_error_detail(resp)
raise RunnerError(f"step not found or not current: {detail}")
if resp.status_code != 200:
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
try:
return resp.json()
except Exception as exc:
raise RunnerError(f"invalid JSON response: {exc}") from exc
# ── M3.3 ──────────────────────────────────────────────────────────────────────
async def generate_evidence(
http_client: httpx.AsyncClient,
console_url: str,
control_token: str,
run_id: str,
timeout: float = 30.0,
) -> dict:
"""
POST /api/runbooks/internal/runs/{run_id}/evidence
Returns: {evidence_path, bytes, created_at, run_id}
Raises RunnerError on HTTP error or 404 (run not found).
"""
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/evidence"
try:
resp = await http_client.post(
url,
headers={"X-Control-Token": control_token},
timeout=timeout,
)
except httpx.RequestError as exc:
raise RunnerError(f"connection error: {exc}") from exc
if resp.status_code == 404:
raise RunnerError(f"run {run_id!r} not found")
if resp.status_code != 200:
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
try:
return resp.json()
except Exception as exc:
raise RunnerError(f"invalid JSON response: {exc}") from exc
async def generate_post_review(
http_client: httpx.AsyncClient,
console_url: str,
control_token: str,
run_id: str,
timeout: float = 30.0,
) -> dict:
"""
POST /api/runbooks/internal/runs/{run_id}/post_review
Returns: {path, bytes, created_at, run_id}
Raises RunnerError on HTTP error or 404.
"""
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/post_review"
try:
resp = await http_client.post(
url,
headers={"X-Control-Token": control_token},
timeout=timeout,
)
except httpx.RequestError as exc:
raise RunnerError(f"connection error: {exc}") from exc
if resp.status_code == 404:
raise RunnerError(f"run {run_id!r} not found")
if resp.status_code != 200:
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
try:
return resp.json()
except Exception as exc:
raise RunnerError(f"invalid JSON response: {exc}") from exc