Files
microdao-daarion/services/sofiia-console/app/runbook_runner.py
Apple 0603184524 feat(sofiia-console): add safe script executor for allowlisted runbook steps
- adds safe_executor.py: REPO_ROOT confinement, strict script allowlist,
  env key allowlist (STRICT/SOFIIA_URL/BFF_A/BFF_B/NODE_ID/AGENT_ID),
  stdin=DEVNULL, 8KB output cap, timeout clamp (max 300s), non-root warn
- integrates script action_type into runbook_runner: next_step handles
  http_check and script branches; running_as_root -> step_status=warn
- extends runbook_parser: rehearsal-v1 now includes 3 built-in script steps
  (preflight, idempotency smoke, generate evidence) after http_checks
- adds tests/test_sofiia_safe_executor.py: 12 tests covering path traversal,
  absolute path, non-allowlist, env drop, timeout, exit_code, mocked subprocess

Made-with: Cursor
2026-03-03 04:57:22 -08:00

357 lines
12 KiB
Python

"""
Runbook runner — create run, next_step (execute http_check/script or return manual), complete_step, abort.
PR3: adds script action_type via SafeExecutor (allowlisted, no shell=True).
"""
from __future__ import annotations
import json
import logging
import time
import uuid
from typing import Any, Dict, List, Optional
import httpx
from . import db as _db
from . import docs_store as _docs_store
from .audit import audit_log, AuditEvent
from .runbook_parser import RunbookStep, parse_runbook
from . import safe_executor as _safe_exec
logger = logging.getLogger(__name__)
_ALLOWED_HTTP_PATHS = {"/api/health", "/metrics", "/api/audit"}
def _now_ts() -> float:
return time.time()
async def create_run(
runbook_path: str,
operator_id: str = "",
node_id: Optional[str] = None,
sofiia_url: Optional[str] = None,
data_json: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""Parse runbook, insert run + steps, audit. Returns run_id, status, current_step, steps_total."""
raw = await _docs_store.get_raw(runbook_path)
if not raw:
raise ValueError(f"Runbook not found: {runbook_path}")
base_url = sofiia_url or "http://127.0.0.1:8002"
steps = parse_runbook(runbook_path, raw, sofiia_url=base_url)
if not steps:
raise ValueError("Runbook produced no steps")
run_id = str(uuid.uuid4())
now = _now_ts()
conn = await _db.get_db()
await conn.execute(
"""INSERT INTO runbook_runs(run_id, runbook_path, status, current_step, created_at, started_at,
finished_at, operator_id, node_id, sofiia_url, data_json, evidence_path)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)""",
(
run_id,
runbook_path,
"running",
0,
now,
now,
None,
operator_id or None,
node_id,
base_url,
json.dumps(data_json or {}, separators=(",", ":")) if data_json else None,
None,
),
)
for s in steps:
await conn.execute(
"""INSERT INTO runbook_steps(run_id, step_index, title, section, action_type, action_json,
status, result_json, started_at, finished_at) VALUES (?,?,?,?,?,?,?,?,?,?)""",
(
run_id,
s.step_index,
s.title,
s.section,
s.action_type,
json.dumps(s.action_json, separators=(",", ":")),
s.status,
None,
None,
None,
),
)
await conn.commit()
await audit_log(
AuditEvent(
event="runbook.run.created",
operator_id=operator_id or "unknown",
operator_id_missing=not operator_id,
node_id=node_id,
status="ok",
data={"run_id": run_id, "runbook_path": runbook_path, "steps_total": len(steps)},
)
)
return {
"run_id": run_id,
"status": "running",
"current_step": 0,
"steps_total": len(steps),
}
async def get_run(run_id: str) -> Optional[Dict[str, Any]]:
"""Return run row + steps (light: no large action_json/result_json)."""
conn = await _db.get_db()
async with conn.execute(
"SELECT run_id, runbook_path, status, current_step, created_at, started_at, finished_at,"
" operator_id, node_id, sofiia_url, evidence_path FROM runbook_runs WHERE run_id = ?",
(run_id,),
) as cur:
row = await cur.fetchone()
if not row:
return None
run = {
"run_id": row[0],
"runbook_path": row[1],
"status": row[2],
"current_step": row[3],
"created_at": row[4],
"started_at": row[5],
"finished_at": row[6],
"operator_id": row[7],
"node_id": row[8],
"sofiia_url": row[9],
"evidence_path": row[10],
}
async with conn.execute(
"SELECT step_index, title, section, action_type, status, started_at, finished_at "
"FROM runbook_steps WHERE run_id = ? ORDER BY step_index",
(run_id,),
) as cur:
step_rows = await cur.fetchall()
run["steps"] = [
{
"step_index": r[0],
"title": r[1],
"section": r[2],
"action_type": r[3],
"status": r[4],
"started_at": r[5],
"finished_at": r[6],
}
for r in (step_rows or [])
]
return run
async def _execute_http_check(base_url: str, action: Dict[str, Any]) -> Dict[str, Any]:
"""Allowlisted GET only; returns {status, status_code, ok, error}."""
path = (action.get("url_path") or "").strip()
if not path.startswith("/"):
path = "/" + path
if path not in _ALLOWED_HTTP_PATHS:
return {"ok": False, "error": "path not allowlisted", "status_code": None}
url = (base_url or "http://127.0.0.1:8002").rstrip("/") + path
expect_statuses = action.get("expect", {}).get("status") or [200]
try:
async with httpx.AsyncClient(timeout=15.0) as client:
r = await client.get(url, headers=action.get("headers") or {})
ok = r.status_code in expect_statuses
return {
"ok": ok,
"status_code": r.status_code,
"expected": expect_statuses,
"status": "ok" if ok else "fail",
}
except Exception as e:
logger.warning("http_check %s failed: %s", url, e)
return {"ok": False, "error": str(e)[:200], "status_code": None, "status": "fail"}
async def next_step(run_id: str, operator_id: str = "") -> Optional[Dict[str, Any]]:
"""
Get current step; if http_check execute and advance; if manual return instructions.
Returns either {type: "http_check", step_index, result, ...} or {type: "manual", step_index, title, instructions_md}.
"""
conn = await _db.get_db()
async with conn.execute(
"SELECT runbook_path, status, current_step, sofiia_url FROM runbook_runs WHERE run_id = ?",
(run_id,),
) as cur:
row = await cur.fetchone()
if not row or row[1] not in ("running", "paused"):
return None
runbook_path, status, current_step, sofiia_url = row[0], row[1], row[2], row[3]
async with conn.execute(
"SELECT step_index, title, section, action_type, action_json, status FROM runbook_steps "
"WHERE run_id = ? AND step_index = ?",
(run_id, current_step),
) as cur:
step_row = await cur.fetchone()
if not step_row:
return None
step_index, title, section, action_type, action_json_str, step_status = step_row
action_json = json.loads(action_json_str) if action_json_str else {}
started_at = _now_ts()
await conn.execute(
"UPDATE runbook_steps SET status = ?, started_at = ? WHERE run_id = ? AND step_index = ?",
("running", started_at, run_id, step_index),
)
await conn.commit()
await audit_log(
AuditEvent(
event="runbook.step.started",
operator_id=operator_id or "unknown",
node_id=None,
status="ok",
data={"run_id": run_id, "step_index": step_index, "action_type": action_type, "title": title},
)
)
if action_type in ("http_check", "script"):
if action_type == "http_check":
result = await _execute_http_check(sofiia_url or "http://127.0.0.1:8002", action_json)
auto_ok = result.get("ok", False)
else:
# script via SafeExecutor
script = action_json.get("script", "")
env = action_json.get("env") or {}
timeout_s = int(action_json.get("timeout_s", 120))
result = await _safe_exec.run_script(script, env=env, timeout_s=timeout_s)
auto_ok = result.get("ok", False)
finished_at = _now_ts()
duration_ms = int((finished_at - started_at) * 1000)
# Non-root warning elevates to "warn" status (not "fail") if script exited 0
if result.get("warning") == "running_as_root" and auto_ok:
step_status = "warn"
else:
step_status = "ok" if auto_ok else "fail"
await conn.execute(
"UPDATE runbook_steps SET status = ?, result_json = ?, finished_at = ? WHERE run_id = ? AND step_index = ?",
(step_status, json.dumps(result, separators=(",", ":")), finished_at, run_id, step_index),
)
next_current = current_step + 1
async with conn.execute("SELECT COUNT(*) FROM runbook_steps WHERE run_id = ?", (run_id,)) as cur:
total = (await cur.fetchone())[0]
if next_current >= total:
await conn.execute(
"UPDATE runbook_runs SET current_step = ?, status = ?, finished_at = ? WHERE run_id = ?",
(next_current, "completed", finished_at, run_id),
)
else:
await conn.execute("UPDATE runbook_runs SET current_step = ? WHERE run_id = ?", (next_current, run_id))
await conn.commit()
await audit_log(
AuditEvent(
event="runbook.step.completed",
operator_id=operator_id or "unknown",
node_id=None,
status=step_status,
duration_ms=duration_ms,
data={
"run_id": run_id,
"step_index": step_index,
"action_type": action_type,
"result_ok": auto_ok,
"exit_code": result.get("exit_code"),
"timeout": result.get("timeout"),
},
)
)
return {
"type": action_type,
"step_index": step_index,
"title": title,
"result": result,
"step_status": step_status,
"next_step": next_current,
"completed": next_current >= total,
}
# manual
instructions = action_json.get("instructions_md") or title
return {
"type": "manual",
"step_index": step_index,
"title": title,
"section": section,
"instructions_md": instructions,
}
async def complete_step(
run_id: str,
step_index: int,
status: str = "ok",
notes: str = "",
data: Optional[Dict[str, Any]] = None,
operator_id: str = "",
) -> bool:
"""Record manual step completion, advance current_step."""
if status not in ("ok", "warn", "fail", "skipped"):
status = "ok"
conn = await _db.get_db()
async with conn.execute(
"SELECT current_step FROM runbook_runs WHERE run_id = ? AND runbook_runs.status IN ('running','paused')",
(run_id,),
) as cur:
row = await cur.fetchone()
if not row or row[0] != step_index:
return False
finished_at = _now_ts()
result_json = json.dumps({"status": status, "notes": notes[:500], **(data or {})}, separators=(",", ":"))
await conn.execute(
"UPDATE runbook_steps SET status = ?, result_json = ?, finished_at = ? WHERE run_id = ? AND step_index = ?",
(status, result_json, finished_at, run_id, step_index),
)
next_current = step_index + 1
async with conn.execute("SELECT COUNT(*) FROM runbook_steps WHERE run_id = ?", (run_id,)) as cur:
total = (await cur.fetchone())[0]
if next_current >= total:
await conn.execute(
"UPDATE runbook_runs SET current_step = ?, status = ?, finished_at = ? WHERE run_id = ?",
(next_current, "completed", finished_at, run_id),
)
else:
await conn.execute("UPDATE runbook_runs SET current_step = ? WHERE run_id = ?", (next_current, run_id))
await conn.commit()
await audit_log(
AuditEvent(
event="runbook.step.completed",
operator_id=operator_id or "unknown",
status=status,
data={"run_id": run_id, "step_index": step_index, "action_type": "manual"},
)
)
return True
async def abort_run(run_id: str, operator_id: str = "") -> bool:
"""Set status=aborted, audit."""
conn = await _db.get_db()
async with conn.execute("SELECT 1 FROM runbook_runs WHERE run_id = ?", (run_id,)) as cur:
if not await cur.fetchone():
return False
now = _now_ts()
await conn.execute(
"UPDATE runbook_runs SET status = ?, finished_at = ? WHERE run_id = ?",
("aborted", now, run_id),
)
await conn.commit()
await audit_log(
AuditEvent(
event="runbook.run.aborted",
operator_id=operator_id or "unknown",
status="ok",
data={"run_id": run_id},
)
)
return True