feat(sofiia-console): add safe script executor for allowlisted runbook steps

- adds safe_executor.py: REPO_ROOT confinement, strict script allowlist, env key allowlist (STRICT/SOFIIA_URL/BFF_A/BFF_B/NODE_ID/AGENT_ID), stdin=DEVNULL, 8KB output cap, timeout clamp (max 300s), non-root warn - integrates script action_type into runbook_runner: next_step handles http_check and script branches; running_as_root -> step_status=warn - extends runbook_parser: rehearsal-v1 now includes 3 built-in script steps (preflight, idempotency smoke, generate evidence) after http_checks - adds tests/test_sofiia_safe_executor.py: 12 tests covering path traversal, absolute path, non-allowlist, env drop, timeout, exit_code, mocked subprocess Made-with: Cursor
2026-03-03 04:57:22 -08:00
parent ad8bddf595
commit 0603184524
4 changed files with 481 additions and 12 deletions
--- a/services/sofiia-console/app/runbook_parser.py
+++ b/services/sofiia-console/app/runbook_parser.py
@@ -86,23 +86,65 @@ def _parse_sections(markdown: str) -> List[tuple]:
    return sections


+def _rehearsal_script_steps(offset: int) -> List[RunbookStep]:
+    """PR3: 3 allowlisted script steps for rehearsal v1 (after http_checks)."""
+    return [
+        RunbookStep(
+            step_index=offset,
+            title="Preflight check (STRICT=1)",
+            section="0–5 min — Preflight",
+            action_type="script",
+            action_json={
+                "script": "ops/preflight_sofiia_console.sh",
+                "env": {"STRICT": "1"},
+                "timeout_s": 120,
+            },
+        ),
+        RunbookStep(
+            step_index=offset + 1,
+            title="Redis idempotency smoke test",
+            section="10–15 min — Smoke",
+            action_type="script",
+            action_json={
+                "script": "ops/redis_idempotency_smoke.sh",
+                "env": {},
+                "timeout_s": 60,
+            },
+        ),
+        RunbookStep(
+            step_index=offset + 2,
+            title="Generate release evidence",
+            section="25–30 min — Evidence",
+            action_type="script",
+            action_json={
+                "script": "ops/generate_release_evidence.sh",
+                "env": {},
+                "timeout_s": 60,
+            },
+        ),
+    ]
+
+
 def parse_runbook(
    runbook_path: str,
    markdown: str,
    sofiia_url: str = "http://127.0.0.1:8002",
 ) -> List[RunbookStep]:
    """
-    Parse markdown into steps. For rehearsal-v1 prepend 3 http_check steps;
-    rest are manual (one per H2 section with instructions_md).
+    Parse markdown into steps.
+    For rehearsal-v1: prepend 3 http_check + 3 script steps; rest are manual.
    """
    path_lower = runbook_path.lower()
    steps: List[RunbookStep] = []
    offset = 0

    if "rehearsal" in path_lower and "30min" in path_lower:
-        builtin = _rehearsal_http_check_steps(sofiia_url)
-        steps.extend(builtin)
-        offset = len(builtin)
+        http_steps = _rehearsal_http_check_steps(sofiia_url)
+        steps.extend(http_steps)
+        offset = len(http_steps)
+        script_steps = _rehearsal_script_steps(offset)
+        steps.extend(script_steps)
+        offset += len(script_steps)

    sections = _parse_sections(markdown)
    for i, (title, content) in enumerate(sections):
--- a/services/sofiia-console/app/runbook_runner.py
+++ b/services/sofiia-console/app/runbook_runner.py
@@ -1,6 +1,6 @@
 """
-Runbook runner — create run, next_step (execute http_check or return manual), complete_step, abort.
-PR2: guided execution, allowlisted HTTP only; audit integration.
+Runbook runner — create run, next_step (execute http_check/script or return manual), complete_step, abort.
+PR3: adds script action_type via SafeExecutor (allowlisted, no shell=True).
 """
 from __future__ import annotations

@@ -16,6 +16,7 @@ from . import db as _db
 from . import docs_store as _docs_store
 from .audit import audit_log, AuditEvent
 from .runbook_parser import RunbookStep, parse_runbook
+from . import safe_executor as _safe_exec

 logger = logging.getLogger(__name__)

@@ -212,11 +213,26 @@ async def next_step(run_id: str, operator_id: str = "") -> Optional[Dict[str, An
        )
    )

-    if action_type == "http_check":
-        result = await _execute_http_check(sofiia_url or "http://127.0.0.1:8002", action_json)
+    if action_type in ("http_check", "script"):
+        if action_type == "http_check":
+            result = await _execute_http_check(sofiia_url or "http://127.0.0.1:8002", action_json)
+            auto_ok = result.get("ok", False)
+        else:
+            # script via SafeExecutor
+            script = action_json.get("script", "")
+            env = action_json.get("env") or {}
+            timeout_s = int(action_json.get("timeout_s", 120))
+            result = await _safe_exec.run_script(script, env=env, timeout_s=timeout_s)
+            auto_ok = result.get("ok", False)
+
        finished_at = _now_ts()
        duration_ms = int((finished_at - started_at) * 1000)
-        step_status = "ok" if result.get("ok") else "fail"
+        # Non-root warning elevates to "warn" status (not "fail") if script exited 0
+        if result.get("warning") == "running_as_root" and auto_ok:
+            step_status = "warn"
+        else:
+            step_status = "ok" if auto_ok else "fail"
+
        await conn.execute(
            "UPDATE runbook_steps SET status = ?, result_json = ?, finished_at = ? WHERE run_id = ? AND step_index = ?",
            (step_status, json.dumps(result, separators=(",", ":")), finished_at, run_id, step_index),
@@ -243,15 +259,18 @@ async def next_step(run_id: str, operator_id: str = "") -> Optional[Dict[str, An
                    "run_id": run_id,
                    "step_index": step_index,
                    "action_type": action_type,
-                    "result_ok": result.get("ok"),
+                    "result_ok": auto_ok,
+                    "exit_code": result.get("exit_code"),
+                    "timeout": result.get("timeout"),
                },
            )
        )
        return {
-            "type": "http_check",
+            "type": action_type,
            "step_index": step_index,
            "title": title,
            "result": result,
+            "step_status": step_status,
            "next_step": next_current,
            "completed": next_current >= total,
        }
--- a/services/sofiia-console/app/safe_executor.py
+++ b/services/sofiia-console/app/safe_executor.py
@@ -0,0 +1,208 @@
+"""
+SafeExecutor — PR3.
+
+Execute allowlisted shell scripts via asyncio.create_subprocess_exec (not shell=True).
+Security: path confinement to REPO_ROOT (realpath), strict env allowlist,
+          stdin=DEVNULL, output cap, timeout clamp, non-root warning.
+"""
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+# ── Config ────────────────────────────────────────────────────────────────────
+
+def _get_repo_root() -> Path:
+    """SOFIIA_REPO_ROOT env or auto-detect: app/ -> sofiia-console/ -> services/ -> repo."""
+    env = os.getenv("SOFIIA_REPO_ROOT", "").strip()
+    if env:
+        return Path(env).resolve()
+    return Path(__file__).resolve().parent.parent.parent.parent
+
+
+_REPO_ROOT = _get_repo_root()
+
+# Allowlisted scripts (relative to REPO_ROOT)
+_SCRIPT_ALLOWLIST: frozenset = frozenset({
+    "ops/preflight_sofiia_console.sh",
+    "ops/redis_idempotency_smoke.sh",
+    "ops/generate_release_evidence.sh",
+})
+
+# Env keys allowed to be passed from action_json
+_ENV_KEY_ALLOWLIST: frozenset = frozenset({
+    "STRICT",
+    "SOFIIA_URL",
+    "BFF_A",
+    "BFF_B",
+    "NODE_ID",
+    "AGENT_ID",
+})
+
+_TIMEOUT_MAX_S: int = 300
+_OUTPUT_CAP_BYTES: int = 8 * 1024  # 8 KB
+
+
+# ── Public API ────────────────────────────────────────────────────────────────
+
+class ScriptNotAllowedError(ValueError):
+    pass
+
+
+def _validate_script_path(script: str) -> Path:
+    """
+    Validate that script is in the allowlist and resides under REPO_ROOT.
+    Returns resolved absolute Path.
+    Raises ScriptNotAllowedError on any violation.
+    """
+    if not script or not script.strip():
+        raise ScriptNotAllowedError("Empty script path")
+
+    # Reject absolute paths and traversal immediately (before resolve)
+    s = script.strip()
+    if s.startswith("/"):
+        raise ScriptNotAllowedError(f"Absolute paths not allowed: {s!r}")
+    if ".." in Path(s).parts:
+        raise ScriptNotAllowedError(f"Path traversal not allowed: {s!r}")
+
+    # Exact allowlist check (on normalized relative form)
+    normalized = s.replace("\\", "/")
+    if normalized not in _SCRIPT_ALLOWLIST:
+        raise ScriptNotAllowedError(f"Script not in allowlist: {normalized!r}")
+
+    resolved = (_REPO_ROOT / normalized).resolve()
+    # Confinement: must be under REPO_ROOT
+    try:
+        resolved.relative_to(_REPO_ROOT)
+    except ValueError:
+        raise ScriptNotAllowedError(f"Script escaped REPO_ROOT: {resolved}")
+
+    return resolved
+
+
+def _filter_env(raw_env: Optional[Dict[str, Any]]) -> tuple[Dict[str, str], List[str]]:
+    """Return (filtered_env, list_of_dropped_keys)."""
+    if not raw_env:
+        return {}, []
+    filtered: Dict[str, str] = {}
+    dropped: List[str] = []
+    for k, v in raw_env.items():
+        if k in _ENV_KEY_ALLOWLIST:
+            filtered[k] = str(v)
+        else:
+            dropped.append(k)
+    return filtered, dropped
+
+
+def _cap_bytes(data: bytes, cap: int = _OUTPUT_CAP_BYTES) -> str:
+    if not data:
+        return ""
+    text = data.decode("utf-8", errors="replace")
+    if len(data) > cap:
+        tail = text[-(cap // 2):]
+        return f"[...truncated...]\n{tail}"
+    return text
+
+
+async def run_script(
+    script: str,
+    env: Optional[Dict[str, Any]] = None,
+    timeout_s: int = 120,
+) -> Dict[str, Any]:
+    """
+    Execute allowlisted script. Returns:
+      {ok, exit_code, stdout_tail, stderr_tail, duration_ms, timeout, warning?}
+    """
+    try:
+        resolved = _validate_script_path(script)
+    except ScriptNotAllowedError as e:
+        return {"ok": False, "exit_code": None, "error": str(e),
+                "stdout_tail": "", "stderr_tail": "", "duration_ms": 0, "timeout": False}
+
+    # Build env: inherit minimal set, add allowed overrides
+    base_env: Dict[str, str] = {
+        k: v for k, v in os.environ.items()
+        if k in {"PATH", "HOME", "LANG", "LC_ALL", "USER", "LOGNAME", "SOFIIA_DATA_DIR", "SOFIIA_REDIS_URL"}
+    }
+    filtered_env, dropped_keys = _filter_env(env)
+    if dropped_keys:
+        logger.warning("safe_executor: dropped non-allowlisted env keys: %s", dropped_keys)
+    base_env.update(filtered_env)
+
+    # Clamp timeout
+    effective_timeout = max(1, min(int(timeout_s), _TIMEOUT_MAX_S))
+
+    # Non-root check
+    warning: Optional[str] = None
+    try:
+        if os.getuid() == 0:
+            warning = "running_as_root"
+            logger.warning("safe_executor: running as root, step status will be warn")
+    except AttributeError:
+        pass  # Windows / no getuid
+
+    started = time.monotonic()
+    timed_out = False
+    exit_code: Optional[int] = None
+    stdout_bytes = b""
+    stderr_bytes = b""
+
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            "/bin/bash",
+            str(resolved),
+            cwd=str(_REPO_ROOT),
+            env=base_env,
+            stdin=asyncio.subprocess.DEVNULL,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        try:
+            stdout_bytes, stderr_bytes = await asyncio.wait_for(
+                proc.communicate(), timeout=float(effective_timeout)
+            )
+            exit_code = proc.returncode
+        except asyncio.TimeoutError:
+            timed_out = True
+            try:
+                proc.kill()
+            except Exception:
+                pass
+            try:
+                await asyncio.wait_for(proc.wait(), timeout=5.0)
+            except Exception:
+                pass
+            exit_code = None
+    except Exception as e:
+        logger.error("safe_executor: failed to start %s: %s", script, e)
+        duration_ms = int((time.monotonic() - started) * 1000)
+        return {
+            "ok": False,
+            "exit_code": None,
+            "error": str(e)[:200],
+            "stdout_tail": "",
+            "stderr_tail": "",
+            "duration_ms": duration_ms,
+            "timeout": False,
+        }
+
+    duration_ms = int((time.monotonic() - started) * 1000)
+    ok = (not timed_out) and exit_code == 0
+    result: Dict[str, Any] = {
+        "ok": ok,
+        "exit_code": exit_code,
+        "stdout_tail": _cap_bytes(stdout_bytes),
+        "stderr_tail": _cap_bytes(stderr_bytes),
+        "duration_ms": duration_ms,
+        "timeout": timed_out,
+        "dropped_env_keys": dropped_keys or None,
+    }
+    if warning:
+        result["warning"] = warning
+    return result