microdao-daarion/services/sofiia-supervisor/app/graphs/release_check_graph.py

"""
Graph 1: release_check_graph

Uses the DAARION job_orchestrator_tool to start a release_check task
via the gateway, then polls until completion.

Node sequence:
  start_job → poll_job (loop) → finalize → END

State:
  job_id          str          Job ID returned by start_task
  job_status      str          "running"|"succeeded"|"failed"|"cancelled"
  poll_count      int          Guard against infinite polling
  result          dict|None    Final release_check report
  error           str|None     Error message if failed
"""

from __future__ import annotations

import asyncio
import logging
import time
from typing import Any, Dict, Optional, TypedDict

from langgraph.graph import StateGraph, END

from ..config import settings
from ..gateway_client import GatewayClient

logger = logging.getLogger(__name__)

MAX_POLL_ITERATIONS = int(settings.JOB_MAX_WAIT_SEC / settings.JOB_POLL_INTERVAL_SEC) + 5


# ─── State ────────────────────────────────────────────────────────────────────

class ReleaseCheckState(TypedDict, total=False):
    # Context (injected before graph.invoke)
    run_id: str
    agent_id: str
    workspace_id: str
    user_id: str
    input: Dict[str, Any]

    # Intermediate
    job_id: Optional[str]
    job_status: Optional[str]
    poll_count: int

    # Output
    result: Optional[Dict[str, Any]]
    error: Optional[str]
    graph_status: str      # "succeeded" | "failed"


# ─── Node implementations ────────────────────────────────────────────────────

async def start_job_node(state: ReleaseCheckState) -> ReleaseCheckState:
    """
    Call job_orchestrator_tool action=start_task with task_id=release_check.
    Expects response: {"job_id": "...", "status": "queued|running"}.
    """
    run_id = state.get("run_id", "")
    inp = state.get("input", {})

    # Build release_check inputs from graph input
    task_inputs = {
        "service_name": inp.get("service_name", "unknown"),
        "diff": inp.get("diff_text", ""),
        "fail_fast": inp.get("fail_fast", True),
        "run_smoke": inp.get("run_smoke", False),
        "run_drift": inp.get("run_drift", True),
        "run_deps": inp.get("run_deps", True),
        "deps_targets": inp.get("deps_targets", ["python", "node"]),
        "deps_vuln_mode": inp.get("deps_vuln_mode", "offline_cache"),
        "deps_fail_on": inp.get("deps_fail_on", ["CRITICAL", "HIGH"]),
        "drift_categories": inp.get("drift_categories", ["services", "openapi", "nats", "tools"]),
        "risk_profile": inp.get("risk_profile", "default"),
    }
    if inp.get("openapi_base"):
        task_inputs["openapi_base"] = inp["openapi_base"]
    if inp.get("openapi_head"):
        task_inputs["openapi_head"] = inp["openapi_head"]

    overall_timeout = inp.get("timeouts", {}).get("overall_sec", 180)

    async with GatewayClient() as gw:
        result = await gw.call_tool(
            tool="job_orchestrator_tool",
            action="start_task",
            params={"task_id": "release_check", "inputs": task_inputs, "timeout_sec": overall_timeout},
            agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
            workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
            user_id=state.get("user_id", ""),
            graph_run_id=run_id,
            graph_node="start_job",
        )

    if not result.success:
        logger.error("release_check: start_job failed run=%s err=%s", run_id, result.error_message)
        return {
            **state,
            "job_id": None,
            "poll_count": 0,
            "graph_status": "failed",
            "error": f"start_task failed: {result.error_message}",
        }

    data = result.data or {}
    job_id = data.get("job_id") or data.get("id")
    job_status = data.get("status", "running")

    logger.info("release_check: job started run=%s job_id=%s status=%s", run_id, job_id, job_status)

    # If job completed synchronously (no async job system), extract result directly
    if job_status in ("succeeded", "failed") and "result" in data:
        return {
            **state,
            "job_id": job_id,
            "job_status": job_status,
            "poll_count": 0,
            "result": data.get("result"),
            "graph_status": "succeeded" if job_status == "succeeded" else "failed",
            "error": data.get("error") if job_status == "failed" else None,
        }

    return {**state, "job_id": job_id, "job_status": job_status, "poll_count": 0}


async def poll_job_node(state: ReleaseCheckState) -> ReleaseCheckState:
    """
    Poll job_orchestrator_tool action=get_job for completion.
    Loops back to itself if still running (via conditional edge).
    """
    run_id = state.get("run_id", "")
    job_id = state.get("job_id")
    poll_count = state.get("poll_count", 0) + 1

    if not job_id:
        return {**state, "poll_count": poll_count, "job_status": "failed",
                "error": "No job_id to poll", "graph_status": "failed"}

    if poll_count > MAX_POLL_ITERATIONS:
        logger.warning("release_check: polling timeout run=%s job=%s", run_id, job_id)
        return {**state, "poll_count": poll_count, "job_status": "failed",
                "error": "Job polling timeout", "graph_status": "failed"}

    # Brief pause before polling
    await asyncio.sleep(settings.JOB_POLL_INTERVAL_SEC)

    async with GatewayClient() as gw:
        result = await gw.call_tool(
            tool="job_orchestrator_tool",
            action="get_job",
            params={"job_id": job_id},
            agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
            workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
            user_id=state.get("user_id", ""),
            graph_run_id=run_id,
            graph_node="poll_job",
        )

    if not result.success:
        logger.warning("release_check: poll error run=%s err=%s", run_id, result.error_message)
        return {**state, "poll_count": poll_count}

    data = result.data or {}
    job_status = data.get("status", "running")

    logger.info("release_check: poll run=%s job=%s status=%s count=%d",
                run_id, job_id, job_status, poll_count)

    update = {**state, "job_id": job_id, "job_status": job_status, "poll_count": poll_count}

    if job_status == "succeeded":
        update["result"] = data.get("result") or data.get("output")
        update["graph_status"] = "succeeded"
    elif job_status in ("failed", "cancelled"):
        update["error"] = data.get("error") or f"Job {job_status}"
        update["graph_status"] = "failed"

    return update


async def finalize_node(state: ReleaseCheckState) -> ReleaseCheckState:
    """Ensure result has the expected release_check report structure."""
    result = state.get("result")
    if not result:
        result = {
            "pass": False,
            "gates": [],
            "recommendations": [state.get("error", "Unknown error")],
            "summary": state.get("error", "Release check failed"),
            "elapsed_ms": 0,
        }
    return {**state, "result": result}


# ─── Conditional routing ──────────────────────────────────────────────────────

def _should_continue_polling(state: ReleaseCheckState) -> str:
    """Route: back to poll_job if still running, else go to finalize."""
    job_status = state.get("job_status", "running")
    graph_status = state.get("graph_status", "")
    if graph_status in ("succeeded", "failed"):
        return "finalize"
    if job_status in ("succeeded", "failed", "cancelled"):
        return "finalize"
    return "poll_job"


def _after_start(state: ReleaseCheckState) -> str:
    """Route after start_job: go directly to finalize if already done, else poll."""
    if state.get("graph_status") in ("succeeded", "failed"):
        return "finalize"
    return "poll_job"


# ─── Graph builder ────────────────────────────────────────────────────────────

def build_release_check_graph():
    """
    Build and compile the release_check LangGraph.

    Graph:
      start_job → [if done] finalize → END
                → [if running] poll_job → [loop] → finalize → END
    """
    graph = StateGraph(ReleaseCheckState)

    graph.add_node("start_job", start_job_node)
    graph.add_node("poll_job", poll_job_node)
    graph.add_node("finalize", finalize_node)

    graph.set_entry_point("start_job")

    graph.add_conditional_edges(
        "start_job",
        _after_start,
        {"finalize": "finalize", "poll_job": "poll_job"},
    )
    graph.add_conditional_edges(
        "poll_job",
        _should_continue_polling,
        {"poll_job": "poll_job", "finalize": "finalize"},
    )
    graph.add_edge("finalize", END)

    return graph.compile()