""" release_check Internal Runner Orchestrates all release gates by calling tool handlers sequentially (no shell). Gates: 1. pr_reviewer_tool – blocking_only (blocking) 2. config_linter_tool – strict=true (blocking) 3. contract_tool – diff_openapi (fail_on_breaking) 4. threatmodel_tool – analyze_diff (risk_profile) 5. [optional] job_orchestrator_tool – smoke_gateway 6. [optional] job_orchestrator_tool – drift_check_node1 Output: { "pass": true|false, "gates": [...], "recommendations": [...], "summary": "..." } """ import asyncio import hashlib import json import logging import os import time from pathlib import Path from typing import Any, Dict, List, Optional, Tuple logger = logging.getLogger(__name__) # ─── Gate Policy ────────────────────────────────────────────────────────────── _gate_policy_cache: Optional[Dict] = None _GATE_POLICY_PATH = os.path.join( os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent)), "config", "release_gate_policy.yml", ) def load_gate_policy(profile: str = "dev") -> Dict: """ Load gate policy for the given profile (dev/staging/prod). Returns dict of {gate_name: {mode, fail_on, ...}}. Falls back to defaults (warn) if config missing or profile unknown. """ global _gate_policy_cache if _gate_policy_cache is None: try: import yaml with open(_GATE_POLICY_PATH, "r") as f: _gate_policy_cache = yaml.safe_load(f) or {} except Exception as e: logger.warning("release_gate_policy.yml not loaded: %s", e) _gate_policy_cache = {} cfg = _gate_policy_cache profiles = cfg.get("profiles") or {} defaults = cfg.get("defaults") or {} default_mode = defaults.get("mode", "warn") profile_cfg = profiles.get(profile) or profiles.get("dev") or {} gates_cfg = profile_cfg.get("gates") or {} # Normalise: ensure every gate has at minimum {mode: default_mode} result: Dict[str, Dict] = {} for gate_name, gate_cfg in gates_cfg.items(): result[gate_name] = dict(gate_cfg) if isinstance(gate_cfg, dict) else {"mode": gate_cfg} def _get(name: str) -> Dict: return result.get(name, {"mode": default_mode}) return { "_profile": profile, "_default_mode": default_mode, "get": _get, **result, } def _reload_gate_policy() -> None: global _gate_policy_cache _gate_policy_cache = None # ─── Gate Result ────────────────────────────────────────────────────────────── def _gate(name: str, status: str, details: Dict = None, **extra) -> Dict: """Build a single gate result dict.""" g = {"name": name, "status": status} g.update(extra) if details: g["details"] = details return g # ─── Individual Gate Runners ───────────────────────────────────────────────── async def _run_dependency_scan( tool_manager, agent_id: str, targets: Optional[List[str]] = None, vuln_mode: str = "offline_cache", fail_on: Optional[List[str]] = None, timeout_sec: float = 40.0, ) -> Tuple[bool, Dict]: """Gate 3: Dependency & supply-chain vulnerability scan.""" args = { "action": "scan", "targets": targets or ["python", "node"], "vuln_mode": vuln_mode, "fail_on": fail_on or ["CRITICAL", "HIGH"], "timeout_sec": timeout_sec, } try: result = await tool_manager.execute_tool( "dependency_scanner_tool", args, agent_id=agent_id ) if not result.success: return False, _gate("dependency_scan", "fail", error=result.error) data = result.result or {} scan_pass = data.get("pass", True) stats = data.get("stats", {}) by_sev = stats.get("by_severity", {}) top_vulns = (data.get("vulnerabilities") or [])[:5] status = "pass" if scan_pass else "fail" return scan_pass, _gate( "dependency_scan", status, critical=by_sev.get("CRITICAL", 0), high=by_sev.get("HIGH", 0), medium=by_sev.get("MEDIUM", 0), total=stats.get("vulns_total", 0), deps_total=stats.get("deps_total", 0), top_vulns=top_vulns, summary=data.get("summary", ""), ) except Exception as e: logger.exception("Dependency scan gate error") return False, _gate("dependency_scan", "error", error=str(e)) async def _run_pr_review(tool_manager, diff_text: str, agent_id: str) -> Tuple[bool, Dict]: """Gate 1: PR review in blocking_only mode.""" if not diff_text or not diff_text.strip(): return True, _gate("pr_review", "skipped", reason="no diff_text provided") args = { "mode": "blocking_only", "diff": { "text": diff_text, "max_chars": 400000, "max_files": 200, }, "options": { "mask_evidence": True, }, } try: result = await tool_manager.execute_tool( "pr_reviewer_tool", args, agent_id=agent_id ) if not result.success: return False, _gate("pr_review", "fail", error=result.error, blocking_count=None, details_ref=None) data = result.result or {} blocking = data.get("blocking_count", 0) or len(data.get("blocking_issues", [])) status = "fail" if blocking > 0 else "pass" return blocking == 0, _gate( "pr_review", status, blocking_count=blocking, summary=data.get("summary", ""), score=data.get("score"), ) except Exception as e: logger.exception("PR review gate error") return False, _gate("pr_review", "error", error=str(e)) async def _run_config_lint(tool_manager, diff_text: str, agent_id: str) -> Tuple[bool, Dict]: """Gate 2: Config linter with strict=true.""" if not diff_text or not diff_text.strip(): return True, _gate("config_lint", "skipped", reason="no diff_text provided") args = { "source": "diff_text", "diff_text": diff_text, "options": { "strict": True, "mask_evidence": True, "include_recommendations": True, }, } try: result = await tool_manager.execute_tool( "config_linter_tool", args, agent_id=agent_id ) if not result.success: return False, _gate("config_lint", "fail", error=result.error) data = result.result or {} blocking_count = data.get("blocking_count", 0) status = "fail" if blocking_count > 0 else "pass" return blocking_count == 0, _gate( "config_lint", status, blocking_count=blocking_count, total_findings=data.get("total_findings", 0), summary=data.get("summary", ""), ) except Exception as e: logger.exception("Config lint gate error") return False, _gate("config_lint", "error", error=str(e)) async def _run_contract_diff( tool_manager, openapi_base: Optional[str], openapi_head: Optional[str], agent_id: str, ) -> Tuple[bool, Dict]: """Gate 4: OpenAPI contract diff.""" if not openapi_base or not openapi_head: return True, _gate("contract_diff", "skipped", reason="openapi_base or openapi_head not provided") args = { "action": "diff_openapi", "base_spec": {"text": openapi_base}, "head_spec": {"text": openapi_head}, "options": { "fail_on_breaking": True, "mask_evidence": True, }, } try: result = await tool_manager.execute_tool( "contract_tool", args, agent_id=agent_id ) if not result.success: return False, _gate("contract_diff", "fail", error=result.error) data = result.result or {} breaking = data.get("breaking_count", 0) or len(data.get("breaking_changes", [])) status = "fail" if breaking > 0 else "pass" return breaking == 0, _gate( "contract_diff", status, breaking_count=breaking, summary=data.get("summary", ""), ) except Exception as e: logger.exception("Contract diff gate error") return False, _gate("contract_diff", "error", error=str(e)) async def _run_threat_model( tool_manager, diff_text: str, service_name: str, risk_profile: str, agent_id: str, ) -> Tuple[bool, Dict]: """Gate 5: Threat model analysis.""" args = { "action": "analyze_diff", "diff_text": diff_text or "", "service_name": service_name, "risk_profile": risk_profile, } try: result = await tool_manager.execute_tool( "threatmodel_tool", args, agent_id=agent_id ) if not result.success: return False, _gate("threat_model", "fail", error=result.error) data = result.result or {} # High risk without mitigation = blocking unmitigated_high = data.get("unmitigated_high_count", 0) status = "fail" if unmitigated_high > 0 else "pass" return unmitigated_high == 0, _gate( "threat_model", status, unmitigated_high=unmitigated_high, risk_profile=risk_profile, summary=data.get("summary", ""), recommendations=data.get("recommendations", []), ) except Exception as e: logger.exception("Threat model gate error") return False, _gate("threat_model", "error", error=str(e)) async def _run_smoke(tool_manager, agent_id: str) -> Tuple[bool, Dict]: """Gate 5 (optional): Smoke test via job orchestrator.""" args = { "action": "start_task", "agent_id": agent_id, "params": { "task_id": "smoke_gateway", "dry_run": False, }, } try: result = await tool_manager.execute_tool( "job_orchestrator_tool", args, agent_id=agent_id ) if not result.success: return False, _gate("smoke", "fail", error=result.error) data = result.result or {} job_id = data.get("id", "") # In production: poll job status. Here we treat queued as optimistic pass. return True, _gate("smoke", "pass", job_id=job_id, note="job queued, check job status for final result") except Exception as e: logger.exception("Smoke gate error") return False, _gate("smoke", "error", error=str(e)) async def _run_drift( tool_manager, agent_id: str, categories: Optional[List[str]] = None, timeout_sec: float = 25.0, ) -> Tuple[bool, Dict]: """ Gate 6 (optional): Drift analysis via drift_analyzer_tool. pass=false when drift finds errors (warnings don't block release). """ args = { "action": "analyze", "categories": categories, # None = all categories "timeout_sec": timeout_sec, } try: result = await tool_manager.execute_tool( "drift_analyzer_tool", args, agent_id=agent_id ) if not result.success: return False, _gate("drift", "fail", error=result.error) data = result.result or {} drift_pass = data.get("pass", True) stats = data.get("stats", {}) errors = stats.get("errors", 0) warnings = stats.get("warnings", 0) skipped = stats.get("skipped", []) status = "pass" if drift_pass else "fail" top_findings = (data.get("findings") or [])[:5] # top 5 for gate summary return drift_pass, _gate( "drift", status, errors=errors, warnings=warnings, skipped=skipped, top_findings=top_findings, summary=data.get("summary", ""), ) except Exception as e: logger.exception("Drift gate error") return False, _gate("drift", "error", error=str(e)) # ─── Main Runner ────────────────────────────────────────────────────────────── async def run_release_check(tool_manager, inputs: Dict, agent_id: str) -> Dict: """ Execute all release gates and return aggregated verdict. Args: tool_manager: ToolManager instance (with execute_tool method) inputs: dict from task_registry inputs_schema agent_id: executing agent Returns: { "pass": bool, "gates": [...], "recommendations": [...], "summary": str, } """ diff_text = inputs.get("diff_text", "") service_name = inputs.get("service_name", "unknown") openapi_base = inputs.get("openapi_base") openapi_head = inputs.get("openapi_head") risk_profile = inputs.get("risk_profile", "default") fail_fast = inputs.get("fail_fast", False) run_smoke = inputs.get("run_smoke", False) run_drift = inputs.get("run_drift", False) gate_profile = inputs.get("gate_profile", "dev") gate_policy = load_gate_policy(gate_profile) gates = [] recommendations = [] overall_pass = True ts_start = time.monotonic() # ── Gate 1: PR Review ────────────────────────────────────────────────── ok, gate = await _run_pr_review(tool_manager, diff_text, agent_id) gates.append(gate) if not ok: overall_pass = False recommendations.append("Fix blocking PR review findings before release.") if fail_fast: return _build_report(overall_pass, gates, recommendations, ts_start) # ── Gate 2: Config Lint ──────────────────────────────────────────────── ok, gate = await _run_config_lint(tool_manager, diff_text, agent_id) gates.append(gate) if not ok: overall_pass = False recommendations.append("Remove secrets/unsafe config before release.") if fail_fast: return _build_report(overall_pass, gates, recommendations, ts_start) # ── Gate 3: Dependency Scan ──────────────────────────────────────────── run_deps = inputs.get("run_deps", True) if run_deps: ok, gate = await _run_dependency_scan( tool_manager, agent_id=agent_id, targets=inputs.get("deps_targets"), vuln_mode=inputs.get("deps_vuln_mode", "offline_cache"), fail_on=inputs.get("deps_fail_on") or ["CRITICAL", "HIGH"], timeout_sec=float(inputs.get("deps_timeout_sec", 40.0)), ) gates.append(gate) if not ok: overall_pass = False top = gate.get("top_vulns", []) top_recs = [v.get("recommendation", "") for v in top if v.get("recommendation")] if top_recs: recommendations.extend(top_recs[:3]) else: recommendations.append( f"Dependency scan found {gate.get('critical',0)} CRITICAL / " f"{gate.get('high',0)} HIGH vulnerabilities. Upgrade before release." ) if fail_fast: return _build_report(overall_pass, gates, recommendations, ts_start) # ── Gate 4 (renumbered): Contract Diff ──────────────────────────────── ok, gate = await _run_contract_diff( tool_manager, openapi_base, openapi_head, agent_id ) gates.append(gate) if not ok: overall_pass = False recommendations.append("Fix breaking OpenAPI changes or bump major version.") if fail_fast: return _build_report(overall_pass, gates, recommendations, ts_start) # ── Gate 5 (renumbered): Threat Model ───────────────────────────────── ok, gate = await _run_threat_model( tool_manager, diff_text, service_name, risk_profile, agent_id ) gates.append(gate) if not ok: overall_pass = False # Collect threat model recommendations threat_recs = gate.get("recommendations", []) recommendations.extend(threat_recs if threat_recs else ["Address unmitigated high-risk threats before release."]) if fail_fast: return _build_report(overall_pass, gates, recommendations, ts_start) # ── Gate 5 (optional): Smoke ─────────────────────────────────────────── if run_smoke: ok, gate = await _run_smoke(tool_manager, agent_id) gates.append(gate) if not ok: overall_pass = False recommendations.append("Smoke tests failed. Investigate gateway health.") # ── Gate 6 (optional): Drift ─────────────────────────────────────────── if run_drift: drift_categories = inputs.get("drift_categories") # optional subset drift_timeout = float(inputs.get("drift_timeout_sec", 25.0)) ok, gate = await _run_drift(tool_manager, agent_id, categories=drift_categories, timeout_sec=drift_timeout) gates.append(gate) if not ok: overall_pass = False top = gate.get("top_findings", []) err_titles = [f.get("title", "") for f in top if f.get("severity") == "error"] if err_titles: recommendations.append( f"Drift errors found: {'; '.join(err_titles[:3])}. Fix before release." ) else: recommendations.append("Drift analysis found errors. Reconcile before release.") # ── SLO Watch (policy-driven: off/warn/strict) ─────────────────────────── run_slo_watch = inputs.get("run_slo_watch", True) _sw_policy = gate_policy.get("slo_watch") if callable(gate_policy.get) else gate_policy.get("slo_watch", {}) _sw_mode = (_sw_policy or {}).get("mode", "warn") if run_slo_watch and _sw_mode != "off": sw_window = int(inputs.get("slo_watch_window_minutes", 60)) ok_sw, gate = await _run_slo_watch( tool_manager, agent_id, service_name=service_name, env=inputs.get("followup_watch_env", "prod"), window_minutes=sw_window, ) gates.append(gate) for rec in gate.get("recommendations", []): recommendations.append(rec) if _sw_mode == "strict" and not gate.get("skipped"): violations = gate.get("violations", []) if violations: overall_pass = False if fail_fast: return _build_report(overall_pass, gates, recommendations, ts_start) # ── Follow-up Watch (policy-driven: off/warn/strict) ───────────────────── run_followup_watch = inputs.get("run_followup_watch", True) _fw_policy = gate_policy.get("followup_watch") if callable(gate_policy.get) else gate_policy.get("followup_watch", {}) _fw_mode = (_fw_policy or {}).get("mode", gate_policy.get("_default_mode", "warn")) if run_followup_watch and _fw_mode != "off": fw_window = int(inputs.get("followup_watch_window_days", 30)) fw_env = inputs.get("followup_watch_env", "any") ok_fw, gate = await _run_followup_watch( tool_manager, agent_id, service_name=service_name, env=fw_env, window_days=fw_window, ) gates.append(gate) for rec in gate.get("recommendations", []): recommendations.append(rec) if _fw_mode == "strict" and not gate.get("skipped"): fail_on_sev = (_fw_policy or {}).get("fail_on", ["P0", "P1"]) blocking_incidents = [ i for i in (gate.get("open_incidents") or []) if i.get("severity") in fail_on_sev ] has_overdue = len(gate.get("overdue_followups") or []) > 0 if blocking_incidents or has_overdue: overall_pass = False if fail_fast: return _build_report(overall_pass, gates, recommendations, ts_start) # ── Risk Watch (policy-driven: off/warn/strict) ─────────────────────────── run_risk_watch = inputs.get("run_risk_watch", True) _risk_policy = gate_policy.get("risk_watch") if callable(gate_policy.get) else gate_policy.get("risk_watch", {}) _risk_mode = ( inputs.get("risk_watch_mode") or (_risk_policy or {}).get("mode", gate_policy.get("_default_mode", "warn")) ) if run_risk_watch and _risk_mode != "off": risk_env = inputs.get("risk_watch_env", "prod") risk_warn_at = inputs.get("risk_watch_warn_at") risk_fail_at = inputs.get("risk_watch_fail_at") ok_risk, gate = await _run_risk_watch( tool_manager, agent_id, service_name=service_name, env=risk_env, warn_at=risk_warn_at, fail_at=risk_fail_at, ) gates.append(gate) for rec in gate.get("recommendations", []): recommendations.append(rec) if _risk_mode == "strict" and not gate.get("skipped"): effective_fail_at = gate.get("effective_fail_at", 80) score = gate.get("score", 0) if score >= effective_fail_at: overall_pass = False if fail_fast: return _build_report(overall_pass, gates, recommendations, ts_start) # ── Risk Delta Watch (policy-driven: off/warn/strict) ───────────────────── run_risk_delta_watch = inputs.get("run_risk_delta_watch", True) _rdw_policy = gate_policy.get("risk_delta_watch") if callable(gate_policy.get) else gate_policy.get("risk_delta_watch", {}) _rdw_mode = ( inputs.get("risk_delta_watch_mode") or (_rdw_policy or {}).get("mode", gate_policy.get("_default_mode", "warn")) ) if run_risk_delta_watch and _rdw_mode != "off": rdw_env = inputs.get("risk_delta_env", "prod") rdw_hours = int(inputs.get("risk_delta_hours", 24)) rdw_warn = inputs.get("risk_delta_warn") rdw_fail = inputs.get("risk_delta_fail") ok_rdw, gate = await _run_risk_delta_watch( tool_manager, agent_id, service_name=service_name, env=rdw_env, delta_hours=rdw_hours, warn_delta=rdw_warn, fail_delta=rdw_fail, policy=None, ) gates.append(gate) for rec in gate.get("recommendations", []): recommendations.append(rec) if _rdw_mode == "strict" and not gate.get("skipped"): # Only block for p0_services when p0_services_strict=True (loaded inside helper) if gate.get("should_fail"): overall_pass = False if fail_fast: return _build_report(overall_pass, gates, recommendations, ts_start) # ── Platform Review Required (policy-driven: off/warn/strict) ─────────── run_platform_review = inputs.get("run_platform_review_required", True) _prv_policy = gate_policy.get("platform_review_required") if callable(gate_policy.get) else gate_policy.get("platform_review_required", {}) _prv_mode = ( inputs.get("platform_review_mode") or (_prv_policy or {}).get("mode", gate_policy.get("_default_mode", "warn")) ) if run_platform_review and _prv_mode != "off": ok_prv, gate = await _run_platform_review_required( tool_manager, agent_id, service_name=service_name, env=inputs.get("platform_review_env", "prod"), ) gates.append(gate) for rec in gate.get("recommendations", []): recommendations.append(rec) if _prv_mode == "strict" and not gate.get("skipped"): if gate.get("should_fail"): overall_pass = False if fail_fast: return _build_report(overall_pass, gates, recommendations, ts_start) # ── Recurrence Watch (policy-driven: off/warn/strict) ───────────────────── run_recurrence_watch = inputs.get("run_recurrence_watch", True) _rw_policy = gate_policy.get("recurrence_watch") if callable(gate_policy.get) else gate_policy.get("recurrence_watch", {}) _rw_mode = ( inputs.get("recurrence_watch_mode") or (_rw_policy or {}).get("mode", gate_policy.get("_default_mode", "warn")) ) if run_recurrence_watch and _rw_mode != "off": rw_windows = inputs.get("recurrence_watch_windows_days", [7, 30]) rw_service = inputs.get("recurrence_watch_service", service_name) ok_rw, gate = await _run_recurrence_watch( tool_manager, agent_id, service_name=rw_service, windows_days=rw_windows, ) gates.append(gate) for rec in gate.get("recommendations", []): recommendations.append(rec) if _rw_mode == "strict" and not gate.get("skipped"): fail_on_sev = (_rw_policy or {}).get("fail_on", {}).get("severity_in", ["P0", "P1"]) fail_on_high = (_rw_policy or {}).get("fail_on", {}).get("high_recurrence", True) if fail_on_high and gate.get("has_high_recurrence"): max_sev = gate.get("max_severity_seen", "P3") if max_sev in fail_on_sev: overall_pass = False if fail_fast: return _build_report(overall_pass, gates, recommendations, ts_start) # ── Privacy Watch (policy-driven: off/warn/strict) ──────────────────────── run_privacy_watch = inputs.get("run_privacy_watch", True) _pw_policy = gate_policy.get("privacy_watch") if callable(gate_policy.get) else gate_policy.get("privacy_watch", {}) _pw_mode = (_pw_policy or {}).get("mode", gate_policy.get("_default_mode", "warn")) if run_privacy_watch and _pw_mode != "off": privacy_mode = inputs.get("privacy_watch_mode", "fast") privacy_audit_h = int(inputs.get("privacy_audit_window_hours", 24)) ok_pw, gate = await _run_privacy_watch( tool_manager, agent_id, mode=privacy_mode, audit_window_hours=privacy_audit_h, ) gates.append(gate) for rec in gate.get("recommendations", []): recommendations.append(rec) # Apply strict mode: block release if findings match fail_on if _pw_mode == "strict" and not gate.get("skipped"): fail_on_sev = (_pw_policy or {}).get("fail_on", ["error"]) all_findings = gate.get("top_findings") or [] blocking = [f for f in all_findings if f.get("severity") in fail_on_sev] if blocking: overall_pass = False if fail_fast: return _build_report(overall_pass, gates, recommendations, ts_start) # ── Cost Watch (always warn even in strict profiles) ────────────────────── run_cost_watch = inputs.get("run_cost_watch", True) _cw_policy = gate_policy.get("cost_watch") if callable(gate_policy.get) else gate_policy.get("cost_watch", {}) _cw_mode = (_cw_policy or {}).get("mode", "warn") if run_cost_watch and _cw_mode != "off": cost_window_h = int(inputs.get("cost_watch_window_hours", 24)) cost_ratio = float(inputs.get("cost_spike_ratio_threshold", 3.0)) cost_min_calls = int(inputs.get("cost_min_calls_threshold", 50)) _, gate = await _run_cost_watch( tool_manager, agent_id, window_hours=cost_window_h, ratio_threshold=cost_ratio, min_calls=cost_min_calls, ) gates.append(gate) # cost_watch is never strict (cost_always_warn in policy) — recommendations only for rec in gate.get("recommendations", []): recommendations.append(rec) return _build_report(overall_pass, gates, recommendations, ts_start) async def _run_slo_watch( tool_manager, agent_id: str, service_name: str = "", env: str = "prod", window_minutes: int = 60, ) -> Tuple[bool, Dict]: """ Warning-only gate: detects SLO breaches before deploying. strict mode blocks on any violation. """ try: args = { "action": "slo_snapshot", "service": service_name, "env": env, "window_minutes": window_minutes, } result = await tool_manager.execute_tool( "observability_tool", args, agent_id=agent_id ) if not result.success: return True, _gate("slo_watch", "pass", note=f"slo_watch skipped: {result.error}", skipped=True) data = result.result or {} violations = data.get("violations", []) metrics = data.get("metrics", {}) thresholds = data.get("thresholds", {}) recs = [] if violations and not data.get("skipped"): viol_desc = ", ".join(violations) recs.append( f"SLO violation ({viol_desc}) detected for '{service_name}' — " f"consider postponing deployment until service recovers" ) note = ( f"Violations: {', '.join(violations)}" if violations else "No SLO violations detected" ) return True, _gate( "slo_watch", "pass", violations=violations, metrics=metrics, thresholds=thresholds, note=note, skipped=data.get("skipped", False), recommendations=recs, ) except Exception as e: logger.warning("slo_watch gate error: %s", e) return True, _gate("slo_watch", "pass", note=f"slo_watch skipped (error): {e}", skipped=True) async def _run_followup_watch( tool_manager, agent_id: str, service_name: str = "", env: str = "any", window_days: int = 30, ) -> Tuple[bool, Dict]: """ Policy-driven gate: checks for open P0/P1 incidents and overdue follow-ups. Returns pass=True in warn mode; strict mode may block based on GatePolicy. """ try: args = { "action": "incident_followups_summary", "service": service_name, "env": env, "window_days": window_days, } result = await tool_manager.execute_tool( "oncall_tool", args, agent_id=agent_id ) if not result.success: return True, _gate("followup_watch", "pass", note=f"followup_watch skipped: {result.error}", skipped=True) data = result.result or {} stats = data.get("stats", {}) open_incs = data.get("open_incidents", []) overdue = data.get("overdue_followups", []) recs = [] if open_incs: sev_list = ", ".join(f"{i['severity']} {i['id']}" for i in open_incs[:3]) recs.append(f"Open critical incidents: {sev_list}") if overdue: ov_list = ", ".join(f"{o['priority']} '{o['title'][:40]}' (due {o['due_date'][:10]})" for o in overdue[:3]) recs.append(f"Overdue follow-ups: {ov_list}") note = ( f"{stats.get('open_incidents', 0)} open P0/P1, " f"{stats.get('overdue', 0)} overdue follow-ups, " f"{stats.get('total_open_followups', 0)} total open" ) return True, _gate( "followup_watch", "pass", open_incidents=open_incs[:5], overdue_followups=overdue[:5], stats=stats, note=note, recommendations=recs, ) except Exception as e: logger.warning("followup_watch gate error: %s", e) return True, _gate("followup_watch", "pass", note=f"followup_watch skipped (error): {e}", skipped=True) async def _run_risk_watch( tool_manager, agent_id: str, service_name: str = "", env: str = "prod", warn_at: Optional[int] = None, fail_at: Optional[int] = None, ) -> Tuple[bool, Dict]: """ Policy-driven gate: computes RiskReport for the target service and evaluates against configurable warn_at/fail_at thresholds. Non-fatal: any error causes skip (never blocks release). """ try: args: Dict = { "action": "service", "env": env, } if service_name: args["service"] = service_name else: # No service → skip gracefully return True, _gate("risk_watch", "pass", note="risk_watch skipped: no service_name provided", skipped=True) result = await tool_manager.execute_tool( "risk_engine_tool", args, agent_id=agent_id ) if not result.success: return True, _gate("risk_watch", "pass", note=f"risk_watch skipped: {result.error}", skipped=True) data = result.result or {} score = int(data.get("score", 0)) band = data.get("band", "low") reasons = data.get("reasons", []) engine_recs = data.get("recommendations", []) # Effective thresholds: input overrides > policy service override > policy defaults thresholds = data.get("thresholds", {}) effective_warn = int(warn_at) if warn_at is not None else int(thresholds.get("warn_at", 50)) effective_fail = int(fail_at) if fail_at is not None else int(thresholds.get("fail_at", 80)) gate_recs = [] if score >= effective_warn: gate_recs.append( f"Service '{service_name}' risk score {score} ({band}): " + "; ".join(reasons[:3]) ) gate_recs.extend(engine_recs[:2]) note = ( f"score={score} band={band} warn_at={effective_warn} fail_at={effective_fail} | " + ("; ".join(reasons[:3]) if reasons else "no signals") ) return True, _gate( "risk_watch", "pass", score=score, band=band, reasons=reasons[:5], effective_warn_at=effective_warn, effective_fail_at=effective_fail, components=data.get("components", {}), skipped=False, note=note, recommendations=gate_recs, ) except Exception as e: logger.warning("risk_watch gate error: %s", e) return True, _gate("risk_watch", "pass", note=f"risk_watch skipped (error): {e}", skipped=True) async def _run_risk_delta_watch( tool_manager, agent_id: str, service_name: str = "", env: str = "prod", delta_hours: int = 24, warn_delta: Optional[int] = None, fail_delta: Optional[int] = None, policy: Optional[Dict] = None, ) -> Tuple[bool, Dict]: """ Gate: checks how much the risk score rose since `delta_hours` ago. Non-fatal: missing history → skipped (never blocks). Sets gate["should_fail"] = True if score delta >= fail_delta AND service is p0 in strict mode. """ try: if not service_name: return True, _gate("risk_delta_watch", "pass", note="risk_delta_watch skipped: no service_name", skipped=True) # Load policy locally if policy is None: try: from risk_engine import load_risk_policy policy = load_risk_policy() except Exception: policy = {} p0_services = set(policy.get("p0_services", [])) rdw_cfg = policy.get("release_gate", {}).get("risk_delta_watch", {}) effective_warn = int(warn_delta) if warn_delta is not None else int(rdw_cfg.get("default_warn_delta_24h", 10)) effective_fail = int(fail_delta) if fail_delta is not None else int(rdw_cfg.get("default_fail_delta_24h", 20)) p0_strict = bool(rdw_cfg.get("p0_services_strict", True)) # Compute current risk score risk_result = await tool_manager.execute_tool( "risk_engine_tool", {"action": "service", "service": service_name, "env": env, "include_trend": False}, agent_id=agent_id, ) if not risk_result.success: return True, _gate("risk_delta_watch", "pass", note=f"risk_delta_watch skipped: {risk_result.error}", skipped=True) current_score = int((risk_result.result or {}).get("score", 0)) current_band = (risk_result.result or {}).get("band", "low") # Get delta from history delta: Optional[int] = None no_history = False try: from risk_history_store import get_risk_history_store hstore = get_risk_history_store() delta = hstore.get_delta(service_name, env, hours=delta_hours) except Exception as he: logger.warning("risk_delta_watch: history unavailable: %s", he) if delta is None: return True, _gate( "risk_delta_watch", "pass", note="No history baseline; run hourly_risk_snapshot first.", skipped=True, recommendations=["No risk history baseline available. Run hourly_risk_snapshot to establish baseline."], ) # Regression flags from trend policy reg_warn = delta >= effective_warn reg_fail = delta >= effective_fail recs: List[str] = [] if reg_warn: recs.append( f"Risk score for '{service_name}' rose +{delta} pts in {delta_hours}h " f"(current: {current_score}, band: {current_band}). " f"Review recent deployments and open incidents." ) if reg_fail: recs.append( f"Risk regression FAIL for '{service_name}': +{delta} pts >= fail threshold {effective_fail}. " f"Block or roll back recent changes." ) # should_fail only when: service is p0, strict enabled, delta >= fail is_p0 = service_name in p0_services should_fail = reg_fail and is_p0 and p0_strict note = ( f"delta_{delta_hours}h={delta} current_score={current_score} band={current_band} " f"warn_at={effective_warn} fail_at={effective_fail} is_p0={is_p0}" ) return True, _gate( "risk_delta_watch", "pass", delta=delta, delta_hours=delta_hours, current_score=current_score, current_band=current_band, effective_warn_delta=effective_warn, effective_fail_delta=effective_fail, regression_warn=reg_warn, regression_fail=reg_fail, is_p0=is_p0, should_fail=should_fail, skipped=False, note=note, recommendations=recs, ) except Exception as e: logger.warning("risk_delta_watch gate error: %s", e) return True, _gate("risk_delta_watch", "pass", note=f"risk_delta_watch skipped (error): {e}", skipped=True) async def _run_platform_review_required( tool_manager, agent_id: str, service_name: str = "", env: str = "prod", ) -> Tuple[bool, Dict]: """ Gate: Computes Architecture Pressure for the service. In warn mode: always pass=True, adds recommendations. In strict mode: sets should_fail=True if pressure >= fail_at. Non-fatal: any error causes skip (never blocks release). """ try: if not service_name: return True, _gate("platform_review_required", "pass", note="platform_review_required skipped: no service_name", skipped=True) # Load architecture pressure policy for thresholds try: from architecture_pressure import load_pressure_policy pressure_policy = load_pressure_policy() except Exception: pressure_policy = {} gate_cfg = pressure_policy.get("release_gate", {}).get( "platform_review_required", {} ) warn_at = int(gate_cfg.get("warn_at", 60)) fail_at = int(gate_cfg.get("fail_at", 85)) # Compute pressure via tool_manager result = await tool_manager.execute_tool( "architecture_pressure_tool", {"action": "service", "service": service_name, "env": env}, agent_id=agent_id, ) if not result.success: return True, _gate("platform_review_required", "pass", note=f"platform_review_required skipped: {result.error}", skipped=True) data = result.result or {} score = int(data.get("score", 0)) band = data.get("band", "low") signals = data.get("signals_summary", []) requires_review = bool(data.get("requires_arch_review", False)) gate_recs = [] should_fail = False if score >= warn_at: gate_recs.append( f"Service '{service_name}' architecture pressure={score} ({band}): " + ("; ".join(signals[:2]) if signals else "structural strain detected") ) if score >= fail_at: gate_recs.append( f"Architecture review required for '{service_name}' before release. " f"Pressure score {score} exceeds fail threshold {fail_at}." ) should_fail = True if requires_review: gate_recs.append( f"Architecture review has been flagged for '{service_name}'. " f"Check ops/reports/platform/ for latest digest." ) note = ( f"pressure_score={score} band={band} warn_at={warn_at} fail_at={fail_at} | " + ("; ".join(signals[:2]) if signals else "no pressure signals") ) return True, _gate( "platform_review_required", "pass", score=score, band=band, signals_summary=signals[:4], requires_arch_review=requires_review, warn_at=warn_at, fail_at=fail_at, should_fail=should_fail, skipped=False, note=note, recommendations=gate_recs, ) except Exception as e: logger.warning("platform_review_required gate error: %s", e) return True, _gate("platform_review_required", "pass", note=f"platform_review_required skipped (error): {e}", skipped=True) async def _run_recurrence_watch( tool_manager, agent_id: str, service_name: str = "", windows_days: List[int] = None, ) -> Tuple[bool, Dict]: """ Policy-driven gate: checks incident recurrence for the target service. - warn mode: always pass=True, adds recommendations. - strict mode: pass=False if high_recurrence + severity in fail_on list. Non-fatal: any error skips the gate. """ if windows_days is None: windows_days = [7, 30] try: # Prefer focused service query; fall back to all if no service specified args: Dict = { "action": "recurrence", "window_days": max(windows_days) if windows_days else 7, } if service_name: args["service"] = service_name result = await tool_manager.execute_tool( "incident_intelligence_tool", args, agent_id=agent_id ) if not result.success: return True, _gate("recurrence_watch", "pass", note=f"recurrence_watch skipped: {result.error}", skipped=True) data = result.result or {} high_sigs = data.get("high_recurrence", {}).get("signatures", []) high_kinds = data.get("high_recurrence", {}).get("kinds", []) warn_sigs = data.get("warn_recurrence", {}).get("signatures", []) warn_kinds = data.get("warn_recurrence", {}).get("kinds", []) has_high = bool(high_sigs or high_kinds) has_warn = bool(warn_sigs or warn_kinds) max_sev = data.get("max_severity_seen", "P3") total = data.get("total_incidents", 0) recs = [] if has_high: bucket_descs = ( [f"sig:{s['signature'][:8]} ({s['count']}x)" for s in high_sigs[:3]] + [f"kind:{k['kind']} ({k['count']}x)" for k in high_kinds[:3]] ) recs.append( f"High recurrence for '{service_name or 'all'}': " + ", ".join(bucket_descs) + " — review root cause before deploying" ) elif has_warn: warn_descs = ( [f"sig:{s['signature'][:8]} ({s['count']}x)" for s in warn_sigs[:2]] + [f"kind:{k['kind']} ({k['count']}x)" for k in warn_kinds[:2]] ) recs.append( f"Warn-level recurrence for '{service_name or 'all'}': " + ", ".join(warn_descs) ) note = ( f"high={len(high_sigs)} sigs / {len(high_kinds)} kinds; " f"warn={len(warn_sigs)}/{len(warn_kinds)}; " f"total_incidents={total}; max_sev={max_sev}" ) return True, _gate( "recurrence_watch", "pass", has_high_recurrence=has_high, has_warn_recurrence=has_warn, high_signatures=[s["signature"][:8] for s in high_sigs[:5]], high_kinds=[k["kind"] for k in high_kinds[:5]], max_severity_seen=max_sev, total_incidents=total, note=note, skipped=False, recommendations=recs, ) except Exception as e: logger.warning("recurrence_watch gate error: %s", e) return True, _gate("recurrence_watch", "pass", note=f"recurrence_watch skipped (error): {e}", skipped=True) async def _run_privacy_watch( tool_manager, agent_id: str, mode: str = "fast", audit_window_hours: int = 24, ) -> Tuple[bool, Dict]: """ Warning-only gate: scans repo (fast mode) and recent audit stream for privacy risks. Always returns pass=True. Adds recommendations for errors/warnings found. """ try: # scan_repo (fast) repo_args = {"action": "scan_repo", "mode": mode, "max_files": 200, "paths_include": ["services/", "config/", "ops/"]} repo_result = await tool_manager.execute_tool( "data_governance_tool", repo_args, agent_id=agent_id ) repo_data = repo_result.result or {} if repo_result.success else {} # scan_audit (optional, non-fatal) audit_data: Dict = {} try: audit_args = {"action": "scan_audit", "time_window_hours": audit_window_hours} audit_result = await tool_manager.execute_tool( "data_governance_tool", audit_args, agent_id=agent_id ) if audit_result.success: audit_data = audit_result.result or {} except Exception: pass # Merge findings all_findings = (repo_data.get("findings") or []) + (audit_data.get("findings") or []) all_recs = list(dict.fromkeys( (repo_data.get("recommendations") or []) + (audit_data.get("recommendations") or []) )) errors = sum(1 for f in all_findings if f.get("severity") == "error") warnings = sum(1 for f in all_findings if f.get("severity") == "warning") infos = sum(1 for f in all_findings if f.get("severity") == "info") total = errors + warnings + infos note = ( f"{total} finding(s): {errors} error(s), {warnings} warning(s)" if total else "No privacy findings" ) return True, _gate( "privacy_watch", "pass", errors=errors, warnings=warnings, infos=infos, top_findings=[ {"id": f.get("id"), "title": f.get("title"), "severity": f.get("severity")} for f in all_findings[:5] ], note=note, recommendations=all_recs, ) except Exception as e: logger.warning("privacy_watch gate error: %s", e) return True, _gate("privacy_watch", "pass", note=f"privacy_watch skipped (error): {e}", skipped=True) async def _run_cost_watch( tool_manager, agent_id: str, window_hours: int = 24, ratio_threshold: float = 3.0, min_calls: int = 50, ) -> Tuple[bool, Dict]: """ Warning-only gate: detects cost/resource anomalies via cost_analyzer_tool. Always returns pass=True (does not block release). Appends recommendations for high-ratio spikes on priority tools. """ try: args = { "action": "anomalies", "window_minutes": int(window_hours * 60 / 4), # last 25% of window "baseline_hours": window_hours, "ratio_threshold": ratio_threshold, "min_calls": min_calls, } result = await tool_manager.execute_tool( "cost_analyzer_tool", args, agent_id=agent_id ) if not result.success: return True, _gate("cost_watch", "pass", note=f"cost_analyzer unavailable: {result.error}", skipped=True) data = result.result or {} anomalies = data.get("anomalies", []) anon_count = len(anomalies) recs = [] cfg_weights: Dict = {} try: import yaml import os weights_path = os.path.join( os.getenv("REPO_ROOT", str(__file__).rsplit("/services", 1)[0]), "config", "cost_weights.yml", ) with open(weights_path) as f: cfg_weights = yaml.safe_load(f) or {} except Exception: pass priority_tools = set((cfg_weights.get("anomaly") or {}).get("priority_tools") or [ "comfy_generate_video", "comfy_generate_image", "pr_reviewer_tool", "job_orchestrator_tool", "observability_tool", ]) for a in anomalies: if a.get("tool") in priority_tools: recs.append(a.get("recommendation", f"Cost spike on {a.get('tool')} (ratio={a.get('ratio')})")) return True, _gate( "cost_watch", "pass", anomalies_count=anon_count, anomalies_preview=[ {"tool": a.get("tool"), "type": a.get("type"), "ratio": a.get("ratio")} for a in anomalies[:5] ], note=(f"{anon_count} anomaly(ies) detected" if anon_count else "No anomalies detected"), recommendations=recs, ) except Exception as e: logger.warning("cost_watch gate error: %s", e) return True, _gate("cost_watch", "pass", note=f"cost_watch skipped (error): {e}", skipped=True) def _build_report( overall_pass: bool, gates: List[Dict], recommendations: List[str], ts_start: float, ) -> Dict: elapsed_ms = round((time.monotonic() - ts_start) * 1000, 1) failed_gates = [g["name"] for g in gates if g.get("status") == "fail"] error_gates = [g["name"] for g in gates if g.get("status") == "error"] passed_gates = [g["name"] for g in gates if g.get("status") == "pass"] skipped_gates = [g["name"] for g in gates if g.get("status") == "skipped"] if overall_pass: summary = f"✅ RELEASE CHECK PASSED in {elapsed_ms}ms. Gates: {passed_gates}." else: summary = ( f"❌ RELEASE CHECK FAILED in {elapsed_ms}ms. " f"Failed: {failed_gates}. Errors: {error_gates}." ) if skipped_gates: summary += f" Skipped: {skipped_gates}." return { "pass": overall_pass, "gates": gates, "recommendations": list(dict.fromkeys(recommendations)), # dedupe preserving order "summary": summary, "elapsed_ms": elapsed_ms, }