Files
microdao-daarion/services/router/release_check_runner.py
Apple 129e4ea1fc feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
2026-03-03 07:14:14 -08:00

1364 lines
52 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
release_check Internal Runner
Orchestrates all release gates by calling tool handlers sequentially (no shell).
Gates:
1. pr_reviewer_tool blocking_only (blocking)
2. config_linter_tool strict=true (blocking)
3. contract_tool diff_openapi (fail_on_breaking)
4. threatmodel_tool analyze_diff (risk_profile)
5. [optional] job_orchestrator_tool smoke_gateway
6. [optional] job_orchestrator_tool drift_check_node1
Output:
{
"pass": true|false,
"gates": [...],
"recommendations": [...],
"summary": "..."
}
"""
import asyncio
import hashlib
import json
import logging
import os
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
# ─── Gate Policy ──────────────────────────────────────────────────────────────
_gate_policy_cache: Optional[Dict] = None
_GATE_POLICY_PATH = os.path.join(
os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent)),
"config", "release_gate_policy.yml",
)
def load_gate_policy(profile: str = "dev") -> Dict:
"""
Load gate policy for the given profile (dev/staging/prod).
Returns dict of {gate_name: {mode, fail_on, ...}}.
Falls back to defaults (warn) if config missing or profile unknown.
"""
global _gate_policy_cache
if _gate_policy_cache is None:
try:
import yaml
with open(_GATE_POLICY_PATH, "r") as f:
_gate_policy_cache = yaml.safe_load(f) or {}
except Exception as e:
logger.warning("release_gate_policy.yml not loaded: %s", e)
_gate_policy_cache = {}
cfg = _gate_policy_cache
profiles = cfg.get("profiles") or {}
defaults = cfg.get("defaults") or {}
default_mode = defaults.get("mode", "warn")
profile_cfg = profiles.get(profile) or profiles.get("dev") or {}
gates_cfg = profile_cfg.get("gates") or {}
# Normalise: ensure every gate has at minimum {mode: default_mode}
result: Dict[str, Dict] = {}
for gate_name, gate_cfg in gates_cfg.items():
result[gate_name] = dict(gate_cfg) if isinstance(gate_cfg, dict) else {"mode": gate_cfg}
def _get(name: str) -> Dict:
return result.get(name, {"mode": default_mode})
return {
"_profile": profile,
"_default_mode": default_mode,
"get": _get,
**result,
}
def _reload_gate_policy() -> None:
global _gate_policy_cache
_gate_policy_cache = None
# ─── Gate Result ──────────────────────────────────────────────────────────────
def _gate(name: str, status: str, details: Dict = None, **extra) -> Dict:
"""Build a single gate result dict."""
g = {"name": name, "status": status}
g.update(extra)
if details:
g["details"] = details
return g
# ─── Individual Gate Runners ─────────────────────────────────────────────────
async def _run_dependency_scan(
tool_manager,
agent_id: str,
targets: Optional[List[str]] = None,
vuln_mode: str = "offline_cache",
fail_on: Optional[List[str]] = None,
timeout_sec: float = 40.0,
) -> Tuple[bool, Dict]:
"""Gate 3: Dependency & supply-chain vulnerability scan."""
args = {
"action": "scan",
"targets": targets or ["python", "node"],
"vuln_mode": vuln_mode,
"fail_on": fail_on or ["CRITICAL", "HIGH"],
"timeout_sec": timeout_sec,
}
try:
result = await tool_manager.execute_tool(
"dependency_scanner_tool", args, agent_id=agent_id
)
if not result.success:
return False, _gate("dependency_scan", "fail", error=result.error)
data = result.result or {}
scan_pass = data.get("pass", True)
stats = data.get("stats", {})
by_sev = stats.get("by_severity", {})
top_vulns = (data.get("vulnerabilities") or [])[:5]
status = "pass" if scan_pass else "fail"
return scan_pass, _gate(
"dependency_scan", status,
critical=by_sev.get("CRITICAL", 0),
high=by_sev.get("HIGH", 0),
medium=by_sev.get("MEDIUM", 0),
total=stats.get("vulns_total", 0),
deps_total=stats.get("deps_total", 0),
top_vulns=top_vulns,
summary=data.get("summary", ""),
)
except Exception as e:
logger.exception("Dependency scan gate error")
return False, _gate("dependency_scan", "error", error=str(e))
async def _run_pr_review(tool_manager, diff_text: str, agent_id: str) -> Tuple[bool, Dict]:
"""Gate 1: PR review in blocking_only mode."""
if not diff_text or not diff_text.strip():
return True, _gate("pr_review", "skipped", reason="no diff_text provided")
args = {
"mode": "blocking_only",
"diff": {
"text": diff_text,
"max_chars": 400000,
"max_files": 200,
},
"options": {
"mask_evidence": True,
},
}
try:
result = await tool_manager.execute_tool(
"pr_reviewer_tool", args, agent_id=agent_id
)
if not result.success:
return False, _gate("pr_review", "fail",
error=result.error,
blocking_count=None,
details_ref=None)
data = result.result or {}
blocking = data.get("blocking_count", 0) or len(data.get("blocking_issues", []))
status = "fail" if blocking > 0 else "pass"
return blocking == 0, _gate(
"pr_review", status,
blocking_count=blocking,
summary=data.get("summary", ""),
score=data.get("score"),
)
except Exception as e:
logger.exception("PR review gate error")
return False, _gate("pr_review", "error", error=str(e))
async def _run_config_lint(tool_manager, diff_text: str, agent_id: str) -> Tuple[bool, Dict]:
"""Gate 2: Config linter with strict=true."""
if not diff_text or not diff_text.strip():
return True, _gate("config_lint", "skipped", reason="no diff_text provided")
args = {
"source": "diff_text",
"diff_text": diff_text,
"options": {
"strict": True,
"mask_evidence": True,
"include_recommendations": True,
},
}
try:
result = await tool_manager.execute_tool(
"config_linter_tool", args, agent_id=agent_id
)
if not result.success:
return False, _gate("config_lint", "fail", error=result.error)
data = result.result or {}
blocking_count = data.get("blocking_count", 0)
status = "fail" if blocking_count > 0 else "pass"
return blocking_count == 0, _gate(
"config_lint", status,
blocking_count=blocking_count,
total_findings=data.get("total_findings", 0),
summary=data.get("summary", ""),
)
except Exception as e:
logger.exception("Config lint gate error")
return False, _gate("config_lint", "error", error=str(e))
async def _run_contract_diff(
tool_manager,
openapi_base: Optional[str],
openapi_head: Optional[str],
agent_id: str,
) -> Tuple[bool, Dict]:
"""Gate 4: OpenAPI contract diff."""
if not openapi_base or not openapi_head:
return True, _gate("contract_diff", "skipped",
reason="openapi_base or openapi_head not provided")
args = {
"action": "diff_openapi",
"base_spec": {"text": openapi_base},
"head_spec": {"text": openapi_head},
"options": {
"fail_on_breaking": True,
"mask_evidence": True,
},
}
try:
result = await tool_manager.execute_tool(
"contract_tool", args, agent_id=agent_id
)
if not result.success:
return False, _gate("contract_diff", "fail", error=result.error)
data = result.result or {}
breaking = data.get("breaking_count", 0) or len(data.get("breaking_changes", []))
status = "fail" if breaking > 0 else "pass"
return breaking == 0, _gate(
"contract_diff", status,
breaking_count=breaking,
summary=data.get("summary", ""),
)
except Exception as e:
logger.exception("Contract diff gate error")
return False, _gate("contract_diff", "error", error=str(e))
async def _run_threat_model(
tool_manager,
diff_text: str,
service_name: str,
risk_profile: str,
agent_id: str,
) -> Tuple[bool, Dict]:
"""Gate 5: Threat model analysis."""
args = {
"action": "analyze_diff",
"diff_text": diff_text or "",
"service_name": service_name,
"risk_profile": risk_profile,
}
try:
result = await tool_manager.execute_tool(
"threatmodel_tool", args, agent_id=agent_id
)
if not result.success:
return False, _gate("threat_model", "fail", error=result.error)
data = result.result or {}
# High risk without mitigation = blocking
unmitigated_high = data.get("unmitigated_high_count", 0)
status = "fail" if unmitigated_high > 0 else "pass"
return unmitigated_high == 0, _gate(
"threat_model", status,
unmitigated_high=unmitigated_high,
risk_profile=risk_profile,
summary=data.get("summary", ""),
recommendations=data.get("recommendations", []),
)
except Exception as e:
logger.exception("Threat model gate error")
return False, _gate("threat_model", "error", error=str(e))
async def _run_smoke(tool_manager, agent_id: str) -> Tuple[bool, Dict]:
"""Gate 5 (optional): Smoke test via job orchestrator."""
args = {
"action": "start_task",
"agent_id": agent_id,
"params": {
"task_id": "smoke_gateway",
"dry_run": False,
},
}
try:
result = await tool_manager.execute_tool(
"job_orchestrator_tool", args, agent_id=agent_id
)
if not result.success:
return False, _gate("smoke", "fail", error=result.error)
data = result.result or {}
job_id = data.get("id", "")
# In production: poll job status. Here we treat queued as optimistic pass.
return True, _gate("smoke", "pass", job_id=job_id,
note="job queued, check job status for final result")
except Exception as e:
logger.exception("Smoke gate error")
return False, _gate("smoke", "error", error=str(e))
async def _run_drift(
tool_manager,
agent_id: str,
categories: Optional[List[str]] = None,
timeout_sec: float = 25.0,
) -> Tuple[bool, Dict]:
"""
Gate 6 (optional): Drift analysis via drift_analyzer_tool.
pass=false when drift finds errors (warnings don't block release).
"""
args = {
"action": "analyze",
"categories": categories, # None = all categories
"timeout_sec": timeout_sec,
}
try:
result = await tool_manager.execute_tool(
"drift_analyzer_tool", args, agent_id=agent_id
)
if not result.success:
return False, _gate("drift", "fail", error=result.error)
data = result.result or {}
drift_pass = data.get("pass", True)
stats = data.get("stats", {})
errors = stats.get("errors", 0)
warnings = stats.get("warnings", 0)
skipped = stats.get("skipped", [])
status = "pass" if drift_pass else "fail"
top_findings = (data.get("findings") or [])[:5] # top 5 for gate summary
return drift_pass, _gate(
"drift", status,
errors=errors,
warnings=warnings,
skipped=skipped,
top_findings=top_findings,
summary=data.get("summary", ""),
)
except Exception as e:
logger.exception("Drift gate error")
return False, _gate("drift", "error", error=str(e))
# ─── Main Runner ──────────────────────────────────────────────────────────────
async def run_release_check(tool_manager, inputs: Dict, agent_id: str) -> Dict:
"""
Execute all release gates and return aggregated verdict.
Args:
tool_manager: ToolManager instance (with execute_tool method)
inputs: dict from task_registry inputs_schema
agent_id: executing agent
Returns:
{
"pass": bool,
"gates": [...],
"recommendations": [...],
"summary": str,
}
"""
diff_text = inputs.get("diff_text", "")
service_name = inputs.get("service_name", "unknown")
openapi_base = inputs.get("openapi_base")
openapi_head = inputs.get("openapi_head")
risk_profile = inputs.get("risk_profile", "default")
fail_fast = inputs.get("fail_fast", False)
run_smoke = inputs.get("run_smoke", False)
run_drift = inputs.get("run_drift", False)
gate_profile = inputs.get("gate_profile", "dev")
gate_policy = load_gate_policy(gate_profile)
gates = []
recommendations = []
overall_pass = True
ts_start = time.monotonic()
# ── Gate 1: PR Review ──────────────────────────────────────────────────
ok, gate = await _run_pr_review(tool_manager, diff_text, agent_id)
gates.append(gate)
if not ok:
overall_pass = False
recommendations.append("Fix blocking PR review findings before release.")
if fail_fast:
return _build_report(overall_pass, gates, recommendations, ts_start)
# ── Gate 2: Config Lint ────────────────────────────────────────────────
ok, gate = await _run_config_lint(tool_manager, diff_text, agent_id)
gates.append(gate)
if not ok:
overall_pass = False
recommendations.append("Remove secrets/unsafe config before release.")
if fail_fast:
return _build_report(overall_pass, gates, recommendations, ts_start)
# ── Gate 3: Dependency Scan ────────────────────────────────────────────
run_deps = inputs.get("run_deps", True)
if run_deps:
ok, gate = await _run_dependency_scan(
tool_manager,
agent_id=agent_id,
targets=inputs.get("deps_targets"),
vuln_mode=inputs.get("deps_vuln_mode", "offline_cache"),
fail_on=inputs.get("deps_fail_on") or ["CRITICAL", "HIGH"],
timeout_sec=float(inputs.get("deps_timeout_sec", 40.0)),
)
gates.append(gate)
if not ok:
overall_pass = False
top = gate.get("top_vulns", [])
top_recs = [v.get("recommendation", "") for v in top if v.get("recommendation")]
if top_recs:
recommendations.extend(top_recs[:3])
else:
recommendations.append(
f"Dependency scan found {gate.get('critical',0)} CRITICAL / "
f"{gate.get('high',0)} HIGH vulnerabilities. Upgrade before release."
)
if fail_fast:
return _build_report(overall_pass, gates, recommendations, ts_start)
# ── Gate 4 (renumbered): Contract Diff ────────────────────────────────
ok, gate = await _run_contract_diff(
tool_manager, openapi_base, openapi_head, agent_id
)
gates.append(gate)
if not ok:
overall_pass = False
recommendations.append("Fix breaking OpenAPI changes or bump major version.")
if fail_fast:
return _build_report(overall_pass, gates, recommendations, ts_start)
# ── Gate 5 (renumbered): Threat Model ─────────────────────────────────
ok, gate = await _run_threat_model(
tool_manager, diff_text, service_name, risk_profile, agent_id
)
gates.append(gate)
if not ok:
overall_pass = False
# Collect threat model recommendations
threat_recs = gate.get("recommendations", [])
recommendations.extend(threat_recs if threat_recs else
["Address unmitigated high-risk threats before release."])
if fail_fast:
return _build_report(overall_pass, gates, recommendations, ts_start)
# ── Gate 5 (optional): Smoke ───────────────────────────────────────────
if run_smoke:
ok, gate = await _run_smoke(tool_manager, agent_id)
gates.append(gate)
if not ok:
overall_pass = False
recommendations.append("Smoke tests failed. Investigate gateway health.")
# ── Gate 6 (optional): Drift ───────────────────────────────────────────
if run_drift:
drift_categories = inputs.get("drift_categories") # optional subset
drift_timeout = float(inputs.get("drift_timeout_sec", 25.0))
ok, gate = await _run_drift(tool_manager, agent_id,
categories=drift_categories,
timeout_sec=drift_timeout)
gates.append(gate)
if not ok:
overall_pass = False
top = gate.get("top_findings", [])
err_titles = [f.get("title", "") for f in top if f.get("severity") == "error"]
if err_titles:
recommendations.append(
f"Drift errors found: {'; '.join(err_titles[:3])}. Fix before release."
)
else:
recommendations.append("Drift analysis found errors. Reconcile before release.")
# ── SLO Watch (policy-driven: off/warn/strict) ───────────────────────────
run_slo_watch = inputs.get("run_slo_watch", True)
_sw_policy = gate_policy.get("slo_watch") if callable(gate_policy.get) else gate_policy.get("slo_watch", {})
_sw_mode = (_sw_policy or {}).get("mode", "warn")
if run_slo_watch and _sw_mode != "off":
sw_window = int(inputs.get("slo_watch_window_minutes", 60))
ok_sw, gate = await _run_slo_watch(
tool_manager, agent_id,
service_name=service_name,
env=inputs.get("followup_watch_env", "prod"),
window_minutes=sw_window,
)
gates.append(gate)
for rec in gate.get("recommendations", []):
recommendations.append(rec)
if _sw_mode == "strict" and not gate.get("skipped"):
violations = gate.get("violations", [])
if violations:
overall_pass = False
if fail_fast:
return _build_report(overall_pass, gates, recommendations, ts_start)
# ── Follow-up Watch (policy-driven: off/warn/strict) ─────────────────────
run_followup_watch = inputs.get("run_followup_watch", True)
_fw_policy = gate_policy.get("followup_watch") if callable(gate_policy.get) else gate_policy.get("followup_watch", {})
_fw_mode = (_fw_policy or {}).get("mode", gate_policy.get("_default_mode", "warn"))
if run_followup_watch and _fw_mode != "off":
fw_window = int(inputs.get("followup_watch_window_days", 30))
fw_env = inputs.get("followup_watch_env", "any")
ok_fw, gate = await _run_followup_watch(
tool_manager, agent_id,
service_name=service_name,
env=fw_env,
window_days=fw_window,
)
gates.append(gate)
for rec in gate.get("recommendations", []):
recommendations.append(rec)
if _fw_mode == "strict" and not gate.get("skipped"):
fail_on_sev = (_fw_policy or {}).get("fail_on", ["P0", "P1"])
blocking_incidents = [
i for i in (gate.get("open_incidents") or [])
if i.get("severity") in fail_on_sev
]
has_overdue = len(gate.get("overdue_followups") or []) > 0
if blocking_incidents or has_overdue:
overall_pass = False
if fail_fast:
return _build_report(overall_pass, gates, recommendations, ts_start)
# ── Risk Watch (policy-driven: off/warn/strict) ───────────────────────────
run_risk_watch = inputs.get("run_risk_watch", True)
_risk_policy = gate_policy.get("risk_watch") if callable(gate_policy.get) else gate_policy.get("risk_watch", {})
_risk_mode = (
inputs.get("risk_watch_mode")
or (_risk_policy or {}).get("mode", gate_policy.get("_default_mode", "warn"))
)
if run_risk_watch and _risk_mode != "off":
risk_env = inputs.get("risk_watch_env", "prod")
risk_warn_at = inputs.get("risk_watch_warn_at")
risk_fail_at = inputs.get("risk_watch_fail_at")
ok_risk, gate = await _run_risk_watch(
tool_manager, agent_id,
service_name=service_name,
env=risk_env,
warn_at=risk_warn_at,
fail_at=risk_fail_at,
)
gates.append(gate)
for rec in gate.get("recommendations", []):
recommendations.append(rec)
if _risk_mode == "strict" and not gate.get("skipped"):
effective_fail_at = gate.get("effective_fail_at", 80)
score = gate.get("score", 0)
if score >= effective_fail_at:
overall_pass = False
if fail_fast:
return _build_report(overall_pass, gates, recommendations, ts_start)
# ── Risk Delta Watch (policy-driven: off/warn/strict) ─────────────────────
run_risk_delta_watch = inputs.get("run_risk_delta_watch", True)
_rdw_policy = gate_policy.get("risk_delta_watch") if callable(gate_policy.get) else gate_policy.get("risk_delta_watch", {})
_rdw_mode = (
inputs.get("risk_delta_watch_mode")
or (_rdw_policy or {}).get("mode", gate_policy.get("_default_mode", "warn"))
)
if run_risk_delta_watch and _rdw_mode != "off":
rdw_env = inputs.get("risk_delta_env", "prod")
rdw_hours = int(inputs.get("risk_delta_hours", 24))
rdw_warn = inputs.get("risk_delta_warn")
rdw_fail = inputs.get("risk_delta_fail")
ok_rdw, gate = await _run_risk_delta_watch(
tool_manager, agent_id,
service_name=service_name,
env=rdw_env,
delta_hours=rdw_hours,
warn_delta=rdw_warn,
fail_delta=rdw_fail,
policy=None,
)
gates.append(gate)
for rec in gate.get("recommendations", []):
recommendations.append(rec)
if _rdw_mode == "strict" and not gate.get("skipped"):
# Only block for p0_services when p0_services_strict=True (loaded inside helper)
if gate.get("should_fail"):
overall_pass = False
if fail_fast:
return _build_report(overall_pass, gates, recommendations, ts_start)
# ── Platform Review Required (policy-driven: off/warn/strict) ───────────
run_platform_review = inputs.get("run_platform_review_required", True)
_prv_policy = gate_policy.get("platform_review_required") if callable(gate_policy.get) else gate_policy.get("platform_review_required", {})
_prv_mode = (
inputs.get("platform_review_mode")
or (_prv_policy or {}).get("mode", gate_policy.get("_default_mode", "warn"))
)
if run_platform_review and _prv_mode != "off":
ok_prv, gate = await _run_platform_review_required(
tool_manager, agent_id,
service_name=service_name,
env=inputs.get("platform_review_env", "prod"),
)
gates.append(gate)
for rec in gate.get("recommendations", []):
recommendations.append(rec)
if _prv_mode == "strict" and not gate.get("skipped"):
if gate.get("should_fail"):
overall_pass = False
if fail_fast:
return _build_report(overall_pass, gates, recommendations, ts_start)
# ── Recurrence Watch (policy-driven: off/warn/strict) ─────────────────────
run_recurrence_watch = inputs.get("run_recurrence_watch", True)
_rw_policy = gate_policy.get("recurrence_watch") if callable(gate_policy.get) else gate_policy.get("recurrence_watch", {})
_rw_mode = (
inputs.get("recurrence_watch_mode")
or (_rw_policy or {}).get("mode", gate_policy.get("_default_mode", "warn"))
)
if run_recurrence_watch and _rw_mode != "off":
rw_windows = inputs.get("recurrence_watch_windows_days", [7, 30])
rw_service = inputs.get("recurrence_watch_service", service_name)
ok_rw, gate = await _run_recurrence_watch(
tool_manager, agent_id,
service_name=rw_service,
windows_days=rw_windows,
)
gates.append(gate)
for rec in gate.get("recommendations", []):
recommendations.append(rec)
if _rw_mode == "strict" and not gate.get("skipped"):
fail_on_sev = (_rw_policy or {}).get("fail_on", {}).get("severity_in", ["P0", "P1"])
fail_on_high = (_rw_policy or {}).get("fail_on", {}).get("high_recurrence", True)
if fail_on_high and gate.get("has_high_recurrence"):
max_sev = gate.get("max_severity_seen", "P3")
if max_sev in fail_on_sev:
overall_pass = False
if fail_fast:
return _build_report(overall_pass, gates, recommendations, ts_start)
# ── Privacy Watch (policy-driven: off/warn/strict) ────────────────────────
run_privacy_watch = inputs.get("run_privacy_watch", True)
_pw_policy = gate_policy.get("privacy_watch") if callable(gate_policy.get) else gate_policy.get("privacy_watch", {})
_pw_mode = (_pw_policy or {}).get("mode", gate_policy.get("_default_mode", "warn"))
if run_privacy_watch and _pw_mode != "off":
privacy_mode = inputs.get("privacy_watch_mode", "fast")
privacy_audit_h = int(inputs.get("privacy_audit_window_hours", 24))
ok_pw, gate = await _run_privacy_watch(
tool_manager, agent_id,
mode=privacy_mode,
audit_window_hours=privacy_audit_h,
)
gates.append(gate)
for rec in gate.get("recommendations", []):
recommendations.append(rec)
# Apply strict mode: block release if findings match fail_on
if _pw_mode == "strict" and not gate.get("skipped"):
fail_on_sev = (_pw_policy or {}).get("fail_on", ["error"])
all_findings = gate.get("top_findings") or []
blocking = [f for f in all_findings if f.get("severity") in fail_on_sev]
if blocking:
overall_pass = False
if fail_fast:
return _build_report(overall_pass, gates, recommendations, ts_start)
# ── Cost Watch (always warn even in strict profiles) ──────────────────────
run_cost_watch = inputs.get("run_cost_watch", True)
_cw_policy = gate_policy.get("cost_watch") if callable(gate_policy.get) else gate_policy.get("cost_watch", {})
_cw_mode = (_cw_policy or {}).get("mode", "warn")
if run_cost_watch and _cw_mode != "off":
cost_window_h = int(inputs.get("cost_watch_window_hours", 24))
cost_ratio = float(inputs.get("cost_spike_ratio_threshold", 3.0))
cost_min_calls = int(inputs.get("cost_min_calls_threshold", 50))
_, gate = await _run_cost_watch(
tool_manager, agent_id,
window_hours=cost_window_h,
ratio_threshold=cost_ratio,
min_calls=cost_min_calls,
)
gates.append(gate)
# cost_watch is never strict (cost_always_warn in policy) — recommendations only
for rec in gate.get("recommendations", []):
recommendations.append(rec)
return _build_report(overall_pass, gates, recommendations, ts_start)
async def _run_slo_watch(
tool_manager,
agent_id: str,
service_name: str = "",
env: str = "prod",
window_minutes: int = 60,
) -> Tuple[bool, Dict]:
"""
Warning-only gate: detects SLO breaches before deploying.
strict mode blocks on any violation.
"""
try:
args = {
"action": "slo_snapshot",
"service": service_name,
"env": env,
"window_minutes": window_minutes,
}
result = await tool_manager.execute_tool(
"observability_tool", args, agent_id=agent_id
)
if not result.success:
return True, _gate("slo_watch", "pass",
note=f"slo_watch skipped: {result.error}", skipped=True)
data = result.result or {}
violations = data.get("violations", [])
metrics = data.get("metrics", {})
thresholds = data.get("thresholds", {})
recs = []
if violations and not data.get("skipped"):
viol_desc = ", ".join(violations)
recs.append(
f"SLO violation ({viol_desc}) detected for '{service_name}'"
f"consider postponing deployment until service recovers"
)
note = (
f"Violations: {', '.join(violations)}" if violations
else "No SLO violations detected"
)
return True, _gate(
"slo_watch", "pass",
violations=violations,
metrics=metrics,
thresholds=thresholds,
note=note,
skipped=data.get("skipped", False),
recommendations=recs,
)
except Exception as e:
logger.warning("slo_watch gate error: %s", e)
return True, _gate("slo_watch", "pass",
note=f"slo_watch skipped (error): {e}", skipped=True)
async def _run_followup_watch(
tool_manager,
agent_id: str,
service_name: str = "",
env: str = "any",
window_days: int = 30,
) -> Tuple[bool, Dict]:
"""
Policy-driven gate: checks for open P0/P1 incidents and overdue follow-ups.
Returns pass=True in warn mode; strict mode may block based on GatePolicy.
"""
try:
args = {
"action": "incident_followups_summary",
"service": service_name,
"env": env,
"window_days": window_days,
}
result = await tool_manager.execute_tool(
"oncall_tool", args, agent_id=agent_id
)
if not result.success:
return True, _gate("followup_watch", "pass",
note=f"followup_watch skipped: {result.error}", skipped=True)
data = result.result or {}
stats = data.get("stats", {})
open_incs = data.get("open_incidents", [])
overdue = data.get("overdue_followups", [])
recs = []
if open_incs:
sev_list = ", ".join(f"{i['severity']} {i['id']}" for i in open_incs[:3])
recs.append(f"Open critical incidents: {sev_list}")
if overdue:
ov_list = ", ".join(f"{o['priority']} '{o['title'][:40]}' (due {o['due_date'][:10]})"
for o in overdue[:3])
recs.append(f"Overdue follow-ups: {ov_list}")
note = (
f"{stats.get('open_incidents', 0)} open P0/P1, "
f"{stats.get('overdue', 0)} overdue follow-ups, "
f"{stats.get('total_open_followups', 0)} total open"
)
return True, _gate(
"followup_watch", "pass",
open_incidents=open_incs[:5],
overdue_followups=overdue[:5],
stats=stats,
note=note,
recommendations=recs,
)
except Exception as e:
logger.warning("followup_watch gate error: %s", e)
return True, _gate("followup_watch", "pass",
note=f"followup_watch skipped (error): {e}", skipped=True)
async def _run_risk_watch(
tool_manager,
agent_id: str,
service_name: str = "",
env: str = "prod",
warn_at: Optional[int] = None,
fail_at: Optional[int] = None,
) -> Tuple[bool, Dict]:
"""
Policy-driven gate: computes RiskReport for the target service and
evaluates against configurable warn_at/fail_at thresholds.
Non-fatal: any error causes skip (never blocks release).
"""
try:
args: Dict = {
"action": "service",
"env": env,
}
if service_name:
args["service"] = service_name
else:
# No service → skip gracefully
return True, _gate("risk_watch", "pass",
note="risk_watch skipped: no service_name provided",
skipped=True)
result = await tool_manager.execute_tool(
"risk_engine_tool", args, agent_id=agent_id
)
if not result.success:
return True, _gate("risk_watch", "pass",
note=f"risk_watch skipped: {result.error}", skipped=True)
data = result.result or {}
score = int(data.get("score", 0))
band = data.get("band", "low")
reasons = data.get("reasons", [])
engine_recs = data.get("recommendations", [])
# Effective thresholds: input overrides > policy service override > policy defaults
thresholds = data.get("thresholds", {})
effective_warn = int(warn_at) if warn_at is not None else int(thresholds.get("warn_at", 50))
effective_fail = int(fail_at) if fail_at is not None else int(thresholds.get("fail_at", 80))
gate_recs = []
if score >= effective_warn:
gate_recs.append(
f"Service '{service_name}' risk score {score} ({band}): "
+ "; ".join(reasons[:3])
)
gate_recs.extend(engine_recs[:2])
note = (
f"score={score} band={band} warn_at={effective_warn} fail_at={effective_fail} | "
+ ("; ".join(reasons[:3]) if reasons else "no signals")
)
return True, _gate(
"risk_watch", "pass",
score=score,
band=band,
reasons=reasons[:5],
effective_warn_at=effective_warn,
effective_fail_at=effective_fail,
components=data.get("components", {}),
skipped=False,
note=note,
recommendations=gate_recs,
)
except Exception as e:
logger.warning("risk_watch gate error: %s", e)
return True, _gate("risk_watch", "pass",
note=f"risk_watch skipped (error): {e}", skipped=True)
async def _run_risk_delta_watch(
tool_manager,
agent_id: str,
service_name: str = "",
env: str = "prod",
delta_hours: int = 24,
warn_delta: Optional[int] = None,
fail_delta: Optional[int] = None,
policy: Optional[Dict] = None,
) -> Tuple[bool, Dict]:
"""
Gate: checks how much the risk score rose since `delta_hours` ago.
Non-fatal: missing history → skipped (never blocks).
Sets gate["should_fail"] = True if score delta >= fail_delta AND service is p0 in strict mode.
"""
try:
if not service_name:
return True, _gate("risk_delta_watch", "pass",
note="risk_delta_watch skipped: no service_name", skipped=True)
# Load policy locally
if policy is None:
try:
from risk_engine import load_risk_policy
policy = load_risk_policy()
except Exception:
policy = {}
p0_services = set(policy.get("p0_services", []))
rdw_cfg = policy.get("release_gate", {}).get("risk_delta_watch", {})
effective_warn = int(warn_delta) if warn_delta is not None else int(rdw_cfg.get("default_warn_delta_24h", 10))
effective_fail = int(fail_delta) if fail_delta is not None else int(rdw_cfg.get("default_fail_delta_24h", 20))
p0_strict = bool(rdw_cfg.get("p0_services_strict", True))
# Compute current risk score
risk_result = await tool_manager.execute_tool(
"risk_engine_tool",
{"action": "service", "service": service_name, "env": env,
"include_trend": False},
agent_id=agent_id,
)
if not risk_result.success:
return True, _gate("risk_delta_watch", "pass",
note=f"risk_delta_watch skipped: {risk_result.error}", skipped=True)
current_score = int((risk_result.result or {}).get("score", 0))
current_band = (risk_result.result or {}).get("band", "low")
# Get delta from history
delta: Optional[int] = None
no_history = False
try:
from risk_history_store import get_risk_history_store
hstore = get_risk_history_store()
delta = hstore.get_delta(service_name, env, hours=delta_hours)
except Exception as he:
logger.warning("risk_delta_watch: history unavailable: %s", he)
if delta is None:
return True, _gate(
"risk_delta_watch", "pass",
note="No history baseline; run hourly_risk_snapshot first.",
skipped=True,
recommendations=["No risk history baseline available. Run hourly_risk_snapshot to establish baseline."],
)
# Regression flags from trend policy
reg_warn = delta >= effective_warn
reg_fail = delta >= effective_fail
recs: List[str] = []
if reg_warn:
recs.append(
f"Risk score for '{service_name}' rose +{delta} pts in {delta_hours}h "
f"(current: {current_score}, band: {current_band}). "
f"Review recent deployments and open incidents."
)
if reg_fail:
recs.append(
f"Risk regression FAIL for '{service_name}': +{delta} pts >= fail threshold {effective_fail}. "
f"Block or roll back recent changes."
)
# should_fail only when: service is p0, strict enabled, delta >= fail
is_p0 = service_name in p0_services
should_fail = reg_fail and is_p0 and p0_strict
note = (
f"delta_{delta_hours}h={delta} current_score={current_score} band={current_band} "
f"warn_at={effective_warn} fail_at={effective_fail} is_p0={is_p0}"
)
return True, _gate(
"risk_delta_watch", "pass",
delta=delta,
delta_hours=delta_hours,
current_score=current_score,
current_band=current_band,
effective_warn_delta=effective_warn,
effective_fail_delta=effective_fail,
regression_warn=reg_warn,
regression_fail=reg_fail,
is_p0=is_p0,
should_fail=should_fail,
skipped=False,
note=note,
recommendations=recs,
)
except Exception as e:
logger.warning("risk_delta_watch gate error: %s", e)
return True, _gate("risk_delta_watch", "pass",
note=f"risk_delta_watch skipped (error): {e}", skipped=True)
async def _run_platform_review_required(
tool_manager,
agent_id: str,
service_name: str = "",
env: str = "prod",
) -> Tuple[bool, Dict]:
"""
Gate: Computes Architecture Pressure for the service.
In warn mode: always pass=True, adds recommendations.
In strict mode: sets should_fail=True if pressure >= fail_at.
Non-fatal: any error causes skip (never blocks release).
"""
try:
if not service_name:
return True, _gate("platform_review_required", "pass",
note="platform_review_required skipped: no service_name",
skipped=True)
# Load architecture pressure policy for thresholds
try:
from architecture_pressure import load_pressure_policy
pressure_policy = load_pressure_policy()
except Exception:
pressure_policy = {}
gate_cfg = pressure_policy.get("release_gate", {}).get(
"platform_review_required", {}
)
warn_at = int(gate_cfg.get("warn_at", 60))
fail_at = int(gate_cfg.get("fail_at", 85))
# Compute pressure via tool_manager
result = await tool_manager.execute_tool(
"architecture_pressure_tool",
{"action": "service", "service": service_name, "env": env},
agent_id=agent_id,
)
if not result.success:
return True, _gate("platform_review_required", "pass",
note=f"platform_review_required skipped: {result.error}",
skipped=True)
data = result.result or {}
score = int(data.get("score", 0))
band = data.get("band", "low")
signals = data.get("signals_summary", [])
requires_review = bool(data.get("requires_arch_review", False))
gate_recs = []
should_fail = False
if score >= warn_at:
gate_recs.append(
f"Service '{service_name}' architecture pressure={score} ({band}): "
+ ("; ".join(signals[:2]) if signals else "structural strain detected")
)
if score >= fail_at:
gate_recs.append(
f"Architecture review required for '{service_name}' before release. "
f"Pressure score {score} exceeds fail threshold {fail_at}."
)
should_fail = True
if requires_review:
gate_recs.append(
f"Architecture review has been flagged for '{service_name}'. "
f"Check ops/reports/platform/ for latest digest."
)
note = (
f"pressure_score={score} band={band} warn_at={warn_at} fail_at={fail_at} | "
+ ("; ".join(signals[:2]) if signals else "no pressure signals")
)
return True, _gate(
"platform_review_required", "pass",
score=score,
band=band,
signals_summary=signals[:4],
requires_arch_review=requires_review,
warn_at=warn_at,
fail_at=fail_at,
should_fail=should_fail,
skipped=False,
note=note,
recommendations=gate_recs,
)
except Exception as e:
logger.warning("platform_review_required gate error: %s", e)
return True, _gate("platform_review_required", "pass",
note=f"platform_review_required skipped (error): {e}",
skipped=True)
async def _run_recurrence_watch(
tool_manager,
agent_id: str,
service_name: str = "",
windows_days: List[int] = None,
) -> Tuple[bool, Dict]:
"""
Policy-driven gate: checks incident recurrence for the target service.
- warn mode: always pass=True, adds recommendations.
- strict mode: pass=False if high_recurrence + severity in fail_on list.
Non-fatal: any error skips the gate.
"""
if windows_days is None:
windows_days = [7, 30]
try:
# Prefer focused service query; fall back to all if no service specified
args: Dict = {
"action": "recurrence",
"window_days": max(windows_days) if windows_days else 7,
}
if service_name:
args["service"] = service_name
result = await tool_manager.execute_tool(
"incident_intelligence_tool", args, agent_id=agent_id
)
if not result.success:
return True, _gate("recurrence_watch", "pass",
note=f"recurrence_watch skipped: {result.error}", skipped=True)
data = result.result or {}
high_sigs = data.get("high_recurrence", {}).get("signatures", [])
high_kinds = data.get("high_recurrence", {}).get("kinds", [])
warn_sigs = data.get("warn_recurrence", {}).get("signatures", [])
warn_kinds = data.get("warn_recurrence", {}).get("kinds", [])
has_high = bool(high_sigs or high_kinds)
has_warn = bool(warn_sigs or warn_kinds)
max_sev = data.get("max_severity_seen", "P3")
total = data.get("total_incidents", 0)
recs = []
if has_high:
bucket_descs = (
[f"sig:{s['signature'][:8]} ({s['count']}x)" for s in high_sigs[:3]]
+ [f"kind:{k['kind']} ({k['count']}x)" for k in high_kinds[:3]]
)
recs.append(
f"High recurrence for '{service_name or 'all'}': "
+ ", ".join(bucket_descs)
+ " — review root cause before deploying"
)
elif has_warn:
warn_descs = (
[f"sig:{s['signature'][:8]} ({s['count']}x)" for s in warn_sigs[:2]]
+ [f"kind:{k['kind']} ({k['count']}x)" for k in warn_kinds[:2]]
)
recs.append(
f"Warn-level recurrence for '{service_name or 'all'}': "
+ ", ".join(warn_descs)
)
note = (
f"high={len(high_sigs)} sigs / {len(high_kinds)} kinds; "
f"warn={len(warn_sigs)}/{len(warn_kinds)}; "
f"total_incidents={total}; max_sev={max_sev}"
)
return True, _gate(
"recurrence_watch", "pass",
has_high_recurrence=has_high,
has_warn_recurrence=has_warn,
high_signatures=[s["signature"][:8] for s in high_sigs[:5]],
high_kinds=[k["kind"] for k in high_kinds[:5]],
max_severity_seen=max_sev,
total_incidents=total,
note=note,
skipped=False,
recommendations=recs,
)
except Exception as e:
logger.warning("recurrence_watch gate error: %s", e)
return True, _gate("recurrence_watch", "pass",
note=f"recurrence_watch skipped (error): {e}", skipped=True)
async def _run_privacy_watch(
tool_manager,
agent_id: str,
mode: str = "fast",
audit_window_hours: int = 24,
) -> Tuple[bool, Dict]:
"""
Warning-only gate: scans repo (fast mode) and recent audit stream for privacy risks.
Always returns pass=True. Adds recommendations for errors/warnings found.
"""
try:
# scan_repo (fast)
repo_args = {"action": "scan_repo", "mode": mode, "max_files": 200,
"paths_include": ["services/", "config/", "ops/"]}
repo_result = await tool_manager.execute_tool(
"data_governance_tool", repo_args, agent_id=agent_id
)
repo_data = repo_result.result or {} if repo_result.success else {}
# scan_audit (optional, non-fatal)
audit_data: Dict = {}
try:
audit_args = {"action": "scan_audit", "time_window_hours": audit_window_hours}
audit_result = await tool_manager.execute_tool(
"data_governance_tool", audit_args, agent_id=agent_id
)
if audit_result.success:
audit_data = audit_result.result or {}
except Exception:
pass
# Merge findings
all_findings = (repo_data.get("findings") or []) + (audit_data.get("findings") or [])
all_recs = list(dict.fromkeys(
(repo_data.get("recommendations") or []) + (audit_data.get("recommendations") or [])
))
errors = sum(1 for f in all_findings if f.get("severity") == "error")
warnings = sum(1 for f in all_findings if f.get("severity") == "warning")
infos = sum(1 for f in all_findings if f.get("severity") == "info")
total = errors + warnings + infos
note = (
f"{total} finding(s): {errors} error(s), {warnings} warning(s)"
if total else "No privacy findings"
)
return True, _gate(
"privacy_watch", "pass",
errors=errors,
warnings=warnings,
infos=infos,
top_findings=[
{"id": f.get("id"), "title": f.get("title"), "severity": f.get("severity")}
for f in all_findings[:5]
],
note=note,
recommendations=all_recs,
)
except Exception as e:
logger.warning("privacy_watch gate error: %s", e)
return True, _gate("privacy_watch", "pass", note=f"privacy_watch skipped (error): {e}", skipped=True)
async def _run_cost_watch(
tool_manager,
agent_id: str,
window_hours: int = 24,
ratio_threshold: float = 3.0,
min_calls: int = 50,
) -> Tuple[bool, Dict]:
"""
Warning-only gate: detects cost/resource anomalies via cost_analyzer_tool.
Always returns pass=True (does not block release).
Appends recommendations for high-ratio spikes on priority tools.
"""
try:
args = {
"action": "anomalies",
"window_minutes": int(window_hours * 60 / 4), # last 25% of window
"baseline_hours": window_hours,
"ratio_threshold": ratio_threshold,
"min_calls": min_calls,
}
result = await tool_manager.execute_tool(
"cost_analyzer_tool", args, agent_id=agent_id
)
if not result.success:
return True, _gate("cost_watch", "pass", note=f"cost_analyzer unavailable: {result.error}", skipped=True)
data = result.result or {}
anomalies = data.get("anomalies", [])
anon_count = len(anomalies)
recs = []
cfg_weights: Dict = {}
try:
import yaml
import os
weights_path = os.path.join(
os.getenv("REPO_ROOT", str(__file__).rsplit("/services", 1)[0]),
"config", "cost_weights.yml",
)
with open(weights_path) as f:
cfg_weights = yaml.safe_load(f) or {}
except Exception:
pass
priority_tools = set((cfg_weights.get("anomaly") or {}).get("priority_tools") or [
"comfy_generate_video", "comfy_generate_image", "pr_reviewer_tool",
"job_orchestrator_tool", "observability_tool",
])
for a in anomalies:
if a.get("tool") in priority_tools:
recs.append(a.get("recommendation", f"Cost spike on {a.get('tool')} (ratio={a.get('ratio')})"))
return True, _gate(
"cost_watch", "pass",
anomalies_count=anon_count,
anomalies_preview=[
{"tool": a.get("tool"), "type": a.get("type"), "ratio": a.get("ratio")}
for a in anomalies[:5]
],
note=(f"{anon_count} anomaly(ies) detected" if anon_count else "No anomalies detected"),
recommendations=recs,
)
except Exception as e:
logger.warning("cost_watch gate error: %s", e)
return True, _gate("cost_watch", "pass", note=f"cost_watch skipped (error): {e}", skipped=True)
def _build_report(
overall_pass: bool,
gates: List[Dict],
recommendations: List[str],
ts_start: float,
) -> Dict:
elapsed_ms = round((time.monotonic() - ts_start) * 1000, 1)
failed_gates = [g["name"] for g in gates if g.get("status") == "fail"]
error_gates = [g["name"] for g in gates if g.get("status") == "error"]
passed_gates = [g["name"] for g in gates if g.get("status") == "pass"]
skipped_gates = [g["name"] for g in gates if g.get("status") == "skipped"]
if overall_pass:
summary = f"✅ RELEASE CHECK PASSED in {elapsed_ms}ms. Gates: {passed_gates}."
else:
summary = (
f"❌ RELEASE CHECK FAILED in {elapsed_ms}ms. "
f"Failed: {failed_gates}. Errors: {error_gates}."
)
if skipped_gates:
summary += f" Skipped: {skipped_gates}."
return {
"pass": overall_pass,
"gates": gates,
"recommendations": list(dict.fromkeys(recommendations)), # dedupe preserving order
"summary": summary,
"elapsed_ms": elapsed_ms,
}