New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
1364 lines
52 KiB
Python
1364 lines
52 KiB
Python
"""
|
||
release_check Internal Runner
|
||
Orchestrates all release gates by calling tool handlers sequentially (no shell).
|
||
|
||
Gates:
|
||
1. pr_reviewer_tool – blocking_only (blocking)
|
||
2. config_linter_tool – strict=true (blocking)
|
||
3. contract_tool – diff_openapi (fail_on_breaking)
|
||
4. threatmodel_tool – analyze_diff (risk_profile)
|
||
5. [optional] job_orchestrator_tool – smoke_gateway
|
||
6. [optional] job_orchestrator_tool – drift_check_node1
|
||
|
||
Output:
|
||
{
|
||
"pass": true|false,
|
||
"gates": [...],
|
||
"recommendations": [...],
|
||
"summary": "..."
|
||
}
|
||
"""
|
||
|
||
import asyncio
|
||
import hashlib
|
||
import json
|
||
import logging
|
||
import os
|
||
import time
|
||
from pathlib import Path
|
||
from typing import Any, Dict, List, Optional, Tuple
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ─── Gate Policy ──────────────────────────────────────────────────────────────
|
||
|
||
_gate_policy_cache: Optional[Dict] = None
|
||
_GATE_POLICY_PATH = os.path.join(
|
||
os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent)),
|
||
"config", "release_gate_policy.yml",
|
||
)
|
||
|
||
|
||
def load_gate_policy(profile: str = "dev") -> Dict:
|
||
"""
|
||
Load gate policy for the given profile (dev/staging/prod).
|
||
Returns dict of {gate_name: {mode, fail_on, ...}}.
|
||
Falls back to defaults (warn) if config missing or profile unknown.
|
||
"""
|
||
global _gate_policy_cache
|
||
if _gate_policy_cache is None:
|
||
try:
|
||
import yaml
|
||
with open(_GATE_POLICY_PATH, "r") as f:
|
||
_gate_policy_cache = yaml.safe_load(f) or {}
|
||
except Exception as e:
|
||
logger.warning("release_gate_policy.yml not loaded: %s", e)
|
||
_gate_policy_cache = {}
|
||
|
||
cfg = _gate_policy_cache
|
||
profiles = cfg.get("profiles") or {}
|
||
defaults = cfg.get("defaults") or {}
|
||
default_mode = defaults.get("mode", "warn")
|
||
|
||
profile_cfg = profiles.get(profile) or profiles.get("dev") or {}
|
||
gates_cfg = profile_cfg.get("gates") or {}
|
||
|
||
# Normalise: ensure every gate has at minimum {mode: default_mode}
|
||
result: Dict[str, Dict] = {}
|
||
for gate_name, gate_cfg in gates_cfg.items():
|
||
result[gate_name] = dict(gate_cfg) if isinstance(gate_cfg, dict) else {"mode": gate_cfg}
|
||
|
||
def _get(name: str) -> Dict:
|
||
return result.get(name, {"mode": default_mode})
|
||
|
||
return {
|
||
"_profile": profile,
|
||
"_default_mode": default_mode,
|
||
"get": _get,
|
||
**result,
|
||
}
|
||
|
||
|
||
def _reload_gate_policy() -> None:
|
||
global _gate_policy_cache
|
||
_gate_policy_cache = None
|
||
|
||
|
||
# ─── Gate Result ──────────────────────────────────────────────────────────────
|
||
|
||
def _gate(name: str, status: str, details: Dict = None, **extra) -> Dict:
|
||
"""Build a single gate result dict."""
|
||
g = {"name": name, "status": status}
|
||
g.update(extra)
|
||
if details:
|
||
g["details"] = details
|
||
return g
|
||
|
||
|
||
# ─── Individual Gate Runners ─────────────────────────────────────────────────
|
||
|
||
async def _run_dependency_scan(
|
||
tool_manager,
|
||
agent_id: str,
|
||
targets: Optional[List[str]] = None,
|
||
vuln_mode: str = "offline_cache",
|
||
fail_on: Optional[List[str]] = None,
|
||
timeout_sec: float = 40.0,
|
||
) -> Tuple[bool, Dict]:
|
||
"""Gate 3: Dependency & supply-chain vulnerability scan."""
|
||
args = {
|
||
"action": "scan",
|
||
"targets": targets or ["python", "node"],
|
||
"vuln_mode": vuln_mode,
|
||
"fail_on": fail_on or ["CRITICAL", "HIGH"],
|
||
"timeout_sec": timeout_sec,
|
||
}
|
||
try:
|
||
result = await tool_manager.execute_tool(
|
||
"dependency_scanner_tool", args, agent_id=agent_id
|
||
)
|
||
if not result.success:
|
||
return False, _gate("dependency_scan", "fail", error=result.error)
|
||
|
||
data = result.result or {}
|
||
scan_pass = data.get("pass", True)
|
||
stats = data.get("stats", {})
|
||
by_sev = stats.get("by_severity", {})
|
||
top_vulns = (data.get("vulnerabilities") or [])[:5]
|
||
|
||
status = "pass" if scan_pass else "fail"
|
||
return scan_pass, _gate(
|
||
"dependency_scan", status,
|
||
critical=by_sev.get("CRITICAL", 0),
|
||
high=by_sev.get("HIGH", 0),
|
||
medium=by_sev.get("MEDIUM", 0),
|
||
total=stats.get("vulns_total", 0),
|
||
deps_total=stats.get("deps_total", 0),
|
||
top_vulns=top_vulns,
|
||
summary=data.get("summary", ""),
|
||
)
|
||
except Exception as e:
|
||
logger.exception("Dependency scan gate error")
|
||
return False, _gate("dependency_scan", "error", error=str(e))
|
||
|
||
|
||
async def _run_pr_review(tool_manager, diff_text: str, agent_id: str) -> Tuple[bool, Dict]:
|
||
"""Gate 1: PR review in blocking_only mode."""
|
||
if not diff_text or not diff_text.strip():
|
||
return True, _gate("pr_review", "skipped", reason="no diff_text provided")
|
||
|
||
args = {
|
||
"mode": "blocking_only",
|
||
"diff": {
|
||
"text": diff_text,
|
||
"max_chars": 400000,
|
||
"max_files": 200,
|
||
},
|
||
"options": {
|
||
"mask_evidence": True,
|
||
},
|
||
}
|
||
try:
|
||
result = await tool_manager.execute_tool(
|
||
"pr_reviewer_tool", args, agent_id=agent_id
|
||
)
|
||
if not result.success:
|
||
return False, _gate("pr_review", "fail",
|
||
error=result.error,
|
||
blocking_count=None,
|
||
details_ref=None)
|
||
|
||
data = result.result or {}
|
||
blocking = data.get("blocking_count", 0) or len(data.get("blocking_issues", []))
|
||
status = "fail" if blocking > 0 else "pass"
|
||
return blocking == 0, _gate(
|
||
"pr_review", status,
|
||
blocking_count=blocking,
|
||
summary=data.get("summary", ""),
|
||
score=data.get("score"),
|
||
)
|
||
except Exception as e:
|
||
logger.exception("PR review gate error")
|
||
return False, _gate("pr_review", "error", error=str(e))
|
||
|
||
|
||
async def _run_config_lint(tool_manager, diff_text: str, agent_id: str) -> Tuple[bool, Dict]:
|
||
"""Gate 2: Config linter with strict=true."""
|
||
if not diff_text or not diff_text.strip():
|
||
return True, _gate("config_lint", "skipped", reason="no diff_text provided")
|
||
|
||
args = {
|
||
"source": "diff_text",
|
||
"diff_text": diff_text,
|
||
"options": {
|
||
"strict": True,
|
||
"mask_evidence": True,
|
||
"include_recommendations": True,
|
||
},
|
||
}
|
||
try:
|
||
result = await tool_manager.execute_tool(
|
||
"config_linter_tool", args, agent_id=agent_id
|
||
)
|
||
if not result.success:
|
||
return False, _gate("config_lint", "fail", error=result.error)
|
||
|
||
data = result.result or {}
|
||
blocking_count = data.get("blocking_count", 0)
|
||
status = "fail" if blocking_count > 0 else "pass"
|
||
return blocking_count == 0, _gate(
|
||
"config_lint", status,
|
||
blocking_count=blocking_count,
|
||
total_findings=data.get("total_findings", 0),
|
||
summary=data.get("summary", ""),
|
||
)
|
||
except Exception as e:
|
||
logger.exception("Config lint gate error")
|
||
return False, _gate("config_lint", "error", error=str(e))
|
||
|
||
|
||
async def _run_contract_diff(
|
||
tool_manager,
|
||
openapi_base: Optional[str],
|
||
openapi_head: Optional[str],
|
||
agent_id: str,
|
||
) -> Tuple[bool, Dict]:
|
||
"""Gate 4: OpenAPI contract diff."""
|
||
if not openapi_base or not openapi_head:
|
||
return True, _gate("contract_diff", "skipped",
|
||
reason="openapi_base or openapi_head not provided")
|
||
|
||
args = {
|
||
"action": "diff_openapi",
|
||
"base_spec": {"text": openapi_base},
|
||
"head_spec": {"text": openapi_head},
|
||
"options": {
|
||
"fail_on_breaking": True,
|
||
"mask_evidence": True,
|
||
},
|
||
}
|
||
try:
|
||
result = await tool_manager.execute_tool(
|
||
"contract_tool", args, agent_id=agent_id
|
||
)
|
||
if not result.success:
|
||
return False, _gate("contract_diff", "fail", error=result.error)
|
||
|
||
data = result.result or {}
|
||
breaking = data.get("breaking_count", 0) or len(data.get("breaking_changes", []))
|
||
status = "fail" if breaking > 0 else "pass"
|
||
return breaking == 0, _gate(
|
||
"contract_diff", status,
|
||
breaking_count=breaking,
|
||
summary=data.get("summary", ""),
|
||
)
|
||
except Exception as e:
|
||
logger.exception("Contract diff gate error")
|
||
return False, _gate("contract_diff", "error", error=str(e))
|
||
|
||
|
||
async def _run_threat_model(
|
||
tool_manager,
|
||
diff_text: str,
|
||
service_name: str,
|
||
risk_profile: str,
|
||
agent_id: str,
|
||
) -> Tuple[bool, Dict]:
|
||
"""Gate 5: Threat model analysis."""
|
||
args = {
|
||
"action": "analyze_diff",
|
||
"diff_text": diff_text or "",
|
||
"service_name": service_name,
|
||
"risk_profile": risk_profile,
|
||
}
|
||
try:
|
||
result = await tool_manager.execute_tool(
|
||
"threatmodel_tool", args, agent_id=agent_id
|
||
)
|
||
if not result.success:
|
||
return False, _gate("threat_model", "fail", error=result.error)
|
||
|
||
data = result.result or {}
|
||
# High risk without mitigation = blocking
|
||
unmitigated_high = data.get("unmitigated_high_count", 0)
|
||
status = "fail" if unmitigated_high > 0 else "pass"
|
||
return unmitigated_high == 0, _gate(
|
||
"threat_model", status,
|
||
unmitigated_high=unmitigated_high,
|
||
risk_profile=risk_profile,
|
||
summary=data.get("summary", ""),
|
||
recommendations=data.get("recommendations", []),
|
||
)
|
||
except Exception as e:
|
||
logger.exception("Threat model gate error")
|
||
return False, _gate("threat_model", "error", error=str(e))
|
||
|
||
|
||
async def _run_smoke(tool_manager, agent_id: str) -> Tuple[bool, Dict]:
|
||
"""Gate 5 (optional): Smoke test via job orchestrator."""
|
||
args = {
|
||
"action": "start_task",
|
||
"agent_id": agent_id,
|
||
"params": {
|
||
"task_id": "smoke_gateway",
|
||
"dry_run": False,
|
||
},
|
||
}
|
||
try:
|
||
result = await tool_manager.execute_tool(
|
||
"job_orchestrator_tool", args, agent_id=agent_id
|
||
)
|
||
if not result.success:
|
||
return False, _gate("smoke", "fail", error=result.error)
|
||
|
||
data = result.result or {}
|
||
job_id = data.get("id", "")
|
||
# In production: poll job status. Here we treat queued as optimistic pass.
|
||
return True, _gate("smoke", "pass", job_id=job_id,
|
||
note="job queued, check job status for final result")
|
||
except Exception as e:
|
||
logger.exception("Smoke gate error")
|
||
return False, _gate("smoke", "error", error=str(e))
|
||
|
||
|
||
async def _run_drift(
|
||
tool_manager,
|
||
agent_id: str,
|
||
categories: Optional[List[str]] = None,
|
||
timeout_sec: float = 25.0,
|
||
) -> Tuple[bool, Dict]:
|
||
"""
|
||
Gate 6 (optional): Drift analysis via drift_analyzer_tool.
|
||
pass=false when drift finds errors (warnings don't block release).
|
||
"""
|
||
args = {
|
||
"action": "analyze",
|
||
"categories": categories, # None = all categories
|
||
"timeout_sec": timeout_sec,
|
||
}
|
||
try:
|
||
result = await tool_manager.execute_tool(
|
||
"drift_analyzer_tool", args, agent_id=agent_id
|
||
)
|
||
if not result.success:
|
||
return False, _gate("drift", "fail", error=result.error)
|
||
|
||
data = result.result or {}
|
||
drift_pass = data.get("pass", True)
|
||
stats = data.get("stats", {})
|
||
errors = stats.get("errors", 0)
|
||
warnings = stats.get("warnings", 0)
|
||
skipped = stats.get("skipped", [])
|
||
|
||
status = "pass" if drift_pass else "fail"
|
||
top_findings = (data.get("findings") or [])[:5] # top 5 for gate summary
|
||
|
||
return drift_pass, _gate(
|
||
"drift", status,
|
||
errors=errors,
|
||
warnings=warnings,
|
||
skipped=skipped,
|
||
top_findings=top_findings,
|
||
summary=data.get("summary", ""),
|
||
)
|
||
except Exception as e:
|
||
logger.exception("Drift gate error")
|
||
return False, _gate("drift", "error", error=str(e))
|
||
|
||
|
||
# ─── Main Runner ──────────────────────────────────────────────────────────────
|
||
|
||
async def run_release_check(tool_manager, inputs: Dict, agent_id: str) -> Dict:
|
||
"""
|
||
Execute all release gates and return aggregated verdict.
|
||
|
||
Args:
|
||
tool_manager: ToolManager instance (with execute_tool method)
|
||
inputs: dict from task_registry inputs_schema
|
||
agent_id: executing agent
|
||
|
||
Returns:
|
||
{
|
||
"pass": bool,
|
||
"gates": [...],
|
||
"recommendations": [...],
|
||
"summary": str,
|
||
}
|
||
"""
|
||
diff_text = inputs.get("diff_text", "")
|
||
service_name = inputs.get("service_name", "unknown")
|
||
openapi_base = inputs.get("openapi_base")
|
||
openapi_head = inputs.get("openapi_head")
|
||
risk_profile = inputs.get("risk_profile", "default")
|
||
fail_fast = inputs.get("fail_fast", False)
|
||
run_smoke = inputs.get("run_smoke", False)
|
||
run_drift = inputs.get("run_drift", False)
|
||
gate_profile = inputs.get("gate_profile", "dev")
|
||
gate_policy = load_gate_policy(gate_profile)
|
||
|
||
gates = []
|
||
recommendations = []
|
||
overall_pass = True
|
||
|
||
ts_start = time.monotonic()
|
||
|
||
# ── Gate 1: PR Review ──────────────────────────────────────────────────
|
||
ok, gate = await _run_pr_review(tool_manager, diff_text, agent_id)
|
||
gates.append(gate)
|
||
if not ok:
|
||
overall_pass = False
|
||
recommendations.append("Fix blocking PR review findings before release.")
|
||
if fail_fast:
|
||
return _build_report(overall_pass, gates, recommendations, ts_start)
|
||
|
||
# ── Gate 2: Config Lint ────────────────────────────────────────────────
|
||
ok, gate = await _run_config_lint(tool_manager, diff_text, agent_id)
|
||
gates.append(gate)
|
||
if not ok:
|
||
overall_pass = False
|
||
recommendations.append("Remove secrets/unsafe config before release.")
|
||
if fail_fast:
|
||
return _build_report(overall_pass, gates, recommendations, ts_start)
|
||
|
||
# ── Gate 3: Dependency Scan ────────────────────────────────────────────
|
||
run_deps = inputs.get("run_deps", True)
|
||
if run_deps:
|
||
ok, gate = await _run_dependency_scan(
|
||
tool_manager,
|
||
agent_id=agent_id,
|
||
targets=inputs.get("deps_targets"),
|
||
vuln_mode=inputs.get("deps_vuln_mode", "offline_cache"),
|
||
fail_on=inputs.get("deps_fail_on") or ["CRITICAL", "HIGH"],
|
||
timeout_sec=float(inputs.get("deps_timeout_sec", 40.0)),
|
||
)
|
||
gates.append(gate)
|
||
if not ok:
|
||
overall_pass = False
|
||
top = gate.get("top_vulns", [])
|
||
top_recs = [v.get("recommendation", "") for v in top if v.get("recommendation")]
|
||
if top_recs:
|
||
recommendations.extend(top_recs[:3])
|
||
else:
|
||
recommendations.append(
|
||
f"Dependency scan found {gate.get('critical',0)} CRITICAL / "
|
||
f"{gate.get('high',0)} HIGH vulnerabilities. Upgrade before release."
|
||
)
|
||
if fail_fast:
|
||
return _build_report(overall_pass, gates, recommendations, ts_start)
|
||
|
||
# ── Gate 4 (renumbered): Contract Diff ────────────────────────────────
|
||
ok, gate = await _run_contract_diff(
|
||
tool_manager, openapi_base, openapi_head, agent_id
|
||
)
|
||
gates.append(gate)
|
||
if not ok:
|
||
overall_pass = False
|
||
recommendations.append("Fix breaking OpenAPI changes or bump major version.")
|
||
if fail_fast:
|
||
return _build_report(overall_pass, gates, recommendations, ts_start)
|
||
|
||
# ── Gate 5 (renumbered): Threat Model ─────────────────────────────────
|
||
ok, gate = await _run_threat_model(
|
||
tool_manager, diff_text, service_name, risk_profile, agent_id
|
||
)
|
||
gates.append(gate)
|
||
if not ok:
|
||
overall_pass = False
|
||
# Collect threat model recommendations
|
||
threat_recs = gate.get("recommendations", [])
|
||
recommendations.extend(threat_recs if threat_recs else
|
||
["Address unmitigated high-risk threats before release."])
|
||
if fail_fast:
|
||
return _build_report(overall_pass, gates, recommendations, ts_start)
|
||
|
||
# ── Gate 5 (optional): Smoke ───────────────────────────────────────────
|
||
if run_smoke:
|
||
ok, gate = await _run_smoke(tool_manager, agent_id)
|
||
gates.append(gate)
|
||
if not ok:
|
||
overall_pass = False
|
||
recommendations.append("Smoke tests failed. Investigate gateway health.")
|
||
|
||
# ── Gate 6 (optional): Drift ───────────────────────────────────────────
|
||
if run_drift:
|
||
drift_categories = inputs.get("drift_categories") # optional subset
|
||
drift_timeout = float(inputs.get("drift_timeout_sec", 25.0))
|
||
ok, gate = await _run_drift(tool_manager, agent_id,
|
||
categories=drift_categories,
|
||
timeout_sec=drift_timeout)
|
||
gates.append(gate)
|
||
if not ok:
|
||
overall_pass = False
|
||
top = gate.get("top_findings", [])
|
||
err_titles = [f.get("title", "") for f in top if f.get("severity") == "error"]
|
||
if err_titles:
|
||
recommendations.append(
|
||
f"Drift errors found: {'; '.join(err_titles[:3])}. Fix before release."
|
||
)
|
||
else:
|
||
recommendations.append("Drift analysis found errors. Reconcile before release.")
|
||
|
||
# ── SLO Watch (policy-driven: off/warn/strict) ───────────────────────────
|
||
run_slo_watch = inputs.get("run_slo_watch", True)
|
||
_sw_policy = gate_policy.get("slo_watch") if callable(gate_policy.get) else gate_policy.get("slo_watch", {})
|
||
_sw_mode = (_sw_policy or {}).get("mode", "warn")
|
||
if run_slo_watch and _sw_mode != "off":
|
||
sw_window = int(inputs.get("slo_watch_window_minutes", 60))
|
||
ok_sw, gate = await _run_slo_watch(
|
||
tool_manager, agent_id,
|
||
service_name=service_name,
|
||
env=inputs.get("followup_watch_env", "prod"),
|
||
window_minutes=sw_window,
|
||
)
|
||
gates.append(gate)
|
||
for rec in gate.get("recommendations", []):
|
||
recommendations.append(rec)
|
||
if _sw_mode == "strict" and not gate.get("skipped"):
|
||
violations = gate.get("violations", [])
|
||
if violations:
|
||
overall_pass = False
|
||
if fail_fast:
|
||
return _build_report(overall_pass, gates, recommendations, ts_start)
|
||
|
||
# ── Follow-up Watch (policy-driven: off/warn/strict) ─────────────────────
|
||
run_followup_watch = inputs.get("run_followup_watch", True)
|
||
_fw_policy = gate_policy.get("followup_watch") if callable(gate_policy.get) else gate_policy.get("followup_watch", {})
|
||
_fw_mode = (_fw_policy or {}).get("mode", gate_policy.get("_default_mode", "warn"))
|
||
if run_followup_watch and _fw_mode != "off":
|
||
fw_window = int(inputs.get("followup_watch_window_days", 30))
|
||
fw_env = inputs.get("followup_watch_env", "any")
|
||
ok_fw, gate = await _run_followup_watch(
|
||
tool_manager, agent_id,
|
||
service_name=service_name,
|
||
env=fw_env,
|
||
window_days=fw_window,
|
||
)
|
||
gates.append(gate)
|
||
for rec in gate.get("recommendations", []):
|
||
recommendations.append(rec)
|
||
|
||
if _fw_mode == "strict" and not gate.get("skipped"):
|
||
fail_on_sev = (_fw_policy or {}).get("fail_on", ["P0", "P1"])
|
||
blocking_incidents = [
|
||
i for i in (gate.get("open_incidents") or [])
|
||
if i.get("severity") in fail_on_sev
|
||
]
|
||
has_overdue = len(gate.get("overdue_followups") or []) > 0
|
||
if blocking_incidents or has_overdue:
|
||
overall_pass = False
|
||
if fail_fast:
|
||
return _build_report(overall_pass, gates, recommendations, ts_start)
|
||
|
||
# ── Risk Watch (policy-driven: off/warn/strict) ───────────────────────────
|
||
run_risk_watch = inputs.get("run_risk_watch", True)
|
||
_risk_policy = gate_policy.get("risk_watch") if callable(gate_policy.get) else gate_policy.get("risk_watch", {})
|
||
_risk_mode = (
|
||
inputs.get("risk_watch_mode")
|
||
or (_risk_policy or {}).get("mode", gate_policy.get("_default_mode", "warn"))
|
||
)
|
||
if run_risk_watch and _risk_mode != "off":
|
||
risk_env = inputs.get("risk_watch_env", "prod")
|
||
risk_warn_at = inputs.get("risk_watch_warn_at")
|
||
risk_fail_at = inputs.get("risk_watch_fail_at")
|
||
ok_risk, gate = await _run_risk_watch(
|
||
tool_manager, agent_id,
|
||
service_name=service_name,
|
||
env=risk_env,
|
||
warn_at=risk_warn_at,
|
||
fail_at=risk_fail_at,
|
||
)
|
||
gates.append(gate)
|
||
for rec in gate.get("recommendations", []):
|
||
recommendations.append(rec)
|
||
|
||
if _risk_mode == "strict" and not gate.get("skipped"):
|
||
effective_fail_at = gate.get("effective_fail_at", 80)
|
||
score = gate.get("score", 0)
|
||
if score >= effective_fail_at:
|
||
overall_pass = False
|
||
if fail_fast:
|
||
return _build_report(overall_pass, gates, recommendations, ts_start)
|
||
|
||
# ── Risk Delta Watch (policy-driven: off/warn/strict) ─────────────────────
|
||
run_risk_delta_watch = inputs.get("run_risk_delta_watch", True)
|
||
_rdw_policy = gate_policy.get("risk_delta_watch") if callable(gate_policy.get) else gate_policy.get("risk_delta_watch", {})
|
||
_rdw_mode = (
|
||
inputs.get("risk_delta_watch_mode")
|
||
or (_rdw_policy or {}).get("mode", gate_policy.get("_default_mode", "warn"))
|
||
)
|
||
if run_risk_delta_watch and _rdw_mode != "off":
|
||
rdw_env = inputs.get("risk_delta_env", "prod")
|
||
rdw_hours = int(inputs.get("risk_delta_hours", 24))
|
||
rdw_warn = inputs.get("risk_delta_warn")
|
||
rdw_fail = inputs.get("risk_delta_fail")
|
||
ok_rdw, gate = await _run_risk_delta_watch(
|
||
tool_manager, agent_id,
|
||
service_name=service_name,
|
||
env=rdw_env,
|
||
delta_hours=rdw_hours,
|
||
warn_delta=rdw_warn,
|
||
fail_delta=rdw_fail,
|
||
policy=None,
|
||
)
|
||
gates.append(gate)
|
||
for rec in gate.get("recommendations", []):
|
||
recommendations.append(rec)
|
||
|
||
if _rdw_mode == "strict" and not gate.get("skipped"):
|
||
# Only block for p0_services when p0_services_strict=True (loaded inside helper)
|
||
if gate.get("should_fail"):
|
||
overall_pass = False
|
||
if fail_fast:
|
||
return _build_report(overall_pass, gates, recommendations, ts_start)
|
||
|
||
# ── Platform Review Required (policy-driven: off/warn/strict) ───────────
|
||
run_platform_review = inputs.get("run_platform_review_required", True)
|
||
_prv_policy = gate_policy.get("platform_review_required") if callable(gate_policy.get) else gate_policy.get("platform_review_required", {})
|
||
_prv_mode = (
|
||
inputs.get("platform_review_mode")
|
||
or (_prv_policy or {}).get("mode", gate_policy.get("_default_mode", "warn"))
|
||
)
|
||
if run_platform_review and _prv_mode != "off":
|
||
ok_prv, gate = await _run_platform_review_required(
|
||
tool_manager, agent_id,
|
||
service_name=service_name,
|
||
env=inputs.get("platform_review_env", "prod"),
|
||
)
|
||
gates.append(gate)
|
||
for rec in gate.get("recommendations", []):
|
||
recommendations.append(rec)
|
||
|
||
if _prv_mode == "strict" and not gate.get("skipped"):
|
||
if gate.get("should_fail"):
|
||
overall_pass = False
|
||
if fail_fast:
|
||
return _build_report(overall_pass, gates, recommendations, ts_start)
|
||
|
||
# ── Recurrence Watch (policy-driven: off/warn/strict) ─────────────────────
|
||
run_recurrence_watch = inputs.get("run_recurrence_watch", True)
|
||
_rw_policy = gate_policy.get("recurrence_watch") if callable(gate_policy.get) else gate_policy.get("recurrence_watch", {})
|
||
_rw_mode = (
|
||
inputs.get("recurrence_watch_mode")
|
||
or (_rw_policy or {}).get("mode", gate_policy.get("_default_mode", "warn"))
|
||
)
|
||
if run_recurrence_watch and _rw_mode != "off":
|
||
rw_windows = inputs.get("recurrence_watch_windows_days", [7, 30])
|
||
rw_service = inputs.get("recurrence_watch_service", service_name)
|
||
ok_rw, gate = await _run_recurrence_watch(
|
||
tool_manager, agent_id,
|
||
service_name=rw_service,
|
||
windows_days=rw_windows,
|
||
)
|
||
gates.append(gate)
|
||
for rec in gate.get("recommendations", []):
|
||
recommendations.append(rec)
|
||
|
||
if _rw_mode == "strict" and not gate.get("skipped"):
|
||
fail_on_sev = (_rw_policy or {}).get("fail_on", {}).get("severity_in", ["P0", "P1"])
|
||
fail_on_high = (_rw_policy or {}).get("fail_on", {}).get("high_recurrence", True)
|
||
if fail_on_high and gate.get("has_high_recurrence"):
|
||
max_sev = gate.get("max_severity_seen", "P3")
|
||
if max_sev in fail_on_sev:
|
||
overall_pass = False
|
||
if fail_fast:
|
||
return _build_report(overall_pass, gates, recommendations, ts_start)
|
||
|
||
# ── Privacy Watch (policy-driven: off/warn/strict) ────────────────────────
|
||
run_privacy_watch = inputs.get("run_privacy_watch", True)
|
||
_pw_policy = gate_policy.get("privacy_watch") if callable(gate_policy.get) else gate_policy.get("privacy_watch", {})
|
||
_pw_mode = (_pw_policy or {}).get("mode", gate_policy.get("_default_mode", "warn"))
|
||
if run_privacy_watch and _pw_mode != "off":
|
||
privacy_mode = inputs.get("privacy_watch_mode", "fast")
|
||
privacy_audit_h = int(inputs.get("privacy_audit_window_hours", 24))
|
||
ok_pw, gate = await _run_privacy_watch(
|
||
tool_manager, agent_id,
|
||
mode=privacy_mode,
|
||
audit_window_hours=privacy_audit_h,
|
||
)
|
||
gates.append(gate)
|
||
for rec in gate.get("recommendations", []):
|
||
recommendations.append(rec)
|
||
|
||
# Apply strict mode: block release if findings match fail_on
|
||
if _pw_mode == "strict" and not gate.get("skipped"):
|
||
fail_on_sev = (_pw_policy or {}).get("fail_on", ["error"])
|
||
all_findings = gate.get("top_findings") or []
|
||
blocking = [f for f in all_findings if f.get("severity") in fail_on_sev]
|
||
if blocking:
|
||
overall_pass = False
|
||
if fail_fast:
|
||
return _build_report(overall_pass, gates, recommendations, ts_start)
|
||
|
||
# ── Cost Watch (always warn even in strict profiles) ──────────────────────
|
||
run_cost_watch = inputs.get("run_cost_watch", True)
|
||
_cw_policy = gate_policy.get("cost_watch") if callable(gate_policy.get) else gate_policy.get("cost_watch", {})
|
||
_cw_mode = (_cw_policy or {}).get("mode", "warn")
|
||
if run_cost_watch and _cw_mode != "off":
|
||
cost_window_h = int(inputs.get("cost_watch_window_hours", 24))
|
||
cost_ratio = float(inputs.get("cost_spike_ratio_threshold", 3.0))
|
||
cost_min_calls = int(inputs.get("cost_min_calls_threshold", 50))
|
||
_, gate = await _run_cost_watch(
|
||
tool_manager, agent_id,
|
||
window_hours=cost_window_h,
|
||
ratio_threshold=cost_ratio,
|
||
min_calls=cost_min_calls,
|
||
)
|
||
gates.append(gate)
|
||
# cost_watch is never strict (cost_always_warn in policy) — recommendations only
|
||
for rec in gate.get("recommendations", []):
|
||
recommendations.append(rec)
|
||
|
||
return _build_report(overall_pass, gates, recommendations, ts_start)
|
||
|
||
|
||
async def _run_slo_watch(
|
||
tool_manager,
|
||
agent_id: str,
|
||
service_name: str = "",
|
||
env: str = "prod",
|
||
window_minutes: int = 60,
|
||
) -> Tuple[bool, Dict]:
|
||
"""
|
||
Warning-only gate: detects SLO breaches before deploying.
|
||
strict mode blocks on any violation.
|
||
"""
|
||
try:
|
||
args = {
|
||
"action": "slo_snapshot",
|
||
"service": service_name,
|
||
"env": env,
|
||
"window_minutes": window_minutes,
|
||
}
|
||
result = await tool_manager.execute_tool(
|
||
"observability_tool", args, agent_id=agent_id
|
||
)
|
||
if not result.success:
|
||
return True, _gate("slo_watch", "pass",
|
||
note=f"slo_watch skipped: {result.error}", skipped=True)
|
||
|
||
data = result.result or {}
|
||
violations = data.get("violations", [])
|
||
metrics = data.get("metrics", {})
|
||
thresholds = data.get("thresholds", {})
|
||
|
||
recs = []
|
||
if violations and not data.get("skipped"):
|
||
viol_desc = ", ".join(violations)
|
||
recs.append(
|
||
f"SLO violation ({viol_desc}) detected for '{service_name}' — "
|
||
f"consider postponing deployment until service recovers"
|
||
)
|
||
|
||
note = (
|
||
f"Violations: {', '.join(violations)}" if violations
|
||
else "No SLO violations detected"
|
||
)
|
||
|
||
return True, _gate(
|
||
"slo_watch", "pass",
|
||
violations=violations,
|
||
metrics=metrics,
|
||
thresholds=thresholds,
|
||
note=note,
|
||
skipped=data.get("skipped", False),
|
||
recommendations=recs,
|
||
)
|
||
except Exception as e:
|
||
logger.warning("slo_watch gate error: %s", e)
|
||
return True, _gate("slo_watch", "pass",
|
||
note=f"slo_watch skipped (error): {e}", skipped=True)
|
||
|
||
|
||
async def _run_followup_watch(
|
||
tool_manager,
|
||
agent_id: str,
|
||
service_name: str = "",
|
||
env: str = "any",
|
||
window_days: int = 30,
|
||
) -> Tuple[bool, Dict]:
|
||
"""
|
||
Policy-driven gate: checks for open P0/P1 incidents and overdue follow-ups.
|
||
Returns pass=True in warn mode; strict mode may block based on GatePolicy.
|
||
"""
|
||
try:
|
||
args = {
|
||
"action": "incident_followups_summary",
|
||
"service": service_name,
|
||
"env": env,
|
||
"window_days": window_days,
|
||
}
|
||
result = await tool_manager.execute_tool(
|
||
"oncall_tool", args, agent_id=agent_id
|
||
)
|
||
if not result.success:
|
||
return True, _gate("followup_watch", "pass",
|
||
note=f"followup_watch skipped: {result.error}", skipped=True)
|
||
|
||
data = result.result or {}
|
||
stats = data.get("stats", {})
|
||
open_incs = data.get("open_incidents", [])
|
||
overdue = data.get("overdue_followups", [])
|
||
|
||
recs = []
|
||
if open_incs:
|
||
sev_list = ", ".join(f"{i['severity']} {i['id']}" for i in open_incs[:3])
|
||
recs.append(f"Open critical incidents: {sev_list}")
|
||
if overdue:
|
||
ov_list = ", ".join(f"{o['priority']} '{o['title'][:40]}' (due {o['due_date'][:10]})"
|
||
for o in overdue[:3])
|
||
recs.append(f"Overdue follow-ups: {ov_list}")
|
||
|
||
note = (
|
||
f"{stats.get('open_incidents', 0)} open P0/P1, "
|
||
f"{stats.get('overdue', 0)} overdue follow-ups, "
|
||
f"{stats.get('total_open_followups', 0)} total open"
|
||
)
|
||
|
||
return True, _gate(
|
||
"followup_watch", "pass",
|
||
open_incidents=open_incs[:5],
|
||
overdue_followups=overdue[:5],
|
||
stats=stats,
|
||
note=note,
|
||
recommendations=recs,
|
||
)
|
||
except Exception as e:
|
||
logger.warning("followup_watch gate error: %s", e)
|
||
return True, _gate("followup_watch", "pass",
|
||
note=f"followup_watch skipped (error): {e}", skipped=True)
|
||
|
||
|
||
async def _run_risk_watch(
|
||
tool_manager,
|
||
agent_id: str,
|
||
service_name: str = "",
|
||
env: str = "prod",
|
||
warn_at: Optional[int] = None,
|
||
fail_at: Optional[int] = None,
|
||
) -> Tuple[bool, Dict]:
|
||
"""
|
||
Policy-driven gate: computes RiskReport for the target service and
|
||
evaluates against configurable warn_at/fail_at thresholds.
|
||
Non-fatal: any error causes skip (never blocks release).
|
||
"""
|
||
try:
|
||
args: Dict = {
|
||
"action": "service",
|
||
"env": env,
|
||
}
|
||
if service_name:
|
||
args["service"] = service_name
|
||
else:
|
||
# No service → skip gracefully
|
||
return True, _gate("risk_watch", "pass",
|
||
note="risk_watch skipped: no service_name provided",
|
||
skipped=True)
|
||
|
||
result = await tool_manager.execute_tool(
|
||
"risk_engine_tool", args, agent_id=agent_id
|
||
)
|
||
if not result.success:
|
||
return True, _gate("risk_watch", "pass",
|
||
note=f"risk_watch skipped: {result.error}", skipped=True)
|
||
|
||
data = result.result or {}
|
||
score = int(data.get("score", 0))
|
||
band = data.get("band", "low")
|
||
reasons = data.get("reasons", [])
|
||
engine_recs = data.get("recommendations", [])
|
||
|
||
# Effective thresholds: input overrides > policy service override > policy defaults
|
||
thresholds = data.get("thresholds", {})
|
||
effective_warn = int(warn_at) if warn_at is not None else int(thresholds.get("warn_at", 50))
|
||
effective_fail = int(fail_at) if fail_at is not None else int(thresholds.get("fail_at", 80))
|
||
|
||
gate_recs = []
|
||
if score >= effective_warn:
|
||
gate_recs.append(
|
||
f"Service '{service_name}' risk score {score} ({band}): "
|
||
+ "; ".join(reasons[:3])
|
||
)
|
||
gate_recs.extend(engine_recs[:2])
|
||
|
||
note = (
|
||
f"score={score} band={band} warn_at={effective_warn} fail_at={effective_fail} | "
|
||
+ ("; ".join(reasons[:3]) if reasons else "no signals")
|
||
)
|
||
|
||
return True, _gate(
|
||
"risk_watch", "pass",
|
||
score=score,
|
||
band=band,
|
||
reasons=reasons[:5],
|
||
effective_warn_at=effective_warn,
|
||
effective_fail_at=effective_fail,
|
||
components=data.get("components", {}),
|
||
skipped=False,
|
||
note=note,
|
||
recommendations=gate_recs,
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning("risk_watch gate error: %s", e)
|
||
return True, _gate("risk_watch", "pass",
|
||
note=f"risk_watch skipped (error): {e}", skipped=True)
|
||
|
||
|
||
async def _run_risk_delta_watch(
|
||
tool_manager,
|
||
agent_id: str,
|
||
service_name: str = "",
|
||
env: str = "prod",
|
||
delta_hours: int = 24,
|
||
warn_delta: Optional[int] = None,
|
||
fail_delta: Optional[int] = None,
|
||
policy: Optional[Dict] = None,
|
||
) -> Tuple[bool, Dict]:
|
||
"""
|
||
Gate: checks how much the risk score rose since `delta_hours` ago.
|
||
Non-fatal: missing history → skipped (never blocks).
|
||
Sets gate["should_fail"] = True if score delta >= fail_delta AND service is p0 in strict mode.
|
||
"""
|
||
try:
|
||
if not service_name:
|
||
return True, _gate("risk_delta_watch", "pass",
|
||
note="risk_delta_watch skipped: no service_name", skipped=True)
|
||
|
||
# Load policy locally
|
||
if policy is None:
|
||
try:
|
||
from risk_engine import load_risk_policy
|
||
policy = load_risk_policy()
|
||
except Exception:
|
||
policy = {}
|
||
|
||
p0_services = set(policy.get("p0_services", []))
|
||
rdw_cfg = policy.get("release_gate", {}).get("risk_delta_watch", {})
|
||
effective_warn = int(warn_delta) if warn_delta is not None else int(rdw_cfg.get("default_warn_delta_24h", 10))
|
||
effective_fail = int(fail_delta) if fail_delta is not None else int(rdw_cfg.get("default_fail_delta_24h", 20))
|
||
p0_strict = bool(rdw_cfg.get("p0_services_strict", True))
|
||
|
||
# Compute current risk score
|
||
risk_result = await tool_manager.execute_tool(
|
||
"risk_engine_tool",
|
||
{"action": "service", "service": service_name, "env": env,
|
||
"include_trend": False},
|
||
agent_id=agent_id,
|
||
)
|
||
if not risk_result.success:
|
||
return True, _gate("risk_delta_watch", "pass",
|
||
note=f"risk_delta_watch skipped: {risk_result.error}", skipped=True)
|
||
|
||
current_score = int((risk_result.result or {}).get("score", 0))
|
||
current_band = (risk_result.result or {}).get("band", "low")
|
||
|
||
# Get delta from history
|
||
delta: Optional[int] = None
|
||
no_history = False
|
||
try:
|
||
from risk_history_store import get_risk_history_store
|
||
hstore = get_risk_history_store()
|
||
delta = hstore.get_delta(service_name, env, hours=delta_hours)
|
||
except Exception as he:
|
||
logger.warning("risk_delta_watch: history unavailable: %s", he)
|
||
|
||
if delta is None:
|
||
return True, _gate(
|
||
"risk_delta_watch", "pass",
|
||
note="No history baseline; run hourly_risk_snapshot first.",
|
||
skipped=True,
|
||
recommendations=["No risk history baseline available. Run hourly_risk_snapshot to establish baseline."],
|
||
)
|
||
|
||
# Regression flags from trend policy
|
||
reg_warn = delta >= effective_warn
|
||
reg_fail = delta >= effective_fail
|
||
|
||
recs: List[str] = []
|
||
if reg_warn:
|
||
recs.append(
|
||
f"Risk score for '{service_name}' rose +{delta} pts in {delta_hours}h "
|
||
f"(current: {current_score}, band: {current_band}). "
|
||
f"Review recent deployments and open incidents."
|
||
)
|
||
if reg_fail:
|
||
recs.append(
|
||
f"Risk regression FAIL for '{service_name}': +{delta} pts >= fail threshold {effective_fail}. "
|
||
f"Block or roll back recent changes."
|
||
)
|
||
|
||
# should_fail only when: service is p0, strict enabled, delta >= fail
|
||
is_p0 = service_name in p0_services
|
||
should_fail = reg_fail and is_p0 and p0_strict
|
||
|
||
note = (
|
||
f"delta_{delta_hours}h={delta} current_score={current_score} band={current_band} "
|
||
f"warn_at={effective_warn} fail_at={effective_fail} is_p0={is_p0}"
|
||
)
|
||
|
||
return True, _gate(
|
||
"risk_delta_watch", "pass",
|
||
delta=delta,
|
||
delta_hours=delta_hours,
|
||
current_score=current_score,
|
||
current_band=current_band,
|
||
effective_warn_delta=effective_warn,
|
||
effective_fail_delta=effective_fail,
|
||
regression_warn=reg_warn,
|
||
regression_fail=reg_fail,
|
||
is_p0=is_p0,
|
||
should_fail=should_fail,
|
||
skipped=False,
|
||
note=note,
|
||
recommendations=recs,
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning("risk_delta_watch gate error: %s", e)
|
||
return True, _gate("risk_delta_watch", "pass",
|
||
note=f"risk_delta_watch skipped (error): {e}", skipped=True)
|
||
|
||
|
||
async def _run_platform_review_required(
|
||
tool_manager,
|
||
agent_id: str,
|
||
service_name: str = "",
|
||
env: str = "prod",
|
||
) -> Tuple[bool, Dict]:
|
||
"""
|
||
Gate: Computes Architecture Pressure for the service.
|
||
In warn mode: always pass=True, adds recommendations.
|
||
In strict mode: sets should_fail=True if pressure >= fail_at.
|
||
Non-fatal: any error causes skip (never blocks release).
|
||
"""
|
||
try:
|
||
if not service_name:
|
||
return True, _gate("platform_review_required", "pass",
|
||
note="platform_review_required skipped: no service_name",
|
||
skipped=True)
|
||
|
||
# Load architecture pressure policy for thresholds
|
||
try:
|
||
from architecture_pressure import load_pressure_policy
|
||
pressure_policy = load_pressure_policy()
|
||
except Exception:
|
||
pressure_policy = {}
|
||
|
||
gate_cfg = pressure_policy.get("release_gate", {}).get(
|
||
"platform_review_required", {}
|
||
)
|
||
warn_at = int(gate_cfg.get("warn_at", 60))
|
||
fail_at = int(gate_cfg.get("fail_at", 85))
|
||
|
||
# Compute pressure via tool_manager
|
||
result = await tool_manager.execute_tool(
|
||
"architecture_pressure_tool",
|
||
{"action": "service", "service": service_name, "env": env},
|
||
agent_id=agent_id,
|
||
)
|
||
if not result.success:
|
||
return True, _gate("platform_review_required", "pass",
|
||
note=f"platform_review_required skipped: {result.error}",
|
||
skipped=True)
|
||
|
||
data = result.result or {}
|
||
score = int(data.get("score", 0))
|
||
band = data.get("band", "low")
|
||
signals = data.get("signals_summary", [])
|
||
requires_review = bool(data.get("requires_arch_review", False))
|
||
|
||
gate_recs = []
|
||
should_fail = False
|
||
|
||
if score >= warn_at:
|
||
gate_recs.append(
|
||
f"Service '{service_name}' architecture pressure={score} ({band}): "
|
||
+ ("; ".join(signals[:2]) if signals else "structural strain detected")
|
||
)
|
||
if score >= fail_at:
|
||
gate_recs.append(
|
||
f"Architecture review required for '{service_name}' before release. "
|
||
f"Pressure score {score} exceeds fail threshold {fail_at}."
|
||
)
|
||
should_fail = True
|
||
|
||
if requires_review:
|
||
gate_recs.append(
|
||
f"Architecture review has been flagged for '{service_name}'. "
|
||
f"Check ops/reports/platform/ for latest digest."
|
||
)
|
||
|
||
note = (
|
||
f"pressure_score={score} band={band} warn_at={warn_at} fail_at={fail_at} | "
|
||
+ ("; ".join(signals[:2]) if signals else "no pressure signals")
|
||
)
|
||
|
||
return True, _gate(
|
||
"platform_review_required", "pass",
|
||
score=score,
|
||
band=band,
|
||
signals_summary=signals[:4],
|
||
requires_arch_review=requires_review,
|
||
warn_at=warn_at,
|
||
fail_at=fail_at,
|
||
should_fail=should_fail,
|
||
skipped=False,
|
||
note=note,
|
||
recommendations=gate_recs,
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning("platform_review_required gate error: %s", e)
|
||
return True, _gate("platform_review_required", "pass",
|
||
note=f"platform_review_required skipped (error): {e}",
|
||
skipped=True)
|
||
|
||
|
||
async def _run_recurrence_watch(
|
||
tool_manager,
|
||
agent_id: str,
|
||
service_name: str = "",
|
||
windows_days: List[int] = None,
|
||
) -> Tuple[bool, Dict]:
|
||
"""
|
||
Policy-driven gate: checks incident recurrence for the target service.
|
||
- warn mode: always pass=True, adds recommendations.
|
||
- strict mode: pass=False if high_recurrence + severity in fail_on list.
|
||
Non-fatal: any error skips the gate.
|
||
"""
|
||
if windows_days is None:
|
||
windows_days = [7, 30]
|
||
try:
|
||
# Prefer focused service query; fall back to all if no service specified
|
||
args: Dict = {
|
||
"action": "recurrence",
|
||
"window_days": max(windows_days) if windows_days else 7,
|
||
}
|
||
if service_name:
|
||
args["service"] = service_name
|
||
|
||
result = await tool_manager.execute_tool(
|
||
"incident_intelligence_tool", args, agent_id=agent_id
|
||
)
|
||
if not result.success:
|
||
return True, _gate("recurrence_watch", "pass",
|
||
note=f"recurrence_watch skipped: {result.error}", skipped=True)
|
||
|
||
data = result.result or {}
|
||
high_sigs = data.get("high_recurrence", {}).get("signatures", [])
|
||
high_kinds = data.get("high_recurrence", {}).get("kinds", [])
|
||
warn_sigs = data.get("warn_recurrence", {}).get("signatures", [])
|
||
warn_kinds = data.get("warn_recurrence", {}).get("kinds", [])
|
||
has_high = bool(high_sigs or high_kinds)
|
||
has_warn = bool(warn_sigs or warn_kinds)
|
||
max_sev = data.get("max_severity_seen", "P3")
|
||
total = data.get("total_incidents", 0)
|
||
|
||
recs = []
|
||
if has_high:
|
||
bucket_descs = (
|
||
[f"sig:{s['signature'][:8]} ({s['count']}x)" for s in high_sigs[:3]]
|
||
+ [f"kind:{k['kind']} ({k['count']}x)" for k in high_kinds[:3]]
|
||
)
|
||
recs.append(
|
||
f"High recurrence for '{service_name or 'all'}': "
|
||
+ ", ".join(bucket_descs)
|
||
+ " — review root cause before deploying"
|
||
)
|
||
elif has_warn:
|
||
warn_descs = (
|
||
[f"sig:{s['signature'][:8]} ({s['count']}x)" for s in warn_sigs[:2]]
|
||
+ [f"kind:{k['kind']} ({k['count']}x)" for k in warn_kinds[:2]]
|
||
)
|
||
recs.append(
|
||
f"Warn-level recurrence for '{service_name or 'all'}': "
|
||
+ ", ".join(warn_descs)
|
||
)
|
||
|
||
note = (
|
||
f"high={len(high_sigs)} sigs / {len(high_kinds)} kinds; "
|
||
f"warn={len(warn_sigs)}/{len(warn_kinds)}; "
|
||
f"total_incidents={total}; max_sev={max_sev}"
|
||
)
|
||
|
||
return True, _gate(
|
||
"recurrence_watch", "pass",
|
||
has_high_recurrence=has_high,
|
||
has_warn_recurrence=has_warn,
|
||
high_signatures=[s["signature"][:8] for s in high_sigs[:5]],
|
||
high_kinds=[k["kind"] for k in high_kinds[:5]],
|
||
max_severity_seen=max_sev,
|
||
total_incidents=total,
|
||
note=note,
|
||
skipped=False,
|
||
recommendations=recs,
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning("recurrence_watch gate error: %s", e)
|
||
return True, _gate("recurrence_watch", "pass",
|
||
note=f"recurrence_watch skipped (error): {e}", skipped=True)
|
||
|
||
|
||
async def _run_privacy_watch(
|
||
tool_manager,
|
||
agent_id: str,
|
||
mode: str = "fast",
|
||
audit_window_hours: int = 24,
|
||
) -> Tuple[bool, Dict]:
|
||
"""
|
||
Warning-only gate: scans repo (fast mode) and recent audit stream for privacy risks.
|
||
Always returns pass=True. Adds recommendations for errors/warnings found.
|
||
"""
|
||
try:
|
||
# scan_repo (fast)
|
||
repo_args = {"action": "scan_repo", "mode": mode, "max_files": 200,
|
||
"paths_include": ["services/", "config/", "ops/"]}
|
||
repo_result = await tool_manager.execute_tool(
|
||
"data_governance_tool", repo_args, agent_id=agent_id
|
||
)
|
||
repo_data = repo_result.result or {} if repo_result.success else {}
|
||
|
||
# scan_audit (optional, non-fatal)
|
||
audit_data: Dict = {}
|
||
try:
|
||
audit_args = {"action": "scan_audit", "time_window_hours": audit_window_hours}
|
||
audit_result = await tool_manager.execute_tool(
|
||
"data_governance_tool", audit_args, agent_id=agent_id
|
||
)
|
||
if audit_result.success:
|
||
audit_data = audit_result.result or {}
|
||
except Exception:
|
||
pass
|
||
|
||
# Merge findings
|
||
all_findings = (repo_data.get("findings") or []) + (audit_data.get("findings") or [])
|
||
all_recs = list(dict.fromkeys(
|
||
(repo_data.get("recommendations") or []) + (audit_data.get("recommendations") or [])
|
||
))
|
||
|
||
errors = sum(1 for f in all_findings if f.get("severity") == "error")
|
||
warnings = sum(1 for f in all_findings if f.get("severity") == "warning")
|
||
infos = sum(1 for f in all_findings if f.get("severity") == "info")
|
||
total = errors + warnings + infos
|
||
|
||
note = (
|
||
f"{total} finding(s): {errors} error(s), {warnings} warning(s)"
|
||
if total else "No privacy findings"
|
||
)
|
||
|
||
return True, _gate(
|
||
"privacy_watch", "pass",
|
||
errors=errors,
|
||
warnings=warnings,
|
||
infos=infos,
|
||
top_findings=[
|
||
{"id": f.get("id"), "title": f.get("title"), "severity": f.get("severity")}
|
||
for f in all_findings[:5]
|
||
],
|
||
note=note,
|
||
recommendations=all_recs,
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.warning("privacy_watch gate error: %s", e)
|
||
return True, _gate("privacy_watch", "pass", note=f"privacy_watch skipped (error): {e}", skipped=True)
|
||
|
||
|
||
async def _run_cost_watch(
|
||
tool_manager,
|
||
agent_id: str,
|
||
window_hours: int = 24,
|
||
ratio_threshold: float = 3.0,
|
||
min_calls: int = 50,
|
||
) -> Tuple[bool, Dict]:
|
||
"""
|
||
Warning-only gate: detects cost/resource anomalies via cost_analyzer_tool.
|
||
Always returns pass=True (does not block release).
|
||
Appends recommendations for high-ratio spikes on priority tools.
|
||
"""
|
||
try:
|
||
args = {
|
||
"action": "anomalies",
|
||
"window_minutes": int(window_hours * 60 / 4), # last 25% of window
|
||
"baseline_hours": window_hours,
|
||
"ratio_threshold": ratio_threshold,
|
||
"min_calls": min_calls,
|
||
}
|
||
result = await tool_manager.execute_tool(
|
||
"cost_analyzer_tool", args, agent_id=agent_id
|
||
)
|
||
if not result.success:
|
||
return True, _gate("cost_watch", "pass", note=f"cost_analyzer unavailable: {result.error}", skipped=True)
|
||
|
||
data = result.result or {}
|
||
anomalies = data.get("anomalies", [])
|
||
anon_count = len(anomalies)
|
||
|
||
recs = []
|
||
cfg_weights: Dict = {}
|
||
try:
|
||
import yaml
|
||
import os
|
||
weights_path = os.path.join(
|
||
os.getenv("REPO_ROOT", str(__file__).rsplit("/services", 1)[0]),
|
||
"config", "cost_weights.yml",
|
||
)
|
||
with open(weights_path) as f:
|
||
cfg_weights = yaml.safe_load(f) or {}
|
||
except Exception:
|
||
pass
|
||
priority_tools = set((cfg_weights.get("anomaly") or {}).get("priority_tools") or [
|
||
"comfy_generate_video", "comfy_generate_image", "pr_reviewer_tool",
|
||
"job_orchestrator_tool", "observability_tool",
|
||
])
|
||
|
||
for a in anomalies:
|
||
if a.get("tool") in priority_tools:
|
||
recs.append(a.get("recommendation", f"Cost spike on {a.get('tool')} (ratio={a.get('ratio')})"))
|
||
|
||
return True, _gate(
|
||
"cost_watch", "pass",
|
||
anomalies_count=anon_count,
|
||
anomalies_preview=[
|
||
{"tool": a.get("tool"), "type": a.get("type"), "ratio": a.get("ratio")}
|
||
for a in anomalies[:5]
|
||
],
|
||
note=(f"{anon_count} anomaly(ies) detected" if anon_count else "No anomalies detected"),
|
||
recommendations=recs,
|
||
)
|
||
except Exception as e:
|
||
logger.warning("cost_watch gate error: %s", e)
|
||
return True, _gate("cost_watch", "pass", note=f"cost_watch skipped (error): {e}", skipped=True)
|
||
|
||
|
||
def _build_report(
|
||
overall_pass: bool,
|
||
gates: List[Dict],
|
||
recommendations: List[str],
|
||
ts_start: float,
|
||
) -> Dict:
|
||
elapsed_ms = round((time.monotonic() - ts_start) * 1000, 1)
|
||
failed_gates = [g["name"] for g in gates if g.get("status") == "fail"]
|
||
error_gates = [g["name"] for g in gates if g.get("status") == "error"]
|
||
passed_gates = [g["name"] for g in gates if g.get("status") == "pass"]
|
||
skipped_gates = [g["name"] for g in gates if g.get("status") == "skipped"]
|
||
|
||
if overall_pass:
|
||
summary = f"✅ RELEASE CHECK PASSED in {elapsed_ms}ms. Gates: {passed_gates}."
|
||
else:
|
||
summary = (
|
||
f"❌ RELEASE CHECK FAILED in {elapsed_ms}ms. "
|
||
f"Failed: {failed_gates}. Errors: {error_gates}."
|
||
)
|
||
if skipped_gates:
|
||
summary += f" Skipped: {skipped_gates}."
|
||
|
||
return {
|
||
"pass": overall_pass,
|
||
"gates": gates,
|
||
"recommendations": list(dict.fromkeys(recommendations)), # dedupe preserving order
|
||
"summary": summary,
|
||
"elapsed_ms": elapsed_ms,
|
||
}
|