Files
microdao-daarion/ops/task_registry.yml
Apple 67225a39fa docs(platform): add policy configs, runbooks, ops scripts and platform documentation
Config policies (16 files): alert_routing, architecture_pressure, backlog,
cost_weights, data_governance, incident_escalation, incident_intelligence,
network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix,
release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout

Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard,
deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice,
cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule),
task_registry, voice alerts/ha/latency/policy

Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks,
NODA1/NODA2 status and setup, audit index and traces, backlog, incident,
supervisor, tools, voice, opencode, release, risk, aistalk, spacebot

Made-with: Cursor
2026-03-03 07:14:53 -08:00

741 lines
24 KiB
YAML

# Job Orchestrator Task Registry
# Defines allowlisted operational tasks that can be executed via job_orchestrator_tool
# Only tasks defined here can be run - no arbitrary command execution
tasks:
# === Smoke Tests ===
- id: "smoke_gateway"
title: "Smoke test gateway"
description: "Run smoke tests against the gateway service"
tags: ["smoke", "ops"]
service: "gateway"
runner: "script"
command_ref: "ops/smoke_helion_stack.sh"
timeout_sec: 300
inputs_schema:
type: "object"
properties: {}
additionalProperties: false
permissions:
entitlements_required:
- "tools.jobs.run.smoke"
dry_run_behavior: "show_help"
- id: "smoke_all"
title: "Smoke test all services"
description: "Run smoke tests against all services in the stack"
tags: ["smoke", "ops"]
runner: "script"
command_ref: "ops/canary_all.sh"
timeout_sec: 600
inputs_schema:
type: "object"
properties:
service:
type: "string"
description: "Optional specific service to test"
additionalProperties: false
permissions:
entitlements_required:
- "tools.jobs.run.smoke"
dry_run_behavior: "validation_only"
# === Drift Checks ===
- id: "drift_check_node1"
title: "Drift check NODE1"
description: "Check infrastructure drift on production node"
tags: ["drift", "ops"]
service: "infrastructure"
runner: "script"
command_ref: "ops/status.sh"
timeout_sec: 300
inputs_schema:
type: "object"
properties:
mode:
type: "string"
enum: ["quick", "full"]
default: "quick"
required: ["mode"]
permissions:
entitlements_required:
- "tools.jobs.run.drift"
dry_run_behavior: "validation_only"
# === Backup Validation ===
- id: "backup_validate"
title: "Validate backup integrity"
description: "Verify backup files are present and valid"
tags: ["backup", "ops"]
service: "storage"
runner: "script"
command_ref: "ops/check_daarwizz_awareness.sh"
timeout_sec: 600
inputs_schema:
type: "object"
properties:
backup_path:
type: "string"
description: "Path to backup directory"
check_integrity:
type: "boolean"
default: true
additionalProperties: false
permissions:
entitlements_required:
- "tools.jobs.run.backup"
dry_run_behavior: "list_files"
# === Contract Checks ===
- id: "contract_check_router"
title: "Contract check router"
description: "Verify OpenAPI contract compatibility for router"
tags: ["migrate", "ops"]
service: "router"
runner: "script"
command_ref: "ops/canary_router_contract.sh"
timeout_sec: 300
inputs_schema:
type: "object"
properties:
strict:
type: "boolean"
default: false
additionalProperties: false
permissions:
entitlements_required:
- "tools.jobs.run.migrate"
dry_run_behavior: "validation_only"
# === Delivery Priority Check ===
- id: "delivery_priority_check"
title: "Delivery priority check"
description: "Verify message delivery priority configuration"
tags: ["ops"]
service: "gateway"
runner: "script"
command_ref: "ops/canary_gateway_delivery_priority.sh"
timeout_sec: 180
inputs_schema:
type: "object"
properties: {}
additionalProperties: false
permissions:
entitlements_required:
- "tools.jobs.run.ops"
dry_run_behavior: "show_help"
# === Monitor ===
- id: "monitor_notification"
title: "Monitor notification check"
description: "Check if monitoring notifications are working"
tags: ["ops"]
service: "monitoring"
runner: "script"
command_ref: "ops/monitor_notify_sofiia.sh"
timeout_sec: 120
inputs_schema:
type: "object"
properties: {}
additionalProperties: false
permissions:
entitlements_required:
- "tools.jobs.run.ops"
dry_run_behavior: "show_help"
# === Release Gate (internal runner: invokes tool endpoints sequentially) ===
- id: "release_check"
title: "Release Gate Check"
description: >
Orchestrates all release gates: PR review, config lint, contract diff,
threat model, optional smoke/drift. Returns one structured pass/fail verdict.
tags: ["release", "gate", "ops"]
runner: "internal" # NOT a shell script; uses release_check_runner.py
command_ref: null # No shell command — internal Python runner
timeout_sec: 600 # 10 min max for all gates
inputs_schema:
type: "object"
properties:
diff_text:
type: "string"
description: "Unified diff text (optional if repo_path provided)"
service_name:
type: "string"
description: "Name of the service being released"
openapi_base:
type: "string"
description: "Base OpenAPI spec (text or repo path)"
openapi_head:
type: "string"
description: "Head OpenAPI spec (text or repo path)"
risk_profile:
type: "string"
enum: ["default", "agentic_tools", "public_api"]
default: "default"
description: "Threat model risk profile"
fail_fast:
type: "boolean"
default: false
description: "Stop at first failing gate"
run_smoke:
type: "boolean"
default: false
description: "Run smoke tests after static gates pass"
run_deps:
type: "boolean"
default: true
description: "Run dependency vulnerability scan (gate 3)"
deps_targets:
type: "array"
items: {type: "string", enum: ["python", "node"]}
description: "Ecosystems to scan (default: python + node)"
deps_vuln_mode:
type: "string"
enum: ["online", "offline_cache"]
default: "offline_cache"
description: "OSV query mode: online or offline_cache"
deps_fail_on:
type: "array"
items: {type: "string", enum: ["CRITICAL", "HIGH", "MEDIUM", "LOW"]}
description: "Severity levels that block release (default: CRITICAL, HIGH)"
deps_timeout_sec:
type: "number"
default: 40
description: "Timeout for dependency scan in seconds"
gate_profile:
type: "string"
enum: ["dev", "staging", "prod"]
default: "dev"
description: "Gate strictness profile (dev=warn-first, staging/prod=strict privacy)"
run_slo_watch:
type: "boolean"
default: true
description: "Run SLO watch gate (warns/blocks if service has active SLO violations)"
slo_watch_window_minutes:
type: "integer"
default: 60
description: "SLO evaluation window in minutes"
run_followup_watch:
type: "boolean"
default: true
description: "Run follow-up watch gate (checks open P0/P1 incidents and overdue follow-ups)"
followup_watch_window_days:
type: "integer"
default: 30
description: "Window for follow-up/incident scan in days"
followup_watch_env:
type: "string"
enum: ["prod", "staging", "any"]
default: "any"
description: "Filter incidents by environment"
run_privacy_watch:
type: "boolean"
default: true
description: "Run privacy/data-governance warning gate (always pass=true, adds recommendations)"
privacy_watch_mode:
type: "string"
enum: ["fast", "full"]
default: "fast"
description: "Scan mode: fast=.py/.yml/.json only, full=all extensions"
privacy_audit_window_hours:
type: "integer"
default: 24
description: "Time window for audit stream scan in hours"
run_cost_watch:
type: "boolean"
default: true
description: "Run cost_watch warning gate (always pass=true, adds recommendations)"
cost_watch_window_hours:
type: "integer"
default: 24
description: "Window for anomaly detection in hours (default 24)"
cost_spike_ratio_threshold:
type: "number"
default: 3.0
description: "Cost spike ratio to flag as warning (default 3.0x baseline)"
cost_min_calls_threshold:
type: "integer"
default: 50
description: "Min calls in window to qualify as anomaly (default 50)"
run_risk_watch:
type: "boolean"
default: true
description: "Run risk_watch gate: warn/block if service risk score exceeds threshold"
risk_watch_env:
type: "string"
enum: ["prod", "staging"]
default: "prod"
description: "Environment for risk score evaluation"
risk_watch_warn_at:
type: "integer"
description: "Override warn threshold (default from risk_policy.yml)"
risk_watch_fail_at:
type: "integer"
description: "Override fail threshold (default from risk_policy.yml per-service override)"
run_risk_delta_watch:
type: "boolean"
default: true
description: "Run risk_delta_watch gate: block staging for p0_services if score rose >= fail_delta in 24h"
risk_delta_env:
type: "string"
enum: ["prod", "staging"]
default: "prod"
description: "Environment for risk delta evaluation"
risk_delta_hours:
type: "integer"
default: 24
description: "Baseline window in hours (default 24h)"
risk_delta_warn:
type: "integer"
description: "Override delta warn threshold (default from risk_policy.yml)"
risk_delta_fail:
type: "integer"
description: "Override delta fail threshold (default from risk_policy.yml)"
run_drift:
type: "boolean"
default: false
description: "Run drift check after static gates pass"
required: ["service_name"]
permissions:
entitlements_required:
- "tools.pr_review.gate"
- "tools.contract.gate"
- "tools.config_lint.gate"
- "tools.threatmodel.gate"
- "tools.deps.gate"
- "tools.cost.read"
- "tools.data_gov.read"
- "tools.risk.read"
- "tools.risk.write"
dry_run_behavior: "validation_only"
# === Audit Retention & Compaction ===
- id: "audit_cleanup"
title: "Audit JSONL Cleanup"
description: "Delete or gzip-archive audit JSONL files older than retention_days. Enforces data governance policy."
tags: ["ops", "retention", "audit"]
service: "infrastructure"
runner: "script"
command_ref: "ops/scripts/audit_cleanup.py"
timeout_sec: 300
inputs_schema:
type: "object"
properties:
retention_days:
type: "integer"
minimum: 1
maximum: 365
default: 30
description: "Delete/archive files older than this many days (from data_governance_policy.yml default)"
dry_run:
type: "boolean"
default: true
description: "If true: report only, no changes"
archive_gzip:
type: "boolean"
default: false
description: "Compress to .jsonl.gz before deleting"
audit_dir:
type: "string"
default: "ops/audit"
description: "Path to audit JSONL directory (relative to repo root)"
required: ["retention_days", "dry_run"]
additionalProperties: false
permissions:
entitlements_required:
- "tools.jobs.run.ops"
dry_run_behavior: "report_only"
- id: "audit_compact"
title: "Audit JSONL Compaction"
description: "Merge last N days of audit JSONL into a single compressed artifact for forensics or fast analysis."
tags: ["ops", "retention", "audit"]
service: "infrastructure"
runner: "script"
command_ref: "ops/scripts/audit_compact.py"
timeout_sec: 180
inputs_schema:
type: "object"
properties:
window_days:
type: "integer"
minimum: 1
maximum: 30
default: 7
description: "Compact files from last N days"
output_path:
type: "string"
description: "Output directory for compact file (default: ops/audit/compact)"
dry_run:
type: "boolean"
default: true
description: "If true: count lines only, do not write"
audit_dir:
type: "string"
default: "ops/audit"
required: ["window_days", "dry_run"]
additionalProperties: false
permissions:
entitlements_required:
- "tools.jobs.run.ops"
dry_run_behavior: "report_only"
# === Scheduled Operational Jobs (daily/weekly) ===
#
# Schedule guidance (add to your cron / systemd timer):
# Daily 03:30: audit_cleanup
# Daily 09:00: daily_cost_digest
# Daily 09:10: daily_privacy_digest
# Weekly Mon 02:00: weekly_drift_full
# Weekly Mon 08:00: weekly_incident_digest
#
# Example cron (NODE1, as ops user):
# 30 3 * * * /usr/local/bin/job_runner.sh audit_cleanup '{"retention_days":30}'
# 0 9 * * * /usr/local/bin/job_runner.sh daily_cost_digest '{}'
# 10 9 * * * /usr/local/bin/job_runner.sh daily_privacy_digest '{}'
# 0 2 * * 1 /usr/local/bin/job_runner.sh weekly_drift_full '{}'
# 0 8 * * 1 /usr/local/bin/job_runner.sh weekly_incident_digest '{}'
- id: "daily_cost_digest"
title: "Daily Cost & FinOps Digest"
description: "Runs cost_analyzer_tool.digest for last 24h (backend=auto) and saves markdown + JSON artifacts."
tags: ["ops", "finops", "scheduled", "daily"]
service: "infrastructure"
runner: "internal"
timeout_sec: 60
inputs_schema:
type: "object"
properties:
window_hours:
type: "integer"
default: 24
description: "Analysis window in hours"
baseline_hours:
type: "integer"
default: 168
description: "Baseline window for anomaly comparison (7d)"
top_n:
type: "integer"
default: 10
description: "Top-N tools/agents to include"
backend:
type: "string"
enum: ["auto", "jsonl", "postgres"]
default: "auto"
description: "Audit data source"
output_dir:
type: "string"
default: "ops/reports/cost"
description: "Directory to write YYYY-MM-DD.json and .md artifacts"
required: []
additionalProperties: false
permissions:
entitlements_required:
- "tools.cost.read"
- "tools.jobs.run.ops"
- id: "daily_privacy_digest"
title: "Daily Privacy & Audit Digest"
description: "Runs data_governance_tool.digest_audit for last 24h (backend=auto) and saves markdown + JSON artifacts."
tags: ["ops", "privacy", "scheduled", "daily"]
service: "infrastructure"
runner: "internal"
timeout_sec: 60
inputs_schema:
type: "object"
properties:
window_hours:
type: "integer"
default: 24
description: "Audit scan window in hours"
max_findings:
type: "integer"
default: 20
description: "Max findings to include in digest"
backend:
type: "string"
enum: ["auto", "jsonl", "postgres"]
default: "auto"
description: "Audit data source"
output_dir:
type: "string"
default: "ops/reports/privacy"
description: "Directory to write YYYY-MM-DD.json and .md artifacts"
required: []
additionalProperties: false
permissions:
entitlements_required:
- "tools.data_gov.read"
- "tools.jobs.run.ops"
- id: "weekly_drift_full"
title: "Weekly Full Drift Analysis"
description: "Runs drift_analyzer_tool with all categories and saves JSON artifact to ops/reports/drift/."
tags: ["ops", "drift", "scheduled", "weekly"]
service: "infrastructure"
runner: "internal"
timeout_sec: 120
inputs_schema:
type: "object"
properties:
drift_categories:
type: "array"
items:
type: "string"
enum: ["services", "openapi", "nats", "tools"]
default: ["services", "openapi", "nats", "tools"]
description: "Categories to analyze"
drift_profile:
type: "string"
enum: ["dev", "release_gate"]
default: "dev"
description: "Severity profile for drift analysis"
output_dir:
type: "string"
default: "ops/reports/drift"
description: "Directory for week-YYYY-WW.json artifact"
required: []
additionalProperties: false
permissions:
entitlements_required:
- "tools.drift.read"
- "tools.jobs.run.ops"
# === Weekly Incident Intelligence Digest (every Monday 08:00) ===
- id: "weekly_incident_digest"
title: "Weekly Incident Intelligence Digest"
description: "Generates weekly incident digest: correlation stats, recurrence tables (7d/30d), and recommendations. Saves md+json to ops/reports/incidents/weekly/."
tags: ["incidents", "intelligence", "scheduled", "weekly"]
runner: "internal"
schedule: "0 8 * * 1" # Monday 08:00 UTC
timeout_sec: 120
concurrency: 1
on_failure: "log_and_continue"
inputs_schema:
type: "object"
properties:
save_artifacts:
type: "boolean"
default: true
description: "Write md+json artifacts to output_dir"
workspace_id:
type: "string"
default: "default"
agent_id:
type: "string"
default: "sofiia"
required: []
additionalProperties: false
permissions:
entitlements_required:
- "tools.oncall.incident_write"
- "tools.jobs.run.ops"
output_artifacts:
- pattern: "ops/reports/incidents/weekly/YYYY-WW.json"
- pattern: "ops/reports/incidents/weekly/YYYY-WW.md"
# === Alert Triage Loop (scheduled, every 5 min, 0 LLM tokens) ===
- id: "alert_triage_loop"
title: "Alert Triage Loop"
description: "Poll unacked alerts and create/update incidents deterministically. 0 LLM tokens in steady state (llm_mode=off)."
tags: ["alerts", "incidents", "scheduled"]
runner: "script"
command_ref: "ops/scripts/alert_triage_loop.py"
schedule: "*/5 * * * *"
timeout_sec: 240
concurrency: 1
on_failure: "log_and_continue"
inputs_schema:
type: "object"
properties:
policy_profile:
type: "string"
default: "default"
description: "Routing policy profile"
dry_run:
type: "boolean"
default: false
description: "Simulate without writes"
workspace_id:
type: "string"
default: "default"
agent_id:
type: "string"
default: "sofiia"
required: []
additionalProperties: false
permissions:
entitlements_required:
- "tools.alerts.read"
- "tools.alerts.ack"
- "tools.oncall.incident_write"
# === Deploy (requires explicit entitlement) ===
- id: "deploy_canary"
title: "Deploy canary"
description: "Deploy canary version of services"
tags: ["deploy"]
service: "infrastructure"
runner: "script"
command_ref: "ops/canary_all.sh"
timeout_sec: 600
inputs_schema:
type: "object"
properties:
service:
type: "string"
description: "Service to deploy"
version:
type: "string"
description: "Version tag to deploy"
percentage:
type: "integer"
minimum: 1
maximum: 100
default: 10
required: ["service", "version"]
permissions:
entitlements_required:
- "tools.jobs.run.deploy"
dry_run_behavior: "show_plan"
# === Risk History & Digest ===
- id: "hourly_risk_snapshot"
title: "Hourly Risk Snapshot"
description: "Compute and persist risk scores for all known services into risk_history store."
tags: ["risk", "ops", "scheduled"]
service: "infrastructure"
runner: "internal"
schedule: "0 * * * *" # every hour
timeout_sec: 120
inputs_schema:
type: "object"
properties:
env:
type: "string"
enum: ["prod", "staging"]
default: "prod"
description: "Environment to snapshot"
permissions:
entitlements_required:
- "tools.risk.write"
dry_run_behavior: "report_only"
- id: "daily_risk_digest"
title: "Daily Risk Digest"
description: "Generate daily risk digest (md+json) in ops/reports/risk/. Runs at policy.digest.daily_hour_utc (default 09:00 UTC)."
tags: ["risk", "ops", "digest", "scheduled"]
service: "infrastructure"
runner: "internal"
schedule: "0 9 * * *" # daily at 09:00 UTC
timeout_sec: 60
inputs_schema:
type: "object"
properties:
env:
type: "string"
enum: ["prod", "staging"]
default: "prod"
date:
type: "string"
description: "Override date (YYYY-MM-DD). Default: today UTC."
permissions:
entitlements_required:
- "tools.risk.write"
dry_run_behavior: "report_only"
- id: "risk_history_cleanup"
title: "Risk History Cleanup"
description: "Delete risk_history records older than retention_days (default 90d)."
tags: ["risk", "ops", "retention", "scheduled"]
service: "infrastructure"
runner: "internal"
schedule: "20 3 * * *" # daily at 03:20 UTC
timeout_sec: 60
inputs_schema:
type: "object"
properties:
retention_days:
type: "integer"
minimum: 7
maximum: 365
default: 90
description: "Retention period in days"
permissions:
entitlements_required:
- "tools.risk.write"
dry_run_behavior: "report_only"
- id: "weekly_platform_priority_digest"
title: "Weekly Platform Priority Digest"
description: "Generate Architecture Pressure digest for all services. Outputs ops/reports/platform/YYYY-WW.md + .json. Auto-creates architecture-review followups for services with pressure >= require_arch_review_at."
tags: ["pressure", "architecture", "digest", "scheduled"]
service: "infrastructure"
runner: "internal"
schedule: "0 6 * * 1" # every Monday at 06:00 UTC
timeout_sec: 120
inputs_schema:
type: "object"
properties:
env:
type: "string"
enum: ["prod", "staging", "dev"]
default: "prod"
auto_followup:
type: "boolean"
default: true
description: "Auto-create architecture-review followups"
top_n:
type: "integer"
default: 10
permissions:
entitlements_required:
- "tools.pressure.write"
dry_run_behavior: "report_only"
- id: "weekly_backlog_generate"
title: "Weekly Backlog Auto-Generation"
description: "Auto-generate Engineering Backlog items from latest weekly Platform Priority Digest. Runs after weekly_platform_priority_digest (06:00 UTC Monday)."
tags: ["backlog", "platform", "scheduled"]
service: "infrastructure"
runner: "internal"
schedule: "20 6 * * 1" # every Monday at 06:20 UTC (20 min after digest)
timeout_sec: 120
inputs_schema:
type: "object"
properties:
env:
type: "string"
enum: ["prod", "staging", "dev"]
default: "prod"
week_str:
type: "string"
description: "Override ISO week (YYYY-WNN). Default: current week."
permissions:
entitlements_required:
- "tools.backlog.admin"
dry_run_behavior: "report_only"
- id: "daily_backlog_cleanup"
title: "Daily Backlog Cleanup"
description: "Remove done/canceled backlog items older than retention_days (default 180d)."
tags: ["backlog", "ops", "retention", "scheduled"]
service: "infrastructure"
runner: "internal"
schedule: "40 3 * * *" # daily at 03:40 UTC
timeout_sec: 60
inputs_schema:
type: "object"
properties:
retention_days:
type: "integer"
minimum: 7
maximum: 730
default: 180
permissions:
entitlements_required:
- "tools.backlog.admin"
dry_run_behavior: "report_only"