# Job Orchestrator Task Registry # Defines allowlisted operational tasks that can be executed via job_orchestrator_tool # Only tasks defined here can be run - no arbitrary command execution tasks: # === Smoke Tests === - id: "smoke_gateway" title: "Smoke test gateway" description: "Run smoke tests against the gateway service" tags: ["smoke", "ops"] service: "gateway" runner: "script" command_ref: "ops/smoke_helion_stack.sh" timeout_sec: 300 inputs_schema: type: "object" properties: {} additionalProperties: false permissions: entitlements_required: - "tools.jobs.run.smoke" dry_run_behavior: "show_help" - id: "smoke_all" title: "Smoke test all services" description: "Run smoke tests against all services in the stack" tags: ["smoke", "ops"] runner: "script" command_ref: "ops/canary_all.sh" timeout_sec: 600 inputs_schema: type: "object" properties: service: type: "string" description: "Optional specific service to test" additionalProperties: false permissions: entitlements_required: - "tools.jobs.run.smoke" dry_run_behavior: "validation_only" # === Drift Checks === - id: "drift_check_node1" title: "Drift check NODE1" description: "Check infrastructure drift on production node" tags: ["drift", "ops"] service: "infrastructure" runner: "script" command_ref: "ops/status.sh" timeout_sec: 300 inputs_schema: type: "object" properties: mode: type: "string" enum: ["quick", "full"] default: "quick" required: ["mode"] permissions: entitlements_required: - "tools.jobs.run.drift" dry_run_behavior: "validation_only" # === Backup Validation === - id: "backup_validate" title: "Validate backup integrity" description: "Verify backup files are present and valid" tags: ["backup", "ops"] service: "storage" runner: "script" command_ref: "ops/check_daarwizz_awareness.sh" timeout_sec: 600 inputs_schema: type: "object" properties: backup_path: type: "string" description: "Path to backup directory" check_integrity: type: "boolean" default: true additionalProperties: false permissions: entitlements_required: - "tools.jobs.run.backup" dry_run_behavior: "list_files" # === Contract Checks === - id: "contract_check_router" title: "Contract check router" description: "Verify OpenAPI contract compatibility for router" tags: ["migrate", "ops"] service: "router" runner: "script" command_ref: "ops/canary_router_contract.sh" timeout_sec: 300 inputs_schema: type: "object" properties: strict: type: "boolean" default: false additionalProperties: false permissions: entitlements_required: - "tools.jobs.run.migrate" dry_run_behavior: "validation_only" # === Delivery Priority Check === - id: "delivery_priority_check" title: "Delivery priority check" description: "Verify message delivery priority configuration" tags: ["ops"] service: "gateway" runner: "script" command_ref: "ops/canary_gateway_delivery_priority.sh" timeout_sec: 180 inputs_schema: type: "object" properties: {} additionalProperties: false permissions: entitlements_required: - "tools.jobs.run.ops" dry_run_behavior: "show_help" # === Monitor === - id: "monitor_notification" title: "Monitor notification check" description: "Check if monitoring notifications are working" tags: ["ops"] service: "monitoring" runner: "script" command_ref: "ops/monitor_notify_sofiia.sh" timeout_sec: 120 inputs_schema: type: "object" properties: {} additionalProperties: false permissions: entitlements_required: - "tools.jobs.run.ops" dry_run_behavior: "show_help" # === Release Gate (internal runner: invokes tool endpoints sequentially) === - id: "release_check" title: "Release Gate Check" description: > Orchestrates all release gates: PR review, config lint, contract diff, threat model, optional smoke/drift. Returns one structured pass/fail verdict. tags: ["release", "gate", "ops"] runner: "internal" # NOT a shell script; uses release_check_runner.py command_ref: null # No shell command — internal Python runner timeout_sec: 600 # 10 min max for all gates inputs_schema: type: "object" properties: diff_text: type: "string" description: "Unified diff text (optional if repo_path provided)" service_name: type: "string" description: "Name of the service being released" openapi_base: type: "string" description: "Base OpenAPI spec (text or repo path)" openapi_head: type: "string" description: "Head OpenAPI spec (text or repo path)" risk_profile: type: "string" enum: ["default", "agentic_tools", "public_api"] default: "default" description: "Threat model risk profile" fail_fast: type: "boolean" default: false description: "Stop at first failing gate" run_smoke: type: "boolean" default: false description: "Run smoke tests after static gates pass" run_deps: type: "boolean" default: true description: "Run dependency vulnerability scan (gate 3)" deps_targets: type: "array" items: {type: "string", enum: ["python", "node"]} description: "Ecosystems to scan (default: python + node)" deps_vuln_mode: type: "string" enum: ["online", "offline_cache"] default: "offline_cache" description: "OSV query mode: online or offline_cache" deps_fail_on: type: "array" items: {type: "string", enum: ["CRITICAL", "HIGH", "MEDIUM", "LOW"]} description: "Severity levels that block release (default: CRITICAL, HIGH)" deps_timeout_sec: type: "number" default: 40 description: "Timeout for dependency scan in seconds" gate_profile: type: "string" enum: ["dev", "staging", "prod"] default: "dev" description: "Gate strictness profile (dev=warn-first, staging/prod=strict privacy)" run_slo_watch: type: "boolean" default: true description: "Run SLO watch gate (warns/blocks if service has active SLO violations)" slo_watch_window_minutes: type: "integer" default: 60 description: "SLO evaluation window in minutes" run_followup_watch: type: "boolean" default: true description: "Run follow-up watch gate (checks open P0/P1 incidents and overdue follow-ups)" followup_watch_window_days: type: "integer" default: 30 description: "Window for follow-up/incident scan in days" followup_watch_env: type: "string" enum: ["prod", "staging", "any"] default: "any" description: "Filter incidents by environment" run_privacy_watch: type: "boolean" default: true description: "Run privacy/data-governance warning gate (always pass=true, adds recommendations)" privacy_watch_mode: type: "string" enum: ["fast", "full"] default: "fast" description: "Scan mode: fast=.py/.yml/.json only, full=all extensions" privacy_audit_window_hours: type: "integer" default: 24 description: "Time window for audit stream scan in hours" run_cost_watch: type: "boolean" default: true description: "Run cost_watch warning gate (always pass=true, adds recommendations)" cost_watch_window_hours: type: "integer" default: 24 description: "Window for anomaly detection in hours (default 24)" cost_spike_ratio_threshold: type: "number" default: 3.0 description: "Cost spike ratio to flag as warning (default 3.0x baseline)" cost_min_calls_threshold: type: "integer" default: 50 description: "Min calls in window to qualify as anomaly (default 50)" run_risk_watch: type: "boolean" default: true description: "Run risk_watch gate: warn/block if service risk score exceeds threshold" risk_watch_env: type: "string" enum: ["prod", "staging"] default: "prod" description: "Environment for risk score evaluation" risk_watch_warn_at: type: "integer" description: "Override warn threshold (default from risk_policy.yml)" risk_watch_fail_at: type: "integer" description: "Override fail threshold (default from risk_policy.yml per-service override)" run_risk_delta_watch: type: "boolean" default: true description: "Run risk_delta_watch gate: block staging for p0_services if score rose >= fail_delta in 24h" risk_delta_env: type: "string" enum: ["prod", "staging"] default: "prod" description: "Environment for risk delta evaluation" risk_delta_hours: type: "integer" default: 24 description: "Baseline window in hours (default 24h)" risk_delta_warn: type: "integer" description: "Override delta warn threshold (default from risk_policy.yml)" risk_delta_fail: type: "integer" description: "Override delta fail threshold (default from risk_policy.yml)" run_drift: type: "boolean" default: false description: "Run drift check after static gates pass" required: ["service_name"] permissions: entitlements_required: - "tools.pr_review.gate" - "tools.contract.gate" - "tools.config_lint.gate" - "tools.threatmodel.gate" - "tools.deps.gate" - "tools.cost.read" - "tools.data_gov.read" - "tools.risk.read" - "tools.risk.write" dry_run_behavior: "validation_only" # === Audit Retention & Compaction === - id: "audit_cleanup" title: "Audit JSONL Cleanup" description: "Delete or gzip-archive audit JSONL files older than retention_days. Enforces data governance policy." tags: ["ops", "retention", "audit"] service: "infrastructure" runner: "script" command_ref: "ops/scripts/audit_cleanup.py" timeout_sec: 300 inputs_schema: type: "object" properties: retention_days: type: "integer" minimum: 1 maximum: 365 default: 30 description: "Delete/archive files older than this many days (from data_governance_policy.yml default)" dry_run: type: "boolean" default: true description: "If true: report only, no changes" archive_gzip: type: "boolean" default: false description: "Compress to .jsonl.gz before deleting" audit_dir: type: "string" default: "ops/audit" description: "Path to audit JSONL directory (relative to repo root)" required: ["retention_days", "dry_run"] additionalProperties: false permissions: entitlements_required: - "tools.jobs.run.ops" dry_run_behavior: "report_only" - id: "audit_compact" title: "Audit JSONL Compaction" description: "Merge last N days of audit JSONL into a single compressed artifact for forensics or fast analysis." tags: ["ops", "retention", "audit"] service: "infrastructure" runner: "script" command_ref: "ops/scripts/audit_compact.py" timeout_sec: 180 inputs_schema: type: "object" properties: window_days: type: "integer" minimum: 1 maximum: 30 default: 7 description: "Compact files from last N days" output_path: type: "string" description: "Output directory for compact file (default: ops/audit/compact)" dry_run: type: "boolean" default: true description: "If true: count lines only, do not write" audit_dir: type: "string" default: "ops/audit" required: ["window_days", "dry_run"] additionalProperties: false permissions: entitlements_required: - "tools.jobs.run.ops" dry_run_behavior: "report_only" # === Scheduled Operational Jobs (daily/weekly) === # # Schedule guidance (add to your cron / systemd timer): # Daily 03:30: audit_cleanup # Daily 09:00: daily_cost_digest # Daily 09:10: daily_privacy_digest # Weekly Mon 02:00: weekly_drift_full # Weekly Mon 08:00: weekly_incident_digest # # Example cron (NODE1, as ops user): # 30 3 * * * /usr/local/bin/job_runner.sh audit_cleanup '{"retention_days":30}' # 0 9 * * * /usr/local/bin/job_runner.sh daily_cost_digest '{}' # 10 9 * * * /usr/local/bin/job_runner.sh daily_privacy_digest '{}' # 0 2 * * 1 /usr/local/bin/job_runner.sh weekly_drift_full '{}' # 0 8 * * 1 /usr/local/bin/job_runner.sh weekly_incident_digest '{}' - id: "daily_cost_digest" title: "Daily Cost & FinOps Digest" description: "Runs cost_analyzer_tool.digest for last 24h (backend=auto) and saves markdown + JSON artifacts." tags: ["ops", "finops", "scheduled", "daily"] service: "infrastructure" runner: "internal" timeout_sec: 60 inputs_schema: type: "object" properties: window_hours: type: "integer" default: 24 description: "Analysis window in hours" baseline_hours: type: "integer" default: 168 description: "Baseline window for anomaly comparison (7d)" top_n: type: "integer" default: 10 description: "Top-N tools/agents to include" backend: type: "string" enum: ["auto", "jsonl", "postgres"] default: "auto" description: "Audit data source" output_dir: type: "string" default: "ops/reports/cost" description: "Directory to write YYYY-MM-DD.json and .md artifacts" required: [] additionalProperties: false permissions: entitlements_required: - "tools.cost.read" - "tools.jobs.run.ops" - id: "daily_privacy_digest" title: "Daily Privacy & Audit Digest" description: "Runs data_governance_tool.digest_audit for last 24h (backend=auto) and saves markdown + JSON artifacts." tags: ["ops", "privacy", "scheduled", "daily"] service: "infrastructure" runner: "internal" timeout_sec: 60 inputs_schema: type: "object" properties: window_hours: type: "integer" default: 24 description: "Audit scan window in hours" max_findings: type: "integer" default: 20 description: "Max findings to include in digest" backend: type: "string" enum: ["auto", "jsonl", "postgres"] default: "auto" description: "Audit data source" output_dir: type: "string" default: "ops/reports/privacy" description: "Directory to write YYYY-MM-DD.json and .md artifacts" required: [] additionalProperties: false permissions: entitlements_required: - "tools.data_gov.read" - "tools.jobs.run.ops" - id: "weekly_drift_full" title: "Weekly Full Drift Analysis" description: "Runs drift_analyzer_tool with all categories and saves JSON artifact to ops/reports/drift/." tags: ["ops", "drift", "scheduled", "weekly"] service: "infrastructure" runner: "internal" timeout_sec: 120 inputs_schema: type: "object" properties: drift_categories: type: "array" items: type: "string" enum: ["services", "openapi", "nats", "tools"] default: ["services", "openapi", "nats", "tools"] description: "Categories to analyze" drift_profile: type: "string" enum: ["dev", "release_gate"] default: "dev" description: "Severity profile for drift analysis" output_dir: type: "string" default: "ops/reports/drift" description: "Directory for week-YYYY-WW.json artifact" required: [] additionalProperties: false permissions: entitlements_required: - "tools.drift.read" - "tools.jobs.run.ops" # === Weekly Incident Intelligence Digest (every Monday 08:00) === - id: "weekly_incident_digest" title: "Weekly Incident Intelligence Digest" description: "Generates weekly incident digest: correlation stats, recurrence tables (7d/30d), and recommendations. Saves md+json to ops/reports/incidents/weekly/." tags: ["incidents", "intelligence", "scheduled", "weekly"] runner: "internal" schedule: "0 8 * * 1" # Monday 08:00 UTC timeout_sec: 120 concurrency: 1 on_failure: "log_and_continue" inputs_schema: type: "object" properties: save_artifacts: type: "boolean" default: true description: "Write md+json artifacts to output_dir" workspace_id: type: "string" default: "default" agent_id: type: "string" default: "sofiia" required: [] additionalProperties: false permissions: entitlements_required: - "tools.oncall.incident_write" - "tools.jobs.run.ops" output_artifacts: - pattern: "ops/reports/incidents/weekly/YYYY-WW.json" - pattern: "ops/reports/incidents/weekly/YYYY-WW.md" # === Alert Triage Loop (scheduled, every 5 min, 0 LLM tokens) === - id: "alert_triage_loop" title: "Alert Triage Loop" description: "Poll unacked alerts and create/update incidents deterministically. 0 LLM tokens in steady state (llm_mode=off)." tags: ["alerts", "incidents", "scheduled"] runner: "script" command_ref: "ops/scripts/alert_triage_loop.py" schedule: "*/5 * * * *" timeout_sec: 240 concurrency: 1 on_failure: "log_and_continue" inputs_schema: type: "object" properties: policy_profile: type: "string" default: "default" description: "Routing policy profile" dry_run: type: "boolean" default: false description: "Simulate without writes" workspace_id: type: "string" default: "default" agent_id: type: "string" default: "sofiia" required: [] additionalProperties: false permissions: entitlements_required: - "tools.alerts.read" - "tools.alerts.ack" - "tools.oncall.incident_write" # === Deploy (requires explicit entitlement) === - id: "deploy_canary" title: "Deploy canary" description: "Deploy canary version of services" tags: ["deploy"] service: "infrastructure" runner: "script" command_ref: "ops/canary_all.sh" timeout_sec: 600 inputs_schema: type: "object" properties: service: type: "string" description: "Service to deploy" version: type: "string" description: "Version tag to deploy" percentage: type: "integer" minimum: 1 maximum: 100 default: 10 required: ["service", "version"] permissions: entitlements_required: - "tools.jobs.run.deploy" dry_run_behavior: "show_plan" # === Risk History & Digest === - id: "hourly_risk_snapshot" title: "Hourly Risk Snapshot" description: "Compute and persist risk scores for all known services into risk_history store." tags: ["risk", "ops", "scheduled"] service: "infrastructure" runner: "internal" schedule: "0 * * * *" # every hour timeout_sec: 120 inputs_schema: type: "object" properties: env: type: "string" enum: ["prod", "staging"] default: "prod" description: "Environment to snapshot" permissions: entitlements_required: - "tools.risk.write" dry_run_behavior: "report_only" - id: "daily_risk_digest" title: "Daily Risk Digest" description: "Generate daily risk digest (md+json) in ops/reports/risk/. Runs at policy.digest.daily_hour_utc (default 09:00 UTC)." tags: ["risk", "ops", "digest", "scheduled"] service: "infrastructure" runner: "internal" schedule: "0 9 * * *" # daily at 09:00 UTC timeout_sec: 60 inputs_schema: type: "object" properties: env: type: "string" enum: ["prod", "staging"] default: "prod" date: type: "string" description: "Override date (YYYY-MM-DD). Default: today UTC." permissions: entitlements_required: - "tools.risk.write" dry_run_behavior: "report_only" - id: "risk_history_cleanup" title: "Risk History Cleanup" description: "Delete risk_history records older than retention_days (default 90d)." tags: ["risk", "ops", "retention", "scheduled"] service: "infrastructure" runner: "internal" schedule: "20 3 * * *" # daily at 03:20 UTC timeout_sec: 60 inputs_schema: type: "object" properties: retention_days: type: "integer" minimum: 7 maximum: 365 default: 90 description: "Retention period in days" permissions: entitlements_required: - "tools.risk.write" dry_run_behavior: "report_only" - id: "weekly_platform_priority_digest" title: "Weekly Platform Priority Digest" description: "Generate Architecture Pressure digest for all services. Outputs ops/reports/platform/YYYY-WW.md + .json. Auto-creates architecture-review followups for services with pressure >= require_arch_review_at." tags: ["pressure", "architecture", "digest", "scheduled"] service: "infrastructure" runner: "internal" schedule: "0 6 * * 1" # every Monday at 06:00 UTC timeout_sec: 120 inputs_schema: type: "object" properties: env: type: "string" enum: ["prod", "staging", "dev"] default: "prod" auto_followup: type: "boolean" default: true description: "Auto-create architecture-review followups" top_n: type: "integer" default: 10 permissions: entitlements_required: - "tools.pressure.write" dry_run_behavior: "report_only" - id: "weekly_backlog_generate" title: "Weekly Backlog Auto-Generation" description: "Auto-generate Engineering Backlog items from latest weekly Platform Priority Digest. Runs after weekly_platform_priority_digest (06:00 UTC Monday)." tags: ["backlog", "platform", "scheduled"] service: "infrastructure" runner: "internal" schedule: "20 6 * * 1" # every Monday at 06:20 UTC (20 min after digest) timeout_sec: 120 inputs_schema: type: "object" properties: env: type: "string" enum: ["prod", "staging", "dev"] default: "prod" week_str: type: "string" description: "Override ISO week (YYYY-WNN). Default: current week." permissions: entitlements_required: - "tools.backlog.admin" dry_run_behavior: "report_only" - id: "daily_backlog_cleanup" title: "Daily Backlog Cleanup" description: "Remove done/canceled backlog items older than retention_days (default 180d)." tags: ["backlog", "ops", "retention", "scheduled"] service: "infrastructure" runner: "internal" schedule: "40 3 * * *" # daily at 03:40 UTC timeout_sec: 60 inputs_schema: type: "object" properties: retention_days: type: "integer" minimum: 7 maximum: 730 default: 180 permissions: entitlements_required: - "tools.backlog.admin" dry_run_behavior: "report_only"