microdao-daarion/ops/task_registry.yml

# Job Orchestrator Task Registry
# Defines allowlisted operational tasks that can be executed via job_orchestrator_tool
# Only tasks defined here can be run - no arbitrary command execution

tasks:
  # === Smoke Tests ===
  - id: "smoke_gateway"
    title: "Smoke test gateway"
    description: "Run smoke tests against the gateway service"
    tags: ["smoke", "ops"]
    service: "gateway"
    runner: "script"
    command_ref: "ops/smoke_helion_stack.sh"
    timeout_sec: 300
    inputs_schema:
      type: "object"
      properties: {}
      additionalProperties: false
    permissions:
      entitlements_required:
        - "tools.jobs.run.smoke"
    dry_run_behavior: "show_help"

  - id: "smoke_all"
    title: "Smoke test all services"
    description: "Run smoke tests against all services in the stack"
    tags: ["smoke", "ops"]
    runner: "script"
    command_ref: "ops/canary_all.sh"
    timeout_sec: 600
    inputs_schema:
      type: "object"
      properties:
        service:
          type: "string"
          description: "Optional specific service to test"
      additionalProperties: false
    permissions:
      entitlements_required:
        - "tools.jobs.run.smoke"
    dry_run_behavior: "validation_only"

  # === Drift Checks ===
  - id: "drift_check_node1"
    title: "Drift check NODE1"
    description: "Check infrastructure drift on production node"
    tags: ["drift", "ops"]
    service: "infrastructure"
    runner: "script"
    command_ref: "ops/status.sh"
    timeout_sec: 300
    inputs_schema:
      type: "object"
      properties:
        mode:
          type: "string"
          enum: ["quick", "full"]
          default: "quick"
      required: ["mode"]
    permissions:
      entitlements_required:
        - "tools.jobs.run.drift"
    dry_run_behavior: "validation_only"

  # === Backup Validation ===
  - id: "backup_validate"
    title: "Validate backup integrity"
    description: "Verify backup files are present and valid"
    tags: ["backup", "ops"]
    service: "storage"
    runner: "script"
    command_ref: "ops/check_daarwizz_awareness.sh"
    timeout_sec: 600
    inputs_schema:
      type: "object"
      properties:
        backup_path:
          type: "string"
          description: "Path to backup directory"
        check_integrity:
          type: "boolean"
          default: true
      additionalProperties: false
    permissions:
      entitlements_required:
        - "tools.jobs.run.backup"
    dry_run_behavior: "list_files"

  # === Contract Checks ===
  - id: "contract_check_router"
    title: "Contract check router"
    description: "Verify OpenAPI contract compatibility for router"
    tags: ["migrate", "ops"]
    service: "router"
    runner: "script"
    command_ref: "ops/canary_router_contract.sh"
    timeout_sec: 300
    inputs_schema:
      type: "object"
      properties:
        strict:
          type: "boolean"
          default: false
      additionalProperties: false
    permissions:
      entitlements_required:
        - "tools.jobs.run.migrate"
    dry_run_behavior: "validation_only"

  # === Delivery Priority Check ===
  - id: "delivery_priority_check"
    title: "Delivery priority check"
    description: "Verify message delivery priority configuration"
    tags: ["ops"]
    service: "gateway"
    runner: "script"
    command_ref: "ops/canary_gateway_delivery_priority.sh"
    timeout_sec: 180
    inputs_schema:
      type: "object"
      properties: {}
      additionalProperties: false
    permissions:
      entitlements_required:
        - "tools.jobs.run.ops"
    dry_run_behavior: "show_help"

  # === Monitor ===
  - id: "monitor_notification"
    title: "Monitor notification check"
    description: "Check if monitoring notifications are working"
    tags: ["ops"]
    service: "monitoring"
    runner: "script"
    command_ref: "ops/monitor_notify_sofiia.sh"
    timeout_sec: 120
    inputs_schema:
      type: "object"
      properties: {}
      additionalProperties: false
    permissions:
      entitlements_required:
        - "tools.jobs.run.ops"
    dry_run_behavior: "show_help"

  # === Release Gate (internal runner: invokes tool endpoints sequentially) ===
  - id: "release_check"
    title: "Release Gate Check"
    description: >
      Orchestrates all release gates: PR review, config lint, contract diff,
      threat model, optional smoke/drift. Returns one structured pass/fail verdict.
    tags: ["release", "gate", "ops"]
    runner: "internal"           # NOT a shell script; uses release_check_runner.py
    command_ref: null            # No shell command — internal Python runner
    timeout_sec: 600             # 10 min max for all gates
    inputs_schema:
      type: "object"
      properties:
        diff_text:
          type: "string"
          description: "Unified diff text (optional if repo_path provided)"
        service_name:
          type: "string"
          description: "Name of the service being released"
        openapi_base:
          type: "string"
          description: "Base OpenAPI spec (text or repo path)"
        openapi_head:
          type: "string"
          description: "Head OpenAPI spec (text or repo path)"
        risk_profile:
          type: "string"
          enum: ["default", "agentic_tools", "public_api"]
          default: "default"
          description: "Threat model risk profile"
        fail_fast:
          type: "boolean"
          default: false
          description: "Stop at first failing gate"
        run_smoke:
          type: "boolean"
          default: false
          description: "Run smoke tests after static gates pass"
        run_deps:
          type: "boolean"
          default: true
          description: "Run dependency vulnerability scan (gate 3)"
        deps_targets:
          type: "array"
          items: {type: "string", enum: ["python", "node"]}
          description: "Ecosystems to scan (default: python + node)"
        deps_vuln_mode:
          type: "string"
          enum: ["online", "offline_cache"]
          default: "offline_cache"
          description: "OSV query mode: online or offline_cache"
        deps_fail_on:
          type: "array"
          items: {type: "string", enum: ["CRITICAL", "HIGH", "MEDIUM", "LOW"]}
          description: "Severity levels that block release (default: CRITICAL, HIGH)"
        deps_timeout_sec:
          type: "number"
          default: 40
          description: "Timeout for dependency scan in seconds"
        gate_profile:
          type: "string"
          enum: ["dev", "staging", "prod"]
          default: "dev"
          description: "Gate strictness profile (dev=warn-first, staging/prod=strict privacy)"
        run_slo_watch:
          type: "boolean"
          default: true
          description: "Run SLO watch gate (warns/blocks if service has active SLO violations)"
        slo_watch_window_minutes:
          type: "integer"
          default: 60
          description: "SLO evaluation window in minutes"
        run_followup_watch:
          type: "boolean"
          default: true
          description: "Run follow-up watch gate (checks open P0/P1 incidents and overdue follow-ups)"
        followup_watch_window_days:
          type: "integer"
          default: 30
          description: "Window for follow-up/incident scan in days"
        followup_watch_env:
          type: "string"
          enum: ["prod", "staging", "any"]
          default: "any"
          description: "Filter incidents by environment"
        run_privacy_watch:
          type: "boolean"
          default: true
          description: "Run privacy/data-governance warning gate (always pass=true, adds recommendations)"
        privacy_watch_mode:
          type: "string"
          enum: ["fast", "full"]
          default: "fast"
          description: "Scan mode: fast=.py/.yml/.json only, full=all extensions"
        privacy_audit_window_hours:
          type: "integer"
          default: 24
          description: "Time window for audit stream scan in hours"
        run_cost_watch:
          type: "boolean"
          default: true
          description: "Run cost_watch warning gate (always pass=true, adds recommendations)"
        cost_watch_window_hours:
          type: "integer"
          default: 24
          description: "Window for anomaly detection in hours (default 24)"
        cost_spike_ratio_threshold:
          type: "number"
          default: 3.0
          description: "Cost spike ratio to flag as warning (default 3.0x baseline)"
        cost_min_calls_threshold:
          type: "integer"
          default: 50
          description: "Min calls in window to qualify as anomaly (default 50)"
        run_risk_watch:
          type: "boolean"
          default: true
          description: "Run risk_watch gate: warn/block if service risk score exceeds threshold"
        risk_watch_env:
          type: "string"
          enum: ["prod", "staging"]
          default: "prod"
          description: "Environment for risk score evaluation"
        risk_watch_warn_at:
          type: "integer"
          description: "Override warn threshold (default from risk_policy.yml)"
        risk_watch_fail_at:
          type: "integer"
          description: "Override fail threshold (default from risk_policy.yml per-service override)"
        run_risk_delta_watch:
          type: "boolean"
          default: true
          description: "Run risk_delta_watch gate: block staging for p0_services if score rose >= fail_delta in 24h"
        risk_delta_env:
          type: "string"
          enum: ["prod", "staging"]
          default: "prod"
          description: "Environment for risk delta evaluation"
        risk_delta_hours:
          type: "integer"
          default: 24
          description: "Baseline window in hours (default 24h)"
        risk_delta_warn:
          type: "integer"
          description: "Override delta warn threshold (default from risk_policy.yml)"
        risk_delta_fail:
          type: "integer"
          description: "Override delta fail threshold (default from risk_policy.yml)"
        run_drift:
          type: "boolean"
          default: false
          description: "Run drift check after static gates pass"
      required: ["service_name"]
    permissions:
      entitlements_required:
        - "tools.pr_review.gate"
        - "tools.contract.gate"
        - "tools.config_lint.gate"
        - "tools.threatmodel.gate"
        - "tools.deps.gate"
        - "tools.cost.read"
        - "tools.data_gov.read"
        - "tools.risk.read"
        - "tools.risk.write"
    dry_run_behavior: "validation_only"

  # === Audit Retention & Compaction ===

  - id: "audit_cleanup"
    title: "Audit JSONL Cleanup"
    description: "Delete or gzip-archive audit JSONL files older than retention_days. Enforces data governance policy."
    tags: ["ops", "retention", "audit"]
    service: "infrastructure"
    runner: "script"
    command_ref: "ops/scripts/audit_cleanup.py"
    timeout_sec: 300
    inputs_schema:
      type: "object"
      properties:
        retention_days:
          type: "integer"
          minimum: 1
          maximum: 365
          default: 30
          description: "Delete/archive files older than this many days (from data_governance_policy.yml default)"
        dry_run:
          type: "boolean"
          default: true
          description: "If true: report only, no changes"
        archive_gzip:
          type: "boolean"
          default: false
          description: "Compress to .jsonl.gz before deleting"
        audit_dir:
          type: "string"
          default: "ops/audit"
          description: "Path to audit JSONL directory (relative to repo root)"
      required: ["retention_days", "dry_run"]
      additionalProperties: false
    permissions:
      entitlements_required:
        - "tools.jobs.run.ops"
    dry_run_behavior: "report_only"

  - id: "audit_compact"
    title: "Audit JSONL Compaction"
    description: "Merge last N days of audit JSONL into a single compressed artifact for forensics or fast analysis."
    tags: ["ops", "retention", "audit"]
    service: "infrastructure"
    runner: "script"
    command_ref: "ops/scripts/audit_compact.py"
    timeout_sec: 180
    inputs_schema:
      type: "object"
      properties:
        window_days:
          type: "integer"
          minimum: 1
          maximum: 30
          default: 7
          description: "Compact files from last N days"
        output_path:
          type: "string"
          description: "Output directory for compact file (default: ops/audit/compact)"
        dry_run:
          type: "boolean"
          default: true
          description: "If true: count lines only, do not write"
        audit_dir:
          type: "string"
          default: "ops/audit"
      required: ["window_days", "dry_run"]
      additionalProperties: false
    permissions:
      entitlements_required:
        - "tools.jobs.run.ops"
    dry_run_behavior: "report_only"

  # === Scheduled Operational Jobs (daily/weekly) ===
  #
  # Schedule guidance (add to your cron / systemd timer):
  #   Daily 03:30:  audit_cleanup
  #   Daily 09:00:  daily_cost_digest
  #   Daily 09:10:  daily_privacy_digest
  #   Weekly Mon 02:00: weekly_drift_full
  #   Weekly Mon 08:00: weekly_incident_digest
  #
  # Example cron (NODE1, as ops user):
  #   30 3 * * *   /usr/local/bin/job_runner.sh audit_cleanup '{"retention_days":30}'
  #   0  9 * * *   /usr/local/bin/job_runner.sh daily_cost_digest '{}'
  #   10 9 * * *   /usr/local/bin/job_runner.sh daily_privacy_digest '{}'
  #   0  2 * * 1   /usr/local/bin/job_runner.sh weekly_drift_full '{}'
  #   0  8 * * 1   /usr/local/bin/job_runner.sh weekly_incident_digest '{}'

  - id: "daily_cost_digest"
    title: "Daily Cost & FinOps Digest"
    description: "Runs cost_analyzer_tool.digest for last 24h (backend=auto) and saves markdown + JSON artifacts."
    tags: ["ops", "finops", "scheduled", "daily"]
    service: "infrastructure"
    runner: "internal"
    timeout_sec: 60
    inputs_schema:
      type: "object"
      properties:
        window_hours:
          type: "integer"
          default: 24
          description: "Analysis window in hours"
        baseline_hours:
          type: "integer"
          default: 168
          description: "Baseline window for anomaly comparison (7d)"
        top_n:
          type: "integer"
          default: 10
          description: "Top-N tools/agents to include"
        backend:
          type: "string"
          enum: ["auto", "jsonl", "postgres"]
          default: "auto"
          description: "Audit data source"
        output_dir:
          type: "string"
          default: "ops/reports/cost"
          description: "Directory to write YYYY-MM-DD.json and .md artifacts"
      required: []
      additionalProperties: false
    permissions:
      entitlements_required:
        - "tools.cost.read"
        - "tools.jobs.run.ops"

  - id: "daily_privacy_digest"
    title: "Daily Privacy & Audit Digest"
    description: "Runs data_governance_tool.digest_audit for last 24h (backend=auto) and saves markdown + JSON artifacts."
    tags: ["ops", "privacy", "scheduled", "daily"]
    service: "infrastructure"
    runner: "internal"
    timeout_sec: 60
    inputs_schema:
      type: "object"
      properties:
        window_hours:
          type: "integer"
          default: 24
          description: "Audit scan window in hours"
        max_findings:
          type: "integer"
          default: 20
          description: "Max findings to include in digest"
        backend:
          type: "string"
          enum: ["auto", "jsonl", "postgres"]
          default: "auto"
          description: "Audit data source"
        output_dir:
          type: "string"
          default: "ops/reports/privacy"
          description: "Directory to write YYYY-MM-DD.json and .md artifacts"
      required: []
      additionalProperties: false
    permissions:
      entitlements_required:
        - "tools.data_gov.read"
        - "tools.jobs.run.ops"

  - id: "weekly_drift_full"
    title: "Weekly Full Drift Analysis"
    description: "Runs drift_analyzer_tool with all categories and saves JSON artifact to ops/reports/drift/."
    tags: ["ops", "drift", "scheduled", "weekly"]
    service: "infrastructure"
    runner: "internal"
    timeout_sec: 120
    inputs_schema:
      type: "object"
      properties:
        drift_categories:
          type: "array"
          items:
            type: "string"
            enum: ["services", "openapi", "nats", "tools"]
          default: ["services", "openapi", "nats", "tools"]
          description: "Categories to analyze"
        drift_profile:
          type: "string"
          enum: ["dev", "release_gate"]
          default: "dev"
          description: "Severity profile for drift analysis"
        output_dir:
          type: "string"
          default: "ops/reports/drift"
          description: "Directory for week-YYYY-WW.json artifact"
      required: []
      additionalProperties: false
    permissions:
      entitlements_required:
        - "tools.drift.read"
        - "tools.jobs.run.ops"

  # === Weekly Incident Intelligence Digest (every Monday 08:00) ===
  - id: "weekly_incident_digest"
    title: "Weekly Incident Intelligence Digest"
    description: "Generates weekly incident digest: correlation stats, recurrence tables (7d/30d), and recommendations. Saves md+json to ops/reports/incidents/weekly/."
    tags: ["incidents", "intelligence", "scheduled", "weekly"]
    runner: "internal"
    schedule: "0 8 * * 1"    # Monday 08:00 UTC
    timeout_sec: 120
    concurrency: 1
    on_failure: "log_and_continue"
    inputs_schema:
      type: "object"
      properties:
        save_artifacts:
          type: "boolean"
          default: true
          description: "Write md+json artifacts to output_dir"
        workspace_id:
          type: "string"
          default: "default"
        agent_id:
          type: "string"
          default: "sofiia"
      required: []
      additionalProperties: false
    permissions:
      entitlements_required:
        - "tools.oncall.incident_write"
        - "tools.jobs.run.ops"
    output_artifacts:
      - pattern: "ops/reports/incidents/weekly/YYYY-WW.json"
      - pattern: "ops/reports/incidents/weekly/YYYY-WW.md"

  # === Alert Triage Loop (scheduled, every 5 min, 0 LLM tokens) ===
  - id: "alert_triage_loop"
    title: "Alert Triage Loop"
    description: "Poll unacked alerts and create/update incidents deterministically. 0 LLM tokens in steady state (llm_mode=off)."
    tags: ["alerts", "incidents", "scheduled"]
    runner: "script"
    command_ref: "ops/scripts/alert_triage_loop.py"
    schedule: "*/5 * * * *"
    timeout_sec: 240
    concurrency: 1
    on_failure: "log_and_continue"
    inputs_schema:
      type: "object"
      properties:
        policy_profile:
          type: "string"
          default: "default"
          description: "Routing policy profile"
        dry_run:
          type: "boolean"
          default: false
          description: "Simulate without writes"
        workspace_id:
          type: "string"
          default: "default"
        agent_id:
          type: "string"
          default: "sofiia"
      required: []
      additionalProperties: false
    permissions:
      entitlements_required:
        - "tools.alerts.read"
        - "tools.alerts.ack"
        - "tools.oncall.incident_write"

  # === Deploy (requires explicit entitlement) ===
  - id: "deploy_canary"
    title: "Deploy canary"
    description: "Deploy canary version of services"
    tags: ["deploy"]
    service: "infrastructure"
    runner: "script"
    command_ref: "ops/canary_all.sh"
    timeout_sec: 600
    inputs_schema:
      type: "object"
      properties:
        service:
          type: "string"
          description: "Service to deploy"
        version:
          type: "string"
          description: "Version tag to deploy"
        percentage:
          type: "integer"
          minimum: 1
          maximum: 100
          default: 10
      required: ["service", "version"]
    permissions:
      entitlements_required:
        - "tools.jobs.run.deploy"
    dry_run_behavior: "show_plan"

  # === Risk History & Digest ===

  - id: "hourly_risk_snapshot"
    title: "Hourly Risk Snapshot"
    description: "Compute and persist risk scores for all known services into risk_history store."
    tags: ["risk", "ops", "scheduled"]
    service: "infrastructure"
    runner: "internal"
    schedule: "0 * * * *"          # every hour
    timeout_sec: 120
    inputs_schema:
      type: "object"
      properties:
        env:
          type: "string"
          enum: ["prod", "staging"]
          default: "prod"
          description: "Environment to snapshot"
    permissions:
      entitlements_required:
        - "tools.risk.write"
    dry_run_behavior: "report_only"

  - id: "daily_risk_digest"
    title: "Daily Risk Digest"
    description: "Generate daily risk digest (md+json) in ops/reports/risk/. Runs at policy.digest.daily_hour_utc (default 09:00 UTC)."
    tags: ["risk", "ops", "digest", "scheduled"]
    service: "infrastructure"
    runner: "internal"
    schedule: "0 9 * * *"          # daily at 09:00 UTC
    timeout_sec: 60
    inputs_schema:
      type: "object"
      properties:
        env:
          type: "string"
          enum: ["prod", "staging"]
          default: "prod"
        date:
          type: "string"
          description: "Override date (YYYY-MM-DD). Default: today UTC."
    permissions:
      entitlements_required:
        - "tools.risk.write"
    dry_run_behavior: "report_only"

  - id: "risk_history_cleanup"
    title: "Risk History Cleanup"
    description: "Delete risk_history records older than retention_days (default 90d)."
    tags: ["risk", "ops", "retention", "scheduled"]
    service: "infrastructure"
    runner: "internal"
    schedule: "20 3 * * *"         # daily at 03:20 UTC
    timeout_sec: 60
    inputs_schema:
      type: "object"
      properties:
        retention_days:
          type: "integer"
          minimum: 7
          maximum: 365
          default: 90
          description: "Retention period in days"
    permissions:
      entitlements_required:
        - "tools.risk.write"
    dry_run_behavior: "report_only"

  - id: "weekly_platform_priority_digest"
    title: "Weekly Platform Priority Digest"
    description: "Generate Architecture Pressure digest for all services. Outputs ops/reports/platform/YYYY-WW.md + .json. Auto-creates architecture-review followups for services with pressure >= require_arch_review_at."
    tags: ["pressure", "architecture", "digest", "scheduled"]
    service: "infrastructure"
    runner: "internal"
    schedule: "0 6 * * 1"          # every Monday at 06:00 UTC
    timeout_sec: 120
    inputs_schema:
      type: "object"
      properties:
        env:
          type: "string"
          enum: ["prod", "staging", "dev"]
          default: "prod"
        auto_followup:
          type: "boolean"
          default: true
          description: "Auto-create architecture-review followups"
        top_n:
          type: "integer"
          default: 10
    permissions:
      entitlements_required:
        - "tools.pressure.write"
    dry_run_behavior: "report_only"

  - id: "weekly_backlog_generate"
    title: "Weekly Backlog Auto-Generation"
    description: "Auto-generate Engineering Backlog items from latest weekly Platform Priority Digest. Runs after weekly_platform_priority_digest (06:00 UTC Monday)."
    tags: ["backlog", "platform", "scheduled"]
    service: "infrastructure"
    runner: "internal"
    schedule: "20 6 * * 1"          # every Monday at 06:20 UTC (20 min after digest)
    timeout_sec: 120
    inputs_schema:
      type: "object"
      properties:
        env:
          type: "string"
          enum: ["prod", "staging", "dev"]
          default: "prod"
        week_str:
          type: "string"
          description: "Override ISO week (YYYY-WNN). Default: current week."
    permissions:
      entitlements_required:
        - "tools.backlog.admin"
    dry_run_behavior: "report_only"

  - id: "daily_backlog_cleanup"
    title: "Daily Backlog Cleanup"
    description: "Remove done/canceled backlog items older than retention_days (default 180d)."
    tags: ["backlog", "ops", "retention", "scheduled"]
    service: "infrastructure"
    runner: "internal"
    schedule: "40 3 * * *"          # daily at 03:40 UTC
    timeout_sec: 60
    inputs_schema:
      type: "object"
      properties:
        retention_days:
          type: "integer"
          minimum: 7
          maximum: 730
          default: 180
    permissions:
      entitlements_required:
        - "tools.backlog.admin"
    dry_run_behavior: "report_only"