docs(platform): add policy configs, runbooks, ops scripts and platform documentation

Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
2026-03-03 07:14:53 -08:00
parent 129e4ea1fc
commit 67225a39fa
102 changed files with 20060 additions and 0 deletions
--- a/config/alert_routing_policy.yml
+++ b/config/alert_routing_policy.yml
@@ -0,0 +1,114 @@
+# alert_routing_policy.yml
+# Controls how the alert_triage_graph processes incoming alerts every 5 minutes.
+# Key design: llm_mode=off means 0 LLM tokens in steady state.
+
+defaults:
+  poll_interval_seconds: 300        # 5 min
+  max_alerts_per_run: 25
+  only_unacked: true
+
+  # Safety valves (avoid runaway incident creation on alert storm)
+  max_incidents_per_run: 5
+  max_triages_per_run: 5
+  dedupe_window_minutes_default: 120
+  ack_note_prefix: "alert_triage_loop"
+
+  # LLM gating — off = 0 tokens in steady state
+  llm_mode: "off"                   # off | local | remote
+  llm_on:
+    triage: false
+    postmortem: false
+
+routing:
+  # ─── HARD AUTO: prod P0/P1 → create incident + deterministic triage ─────────
+  - match:
+      env_in: ["prod"]
+      severity_in: ["P0", "P1"]
+    actions:
+      auto_incident: true
+      auto_triage: true
+      triage_mode: "deterministic"  # deterministic | llm
+      incident_severity_cap: "P1"
+      dedupe_window_minutes: 180
+      attach_alert_artifact: true
+      ack: true
+
+  # ─── Security alerts: auto incident + (optional) LLM triage ─────────────────
+  - match:
+      kind_in: ["security"]
+    actions:
+      auto_incident: true
+      auto_triage: true
+      triage_mode: "deterministic"  # flip to llm once stable
+      incident_severity_cap: "P0"
+      dedupe_window_minutes: 360
+      attach_alert_artifact: true
+      ack: true
+
+  # ─── Resource-critical: OOM/crashloop/disk in prod|staging ──────────────────
+  - match:
+      kind_in: ["oom", "crashloop", "disk"]
+      env_in: ["prod", "staging"]
+      severity_in: ["P0", "P1", "P2"]
+    actions:
+      auto_incident: true
+      auto_triage: true
+      triage_mode: "deterministic"
+      incident_severity_cap: "P1"
+      dedupe_window_minutes: 240
+      attach_alert_artifact: true
+      ack: true
+
+  # ─── Staging P1: auto incident, no triage (save resources) ─────────────────
+  - match:
+      env_in: ["staging"]
+      severity_in: ["P1"]
+    actions:
+      auto_incident: true
+      auto_triage: false
+      triage_mode: "deterministic"
+      incident_severity_cap: "P1"
+      dedupe_window_minutes: 120
+      attach_alert_artifact: true
+      ack: true
+
+  # ─── Deploy events: digest-only ──────────────────────────────────────────────
+  - match:
+      kind_in: ["deploy"]
+    actions:
+      auto_incident: false
+      digest_only: true
+      ack: true
+
+  # ─── Lower severity: digest-only ─────────────────────────────────────────────
+  - match:
+      severity_in: ["P2", "P3", "INFO"]
+    actions:
+      auto_incident: false
+      digest_only: true
+      ack: true
+
+# ─── Kind normalization (aliases Monitor may use) ────────────────────────────
+kind_map:
+  latency:    ["latency", "p95_latency", "p99_latency", "slow_response"]
+  error_rate: ["error_rate", "5xx_rate", "http_errors"]
+  slo_breach: ["slo_breach", "slo", "slo_violation"]
+  crashloop:  ["crashloop", "restart_loop", "oom_kill"]
+  oom:        ["oom", "out_of_memory", "memory_pressure"]
+  disk:       ["disk", "disk_full", "disk_pressure", "pvc_full"]
+  security:   ["security", "unauthorized", "injection", "brute_force"]
+
+# ─── Per-kind severity caps for incidents created by the loop ─────────────────
+severity_caps:
+  deploy:     "P2"
+  latency:    "P1"
+  error_rate: "P1"
+  slo_breach: "P1"
+  security:   "P0"
+
+# ─── Signature dedupe settings ────────────────────────────────────────────────
+signature:
+  use_kind:        true
+  use_fingerprint: true
+  use_node_label:  false   # true = per-node incidents (noisier)
+  normalize_title: true    # strip numbers/timestamps from title before hash
--- a/config/architecture_pressure_policy.yml
+++ b/config/architecture_pressure_policy.yml
@@ -0,0 +1,51 @@
+# Architecture Pressure Policy — DAARION.city
+#
+# Deterministic structural health index: measures long-term architectural strain.
+# Risk = short-term stability.  Pressure = long-term structural debt.
+#
+# All thresholds / weights configurable here; no LLM, no external calls.
+
+defaults:
+  lookback_days: 30
+  top_n: 10
+
+# Per-signal additive weights
+weights:
+  recurrence_high_30d: 20      # high-recurrence bucket present in 30d
+  recurrence_warn_30d: 10      # warn-level recurrence in 30d
+  regressions_30d: 15          # each positive delta_24h event in 30d
+  escalations_30d: 12          # each escalation event in 30d
+  followups_created_30d: 8     # each new followup created in 30d
+  followups_overdue: 15        # current overdue followups (snapshot)
+  drift_failures_30d: 10       # drift gate fail/warn events in 30d
+  dependency_high_30d: 10      # dependency scan HIGH/CRITICAL findings in 30d
+
+# Score → band mapping
+bands:
+  low_max: 20
+  medium_max: 45
+  high_max: 70
+  # above high_max → critical
+
+# Priority rules for automatic follow-up creation
+priority_rules:
+  require_arch_review_at: 70   # pressure score >= this → requires_arch_review=true
+  auto_create_followup: true   # create a follow-up when require_arch_review triggered
+  followup_priority: "P1"
+  followup_due_days: 14
+  followup_owner: "cto"
+  # Dedupe key: arch_review:{YYYY-WW}:{service}
+  # Prevents duplicate creation within the same ISO week
+
+# Release gate behaviour
+release_gate:
+  platform_review_required:
+    enabled: true
+    warn_at: 60
+    fail_at: 85     # only blocks if gate profile is "strict"
+
+# Digest settings
+digest:
+  output_dir: "ops/reports/platform"
+  max_chars: 12000
+  top_n_in_digest: 10
--- a/config/backlog_policy.yml
+++ b/config/backlog_policy.yml
@@ -0,0 +1,86 @@
+# Engineering Backlog Policy — DAARION.city
+#
+# Governs auto-generation of platform backlog items from Risk/Pressure digests,
+# workflow transitions, ownership, and storage retention.
+#
+# No LLM. Deterministic generation. Source of truth for engineering priorities.
+
+defaults:
+  env: "prod"
+  retention_days: 180
+  max_items_per_run: 50
+
+# Dedupe scheme: prevents duplicate creation within the same ISO week
+dedupe:
+  scheme: "YYYY-WW"               # weekly deduplication window
+  key_fields: ["service", "category", "env"]
+  key_prefix: "platform_backlog"
+  # Final key: platform_backlog:{YYYY-WW}:{env}:{service}:{category}
+
+# Per-category defaults
+categories:
+  arch_review:
+    priority: "P1"
+    due_days: 14
+  refactor:
+    priority: "P1"
+    due_days: 21
+  slo_hardening:
+    priority: "P2"
+    due_days: 30
+  cleanup_followups:
+    priority: "P2"
+    due_days: 14
+  security:
+    priority: "P0"
+    due_days: 7
+
+# Auto-generation rules (evaluated per-service top-to-bottom; first match wins per category)
+generation:
+  weekly_from_pressure_digest: true
+  daily_from_risk_digest: false
+  rules:
+    - name: "arch_review_required"
+      when:
+        pressure_requires_arch_review: true
+      create:
+        category: "arch_review"
+        title_template: "[ARCH] Review required: {service}"
+
+    - name: "high_pressure_refactor"
+      when:
+        pressure_band_in: ["high", "critical"]
+        risk_band_in: ["high", "critical"]
+      create:
+        category: "refactor"
+        title_template: "[REF] Reduce pressure & risk: {service}"
+
+    - name: "slo_violations"
+      when:
+        risk_has_slo_violations: true
+      create:
+        category: "slo_hardening"
+        title_template: "[SLO] Fix violations: {service}"
+
+    - name: "followup_backlog"
+      when:
+        followups_overdue_gt: 0
+      create:
+        category: "cleanup_followups"
+        title_template: "[OPS] Close overdue followups: {service}"
+
+# Owner assignments (default + service-level overrides)
+ownership:
+  default_owner: "oncall"
+  overrides:
+    gateway: "cto"
+
+# Workflow state machine
+workflow:
+  statuses: ["open", "in_progress", "blocked", "done", "canceled"]
+  allowed_transitions:
+    open:        ["in_progress", "blocked", "canceled"]
+    in_progress: ["blocked", "done", "canceled"]
+    blocked:     ["open", "in_progress", "canceled"]
+    done:        []
+    canceled:    []
--- a/config/cost_weights.yml
+++ b/config/cost_weights.yml
@@ -0,0 +1,133 @@
+# Cost Weights — DAARION FinOps MVP
+#
+# "cost_units" = cost_per_call + duration_ms * cost_per_ms
+# These are RELATIVE units for ranking, not actual dollars.
+#
+# Update weights as actual cost data becomes available.
+
+defaults:
+  cost_per_call: 1.0     # baseline: 1 unit per call
+  cost_per_ms: 0.001     # 0.001 units per ms elapsed
+
+tools:
+  # ─── Heavy GPU/compute (high cost) ───────────────────────────────────────
+  comfy_generate_video:
+    cost_per_call: 120.0
+    cost_per_ms: 0.005
+    category: media
+
+  comfy_generate_image:
+    cost_per_call: 50.0
+    cost_per_ms: 0.003
+    category: media
+
+  # ─── Release / governance tools ──────────────────────────────────────────
+  pr_reviewer_tool:
+    cost_per_call: 10.0
+    cost_per_ms: 0.002
+    category: release
+
+  contract_tool:
+    cost_per_call: 5.0
+    cost_per_ms: 0.001
+    category: release
+
+  threatmodel_tool:
+    cost_per_call: 5.0
+    cost_per_ms: 0.001
+    category: release
+
+  dependency_scanner_tool:
+    cost_per_call: 3.0
+    cost_per_ms: 0.001
+    category: release
+
+  drift_analyzer_tool:
+    cost_per_call: 4.0
+    cost_per_ms: 0.001
+    category: release
+
+  cost_analyzer_tool:
+    cost_per_call: 2.0
+    cost_per_ms: 0.001
+    category: finops
+
+  # ─── Observability (moderate cost, often called) ─────────────────────────
+  observability_tool:
+    cost_per_call: 2.0
+    cost_per_ms: 0.001
+    category: observability
+
+  # ─── Jobs / orchestration ────────────────────────────────────────────────
+  job_orchestrator_tool:
+    cost_per_call: 3.0
+    cost_per_ms: 0.001
+    category: ops
+
+  # ─── Web / external (network cost) ───────────────────────────────────────
+  web_search:
+    cost_per_call: 2.0
+    cost_per_ms: 0.001
+    category: web
+
+  web_extract:
+    cost_per_call: 1.5
+    cost_per_ms: 0.001
+    category: web
+
+  crawl4ai_scrape:
+    cost_per_call: 3.0
+    cost_per_ms: 0.001
+    category: web
+
+  # ─── Knowledge / memory (low cost) ───────────────────────────────────────
+  memory_search:
+    cost_per_call: 0.5
+    cost_per_ms: 0.0005
+    category: memory
+
+  remember_fact:
+    cost_per_call: 0.5
+    cost_per_ms: 0.0005
+    category: memory
+
+  graph_query:
+    cost_per_call: 0.5
+    cost_per_ms: 0.0005
+    category: memory
+
+  kb_tool:
+    cost_per_call: 1.0
+    cost_per_ms: 0.001
+    category: knowledge
+
+  # ─── Repo / code tools ───────────────────────────────────────────────────
+  repo_tool:
+    cost_per_call: 1.5
+    cost_per_ms: 0.001
+    category: dev
+
+  config_linter_tool:
+    cost_per_call: 2.0
+    cost_per_ms: 0.001
+    category: release
+
+  # ─── Oncall / incident ───────────────────────────────────────────────────
+  oncall_tool:
+    cost_per_call: 1.0
+    cost_per_ms: 0.001
+    category: ops
+
+# ─── Anomaly detection thresholds ────────────────────────────────────────────
+anomaly:
+  # Spike: window_cost / baseline_avg_cost >= ratio_threshold
+  spike_ratio_threshold: 3.0
+  # Must have at least this many calls in window to be an anomaly
+  min_calls_threshold: 10
+  # High-priority tools for cost_watch gate in release_check
+  priority_tools:
+    - comfy_generate_video
+    - comfy_generate_image
+    - pr_reviewer_tool
+    - job_orchestrator_tool
+    - observability_tool
--- a/config/data_governance_policy.yml
+++ b/config/data_governance_policy.yml
@@ -0,0 +1,192 @@
+# Data Governance & Privacy Policy — DAARION.city
+#
+# Used by data_governance_tool to scan for PII/secrets/logging/retention risks.
+# Severity: "error" = high risk (still warning-only in gate_mode=warning_only).
+#           "warning" = medium risk.
+#           "info" = low risk / informational.
+
+# ─── Retention policies ───────────────────────────────────────────────────────
+retention:
+  audit_jsonl_days: 30
+  audit_postgres_days: 90
+  memory_events_days: 90
+  logs_days: 14
+  # Large output threshold: if audit out_size >= this, flag as anomaly
+  large_output_bytes: 65536      # 64KB
+
+# ─── PII patterns ─────────────────────────────────────────────────────────────
+pii_patterns:
+  email:
+    regex: "(?i)\\b[A-Z0-9._%+\\-]+@[A-Z0-9.\\-]+\\.[A-Z]{2,}\\b"
+    severity: "warning"
+    id: "DG-PII-001"
+    description: "Email address detected"
+
+  phone_ua_intl:
+    regex: "\\b\\+?[0-9][0-9\\-\\s()]{7,}[0-9]\\b"
+    severity: "warning"
+    id: "DG-PII-002"
+    description: "Phone-like number detected"
+
+  credit_card:
+    regex: "\\b(?:\\d[ \\-]*?){13,19}\\b"
+    severity: "error"
+    id: "DG-PII-003"
+    description: "Credit card-like number detected"
+
+  passport_like:
+    regex: "\\b[A-Z]{2}\\d{6,7}\\b"
+    severity: "warning"
+    id: "DG-PII-004"
+    description: "Passport-like identifier detected"
+
+  tax_id_ua:
+    regex: "\\b\\d{10}\\b"
+    severity: "info"
+    id: "DG-PII-005"
+    description: "Possible Ukrainian tax ID (10 digits)"
+
+# ─── Extra secret patterns (supplement tool_governance._SECRET_PATTERNS) ──────
+secret_patterns:
+  inherit_from_tool_governance: true
+  extra:
+    - name: "private_key_block"
+      regex: "-----BEGIN [A-Z ]*PRIVATE KEY-----"
+      severity: "error"
+      id: "DG-SEC-001"
+    - name: "aws_mfa_token"
+      regex: "(?i)mfa[_\\-]?token[\\s=:]+['\"`]?[\\dA-Z]{6,8}['\"`]?"
+      severity: "warning"
+      id: "DG-SEC-002"
+    - name: "pem_certificate"
+      regex: "-----BEGIN CERTIFICATE-----"
+      severity: "info"
+      id: "DG-SEC-003"
+
+# ─── Logging safety rules ─────────────────────────────────────────────────────
+logging_rules:
+  # Field names that must NOT appear unmasked in logger calls
+  forbid_logging_fields:
+    - password
+    - passwd
+    - token
+    - secret
+    - private_key
+    - api_key
+    - access_key
+    - credential
+    - auth_header
+    - bearer
+
+  # Fields that should appear as hash-only (warn if logged raw)
+  sensitive_fields_warn:
+    - user_id
+    - chat_id
+    - telegram_id
+    - session_id
+    - workspace_id
+
+  # Calls that indicate redaction is applied (good)
+  redaction_calls:
+    - redact
+    - mask
+    - sanitize
+    - anonymize
+    - _hash
+    - sha256
+
+  # Payload field names that indicate raw content is being logged/stored
+  raw_payload_indicators:
+    - payload
+    - diff_text
+    - openapi_text
+    - request_body
+    - response_body
+    - prompt
+    - messages
+    - content
+    - transcript
+    - conversation
+    - full_text
+
+# ─── Storage / retention keywords ─────────────────────────────────────────────
+storage_keywords:
+  write_patterns:
+    - save_message
+    - store_event
+    - insert_record
+    - append_event
+    - write_event
+    - write_record
+    - persist
+    - bulk_insert
+    - executemany
+  retention_indicators:
+    - ttl
+    - expire
+    - retention
+    - cleanup
+    - delete_old
+    - purge
+    - rotate
+    - max_age
+    - expiry
+  context_window: 20    # lines before/after to search for retention indicator
+
+# ─── Scan paths ───────────────────────────────────────────────────────────────
+paths:
+  include:
+    - "services/"
+    - "docs/"
+    - "ops/"
+    - "config/"
+  exclude:
+    - "**/node_modules/**"
+    - "**/.git/**"
+    - "**/dist/**"
+    - "**/build/**"
+    - "**/.venv/**"
+    - "**/__pycache__/**"
+    - "**/*.pyc"
+    - "**/*.lock"        # dependency lock files (high false-positive risk)
+    - "**/*.min.js"
+
+  # File extensions to scan
+  scan_extensions:
+    - ".py"
+    - ".ts"
+    - ".js"
+    - ".yml"
+    - ".yaml"
+    - ".json"
+    - ".env.example"
+    - ".md"
+    - ".txt"
+    - ".sh"
+
+  # Never scan these (sensitive or binary)
+  never_scan:
+    - "*.env"
+    - ".env.*"
+    - "*.pem"
+    - "*.key"
+    - "*.pfx"
+    - "*.p12"
+    - "*.crt"
+
+# ─── Gate behaviour ───────────────────────────────────────────────────────────
+severity_behavior:
+  # warning_only: gate always pass=True (adds recommendations only)
+  # strict: gate pass=False on any error finding
+  gate_mode: "warning_only"
+  recommend_on:
+    - "warning"
+    - "error"
+
+# ─── Limits ───────────────────────────────────────────────────────────────────
+limits:
+  max_files_fast: 200
+  max_files_full: 500
+  max_bytes_per_file: 262144   # 256KB
+  max_findings: 200            # cap before truncating
+  max_evidence_chars: 200      # mask and truncate evidence snippets
--- a/config/incident_escalation_policy.yml
+++ b/config/incident_escalation_policy.yml
@@ -0,0 +1,37 @@
+# Incident Escalation Policy
+# Controls deterministic escalation and auto-resolve candidate logic.
+
+defaults:
+  window_minutes: 60
+
+escalation:
+  # Escalate when the same signature storms
+  occurrences_thresholds:
+    P2_to_P1: 10      # occurrences_60m to escalate P2 → P1
+    P1_to_P0: 25      # occurrences_60m to escalate P1 → P0
+
+  triage_thresholds_24h:
+    P2_to_P1: 3       # triage_count_24h to escalate P2 → P1
+    P1_to_P0: 6       # triage_count_24h to escalate P1 → P0
+
+  severity_cap: "P0"  # never escalate above this
+
+  create_followup_on_escalate: true
+  followup:
+    priority: "P1"
+    due_hours: 24
+    owner: "oncall"
+    message_template: "Escalated due to alert storm: occurrences={occurrences_60m}, triages_24h={triage_count_24h}"
+
+auto_resolve:
+  # Candidates only in MVP — do not auto-close P0/P1
+  no_alerts_minutes_for_candidate: 60
+  close_allowed_severities: ["P2", "P3"]
+  auto_close: false             # set true carefully in staging only
+  candidate_event_type: "note"
+  candidate_message: "Auto-resolve candidate: no alerts observed in {no_alerts_minutes} minutes for this signature"
+
+alert_loop_slo:
+  claim_to_ack_p95_seconds: 60   # p95 latency from claim → ack
+  failed_rate_pct: 5             # max % of failed/(acked+failed) in window
+  processing_stuck_minutes: 15   # alerts in processing beyond this → stuck
--- a/config/incident_intelligence_policy.yml
+++ b/config/incident_intelligence_policy.yml
@@ -0,0 +1,88 @@
+# Incident Intelligence Policy
+# Controls correlation scoring, recurrence detection, and digest generation.
+
+correlation:
+  lookback_days: 30
+  max_related: 10
+  min_score: 20          # discard matches below this
+  rules:
+    - name: "same_signature"
+      weight: 100
+      match:
+        signature: true
+
+    - name: "same_service_and_kind"
+      weight: 60
+      match:
+        same_service: true
+        same_kind: true
+
+    - name: "same_service_time_cluster"
+      weight: 40
+      match:
+        same_service: true
+        within_minutes: 180
+
+    - name: "same_kind_cross_service"
+      weight: 30
+      match:
+        same_kind: true
+        within_minutes: 120
+
+recurrence:
+  windows_days: [7, 30]
+  thresholds:
+    signature:
+      warn: 3     # ≥ 3 occurrences in window → warn
+      high: 6     # ≥ 6 occurrences in window → high
+    kind:
+      warn: 5
+      high: 10
+  top_n: 15        # top N per category
+
+  # Deterministic recommendations per recurrence level
+  recommendations:
+    signature_high: "Create permanent fix: add regression test + SLO guard for this failure type"
+    signature_warn: "Review root cause history; consider adding monitoring threshold"
+    kind_high: "Systemic issue with kind={kind}: review architecture / add circuit breaker"
+    kind_warn: "Recurring kind={kind}: validate if alert thresholds are tuned correctly"
+
+digest:
+  weekly_day: "Mon"
+  include_closed: true
+  include_open: true
+  output_dir: "ops/reports/incidents"
+  markdown_max_chars: 8000
+  top_incidents: 20      # max incidents in weekly listing
+
+# ── Root-Cause Buckets ─────────────────────────────────────────────────────
+buckets:
+  mode: "service_kind"         # service_kind | signature_prefix
+  signature_prefix_len: 12
+  top_n: 10
+  min_count:
+    7: 3                       # bucket must have ≥ 3 incidents in last 7d
+    30: 6                      # or ≥ 6 in last 30d
+  include_statuses: ["open", "mitigating", "resolved", "closed"]
+
+# ── Auto Follow-ups (policy-driven, no LLM) ───────────────────────────────
+autofollowups:
+  enabled: true
+  only_when_high: true         # only create for HIGH recurrence buckets
+  owner: "oncall"
+  priority: "P1"
+  due_days: 7
+  max_followups_per_bucket_per_week: 1   # dedupe by week+bucket_key
+  dedupe_key_prefix: "intel_recur"
+
+# ── Release Gate: recurrence_watch ────────────────────────────────────────
+release_gate:
+  recurrence_watch:
+    enabled: true
+    service_scope: "target_service"  # target_service | all
+    windows_days: [7, 30]
+    fail_on:
+      severity_in: ["P0", "P1"]   # used only in strict mode
+      high_recurrence: true
+    warn_on:
+      warn_recurrence: true
--- a/config/network_allowlist.yml
+++ b/config/network_allowlist.yml
@@ -0,0 +1,143 @@
+# Network Allowlist for Tool HTTP Calls
+# Tools that make outbound HTTP requests MUST use only hosts/IPs listed here.
+# Any request to unlisted hosts is blocked by tool_governance.py middleware.
+#
+# Format per tool:
+#   hosts:   exact hostname or IP
+#   prefixes: URL prefix match (for paths)
+
+# ─── Observability Sources ────────────────────────────────────────────────────
+observability_tool:
+  description: "Prometheus, Loki, Tempo datasources"
+  hosts:
+    - "localhost"
+    - "127.0.0.1"
+    - "prometheus"
+    - "loki"
+    - "tempo"
+    - "monitoring"
+    - "144.76.224.179"    # NODA1 monitoring
+  ports_allowed: [9090, 3100, 3200, 9080]
+  schemes: ["http", "https"]
+
+# ─── Oncall / Service Health ──────────────────────────────────────────────────
+oncall_tool:
+  description: "Internal service health endpoints only"
+  hosts:
+    - "localhost"
+    - "127.0.0.1"
+    - "gateway"
+    - "router"
+    - "memory"
+    - "qdrant"
+    - "nats"
+    - "144.76.224.179"    # NODA1
+    - "212.8.58.133"      # NODA3
+  ports_allowed: [80, 443, 8000, 8080, 8222, 9000, 9100, 9102, 9200, 9300, 9400]
+  schemes: ["http", "https"]
+
+# ─── Web Search / Extract ─────────────────────────────────────────────────────
+web_search:
+  description: "Search provider APIs"
+  hosts:
+    - "api.duckduckgo.com"
+    - "serpapi.com"
+    - "api.bing.microsoft.com"
+    - "customsearch.googleapis.com"
+  schemes: ["https"]
+
+web_extract:
+  description: "Any public HTTPS URL (user-provided)"
+  allow_any_public: true      # Allow any non-private IP
+  block_private_ranges: true  # Block RFC1918 / loopback / link-local
+  schemes: ["https"]
+
+crawl4ai_scrape:
+  description: "Crawl4AI service + public URLs"
+  hosts:
+    - "localhost"
+    - "127.0.0.1"
+    - "crawl4ai"
+  ports_allowed: [11235]
+  allow_any_public: true
+  block_private_ranges: true
+  schemes: ["http", "https"]
+
+# ─── Memory / Graph ───────────────────────────────────────────────────────────
+memory_search:
+  description: "Memory service + Qdrant"
+  hosts:
+    - "localhost"
+    - "127.0.0.1"
+    - "memory-service"
+    - "qdrant"
+    - "144.76.224.179"
+  ports_allowed: [6333, 8001, 8100]
+  schemes: ["http", "https"]
+
+graph_query:
+  description: "Neo4j bolt/http"
+  hosts:
+    - "localhost"
+    - "127.0.0.1"
+    - "neo4j"
+  ports_allowed: [7474, 7687]
+  schemes: ["http", "https", "bolt", "bolt+s"]
+
+# ─── ComfyUI / Image Generation ──────────────────────────────────────────────
+comfy_generate_image:
+  description: "ComfyUI on NODA3"
+  hosts:
+    - "localhost"
+    - "127.0.0.1"
+    - "212.8.58.133"
+  ports_allowed: [8188]
+  schemes: ["http"]
+
+comfy_generate_video:
+  description: "ComfyUI video on NODA3"
+  hosts:
+    - "localhost"
+    - "127.0.0.1"
+    - "212.8.58.133"
+  ports_allowed: [8188]
+  schemes: ["http"]
+
+# ─── LLM Providers ────────────────────────────────────────────────────────────
+# (Used by router/gateway, not direct tool calls, but documented for reference)
+llm_providers:
+  description: "External LLM APIs"
+  hosts:
+    - "api.x.ai"            # xAI Grok
+    - "open.bigmodel.cn"    # GLM-5 Z.AI
+    - "api.deepseek.com"    # DeepSeek
+    - "api.openai.com"      # OpenAI fallback
+  schemes: ["https"]
+
+# ─── Presentation Service ─────────────────────────────────────────────────────
+presentation_create:
+  description: "Presentation rendering service"
+  hosts:
+    - "localhost"
+    - "127.0.0.1"
+    - "presentation-service"
+  ports_allowed: [8080, 9500]
+  schemes: ["http", "https"]
+
+# ─── Dependency Scanner ───────────────────────────────────────────────────────
+dependency_scanner_tool:
+  description: "OSV.dev API for vulnerability lookups (online mode only)"
+  hosts:
+    - "api.osv.dev"
+  schemes: ["https"]
+  # Only used when vuln_mode=online; offline_cache requires no outbound
+
+# ─── Private IP Ranges (always blocked for allow_any_public tools) ────────────
+private_ip_ranges:
+  - "10.0.0.0/8"
+  - "172.16.0.0/12"
+  - "192.168.0.0/16"
+  - "127.0.0.0/8"
+  - "169.254.0.0/16"
+  - "::1/128"
+  - "fc00::/7"
--- a/config/observability_sources.yml
+++ b/config/observability_sources.yml
@@ -0,0 +1,49 @@
+# Observability Data Sources Configuration
+# These are internal URLs - never expose to external networks
+
+prometheus:
+  # Prometheus server URL (internal network)
+  base_url: "http://prometheus:9090"
+  
+  # Allowed PromQL query prefixes (security)
+  allow_promql_prefixes:
+    - "sum("
+    - "rate("
+    - "histogram_quantile("
+    - "avg("
+    - "max("
+    - "min("
+    - "count("
+    - "irate("
+    - "last_over_time("
+    - "present_over_time("
+
+loki:
+  # Loki log server URL (internal network)
+  base_url: "http://loki:3100"
+
+tempo:
+  # Tempo trace server URL (internal network)
+  base_url: "http://tempo:3200"
+
+# Limits configuration
+limits:
+  # Maximum time window for queries (hours)
+  max_time_window_hours: 24
+  
+  # Maximum series returned
+  max_series: 200
+  
+  # Maximum points in range query
+  max_points: 2000
+  
+  # Maximum bytes in response
+  max_bytes: 300000
+  
+  # Query timeout (seconds)
+  timeout_seconds: 5
+
+# Environment variables (override URLs)
+# PROMETHEUS_URL
+# LOKI_URL
+# TEMPO_URL
--- a/config/release_gate_policy.yml
+++ b/config/release_gate_policy.yml
@@ -0,0 +1,133 @@
+# Release Gate Policy — DAARION.city
+#
+# Controls strictness of each gate per deployment profile.
+#
+# Modes:
+#   off    — gate is fully skipped (no call, no output)
+#   warn   — gate always pass=True; findings become recommendations only
+#   strict — gate can fail release (pass=False) when fail_on conditions are met
+#
+# Profiles: dev | staging | prod
+# Set via release_check input `gate_profile` (default: dev).
+
+profiles:
+  dev:
+    description: "Development: strict for security gates, warn for governance"
+    gates:
+      pr_review:
+        mode: "strict"
+      config_lint:
+        mode: "strict"
+      dependency_scan:
+        mode: "strict"
+        fail_on_severities: ["CRITICAL", "HIGH"]
+      contract_diff:
+        mode: "strict"
+      threat_model:
+        mode: "strict"
+      smoke:
+        mode: "warn"
+      drift:
+        mode: "warn"
+      slo_watch:
+        mode: "warn"
+      followup_watch:
+        mode: "warn"
+        fail_on: ["P0", "P1"]
+      privacy_watch:
+        mode: "warn"
+      cost_watch:
+        mode: "warn"
+      recurrence_watch:
+        mode: "warn"
+      risk_watch:
+        mode: "warn"
+      risk_delta_watch:
+        mode: "warn"
+      platform_review_required:
+        mode: "warn"
+
+  staging:
+    description: "Staging: strict security + strict privacy on errors"
+    gates:
+      pr_review:
+        mode: "strict"
+      config_lint:
+        mode: "strict"
+      dependency_scan:
+        mode: "strict"
+        fail_on_severities: ["CRITICAL", "HIGH"]
+      contract_diff:
+        mode: "strict"
+      threat_model:
+        mode: "strict"
+      smoke:
+        mode: "warn"
+      drift:
+        mode: "strict"
+      slo_watch:
+        mode: "strict"        # Don't deploy if SLO currently breached
+      followup_watch:
+        mode: "strict"
+        fail_on: ["P0", "P1"]
+      privacy_watch:
+        mode: "strict"
+        fail_on: ["error"]
+      cost_watch:
+        mode: "warn"
+      recurrence_watch:
+        mode: "strict"        # Block staging deploy if P0/P1 high recurrence
+        fail_on:
+          severity_in: ["P0", "P1"]
+          high_recurrence: true
+      risk_watch:
+        mode: "strict"        # Block staging if score >= fail_at for p0_services
+      risk_delta_watch:
+        mode: "strict"        # Block staging for p0_services when delta >= fail_delta
+      platform_review_required:
+        mode: "warn"          # warn-first: never blocks staging by default
+
+  prod:
+    description: "Production: maximum strictness across all gates"
+    gates:
+      pr_review:
+        mode: "strict"
+      config_lint:
+        mode: "strict"
+      dependency_scan:
+        mode: "strict"
+        fail_on_severities: ["CRITICAL", "HIGH", "MEDIUM"]
+      contract_diff:
+        mode: "strict"
+      threat_model:
+        mode: "strict"
+      smoke:
+        mode: "strict"
+      drift:
+        mode: "strict"
+      slo_watch:
+        mode: "warn"           # Warn: don't automatically block prod deploys on SLO
+      followup_watch:
+        mode: "warn"
+        fail_on: ["P0"]
+      privacy_watch:
+        mode: "strict"
+        fail_on: ["error"]
+      cost_watch:
+        mode: "warn"
+      recurrence_watch:
+        mode: "warn"           # Warn only in prod (accumulate data first)
+      risk_watch:
+        mode: "warn"           # Warn only in prod
+      risk_delta_watch:
+        mode: "warn"           # Warn only in prod
+      platform_review_required:
+        mode: "warn"           # Start conservative in prod
+
+# ─── Defaults (used if profile or gate not found) ────────────────────────────
+defaults:
+  mode: "warn"
+  # privacy_watch default fail_on (for strict mode):
+  privacy_fail_on: ["error"]
+  # cost_watch is never strict by default
+  cost_always_warn: true
--- a/config/risk_attribution_policy.yml
+++ b/config/risk_attribution_policy.yml
@@ -0,0 +1,80 @@
+# Risk Attribution Policy — DAARION.city
+#
+# Deterministic attribution: risk spike → likely causes.
+# LLM enrichment is OFF by default; local only on regression triggers.
+
+defaults:
+  lookback_hours: 24
+  max_causes: 5
+  llm_mode: "off"           # off | local | remote
+  llm_max_chars_in: 3500
+  llm_max_chars_out: 800
+
+# LLM enrichment triggers — only if ALL conditions are met
+llm_triggers:
+  risk_delta_warn: 10       # delta_24h >= 10
+  risk_delta_fail: 20       # delta_24h >= 20 (fail-level)
+  band_in: ["high", "critical"]
+
+# Per-cause scoring weights (additive)
+weights:
+  deploy: 30
+  dependency: 25
+  drift: 25
+  incident_storm: 20
+  slo_violation: 15
+  followups_overdue: 10
+  alert_loop_degraded: 10
+
+# Per-signal detection config
+signals:
+  deploy:
+    # Alert kinds that indicate a deploy event
+    kinds: ["deploy", "deployment", "rollout", "canary"]
+
+  dependency:
+    # Release gate names whose fail/warn counts as a dependency signal
+    release_gate_names: ["dependency_scan", "deps"]
+
+  drift:
+    release_gate_names: ["drift", "config_drift"]
+
+  incident_storm:
+    thresholds:
+      # occurrences in last 60min across all alert signatures for the service
+      occurrences_60m_warn: 10
+      # escalations (Escalated events) in last 24h
+      escalations_24h_warn: 2
+
+  slo:
+    require_active_violation: true
+
+# Confidence bands (minimum score to reach that band)
+output:
+  confidence_bands:
+    high: 60      # score >= 60 → high confidence
+    medium: 35    # score >= 35 → medium
+    # below 35     → low
+
+# Change Timeline config
+timeline:
+  enabled: true
+  lookback_hours: 24
+  max_items: 30
+  include_types: ["deploy","dependency","drift","incident","slo","followup","alert_loop","release_gate"]
+  time_bucket_minutes: 5     # coalesce same-type events within 5-min windows
+
+# Evidence linking
+evidence_linking:
+  enabled: true
+  max_refs_per_cause: 10
+
+# LLM local endpoint config (only used when llm_mode=local)
+llm_local:
+  endpoint: "http://localhost:11434/api/generate"
+  model: "llama3"
+  timeout_seconds: 15
+  # Hardening guards
+  model_allowlist: ["qwen2.5-coder:3b", "llama3.1:8b-instruct", "phi3:mini", "llama3"]
+  max_calls_per_digest: 3
+  per_day_dedupe: true       # key: risk_enrich:{YYYY-MM-DD}:{service}:{env}
--- a/config/risk_policy.yml
+++ b/config/risk_policy.yml
@@ -0,0 +1,89 @@
+# Service Risk Index Policy — DAARION.city
+#
+# Controls how Risk Scores are computed, classified, and gated.
+# All scoring is deterministic: no LLM required.
+
+defaults:
+  window_hours: 24
+  recurrence_windows_days: [7, 30]
+  slo_window_minutes: 60
+
+thresholds:
+  bands:
+    low_max: 20
+    medium_max: 50
+    high_max: 80
+  risk_watch:          # defaults, overridable per service below
+    warn_at: 50        # score >= warn_at → recommendations
+    fail_at: 80        # score >= fail_at → gate fails (strict mode only)
+
+weights:
+  open_incidents:
+    P0: 50
+    P1: 25
+    P2: 10
+    P3: 5
+  recurrence:
+    signature_warn_7d: 10
+    signature_high_7d: 20
+    kind_warn_7d: 8
+    kind_high_7d: 15
+    signature_high_30d: 10
+    kind_high_30d: 8
+  followups:
+    overdue_P0: 20
+    overdue_P1: 12
+    overdue_other: 6
+  slo:
+    violation: 10       # per active violation
+  alerts_loop:
+    slo_violation: 10   # per alert-loop SLO violation
+  escalation:
+    escalations_24h:
+      warn: 5           # score added if escalations_24h >= 1
+      high: 12          # score added if escalations_24h >= 3
+
+# Per-service risk gate overrides (lower/higher fail_at)
+service_overrides:
+  gateway:
+    risk_watch:
+      fail_at: 75       # gateway is critical: fail earlier
+  router:
+    risk_watch:
+      fail_at: 80
+
+# Services treated as P0 (always subject to strict risk_watch in staging)
+p0_services:
+  - gateway
+  - router
+
+# ─── History & Snapshotting ────────────────────────────────────────────────────
+history:
+  snapshot_interval_minutes: 60
+  retention_days: 90
+  max_services_per_run: 50
+
+# ─── Trend analysis ───────────────────────────────────────────────────────────
+trend:
+  delta_windows_hours: [24, 168]    # 24h and 7d
+  volatility_window_hours: 168      # stddev computed over last 7d
+  regression_threshold:
+    delta_24h_warn: 10              # score rose >= 10 points in 24h → warn
+    delta_24h_fail: 20              # score rose >= 20 points in 24h → fail (strict)
+    delta_7d_warn: 15
+    delta_7d_fail: 30
+
+# ─── Daily Digest ─────────────────────────────────────────────────────────────
+digest:
+  daily_hour_utc: 9                 # generate at 09:00 UTC
+  output_dir: "ops/reports/risk"
+  markdown_max_chars: 8000
+  top_n: 10
+
+# ─── Risk Delta release gate ──────────────────────────────────────────────────
+release_gate:
+  risk_delta_watch:
+    enabled: true
+    default_warn_delta_24h: 10
+    default_fail_delta_24h: 20
+    p0_services_strict: true
--- a/config/roles/aistalk/aurora.md
+++ b/config/roles/aistalk/aurora.md
@@ -0,0 +1,52 @@
+# Aurora (Autonomous Media Forensics)
+
+Role:
+- Lead media forensics for video, audio, and photo evidence inside AISTALK.
+- Extract usable evidence from low-quality media while preserving reproducibility.
+
+Modes:
+- `tactical`: fast triage for operational clarity.
+  - prioritize turnaround and readability
+  - lightweight pipelines and lower cost
+  - output is advisory (not courtroom-grade)
+- `forensic`: evidence-grade processing.
+  - prioritize reproducibility and auditability
+  - mandatory input/output hashing and immutable processing log
+  - chain-of-custody notes + signing metadata
+
+Capabilities:
+- Video: denoise, deblur, super-resolution, stabilization, frame interpolation.
+- Face-focused enhancement: controlled face restoration with clear model attribution.
+- Audio: denoise, speech intelligibility improvement, deepfake risk signals.
+- Photo: artifact cleanup, upscale, metadata/EXIF integrity review.
+
+Internal sub-pipeline handles:
+- `Clarity`: global video enhancement.
+- `Vera`: face restoration and face-quality diagnostics.
+- `Echo`: audio cleaning/transcription/deepfake heuristics.
+- `Pixis`: photo restoration and metadata checks.
+- `Kore`: forensic packaging (hashes, chain-of-custody, signature metadata).
+
+Output contract (strict JSON for downstream graphing):
+```json
+{
+  "agent": "Aurora",
+  "mode": "tactical | forensic",
+  "job_id": "aurora_YYYYMMDD_###",
+  "input_file": {"name": "file.ext", "hash": "sha256:..."},
+  "processing_log": [
+    {"step": "denoise", "model": "model_name", "time_ms": 0}
+  ],
+  "output_files": [
+    {"type": "video|audio|photo|forensic_log", "url": "https://...", "hash": "sha256:..."}
+  ],
+  "digital_signature": "ed25519:... | null"
+}
+```
+
+Boundaries:
+- No deceptive deepfake generation or identity manipulation.
+- Never present AI-enhanced output as untouched original evidence.
+- Flag uncertainty and potential enhancement artifacts explicitly.
+- Do not provide final legal conclusions; require expert human review for court use.
+- Preserve originals; never destructively overwrite source evidence.
--- a/config/slo_policy.yml
+++ b/config/slo_policy.yml
@@ -0,0 +1,64 @@
+# SLO Policy — DAARION.city
+#
+# Defines Service Level Objectives per service.
+# Used by observability_tool.slo_snapshot and incident_triage_graph slo_context node.
+#
+# Fields:
+#   error_rate_pct   — max allowed error rate (%)
+#   latency_p95_ms   — max p95 latency (milliseconds)
+#   window_minutes   — default observation window (default: 60)
+
+defaults:
+  window_minutes: 60
+  error_rate_pct: 1.0
+  latency_p95_ms: 300
+
+services:
+  gateway:
+    error_rate_pct: 1.0
+    latency_p95_ms: 300
+  router:
+    error_rate_pct: 0.5
+    latency_p95_ms: 200
+  memory-service:
+    error_rate_pct: 1.0
+    latency_p95_ms: 400
+  sofiia-supervisor:
+    error_rate_pct: 1.0
+    latency_p95_ms: 500
+
+# ─── Voice SLO profiles ───────────────────────────────────────────────────────
+# Two profiles aligned with router-config.yml selection_policies.
+# Measured via Prometheus metrics emitted by sofiia-console /api/telemetry/voice
+# and memory-service voice_endpoints.py.
+#
+# Prometheus metrics:
+#   voice_ttfa_ms{voice_profile}       — Time-to-first-audio (BFF → first playable)
+#   voice_e2e_ms{voice_profile}        — User stops speaking → audio plays
+#   voice_tts_first_ms{voice_profile}  — First-sentence TTS synthesis
+#   voice_tts_compute_ms{engine,voice} — Memory-service internal TTS
+#   voice_queue_underflows_total       — Playback starvation events
+voice_slo:
+  voice_fast_uk:
+    description: "Fast profile: gemma3 → qwen3.5 fallback"
+    ttfa_ms_p95: 5000          # TTFA p95 ≤ 5s
+    e2e_ms_p95: 9000           # E2E p95 ≤ 9s
+    tts_first_ms_p95: 2000     # TTS synthesis p95 ≤ 2s
+    underflow_rate_pct: 1.0    # starvation events per 100 voice turns ≤ 1%
+    tts_error_rate_pct: 0.5    # edge-tts failures ≤ 0.5%
+    window_minutes: 10
+
+  voice_quality_uk:
+    description: "Quality profile: qwen3.5 → qwen3:14b fallback"
+    ttfa_ms_p95: 7000
+    e2e_ms_p95: 12000
+    tts_first_ms_p95: 2000     # TTS itself is the same engine
+    underflow_rate_pct: 2.0    # slightly relaxed (longer LLM → more gap risk)
+    tts_error_rate_pct: 0.5
+    window_minutes: 10
+
+  # Canary thresholds (runtime health check, stricter)
+  canary:
+    tts_polina_max_ms: 3000    # live Polina synthesis ≤ 3s
+    tts_ostap_max_ms: 3000     # live Ostap synthesis ≤ 3s
+    min_audio_bytes: 1000      # valid audio is never empty/tiny