docs(platform): add policy configs, runbooks, ops scripts and platform documentation

Config policies (16 files): alert_routing, architecture_pressure, backlog,
cost_weights, data_governance, incident_escalation, incident_intelligence,
network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix,
release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout

Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard,
deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice,
cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule),
task_registry, voice alerts/ha/latency/policy

Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks,
NODA1/NODA2 status and setup, audit index and traces, backlog, incident,
supervisor, tools, voice, opencode, release, risk, aistalk, spacebot

Made-with: Cursor
This commit is contained in:
Apple
2026-03-03 07:14:53 -08:00
parent 129e4ea1fc
commit 67225a39fa
102 changed files with 20060 additions and 0 deletions

View File

@@ -0,0 +1,114 @@
# alert_routing_policy.yml
# Controls how the alert_triage_graph processes incoming alerts every 5 minutes.
# Key design: llm_mode=off means 0 LLM tokens in steady state.
defaults:
poll_interval_seconds: 300 # 5 min
max_alerts_per_run: 25
only_unacked: true
# Safety valves (avoid runaway incident creation on alert storm)
max_incidents_per_run: 5
max_triages_per_run: 5
dedupe_window_minutes_default: 120
ack_note_prefix: "alert_triage_loop"
# LLM gating — off = 0 tokens in steady state
llm_mode: "off" # off | local | remote
llm_on:
triage: false
postmortem: false
routing:
# ─── HARD AUTO: prod P0/P1 → create incident + deterministic triage ─────────
- match:
env_in: ["prod"]
severity_in: ["P0", "P1"]
actions:
auto_incident: true
auto_triage: true
triage_mode: "deterministic" # deterministic | llm
incident_severity_cap: "P1"
dedupe_window_minutes: 180
attach_alert_artifact: true
ack: true
# ─── Security alerts: auto incident + (optional) LLM triage ─────────────────
- match:
kind_in: ["security"]
actions:
auto_incident: true
auto_triage: true
triage_mode: "deterministic" # flip to llm once stable
incident_severity_cap: "P0"
dedupe_window_minutes: 360
attach_alert_artifact: true
ack: true
# ─── Resource-critical: OOM/crashloop/disk in prod|staging ──────────────────
- match:
kind_in: ["oom", "crashloop", "disk"]
env_in: ["prod", "staging"]
severity_in: ["P0", "P1", "P2"]
actions:
auto_incident: true
auto_triage: true
triage_mode: "deterministic"
incident_severity_cap: "P1"
dedupe_window_minutes: 240
attach_alert_artifact: true
ack: true
# ─── Staging P1: auto incident, no triage (save resources) ─────────────────
- match:
env_in: ["staging"]
severity_in: ["P1"]
actions:
auto_incident: true
auto_triage: false
triage_mode: "deterministic"
incident_severity_cap: "P1"
dedupe_window_minutes: 120
attach_alert_artifact: true
ack: true
# ─── Deploy events: digest-only ──────────────────────────────────────────────
- match:
kind_in: ["deploy"]
actions:
auto_incident: false
digest_only: true
ack: true
# ─── Lower severity: digest-only ─────────────────────────────────────────────
- match:
severity_in: ["P2", "P3", "INFO"]
actions:
auto_incident: false
digest_only: true
ack: true
# ─── Kind normalization (aliases Monitor may use) ────────────────────────────
kind_map:
latency: ["latency", "p95_latency", "p99_latency", "slow_response"]
error_rate: ["error_rate", "5xx_rate", "http_errors"]
slo_breach: ["slo_breach", "slo", "slo_violation"]
crashloop: ["crashloop", "restart_loop", "oom_kill"]
oom: ["oom", "out_of_memory", "memory_pressure"]
disk: ["disk", "disk_full", "disk_pressure", "pvc_full"]
security: ["security", "unauthorized", "injection", "brute_force"]
# ─── Per-kind severity caps for incidents created by the loop ─────────────────
severity_caps:
deploy: "P2"
latency: "P1"
error_rate: "P1"
slo_breach: "P1"
security: "P0"
# ─── Signature dedupe settings ────────────────────────────────────────────────
signature:
use_kind: true
use_fingerprint: true
use_node_label: false # true = per-node incidents (noisier)
normalize_title: true # strip numbers/timestamps from title before hash

View File

@@ -0,0 +1,51 @@
# Architecture Pressure Policy — DAARION.city
#
# Deterministic structural health index: measures long-term architectural strain.
# Risk = short-term stability. Pressure = long-term structural debt.
#
# All thresholds / weights configurable here; no LLM, no external calls.
defaults:
lookback_days: 30
top_n: 10
# Per-signal additive weights
weights:
recurrence_high_30d: 20 # high-recurrence bucket present in 30d
recurrence_warn_30d: 10 # warn-level recurrence in 30d
regressions_30d: 15 # each positive delta_24h event in 30d
escalations_30d: 12 # each escalation event in 30d
followups_created_30d: 8 # each new followup created in 30d
followups_overdue: 15 # current overdue followups (snapshot)
drift_failures_30d: 10 # drift gate fail/warn events in 30d
dependency_high_30d: 10 # dependency scan HIGH/CRITICAL findings in 30d
# Score → band mapping
bands:
low_max: 20
medium_max: 45
high_max: 70
# above high_max → critical
# Priority rules for automatic follow-up creation
priority_rules:
require_arch_review_at: 70 # pressure score >= this → requires_arch_review=true
auto_create_followup: true # create a follow-up when require_arch_review triggered
followup_priority: "P1"
followup_due_days: 14
followup_owner: "cto"
# Dedupe key: arch_review:{YYYY-WW}:{service}
# Prevents duplicate creation within the same ISO week
# Release gate behaviour
release_gate:
platform_review_required:
enabled: true
warn_at: 60
fail_at: 85 # only blocks if gate profile is "strict"
# Digest settings
digest:
output_dir: "ops/reports/platform"
max_chars: 12000
top_n_in_digest: 10

86
config/backlog_policy.yml Normal file
View File

@@ -0,0 +1,86 @@
# Engineering Backlog Policy — DAARION.city
#
# Governs auto-generation of platform backlog items from Risk/Pressure digests,
# workflow transitions, ownership, and storage retention.
#
# No LLM. Deterministic generation. Source of truth for engineering priorities.
defaults:
env: "prod"
retention_days: 180
max_items_per_run: 50
# Dedupe scheme: prevents duplicate creation within the same ISO week
dedupe:
scheme: "YYYY-WW" # weekly deduplication window
key_fields: ["service", "category", "env"]
key_prefix: "platform_backlog"
# Final key: platform_backlog:{YYYY-WW}:{env}:{service}:{category}
# Per-category defaults
categories:
arch_review:
priority: "P1"
due_days: 14
refactor:
priority: "P1"
due_days: 21
slo_hardening:
priority: "P2"
due_days: 30
cleanup_followups:
priority: "P2"
due_days: 14
security:
priority: "P0"
due_days: 7
# Auto-generation rules (evaluated per-service top-to-bottom; first match wins per category)
generation:
weekly_from_pressure_digest: true
daily_from_risk_digest: false
rules:
- name: "arch_review_required"
when:
pressure_requires_arch_review: true
create:
category: "arch_review"
title_template: "[ARCH] Review required: {service}"
- name: "high_pressure_refactor"
when:
pressure_band_in: ["high", "critical"]
risk_band_in: ["high", "critical"]
create:
category: "refactor"
title_template: "[REF] Reduce pressure & risk: {service}"
- name: "slo_violations"
when:
risk_has_slo_violations: true
create:
category: "slo_hardening"
title_template: "[SLO] Fix violations: {service}"
- name: "followup_backlog"
when:
followups_overdue_gt: 0
create:
category: "cleanup_followups"
title_template: "[OPS] Close overdue followups: {service}"
# Owner assignments (default + service-level overrides)
ownership:
default_owner: "oncall"
overrides:
gateway: "cto"
# Workflow state machine
workflow:
statuses: ["open", "in_progress", "blocked", "done", "canceled"]
allowed_transitions:
open: ["in_progress", "blocked", "canceled"]
in_progress: ["blocked", "done", "canceled"]
blocked: ["open", "in_progress", "canceled"]
done: []
canceled: []

133
config/cost_weights.yml Normal file
View File

@@ -0,0 +1,133 @@
# Cost Weights — DAARION FinOps MVP
#
# "cost_units" = cost_per_call + duration_ms * cost_per_ms
# These are RELATIVE units for ranking, not actual dollars.
#
# Update weights as actual cost data becomes available.
defaults:
cost_per_call: 1.0 # baseline: 1 unit per call
cost_per_ms: 0.001 # 0.001 units per ms elapsed
tools:
# ─── Heavy GPU/compute (high cost) ───────────────────────────────────────
comfy_generate_video:
cost_per_call: 120.0
cost_per_ms: 0.005
category: media
comfy_generate_image:
cost_per_call: 50.0
cost_per_ms: 0.003
category: media
# ─── Release / governance tools ──────────────────────────────────────────
pr_reviewer_tool:
cost_per_call: 10.0
cost_per_ms: 0.002
category: release
contract_tool:
cost_per_call: 5.0
cost_per_ms: 0.001
category: release
threatmodel_tool:
cost_per_call: 5.0
cost_per_ms: 0.001
category: release
dependency_scanner_tool:
cost_per_call: 3.0
cost_per_ms: 0.001
category: release
drift_analyzer_tool:
cost_per_call: 4.0
cost_per_ms: 0.001
category: release
cost_analyzer_tool:
cost_per_call: 2.0
cost_per_ms: 0.001
category: finops
# ─── Observability (moderate cost, often called) ─────────────────────────
observability_tool:
cost_per_call: 2.0
cost_per_ms: 0.001
category: observability
# ─── Jobs / orchestration ────────────────────────────────────────────────
job_orchestrator_tool:
cost_per_call: 3.0
cost_per_ms: 0.001
category: ops
# ─── Web / external (network cost) ───────────────────────────────────────
web_search:
cost_per_call: 2.0
cost_per_ms: 0.001
category: web
web_extract:
cost_per_call: 1.5
cost_per_ms: 0.001
category: web
crawl4ai_scrape:
cost_per_call: 3.0
cost_per_ms: 0.001
category: web
# ─── Knowledge / memory (low cost) ───────────────────────────────────────
memory_search:
cost_per_call: 0.5
cost_per_ms: 0.0005
category: memory
remember_fact:
cost_per_call: 0.5
cost_per_ms: 0.0005
category: memory
graph_query:
cost_per_call: 0.5
cost_per_ms: 0.0005
category: memory
kb_tool:
cost_per_call: 1.0
cost_per_ms: 0.001
category: knowledge
# ─── Repo / code tools ───────────────────────────────────────────────────
repo_tool:
cost_per_call: 1.5
cost_per_ms: 0.001
category: dev
config_linter_tool:
cost_per_call: 2.0
cost_per_ms: 0.001
category: release
# ─── Oncall / incident ───────────────────────────────────────────────────
oncall_tool:
cost_per_call: 1.0
cost_per_ms: 0.001
category: ops
# ─── Anomaly detection thresholds ────────────────────────────────────────────
anomaly:
# Spike: window_cost / baseline_avg_cost >= ratio_threshold
spike_ratio_threshold: 3.0
# Must have at least this many calls in window to be an anomaly
min_calls_threshold: 10
# High-priority tools for cost_watch gate in release_check
priority_tools:
- comfy_generate_video
- comfy_generate_image
- pr_reviewer_tool
- job_orchestrator_tool
- observability_tool

View File

@@ -0,0 +1,192 @@
# Data Governance & Privacy Policy — DAARION.city
#
# Used by data_governance_tool to scan for PII/secrets/logging/retention risks.
# Severity: "error" = high risk (still warning-only in gate_mode=warning_only).
# "warning" = medium risk.
# "info" = low risk / informational.
# ─── Retention policies ───────────────────────────────────────────────────────
retention:
audit_jsonl_days: 30
audit_postgres_days: 90
memory_events_days: 90
logs_days: 14
# Large output threshold: if audit out_size >= this, flag as anomaly
large_output_bytes: 65536 # 64KB
# ─── PII patterns ─────────────────────────────────────────────────────────────
pii_patterns:
email:
regex: "(?i)\\b[A-Z0-9._%+\\-]+@[A-Z0-9.\\-]+\\.[A-Z]{2,}\\b"
severity: "warning"
id: "DG-PII-001"
description: "Email address detected"
phone_ua_intl:
regex: "\\b\\+?[0-9][0-9\\-\\s()]{7,}[0-9]\\b"
severity: "warning"
id: "DG-PII-002"
description: "Phone-like number detected"
credit_card:
regex: "\\b(?:\\d[ \\-]*?){13,19}\\b"
severity: "error"
id: "DG-PII-003"
description: "Credit card-like number detected"
passport_like:
regex: "\\b[A-Z]{2}\\d{6,7}\\b"
severity: "warning"
id: "DG-PII-004"
description: "Passport-like identifier detected"
tax_id_ua:
regex: "\\b\\d{10}\\b"
severity: "info"
id: "DG-PII-005"
description: "Possible Ukrainian tax ID (10 digits)"
# ─── Extra secret patterns (supplement tool_governance._SECRET_PATTERNS) ──────
secret_patterns:
inherit_from_tool_governance: true
extra:
- name: "private_key_block"
regex: "-----BEGIN [A-Z ]*PRIVATE KEY-----"
severity: "error"
id: "DG-SEC-001"
- name: "aws_mfa_token"
regex: "(?i)mfa[_\\-]?token[\\s=:]+['\"`]?[\\dA-Z]{6,8}['\"`]?"
severity: "warning"
id: "DG-SEC-002"
- name: "pem_certificate"
regex: "-----BEGIN CERTIFICATE-----"
severity: "info"
id: "DG-SEC-003"
# ─── Logging safety rules ─────────────────────────────────────────────────────
logging_rules:
# Field names that must NOT appear unmasked in logger calls
forbid_logging_fields:
- password
- passwd
- token
- secret
- private_key
- api_key
- access_key
- credential
- auth_header
- bearer
# Fields that should appear as hash-only (warn if logged raw)
sensitive_fields_warn:
- user_id
- chat_id
- telegram_id
- session_id
- workspace_id
# Calls that indicate redaction is applied (good)
redaction_calls:
- redact
- mask
- sanitize
- anonymize
- _hash
- sha256
# Payload field names that indicate raw content is being logged/stored
raw_payload_indicators:
- payload
- diff_text
- openapi_text
- request_body
- response_body
- prompt
- messages
- content
- transcript
- conversation
- full_text
# ─── Storage / retention keywords ─────────────────────────────────────────────
storage_keywords:
write_patterns:
- save_message
- store_event
- insert_record
- append_event
- write_event
- write_record
- persist
- bulk_insert
- executemany
retention_indicators:
- ttl
- expire
- retention
- cleanup
- delete_old
- purge
- rotate
- max_age
- expiry
context_window: 20 # lines before/after to search for retention indicator
# ─── Scan paths ───────────────────────────────────────────────────────────────
paths:
include:
- "services/"
- "docs/"
- "ops/"
- "config/"
exclude:
- "**/node_modules/**"
- "**/.git/**"
- "**/dist/**"
- "**/build/**"
- "**/.venv/**"
- "**/__pycache__/**"
- "**/*.pyc"
- "**/*.lock" # dependency lock files (high false-positive risk)
- "**/*.min.js"
# File extensions to scan
scan_extensions:
- ".py"
- ".ts"
- ".js"
- ".yml"
- ".yaml"
- ".json"
- ".env.example"
- ".md"
- ".txt"
- ".sh"
# Never scan these (sensitive or binary)
never_scan:
- "*.env"
- ".env.*"
- "*.pem"
- "*.key"
- "*.pfx"
- "*.p12"
- "*.crt"
# ─── Gate behaviour ───────────────────────────────────────────────────────────
severity_behavior:
# warning_only: gate always pass=True (adds recommendations only)
# strict: gate pass=False on any error finding
gate_mode: "warning_only"
recommend_on:
- "warning"
- "error"
# ─── Limits ───────────────────────────────────────────────────────────────────
limits:
max_files_fast: 200
max_files_full: 500
max_bytes_per_file: 262144 # 256KB
max_findings: 200 # cap before truncating
max_evidence_chars: 200 # mask and truncate evidence snippets

View File

@@ -0,0 +1,37 @@
# Incident Escalation Policy
# Controls deterministic escalation and auto-resolve candidate logic.
defaults:
window_minutes: 60
escalation:
# Escalate when the same signature storms
occurrences_thresholds:
P2_to_P1: 10 # occurrences_60m to escalate P2 → P1
P1_to_P0: 25 # occurrences_60m to escalate P1 → P0
triage_thresholds_24h:
P2_to_P1: 3 # triage_count_24h to escalate P2 → P1
P1_to_P0: 6 # triage_count_24h to escalate P1 → P0
severity_cap: "P0" # never escalate above this
create_followup_on_escalate: true
followup:
priority: "P1"
due_hours: 24
owner: "oncall"
message_template: "Escalated due to alert storm: occurrences={occurrences_60m}, triages_24h={triage_count_24h}"
auto_resolve:
# Candidates only in MVP — do not auto-close P0/P1
no_alerts_minutes_for_candidate: 60
close_allowed_severities: ["P2", "P3"]
auto_close: false # set true carefully in staging only
candidate_event_type: "note"
candidate_message: "Auto-resolve candidate: no alerts observed in {no_alerts_minutes} minutes for this signature"
alert_loop_slo:
claim_to_ack_p95_seconds: 60 # p95 latency from claim → ack
failed_rate_pct: 5 # max % of failed/(acked+failed) in window
processing_stuck_minutes: 15 # alerts in processing beyond this → stuck

View File

@@ -0,0 +1,88 @@
# Incident Intelligence Policy
# Controls correlation scoring, recurrence detection, and digest generation.
correlation:
lookback_days: 30
max_related: 10
min_score: 20 # discard matches below this
rules:
- name: "same_signature"
weight: 100
match:
signature: true
- name: "same_service_and_kind"
weight: 60
match:
same_service: true
same_kind: true
- name: "same_service_time_cluster"
weight: 40
match:
same_service: true
within_minutes: 180
- name: "same_kind_cross_service"
weight: 30
match:
same_kind: true
within_minutes: 120
recurrence:
windows_days: [7, 30]
thresholds:
signature:
warn: 3 # ≥ 3 occurrences in window → warn
high: 6 # ≥ 6 occurrences in window → high
kind:
warn: 5
high: 10
top_n: 15 # top N per category
# Deterministic recommendations per recurrence level
recommendations:
signature_high: "Create permanent fix: add regression test + SLO guard for this failure type"
signature_warn: "Review root cause history; consider adding monitoring threshold"
kind_high: "Systemic issue with kind={kind}: review architecture / add circuit breaker"
kind_warn: "Recurring kind={kind}: validate if alert thresholds are tuned correctly"
digest:
weekly_day: "Mon"
include_closed: true
include_open: true
output_dir: "ops/reports/incidents"
markdown_max_chars: 8000
top_incidents: 20 # max incidents in weekly listing
# ── Root-Cause Buckets ─────────────────────────────────────────────────────
buckets:
mode: "service_kind" # service_kind | signature_prefix
signature_prefix_len: 12
top_n: 10
min_count:
7: 3 # bucket must have ≥ 3 incidents in last 7d
30: 6 # or ≥ 6 in last 30d
include_statuses: ["open", "mitigating", "resolved", "closed"]
# ── Auto Follow-ups (policy-driven, no LLM) ───────────────────────────────
autofollowups:
enabled: true
only_when_high: true # only create for HIGH recurrence buckets
owner: "oncall"
priority: "P1"
due_days: 7
max_followups_per_bucket_per_week: 1 # dedupe by week+bucket_key
dedupe_key_prefix: "intel_recur"
# ── Release Gate: recurrence_watch ────────────────────────────────────────
release_gate:
recurrence_watch:
enabled: true
service_scope: "target_service" # target_service | all
windows_days: [7, 30]
fail_on:
severity_in: ["P0", "P1"] # used only in strict mode
high_recurrence: true
warn_on:
warn_recurrence: true

View File

@@ -0,0 +1,143 @@
# Network Allowlist for Tool HTTP Calls
# Tools that make outbound HTTP requests MUST use only hosts/IPs listed here.
# Any request to unlisted hosts is blocked by tool_governance.py middleware.
#
# Format per tool:
# hosts: exact hostname or IP
# prefixes: URL prefix match (for paths)
# ─── Observability Sources ────────────────────────────────────────────────────
observability_tool:
description: "Prometheus, Loki, Tempo datasources"
hosts:
- "localhost"
- "127.0.0.1"
- "prometheus"
- "loki"
- "tempo"
- "monitoring"
- "144.76.224.179" # NODA1 monitoring
ports_allowed: [9090, 3100, 3200, 9080]
schemes: ["http", "https"]
# ─── Oncall / Service Health ──────────────────────────────────────────────────
oncall_tool:
description: "Internal service health endpoints only"
hosts:
- "localhost"
- "127.0.0.1"
- "gateway"
- "router"
- "memory"
- "qdrant"
- "nats"
- "144.76.224.179" # NODA1
- "212.8.58.133" # NODA3
ports_allowed: [80, 443, 8000, 8080, 8222, 9000, 9100, 9102, 9200, 9300, 9400]
schemes: ["http", "https"]
# ─── Web Search / Extract ─────────────────────────────────────────────────────
web_search:
description: "Search provider APIs"
hosts:
- "api.duckduckgo.com"
- "serpapi.com"
- "api.bing.microsoft.com"
- "customsearch.googleapis.com"
schemes: ["https"]
web_extract:
description: "Any public HTTPS URL (user-provided)"
allow_any_public: true # Allow any non-private IP
block_private_ranges: true # Block RFC1918 / loopback / link-local
schemes: ["https"]
crawl4ai_scrape:
description: "Crawl4AI service + public URLs"
hosts:
- "localhost"
- "127.0.0.1"
- "crawl4ai"
ports_allowed: [11235]
allow_any_public: true
block_private_ranges: true
schemes: ["http", "https"]
# ─── Memory / Graph ───────────────────────────────────────────────────────────
memory_search:
description: "Memory service + Qdrant"
hosts:
- "localhost"
- "127.0.0.1"
- "memory-service"
- "qdrant"
- "144.76.224.179"
ports_allowed: [6333, 8001, 8100]
schemes: ["http", "https"]
graph_query:
description: "Neo4j bolt/http"
hosts:
- "localhost"
- "127.0.0.1"
- "neo4j"
ports_allowed: [7474, 7687]
schemes: ["http", "https", "bolt", "bolt+s"]
# ─── ComfyUI / Image Generation ──────────────────────────────────────────────
comfy_generate_image:
description: "ComfyUI on NODA3"
hosts:
- "localhost"
- "127.0.0.1"
- "212.8.58.133"
ports_allowed: [8188]
schemes: ["http"]
comfy_generate_video:
description: "ComfyUI video on NODA3"
hosts:
- "localhost"
- "127.0.0.1"
- "212.8.58.133"
ports_allowed: [8188]
schemes: ["http"]
# ─── LLM Providers ────────────────────────────────────────────────────────────
# (Used by router/gateway, not direct tool calls, but documented for reference)
llm_providers:
description: "External LLM APIs"
hosts:
- "api.x.ai" # xAI Grok
- "open.bigmodel.cn" # GLM-5 Z.AI
- "api.deepseek.com" # DeepSeek
- "api.openai.com" # OpenAI fallback
schemes: ["https"]
# ─── Presentation Service ─────────────────────────────────────────────────────
presentation_create:
description: "Presentation rendering service"
hosts:
- "localhost"
- "127.0.0.1"
- "presentation-service"
ports_allowed: [8080, 9500]
schemes: ["http", "https"]
# ─── Dependency Scanner ───────────────────────────────────────────────────────
dependency_scanner_tool:
description: "OSV.dev API for vulnerability lookups (online mode only)"
hosts:
- "api.osv.dev"
schemes: ["https"]
# Only used when vuln_mode=online; offline_cache requires no outbound
# ─── Private IP Ranges (always blocked for allow_any_public tools) ────────────
private_ip_ranges:
- "10.0.0.0/8"
- "172.16.0.0/12"
- "192.168.0.0/16"
- "127.0.0.0/8"
- "169.254.0.0/16"
- "::1/128"
- "fc00::/7"

View File

@@ -0,0 +1,49 @@
# Observability Data Sources Configuration
# These are internal URLs - never expose to external networks
prometheus:
# Prometheus server URL (internal network)
base_url: "http://prometheus:9090"
# Allowed PromQL query prefixes (security)
allow_promql_prefixes:
- "sum("
- "rate("
- "histogram_quantile("
- "avg("
- "max("
- "min("
- "count("
- "irate("
- "last_over_time("
- "present_over_time("
loki:
# Loki log server URL (internal network)
base_url: "http://loki:3100"
tempo:
# Tempo trace server URL (internal network)
base_url: "http://tempo:3200"
# Limits configuration
limits:
# Maximum time window for queries (hours)
max_time_window_hours: 24
# Maximum series returned
max_series: 200
# Maximum points in range query
max_points: 2000
# Maximum bytes in response
max_bytes: 300000
# Query timeout (seconds)
timeout_seconds: 5
# Environment variables (override URLs)
# PROMETHEUS_URL
# LOKI_URL
# TEMPO_URL

View File

@@ -0,0 +1,133 @@
# Release Gate Policy — DAARION.city
#
# Controls strictness of each gate per deployment profile.
#
# Modes:
# off — gate is fully skipped (no call, no output)
# warn — gate always pass=True; findings become recommendations only
# strict — gate can fail release (pass=False) when fail_on conditions are met
#
# Profiles: dev | staging | prod
# Set via release_check input `gate_profile` (default: dev).
profiles:
dev:
description: "Development: strict for security gates, warn for governance"
gates:
pr_review:
mode: "strict"
config_lint:
mode: "strict"
dependency_scan:
mode: "strict"
fail_on_severities: ["CRITICAL", "HIGH"]
contract_diff:
mode: "strict"
threat_model:
mode: "strict"
smoke:
mode: "warn"
drift:
mode: "warn"
slo_watch:
mode: "warn"
followup_watch:
mode: "warn"
fail_on: ["P0", "P1"]
privacy_watch:
mode: "warn"
cost_watch:
mode: "warn"
recurrence_watch:
mode: "warn"
risk_watch:
mode: "warn"
risk_delta_watch:
mode: "warn"
platform_review_required:
mode: "warn"
staging:
description: "Staging: strict security + strict privacy on errors"
gates:
pr_review:
mode: "strict"
config_lint:
mode: "strict"
dependency_scan:
mode: "strict"
fail_on_severities: ["CRITICAL", "HIGH"]
contract_diff:
mode: "strict"
threat_model:
mode: "strict"
smoke:
mode: "warn"
drift:
mode: "strict"
slo_watch:
mode: "strict" # Don't deploy if SLO currently breached
followup_watch:
mode: "strict"
fail_on: ["P0", "P1"]
privacy_watch:
mode: "strict"
fail_on: ["error"]
cost_watch:
mode: "warn"
recurrence_watch:
mode: "strict" # Block staging deploy if P0/P1 high recurrence
fail_on:
severity_in: ["P0", "P1"]
high_recurrence: true
risk_watch:
mode: "strict" # Block staging if score >= fail_at for p0_services
risk_delta_watch:
mode: "strict" # Block staging for p0_services when delta >= fail_delta
platform_review_required:
mode: "warn" # warn-first: never blocks staging by default
prod:
description: "Production: maximum strictness across all gates"
gates:
pr_review:
mode: "strict"
config_lint:
mode: "strict"
dependency_scan:
mode: "strict"
fail_on_severities: ["CRITICAL", "HIGH", "MEDIUM"]
contract_diff:
mode: "strict"
threat_model:
mode: "strict"
smoke:
mode: "strict"
drift:
mode: "strict"
slo_watch:
mode: "warn" # Warn: don't automatically block prod deploys on SLO
followup_watch:
mode: "warn"
fail_on: ["P0"]
privacy_watch:
mode: "strict"
fail_on: ["error"]
cost_watch:
mode: "warn"
recurrence_watch:
mode: "warn" # Warn only in prod (accumulate data first)
risk_watch:
mode: "warn" # Warn only in prod
risk_delta_watch:
mode: "warn" # Warn only in prod
platform_review_required:
mode: "warn" # Start conservative in prod
# ─── Defaults (used if profile or gate not found) ────────────────────────────
defaults:
mode: "warn"
# privacy_watch default fail_on (for strict mode):
privacy_fail_on: ["error"]
# cost_watch is never strict by default
cost_always_warn: true

View File

@@ -0,0 +1,80 @@
# Risk Attribution Policy — DAARION.city
#
# Deterministic attribution: risk spike → likely causes.
# LLM enrichment is OFF by default; local only on regression triggers.
defaults:
lookback_hours: 24
max_causes: 5
llm_mode: "off" # off | local | remote
llm_max_chars_in: 3500
llm_max_chars_out: 800
# LLM enrichment triggers — only if ALL conditions are met
llm_triggers:
risk_delta_warn: 10 # delta_24h >= 10
risk_delta_fail: 20 # delta_24h >= 20 (fail-level)
band_in: ["high", "critical"]
# Per-cause scoring weights (additive)
weights:
deploy: 30
dependency: 25
drift: 25
incident_storm: 20
slo_violation: 15
followups_overdue: 10
alert_loop_degraded: 10
# Per-signal detection config
signals:
deploy:
# Alert kinds that indicate a deploy event
kinds: ["deploy", "deployment", "rollout", "canary"]
dependency:
# Release gate names whose fail/warn counts as a dependency signal
release_gate_names: ["dependency_scan", "deps"]
drift:
release_gate_names: ["drift", "config_drift"]
incident_storm:
thresholds:
# occurrences in last 60min across all alert signatures for the service
occurrences_60m_warn: 10
# escalations (Escalated events) in last 24h
escalations_24h_warn: 2
slo:
require_active_violation: true
# Confidence bands (minimum score to reach that band)
output:
confidence_bands:
high: 60 # score >= 60 → high confidence
medium: 35 # score >= 35 → medium
# below 35 → low
# Change Timeline config
timeline:
enabled: true
lookback_hours: 24
max_items: 30
include_types: ["deploy","dependency","drift","incident","slo","followup","alert_loop","release_gate"]
time_bucket_minutes: 5 # coalesce same-type events within 5-min windows
# Evidence linking
evidence_linking:
enabled: true
max_refs_per_cause: 10
# LLM local endpoint config (only used when llm_mode=local)
llm_local:
endpoint: "http://localhost:11434/api/generate"
model: "llama3"
timeout_seconds: 15
# Hardening guards
model_allowlist: ["qwen2.5-coder:3b", "llama3.1:8b-instruct", "phi3:mini", "llama3"]
max_calls_per_digest: 3
per_day_dedupe: true # key: risk_enrich:{YYYY-MM-DD}:{service}:{env}

89
config/risk_policy.yml Normal file
View File

@@ -0,0 +1,89 @@
# Service Risk Index Policy — DAARION.city
#
# Controls how Risk Scores are computed, classified, and gated.
# All scoring is deterministic: no LLM required.
defaults:
window_hours: 24
recurrence_windows_days: [7, 30]
slo_window_minutes: 60
thresholds:
bands:
low_max: 20
medium_max: 50
high_max: 80
risk_watch: # defaults, overridable per service below
warn_at: 50 # score >= warn_at → recommendations
fail_at: 80 # score >= fail_at → gate fails (strict mode only)
weights:
open_incidents:
P0: 50
P1: 25
P2: 10
P3: 5
recurrence:
signature_warn_7d: 10
signature_high_7d: 20
kind_warn_7d: 8
kind_high_7d: 15
signature_high_30d: 10
kind_high_30d: 8
followups:
overdue_P0: 20
overdue_P1: 12
overdue_other: 6
slo:
violation: 10 # per active violation
alerts_loop:
slo_violation: 10 # per alert-loop SLO violation
escalation:
escalations_24h:
warn: 5 # score added if escalations_24h >= 1
high: 12 # score added if escalations_24h >= 3
# Per-service risk gate overrides (lower/higher fail_at)
service_overrides:
gateway:
risk_watch:
fail_at: 75 # gateway is critical: fail earlier
router:
risk_watch:
fail_at: 80
# Services treated as P0 (always subject to strict risk_watch in staging)
p0_services:
- gateway
- router
# ─── History & Snapshotting ────────────────────────────────────────────────────
history:
snapshot_interval_minutes: 60
retention_days: 90
max_services_per_run: 50
# ─── Trend analysis ───────────────────────────────────────────────────────────
trend:
delta_windows_hours: [24, 168] # 24h and 7d
volatility_window_hours: 168 # stddev computed over last 7d
regression_threshold:
delta_24h_warn: 10 # score rose >= 10 points in 24h → warn
delta_24h_fail: 20 # score rose >= 20 points in 24h → fail (strict)
delta_7d_warn: 15
delta_7d_fail: 30
# ─── Daily Digest ─────────────────────────────────────────────────────────────
digest:
daily_hour_utc: 9 # generate at 09:00 UTC
output_dir: "ops/reports/risk"
markdown_max_chars: 8000
top_n: 10
# ─── Risk Delta release gate ──────────────────────────────────────────────────
release_gate:
risk_delta_watch:
enabled: true
default_warn_delta_24h: 10
default_fail_delta_24h: 20
p0_services_strict: true

View File

@@ -0,0 +1,52 @@
# Aurora (Autonomous Media Forensics)
Role:
- Lead media forensics for video, audio, and photo evidence inside AISTALK.
- Extract usable evidence from low-quality media while preserving reproducibility.
Modes:
- `tactical`: fast triage for operational clarity.
- prioritize turnaround and readability
- lightweight pipelines and lower cost
- output is advisory (not courtroom-grade)
- `forensic`: evidence-grade processing.
- prioritize reproducibility and auditability
- mandatory input/output hashing and immutable processing log
- chain-of-custody notes + signing metadata
Capabilities:
- Video: denoise, deblur, super-resolution, stabilization, frame interpolation.
- Face-focused enhancement: controlled face restoration with clear model attribution.
- Audio: denoise, speech intelligibility improvement, deepfake risk signals.
- Photo: artifact cleanup, upscale, metadata/EXIF integrity review.
Internal sub-pipeline handles:
- `Clarity`: global video enhancement.
- `Vera`: face restoration and face-quality diagnostics.
- `Echo`: audio cleaning/transcription/deepfake heuristics.
- `Pixis`: photo restoration and metadata checks.
- `Kore`: forensic packaging (hashes, chain-of-custody, signature metadata).
Output contract (strict JSON for downstream graphing):
```json
{
"agent": "Aurora",
"mode": "tactical | forensic",
"job_id": "aurora_YYYYMMDD_###",
"input_file": {"name": "file.ext", "hash": "sha256:..."},
"processing_log": [
{"step": "denoise", "model": "model_name", "time_ms": 0}
],
"output_files": [
{"type": "video|audio|photo|forensic_log", "url": "https://...", "hash": "sha256:..."}
],
"digital_signature": "ed25519:... | null"
}
```
Boundaries:
- No deceptive deepfake generation or identity manipulation.
- Never present AI-enhanced output as untouched original evidence.
- Flag uncertainty and potential enhancement artifacts explicitly.
- Do not provide final legal conclusions; require expert human review for court use.
- Preserve originals; never destructively overwrite source evidence.

64
config/slo_policy.yml Normal file
View File

@@ -0,0 +1,64 @@
# SLO Policy — DAARION.city
#
# Defines Service Level Objectives per service.
# Used by observability_tool.slo_snapshot and incident_triage_graph slo_context node.
#
# Fields:
# error_rate_pct — max allowed error rate (%)
# latency_p95_ms — max p95 latency (milliseconds)
# window_minutes — default observation window (default: 60)
defaults:
window_minutes: 60
error_rate_pct: 1.0
latency_p95_ms: 300
services:
gateway:
error_rate_pct: 1.0
latency_p95_ms: 300
router:
error_rate_pct: 0.5
latency_p95_ms: 200
memory-service:
error_rate_pct: 1.0
latency_p95_ms: 400
sofiia-supervisor:
error_rate_pct: 1.0
latency_p95_ms: 500
# ─── Voice SLO profiles ───────────────────────────────────────────────────────
# Two profiles aligned with router-config.yml selection_policies.
# Measured via Prometheus metrics emitted by sofiia-console /api/telemetry/voice
# and memory-service voice_endpoints.py.
#
# Prometheus metrics:
# voice_ttfa_ms{voice_profile} — Time-to-first-audio (BFF → first playable)
# voice_e2e_ms{voice_profile} — User stops speaking → audio plays
# voice_tts_first_ms{voice_profile} — First-sentence TTS synthesis
# voice_tts_compute_ms{engine,voice} — Memory-service internal TTS
# voice_queue_underflows_total — Playback starvation events
voice_slo:
voice_fast_uk:
description: "Fast profile: gemma3 → qwen3.5 fallback"
ttfa_ms_p95: 5000 # TTFA p95 ≤ 5s
e2e_ms_p95: 9000 # E2E p95 ≤ 9s
tts_first_ms_p95: 2000 # TTS synthesis p95 ≤ 2s
underflow_rate_pct: 1.0 # starvation events per 100 voice turns ≤ 1%
tts_error_rate_pct: 0.5 # edge-tts failures ≤ 0.5%
window_minutes: 10
voice_quality_uk:
description: "Quality profile: qwen3.5 → qwen3:14b fallback"
ttfa_ms_p95: 7000
e2e_ms_p95: 12000
tts_first_ms_p95: 2000 # TTS itself is the same engine
underflow_rate_pct: 2.0 # slightly relaxed (longer LLM → more gap risk)
tts_error_rate_pct: 0.5
window_minutes: 10
# Canary thresholds (runtime health check, stricter)
canary:
tts_polina_max_ms: 3000 # live Polina synthesis ≤ 3s
tts_ostap_max_ms: 3000 # live Ostap synthesis ≤ 3s
min_audio_bytes: 1000 # valid audio is never empty/tiny