docs(platform): add policy configs, runbooks, ops scripts and platform documentation
Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
This commit is contained in:
114
config/alert_routing_policy.yml
Normal file
114
config/alert_routing_policy.yml
Normal file
@@ -0,0 +1,114 @@
|
||||
# alert_routing_policy.yml
|
||||
# Controls how the alert_triage_graph processes incoming alerts every 5 minutes.
|
||||
# Key design: llm_mode=off means 0 LLM tokens in steady state.
|
||||
|
||||
defaults:
|
||||
poll_interval_seconds: 300 # 5 min
|
||||
max_alerts_per_run: 25
|
||||
only_unacked: true
|
||||
|
||||
# Safety valves (avoid runaway incident creation on alert storm)
|
||||
max_incidents_per_run: 5
|
||||
max_triages_per_run: 5
|
||||
dedupe_window_minutes_default: 120
|
||||
ack_note_prefix: "alert_triage_loop"
|
||||
|
||||
# LLM gating — off = 0 tokens in steady state
|
||||
llm_mode: "off" # off | local | remote
|
||||
llm_on:
|
||||
triage: false
|
||||
postmortem: false
|
||||
|
||||
routing:
|
||||
# ─── HARD AUTO: prod P0/P1 → create incident + deterministic triage ─────────
|
||||
- match:
|
||||
env_in: ["prod"]
|
||||
severity_in: ["P0", "P1"]
|
||||
actions:
|
||||
auto_incident: true
|
||||
auto_triage: true
|
||||
triage_mode: "deterministic" # deterministic | llm
|
||||
incident_severity_cap: "P1"
|
||||
dedupe_window_minutes: 180
|
||||
attach_alert_artifact: true
|
||||
ack: true
|
||||
|
||||
# ─── Security alerts: auto incident + (optional) LLM triage ─────────────────
|
||||
- match:
|
||||
kind_in: ["security"]
|
||||
actions:
|
||||
auto_incident: true
|
||||
auto_triage: true
|
||||
triage_mode: "deterministic" # flip to llm once stable
|
||||
incident_severity_cap: "P0"
|
||||
dedupe_window_minutes: 360
|
||||
attach_alert_artifact: true
|
||||
ack: true
|
||||
|
||||
# ─── Resource-critical: OOM/crashloop/disk in prod|staging ──────────────────
|
||||
- match:
|
||||
kind_in: ["oom", "crashloop", "disk"]
|
||||
env_in: ["prod", "staging"]
|
||||
severity_in: ["P0", "P1", "P2"]
|
||||
actions:
|
||||
auto_incident: true
|
||||
auto_triage: true
|
||||
triage_mode: "deterministic"
|
||||
incident_severity_cap: "P1"
|
||||
dedupe_window_minutes: 240
|
||||
attach_alert_artifact: true
|
||||
ack: true
|
||||
|
||||
# ─── Staging P1: auto incident, no triage (save resources) ─────────────────
|
||||
- match:
|
||||
env_in: ["staging"]
|
||||
severity_in: ["P1"]
|
||||
actions:
|
||||
auto_incident: true
|
||||
auto_triage: false
|
||||
triage_mode: "deterministic"
|
||||
incident_severity_cap: "P1"
|
||||
dedupe_window_minutes: 120
|
||||
attach_alert_artifact: true
|
||||
ack: true
|
||||
|
||||
# ─── Deploy events: digest-only ──────────────────────────────────────────────
|
||||
- match:
|
||||
kind_in: ["deploy"]
|
||||
actions:
|
||||
auto_incident: false
|
||||
digest_only: true
|
||||
ack: true
|
||||
|
||||
# ─── Lower severity: digest-only ─────────────────────────────────────────────
|
||||
- match:
|
||||
severity_in: ["P2", "P3", "INFO"]
|
||||
actions:
|
||||
auto_incident: false
|
||||
digest_only: true
|
||||
ack: true
|
||||
|
||||
# ─── Kind normalization (aliases Monitor may use) ────────────────────────────
|
||||
kind_map:
|
||||
latency: ["latency", "p95_latency", "p99_latency", "slow_response"]
|
||||
error_rate: ["error_rate", "5xx_rate", "http_errors"]
|
||||
slo_breach: ["slo_breach", "slo", "slo_violation"]
|
||||
crashloop: ["crashloop", "restart_loop", "oom_kill"]
|
||||
oom: ["oom", "out_of_memory", "memory_pressure"]
|
||||
disk: ["disk", "disk_full", "disk_pressure", "pvc_full"]
|
||||
security: ["security", "unauthorized", "injection", "brute_force"]
|
||||
|
||||
# ─── Per-kind severity caps for incidents created by the loop ─────────────────
|
||||
severity_caps:
|
||||
deploy: "P2"
|
||||
latency: "P1"
|
||||
error_rate: "P1"
|
||||
slo_breach: "P1"
|
||||
security: "P0"
|
||||
|
||||
# ─── Signature dedupe settings ────────────────────────────────────────────────
|
||||
signature:
|
||||
use_kind: true
|
||||
use_fingerprint: true
|
||||
use_node_label: false # true = per-node incidents (noisier)
|
||||
normalize_title: true # strip numbers/timestamps from title before hash
|
||||
51
config/architecture_pressure_policy.yml
Normal file
51
config/architecture_pressure_policy.yml
Normal file
@@ -0,0 +1,51 @@
|
||||
# Architecture Pressure Policy — DAARION.city
|
||||
#
|
||||
# Deterministic structural health index: measures long-term architectural strain.
|
||||
# Risk = short-term stability. Pressure = long-term structural debt.
|
||||
#
|
||||
# All thresholds / weights configurable here; no LLM, no external calls.
|
||||
|
||||
defaults:
|
||||
lookback_days: 30
|
||||
top_n: 10
|
||||
|
||||
# Per-signal additive weights
|
||||
weights:
|
||||
recurrence_high_30d: 20 # high-recurrence bucket present in 30d
|
||||
recurrence_warn_30d: 10 # warn-level recurrence in 30d
|
||||
regressions_30d: 15 # each positive delta_24h event in 30d
|
||||
escalations_30d: 12 # each escalation event in 30d
|
||||
followups_created_30d: 8 # each new followup created in 30d
|
||||
followups_overdue: 15 # current overdue followups (snapshot)
|
||||
drift_failures_30d: 10 # drift gate fail/warn events in 30d
|
||||
dependency_high_30d: 10 # dependency scan HIGH/CRITICAL findings in 30d
|
||||
|
||||
# Score → band mapping
|
||||
bands:
|
||||
low_max: 20
|
||||
medium_max: 45
|
||||
high_max: 70
|
||||
# above high_max → critical
|
||||
|
||||
# Priority rules for automatic follow-up creation
|
||||
priority_rules:
|
||||
require_arch_review_at: 70 # pressure score >= this → requires_arch_review=true
|
||||
auto_create_followup: true # create a follow-up when require_arch_review triggered
|
||||
followup_priority: "P1"
|
||||
followup_due_days: 14
|
||||
followup_owner: "cto"
|
||||
# Dedupe key: arch_review:{YYYY-WW}:{service}
|
||||
# Prevents duplicate creation within the same ISO week
|
||||
|
||||
# Release gate behaviour
|
||||
release_gate:
|
||||
platform_review_required:
|
||||
enabled: true
|
||||
warn_at: 60
|
||||
fail_at: 85 # only blocks if gate profile is "strict"
|
||||
|
||||
# Digest settings
|
||||
digest:
|
||||
output_dir: "ops/reports/platform"
|
||||
max_chars: 12000
|
||||
top_n_in_digest: 10
|
||||
86
config/backlog_policy.yml
Normal file
86
config/backlog_policy.yml
Normal file
@@ -0,0 +1,86 @@
|
||||
# Engineering Backlog Policy — DAARION.city
|
||||
#
|
||||
# Governs auto-generation of platform backlog items from Risk/Pressure digests,
|
||||
# workflow transitions, ownership, and storage retention.
|
||||
#
|
||||
# No LLM. Deterministic generation. Source of truth for engineering priorities.
|
||||
|
||||
defaults:
|
||||
env: "prod"
|
||||
retention_days: 180
|
||||
max_items_per_run: 50
|
||||
|
||||
# Dedupe scheme: prevents duplicate creation within the same ISO week
|
||||
dedupe:
|
||||
scheme: "YYYY-WW" # weekly deduplication window
|
||||
key_fields: ["service", "category", "env"]
|
||||
key_prefix: "platform_backlog"
|
||||
# Final key: platform_backlog:{YYYY-WW}:{env}:{service}:{category}
|
||||
|
||||
# Per-category defaults
|
||||
categories:
|
||||
arch_review:
|
||||
priority: "P1"
|
||||
due_days: 14
|
||||
refactor:
|
||||
priority: "P1"
|
||||
due_days: 21
|
||||
slo_hardening:
|
||||
priority: "P2"
|
||||
due_days: 30
|
||||
cleanup_followups:
|
||||
priority: "P2"
|
||||
due_days: 14
|
||||
security:
|
||||
priority: "P0"
|
||||
due_days: 7
|
||||
|
||||
# Auto-generation rules (evaluated per-service top-to-bottom; first match wins per category)
|
||||
generation:
|
||||
weekly_from_pressure_digest: true
|
||||
daily_from_risk_digest: false
|
||||
rules:
|
||||
- name: "arch_review_required"
|
||||
when:
|
||||
pressure_requires_arch_review: true
|
||||
create:
|
||||
category: "arch_review"
|
||||
title_template: "[ARCH] Review required: {service}"
|
||||
|
||||
- name: "high_pressure_refactor"
|
||||
when:
|
||||
pressure_band_in: ["high", "critical"]
|
||||
risk_band_in: ["high", "critical"]
|
||||
create:
|
||||
category: "refactor"
|
||||
title_template: "[REF] Reduce pressure & risk: {service}"
|
||||
|
||||
- name: "slo_violations"
|
||||
when:
|
||||
risk_has_slo_violations: true
|
||||
create:
|
||||
category: "slo_hardening"
|
||||
title_template: "[SLO] Fix violations: {service}"
|
||||
|
||||
- name: "followup_backlog"
|
||||
when:
|
||||
followups_overdue_gt: 0
|
||||
create:
|
||||
category: "cleanup_followups"
|
||||
title_template: "[OPS] Close overdue followups: {service}"
|
||||
|
||||
# Owner assignments (default + service-level overrides)
|
||||
ownership:
|
||||
default_owner: "oncall"
|
||||
overrides:
|
||||
gateway: "cto"
|
||||
|
||||
# Workflow state machine
|
||||
workflow:
|
||||
statuses: ["open", "in_progress", "blocked", "done", "canceled"]
|
||||
allowed_transitions:
|
||||
open: ["in_progress", "blocked", "canceled"]
|
||||
in_progress: ["blocked", "done", "canceled"]
|
||||
blocked: ["open", "in_progress", "canceled"]
|
||||
done: []
|
||||
canceled: []
|
||||
133
config/cost_weights.yml
Normal file
133
config/cost_weights.yml
Normal file
@@ -0,0 +1,133 @@
|
||||
# Cost Weights — DAARION FinOps MVP
|
||||
#
|
||||
# "cost_units" = cost_per_call + duration_ms * cost_per_ms
|
||||
# These are RELATIVE units for ranking, not actual dollars.
|
||||
#
|
||||
# Update weights as actual cost data becomes available.
|
||||
|
||||
defaults:
|
||||
cost_per_call: 1.0 # baseline: 1 unit per call
|
||||
cost_per_ms: 0.001 # 0.001 units per ms elapsed
|
||||
|
||||
tools:
|
||||
# ─── Heavy GPU/compute (high cost) ───────────────────────────────────────
|
||||
comfy_generate_video:
|
||||
cost_per_call: 120.0
|
||||
cost_per_ms: 0.005
|
||||
category: media
|
||||
|
||||
comfy_generate_image:
|
||||
cost_per_call: 50.0
|
||||
cost_per_ms: 0.003
|
||||
category: media
|
||||
|
||||
# ─── Release / governance tools ──────────────────────────────────────────
|
||||
pr_reviewer_tool:
|
||||
cost_per_call: 10.0
|
||||
cost_per_ms: 0.002
|
||||
category: release
|
||||
|
||||
contract_tool:
|
||||
cost_per_call: 5.0
|
||||
cost_per_ms: 0.001
|
||||
category: release
|
||||
|
||||
threatmodel_tool:
|
||||
cost_per_call: 5.0
|
||||
cost_per_ms: 0.001
|
||||
category: release
|
||||
|
||||
dependency_scanner_tool:
|
||||
cost_per_call: 3.0
|
||||
cost_per_ms: 0.001
|
||||
category: release
|
||||
|
||||
drift_analyzer_tool:
|
||||
cost_per_call: 4.0
|
||||
cost_per_ms: 0.001
|
||||
category: release
|
||||
|
||||
cost_analyzer_tool:
|
||||
cost_per_call: 2.0
|
||||
cost_per_ms: 0.001
|
||||
category: finops
|
||||
|
||||
# ─── Observability (moderate cost, often called) ─────────────────────────
|
||||
observability_tool:
|
||||
cost_per_call: 2.0
|
||||
cost_per_ms: 0.001
|
||||
category: observability
|
||||
|
||||
# ─── Jobs / orchestration ────────────────────────────────────────────────
|
||||
job_orchestrator_tool:
|
||||
cost_per_call: 3.0
|
||||
cost_per_ms: 0.001
|
||||
category: ops
|
||||
|
||||
# ─── Web / external (network cost) ───────────────────────────────────────
|
||||
web_search:
|
||||
cost_per_call: 2.0
|
||||
cost_per_ms: 0.001
|
||||
category: web
|
||||
|
||||
web_extract:
|
||||
cost_per_call: 1.5
|
||||
cost_per_ms: 0.001
|
||||
category: web
|
||||
|
||||
crawl4ai_scrape:
|
||||
cost_per_call: 3.0
|
||||
cost_per_ms: 0.001
|
||||
category: web
|
||||
|
||||
# ─── Knowledge / memory (low cost) ───────────────────────────────────────
|
||||
memory_search:
|
||||
cost_per_call: 0.5
|
||||
cost_per_ms: 0.0005
|
||||
category: memory
|
||||
|
||||
remember_fact:
|
||||
cost_per_call: 0.5
|
||||
cost_per_ms: 0.0005
|
||||
category: memory
|
||||
|
||||
graph_query:
|
||||
cost_per_call: 0.5
|
||||
cost_per_ms: 0.0005
|
||||
category: memory
|
||||
|
||||
kb_tool:
|
||||
cost_per_call: 1.0
|
||||
cost_per_ms: 0.001
|
||||
category: knowledge
|
||||
|
||||
# ─── Repo / code tools ───────────────────────────────────────────────────
|
||||
repo_tool:
|
||||
cost_per_call: 1.5
|
||||
cost_per_ms: 0.001
|
||||
category: dev
|
||||
|
||||
config_linter_tool:
|
||||
cost_per_call: 2.0
|
||||
cost_per_ms: 0.001
|
||||
category: release
|
||||
|
||||
# ─── Oncall / incident ───────────────────────────────────────────────────
|
||||
oncall_tool:
|
||||
cost_per_call: 1.0
|
||||
cost_per_ms: 0.001
|
||||
category: ops
|
||||
|
||||
# ─── Anomaly detection thresholds ────────────────────────────────────────────
|
||||
anomaly:
|
||||
# Spike: window_cost / baseline_avg_cost >= ratio_threshold
|
||||
spike_ratio_threshold: 3.0
|
||||
# Must have at least this many calls in window to be an anomaly
|
||||
min_calls_threshold: 10
|
||||
# High-priority tools for cost_watch gate in release_check
|
||||
priority_tools:
|
||||
- comfy_generate_video
|
||||
- comfy_generate_image
|
||||
- pr_reviewer_tool
|
||||
- job_orchestrator_tool
|
||||
- observability_tool
|
||||
192
config/data_governance_policy.yml
Normal file
192
config/data_governance_policy.yml
Normal file
@@ -0,0 +1,192 @@
|
||||
# Data Governance & Privacy Policy — DAARION.city
|
||||
#
|
||||
# Used by data_governance_tool to scan for PII/secrets/logging/retention risks.
|
||||
# Severity: "error" = high risk (still warning-only in gate_mode=warning_only).
|
||||
# "warning" = medium risk.
|
||||
# "info" = low risk / informational.
|
||||
|
||||
# ─── Retention policies ───────────────────────────────────────────────────────
|
||||
retention:
|
||||
audit_jsonl_days: 30
|
||||
audit_postgres_days: 90
|
||||
memory_events_days: 90
|
||||
logs_days: 14
|
||||
# Large output threshold: if audit out_size >= this, flag as anomaly
|
||||
large_output_bytes: 65536 # 64KB
|
||||
|
||||
# ─── PII patterns ─────────────────────────────────────────────────────────────
|
||||
pii_patterns:
|
||||
email:
|
||||
regex: "(?i)\\b[A-Z0-9._%+\\-]+@[A-Z0-9.\\-]+\\.[A-Z]{2,}\\b"
|
||||
severity: "warning"
|
||||
id: "DG-PII-001"
|
||||
description: "Email address detected"
|
||||
|
||||
phone_ua_intl:
|
||||
regex: "\\b\\+?[0-9][0-9\\-\\s()]{7,}[0-9]\\b"
|
||||
severity: "warning"
|
||||
id: "DG-PII-002"
|
||||
description: "Phone-like number detected"
|
||||
|
||||
credit_card:
|
||||
regex: "\\b(?:\\d[ \\-]*?){13,19}\\b"
|
||||
severity: "error"
|
||||
id: "DG-PII-003"
|
||||
description: "Credit card-like number detected"
|
||||
|
||||
passport_like:
|
||||
regex: "\\b[A-Z]{2}\\d{6,7}\\b"
|
||||
severity: "warning"
|
||||
id: "DG-PII-004"
|
||||
description: "Passport-like identifier detected"
|
||||
|
||||
tax_id_ua:
|
||||
regex: "\\b\\d{10}\\b"
|
||||
severity: "info"
|
||||
id: "DG-PII-005"
|
||||
description: "Possible Ukrainian tax ID (10 digits)"
|
||||
|
||||
# ─── Extra secret patterns (supplement tool_governance._SECRET_PATTERNS) ──────
|
||||
secret_patterns:
|
||||
inherit_from_tool_governance: true
|
||||
extra:
|
||||
- name: "private_key_block"
|
||||
regex: "-----BEGIN [A-Z ]*PRIVATE KEY-----"
|
||||
severity: "error"
|
||||
id: "DG-SEC-001"
|
||||
- name: "aws_mfa_token"
|
||||
regex: "(?i)mfa[_\\-]?token[\\s=:]+['\"`]?[\\dA-Z]{6,8}['\"`]?"
|
||||
severity: "warning"
|
||||
id: "DG-SEC-002"
|
||||
- name: "pem_certificate"
|
||||
regex: "-----BEGIN CERTIFICATE-----"
|
||||
severity: "info"
|
||||
id: "DG-SEC-003"
|
||||
|
||||
# ─── Logging safety rules ─────────────────────────────────────────────────────
|
||||
logging_rules:
|
||||
# Field names that must NOT appear unmasked in logger calls
|
||||
forbid_logging_fields:
|
||||
- password
|
||||
- passwd
|
||||
- token
|
||||
- secret
|
||||
- private_key
|
||||
- api_key
|
||||
- access_key
|
||||
- credential
|
||||
- auth_header
|
||||
- bearer
|
||||
|
||||
# Fields that should appear as hash-only (warn if logged raw)
|
||||
sensitive_fields_warn:
|
||||
- user_id
|
||||
- chat_id
|
||||
- telegram_id
|
||||
- session_id
|
||||
- workspace_id
|
||||
|
||||
# Calls that indicate redaction is applied (good)
|
||||
redaction_calls:
|
||||
- redact
|
||||
- mask
|
||||
- sanitize
|
||||
- anonymize
|
||||
- _hash
|
||||
- sha256
|
||||
|
||||
# Payload field names that indicate raw content is being logged/stored
|
||||
raw_payload_indicators:
|
||||
- payload
|
||||
- diff_text
|
||||
- openapi_text
|
||||
- request_body
|
||||
- response_body
|
||||
- prompt
|
||||
- messages
|
||||
- content
|
||||
- transcript
|
||||
- conversation
|
||||
- full_text
|
||||
|
||||
# ─── Storage / retention keywords ─────────────────────────────────────────────
|
||||
storage_keywords:
|
||||
write_patterns:
|
||||
- save_message
|
||||
- store_event
|
||||
- insert_record
|
||||
- append_event
|
||||
- write_event
|
||||
- write_record
|
||||
- persist
|
||||
- bulk_insert
|
||||
- executemany
|
||||
retention_indicators:
|
||||
- ttl
|
||||
- expire
|
||||
- retention
|
||||
- cleanup
|
||||
- delete_old
|
||||
- purge
|
||||
- rotate
|
||||
- max_age
|
||||
- expiry
|
||||
context_window: 20 # lines before/after to search for retention indicator
|
||||
|
||||
# ─── Scan paths ───────────────────────────────────────────────────────────────
|
||||
paths:
|
||||
include:
|
||||
- "services/"
|
||||
- "docs/"
|
||||
- "ops/"
|
||||
- "config/"
|
||||
exclude:
|
||||
- "**/node_modules/**"
|
||||
- "**/.git/**"
|
||||
- "**/dist/**"
|
||||
- "**/build/**"
|
||||
- "**/.venv/**"
|
||||
- "**/__pycache__/**"
|
||||
- "**/*.pyc"
|
||||
- "**/*.lock" # dependency lock files (high false-positive risk)
|
||||
- "**/*.min.js"
|
||||
|
||||
# File extensions to scan
|
||||
scan_extensions:
|
||||
- ".py"
|
||||
- ".ts"
|
||||
- ".js"
|
||||
- ".yml"
|
||||
- ".yaml"
|
||||
- ".json"
|
||||
- ".env.example"
|
||||
- ".md"
|
||||
- ".txt"
|
||||
- ".sh"
|
||||
|
||||
# Never scan these (sensitive or binary)
|
||||
never_scan:
|
||||
- "*.env"
|
||||
- ".env.*"
|
||||
- "*.pem"
|
||||
- "*.key"
|
||||
- "*.pfx"
|
||||
- "*.p12"
|
||||
- "*.crt"
|
||||
|
||||
# ─── Gate behaviour ───────────────────────────────────────────────────────────
|
||||
severity_behavior:
|
||||
# warning_only: gate always pass=True (adds recommendations only)
|
||||
# strict: gate pass=False on any error finding
|
||||
gate_mode: "warning_only"
|
||||
recommend_on:
|
||||
- "warning"
|
||||
- "error"
|
||||
|
||||
# ─── Limits ───────────────────────────────────────────────────────────────────
|
||||
limits:
|
||||
max_files_fast: 200
|
||||
max_files_full: 500
|
||||
max_bytes_per_file: 262144 # 256KB
|
||||
max_findings: 200 # cap before truncating
|
||||
max_evidence_chars: 200 # mask and truncate evidence snippets
|
||||
37
config/incident_escalation_policy.yml
Normal file
37
config/incident_escalation_policy.yml
Normal file
@@ -0,0 +1,37 @@
|
||||
# Incident Escalation Policy
|
||||
# Controls deterministic escalation and auto-resolve candidate logic.
|
||||
|
||||
defaults:
|
||||
window_minutes: 60
|
||||
|
||||
escalation:
|
||||
# Escalate when the same signature storms
|
||||
occurrences_thresholds:
|
||||
P2_to_P1: 10 # occurrences_60m to escalate P2 → P1
|
||||
P1_to_P0: 25 # occurrences_60m to escalate P1 → P0
|
||||
|
||||
triage_thresholds_24h:
|
||||
P2_to_P1: 3 # triage_count_24h to escalate P2 → P1
|
||||
P1_to_P0: 6 # triage_count_24h to escalate P1 → P0
|
||||
|
||||
severity_cap: "P0" # never escalate above this
|
||||
|
||||
create_followup_on_escalate: true
|
||||
followup:
|
||||
priority: "P1"
|
||||
due_hours: 24
|
||||
owner: "oncall"
|
||||
message_template: "Escalated due to alert storm: occurrences={occurrences_60m}, triages_24h={triage_count_24h}"
|
||||
|
||||
auto_resolve:
|
||||
# Candidates only in MVP — do not auto-close P0/P1
|
||||
no_alerts_minutes_for_candidate: 60
|
||||
close_allowed_severities: ["P2", "P3"]
|
||||
auto_close: false # set true carefully in staging only
|
||||
candidate_event_type: "note"
|
||||
candidate_message: "Auto-resolve candidate: no alerts observed in {no_alerts_minutes} minutes for this signature"
|
||||
|
||||
alert_loop_slo:
|
||||
claim_to_ack_p95_seconds: 60 # p95 latency from claim → ack
|
||||
failed_rate_pct: 5 # max % of failed/(acked+failed) in window
|
||||
processing_stuck_minutes: 15 # alerts in processing beyond this → stuck
|
||||
88
config/incident_intelligence_policy.yml
Normal file
88
config/incident_intelligence_policy.yml
Normal file
@@ -0,0 +1,88 @@
|
||||
# Incident Intelligence Policy
|
||||
# Controls correlation scoring, recurrence detection, and digest generation.
|
||||
|
||||
correlation:
|
||||
lookback_days: 30
|
||||
max_related: 10
|
||||
min_score: 20 # discard matches below this
|
||||
rules:
|
||||
- name: "same_signature"
|
||||
weight: 100
|
||||
match:
|
||||
signature: true
|
||||
|
||||
- name: "same_service_and_kind"
|
||||
weight: 60
|
||||
match:
|
||||
same_service: true
|
||||
same_kind: true
|
||||
|
||||
- name: "same_service_time_cluster"
|
||||
weight: 40
|
||||
match:
|
||||
same_service: true
|
||||
within_minutes: 180
|
||||
|
||||
- name: "same_kind_cross_service"
|
||||
weight: 30
|
||||
match:
|
||||
same_kind: true
|
||||
within_minutes: 120
|
||||
|
||||
recurrence:
|
||||
windows_days: [7, 30]
|
||||
thresholds:
|
||||
signature:
|
||||
warn: 3 # ≥ 3 occurrences in window → warn
|
||||
high: 6 # ≥ 6 occurrences in window → high
|
||||
kind:
|
||||
warn: 5
|
||||
high: 10
|
||||
top_n: 15 # top N per category
|
||||
|
||||
# Deterministic recommendations per recurrence level
|
||||
recommendations:
|
||||
signature_high: "Create permanent fix: add regression test + SLO guard for this failure type"
|
||||
signature_warn: "Review root cause history; consider adding monitoring threshold"
|
||||
kind_high: "Systemic issue with kind={kind}: review architecture / add circuit breaker"
|
||||
kind_warn: "Recurring kind={kind}: validate if alert thresholds are tuned correctly"
|
||||
|
||||
digest:
|
||||
weekly_day: "Mon"
|
||||
include_closed: true
|
||||
include_open: true
|
||||
output_dir: "ops/reports/incidents"
|
||||
markdown_max_chars: 8000
|
||||
top_incidents: 20 # max incidents in weekly listing
|
||||
|
||||
# ── Root-Cause Buckets ─────────────────────────────────────────────────────
|
||||
buckets:
|
||||
mode: "service_kind" # service_kind | signature_prefix
|
||||
signature_prefix_len: 12
|
||||
top_n: 10
|
||||
min_count:
|
||||
7: 3 # bucket must have ≥ 3 incidents in last 7d
|
||||
30: 6 # or ≥ 6 in last 30d
|
||||
include_statuses: ["open", "mitigating", "resolved", "closed"]
|
||||
|
||||
# ── Auto Follow-ups (policy-driven, no LLM) ───────────────────────────────
|
||||
autofollowups:
|
||||
enabled: true
|
||||
only_when_high: true # only create for HIGH recurrence buckets
|
||||
owner: "oncall"
|
||||
priority: "P1"
|
||||
due_days: 7
|
||||
max_followups_per_bucket_per_week: 1 # dedupe by week+bucket_key
|
||||
dedupe_key_prefix: "intel_recur"
|
||||
|
||||
# ── Release Gate: recurrence_watch ────────────────────────────────────────
|
||||
release_gate:
|
||||
recurrence_watch:
|
||||
enabled: true
|
||||
service_scope: "target_service" # target_service | all
|
||||
windows_days: [7, 30]
|
||||
fail_on:
|
||||
severity_in: ["P0", "P1"] # used only in strict mode
|
||||
high_recurrence: true
|
||||
warn_on:
|
||||
warn_recurrence: true
|
||||
143
config/network_allowlist.yml
Normal file
143
config/network_allowlist.yml
Normal file
@@ -0,0 +1,143 @@
|
||||
# Network Allowlist for Tool HTTP Calls
|
||||
# Tools that make outbound HTTP requests MUST use only hosts/IPs listed here.
|
||||
# Any request to unlisted hosts is blocked by tool_governance.py middleware.
|
||||
#
|
||||
# Format per tool:
|
||||
# hosts: exact hostname or IP
|
||||
# prefixes: URL prefix match (for paths)
|
||||
|
||||
# ─── Observability Sources ────────────────────────────────────────────────────
|
||||
observability_tool:
|
||||
description: "Prometheus, Loki, Tempo datasources"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "prometheus"
|
||||
- "loki"
|
||||
- "tempo"
|
||||
- "monitoring"
|
||||
- "144.76.224.179" # NODA1 monitoring
|
||||
ports_allowed: [9090, 3100, 3200, 9080]
|
||||
schemes: ["http", "https"]
|
||||
|
||||
# ─── Oncall / Service Health ──────────────────────────────────────────────────
|
||||
oncall_tool:
|
||||
description: "Internal service health endpoints only"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "gateway"
|
||||
- "router"
|
||||
- "memory"
|
||||
- "qdrant"
|
||||
- "nats"
|
||||
- "144.76.224.179" # NODA1
|
||||
- "212.8.58.133" # NODA3
|
||||
ports_allowed: [80, 443, 8000, 8080, 8222, 9000, 9100, 9102, 9200, 9300, 9400]
|
||||
schemes: ["http", "https"]
|
||||
|
||||
# ─── Web Search / Extract ─────────────────────────────────────────────────────
|
||||
web_search:
|
||||
description: "Search provider APIs"
|
||||
hosts:
|
||||
- "api.duckduckgo.com"
|
||||
- "serpapi.com"
|
||||
- "api.bing.microsoft.com"
|
||||
- "customsearch.googleapis.com"
|
||||
schemes: ["https"]
|
||||
|
||||
web_extract:
|
||||
description: "Any public HTTPS URL (user-provided)"
|
||||
allow_any_public: true # Allow any non-private IP
|
||||
block_private_ranges: true # Block RFC1918 / loopback / link-local
|
||||
schemes: ["https"]
|
||||
|
||||
crawl4ai_scrape:
|
||||
description: "Crawl4AI service + public URLs"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "crawl4ai"
|
||||
ports_allowed: [11235]
|
||||
allow_any_public: true
|
||||
block_private_ranges: true
|
||||
schemes: ["http", "https"]
|
||||
|
||||
# ─── Memory / Graph ───────────────────────────────────────────────────────────
|
||||
memory_search:
|
||||
description: "Memory service + Qdrant"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "memory-service"
|
||||
- "qdrant"
|
||||
- "144.76.224.179"
|
||||
ports_allowed: [6333, 8001, 8100]
|
||||
schemes: ["http", "https"]
|
||||
|
||||
graph_query:
|
||||
description: "Neo4j bolt/http"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "neo4j"
|
||||
ports_allowed: [7474, 7687]
|
||||
schemes: ["http", "https", "bolt", "bolt+s"]
|
||||
|
||||
# ─── ComfyUI / Image Generation ──────────────────────────────────────────────
|
||||
comfy_generate_image:
|
||||
description: "ComfyUI on NODA3"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "212.8.58.133"
|
||||
ports_allowed: [8188]
|
||||
schemes: ["http"]
|
||||
|
||||
comfy_generate_video:
|
||||
description: "ComfyUI video on NODA3"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "212.8.58.133"
|
||||
ports_allowed: [8188]
|
||||
schemes: ["http"]
|
||||
|
||||
# ─── LLM Providers ────────────────────────────────────────────────────────────
|
||||
# (Used by router/gateway, not direct tool calls, but documented for reference)
|
||||
llm_providers:
|
||||
description: "External LLM APIs"
|
||||
hosts:
|
||||
- "api.x.ai" # xAI Grok
|
||||
- "open.bigmodel.cn" # GLM-5 Z.AI
|
||||
- "api.deepseek.com" # DeepSeek
|
||||
- "api.openai.com" # OpenAI fallback
|
||||
schemes: ["https"]
|
||||
|
||||
# ─── Presentation Service ─────────────────────────────────────────────────────
|
||||
presentation_create:
|
||||
description: "Presentation rendering service"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "presentation-service"
|
||||
ports_allowed: [8080, 9500]
|
||||
schemes: ["http", "https"]
|
||||
|
||||
# ─── Dependency Scanner ───────────────────────────────────────────────────────
|
||||
dependency_scanner_tool:
|
||||
description: "OSV.dev API for vulnerability lookups (online mode only)"
|
||||
hosts:
|
||||
- "api.osv.dev"
|
||||
schemes: ["https"]
|
||||
# Only used when vuln_mode=online; offline_cache requires no outbound
|
||||
|
||||
# ─── Private IP Ranges (always blocked for allow_any_public tools) ────────────
|
||||
private_ip_ranges:
|
||||
- "10.0.0.0/8"
|
||||
- "172.16.0.0/12"
|
||||
- "192.168.0.0/16"
|
||||
- "127.0.0.0/8"
|
||||
- "169.254.0.0/16"
|
||||
- "::1/128"
|
||||
- "fc00::/7"
|
||||
49
config/observability_sources.yml
Normal file
49
config/observability_sources.yml
Normal file
@@ -0,0 +1,49 @@
|
||||
# Observability Data Sources Configuration
|
||||
# These are internal URLs - never expose to external networks
|
||||
|
||||
prometheus:
|
||||
# Prometheus server URL (internal network)
|
||||
base_url: "http://prometheus:9090"
|
||||
|
||||
# Allowed PromQL query prefixes (security)
|
||||
allow_promql_prefixes:
|
||||
- "sum("
|
||||
- "rate("
|
||||
- "histogram_quantile("
|
||||
- "avg("
|
||||
- "max("
|
||||
- "min("
|
||||
- "count("
|
||||
- "irate("
|
||||
- "last_over_time("
|
||||
- "present_over_time("
|
||||
|
||||
loki:
|
||||
# Loki log server URL (internal network)
|
||||
base_url: "http://loki:3100"
|
||||
|
||||
tempo:
|
||||
# Tempo trace server URL (internal network)
|
||||
base_url: "http://tempo:3200"
|
||||
|
||||
# Limits configuration
|
||||
limits:
|
||||
# Maximum time window for queries (hours)
|
||||
max_time_window_hours: 24
|
||||
|
||||
# Maximum series returned
|
||||
max_series: 200
|
||||
|
||||
# Maximum points in range query
|
||||
max_points: 2000
|
||||
|
||||
# Maximum bytes in response
|
||||
max_bytes: 300000
|
||||
|
||||
# Query timeout (seconds)
|
||||
timeout_seconds: 5
|
||||
|
||||
# Environment variables (override URLs)
|
||||
# PROMETHEUS_URL
|
||||
# LOKI_URL
|
||||
# TEMPO_URL
|
||||
133
config/release_gate_policy.yml
Normal file
133
config/release_gate_policy.yml
Normal file
@@ -0,0 +1,133 @@
|
||||
# Release Gate Policy — DAARION.city
|
||||
#
|
||||
# Controls strictness of each gate per deployment profile.
|
||||
#
|
||||
# Modes:
|
||||
# off — gate is fully skipped (no call, no output)
|
||||
# warn — gate always pass=True; findings become recommendations only
|
||||
# strict — gate can fail release (pass=False) when fail_on conditions are met
|
||||
#
|
||||
# Profiles: dev | staging | prod
|
||||
# Set via release_check input `gate_profile` (default: dev).
|
||||
|
||||
profiles:
|
||||
dev:
|
||||
description: "Development: strict for security gates, warn for governance"
|
||||
gates:
|
||||
pr_review:
|
||||
mode: "strict"
|
||||
config_lint:
|
||||
mode: "strict"
|
||||
dependency_scan:
|
||||
mode: "strict"
|
||||
fail_on_severities: ["CRITICAL", "HIGH"]
|
||||
contract_diff:
|
||||
mode: "strict"
|
||||
threat_model:
|
||||
mode: "strict"
|
||||
smoke:
|
||||
mode: "warn"
|
||||
drift:
|
||||
mode: "warn"
|
||||
slo_watch:
|
||||
mode: "warn"
|
||||
followup_watch:
|
||||
mode: "warn"
|
||||
fail_on: ["P0", "P1"]
|
||||
privacy_watch:
|
||||
mode: "warn"
|
||||
cost_watch:
|
||||
mode: "warn"
|
||||
recurrence_watch:
|
||||
mode: "warn"
|
||||
risk_watch:
|
||||
mode: "warn"
|
||||
risk_delta_watch:
|
||||
mode: "warn"
|
||||
platform_review_required:
|
||||
mode: "warn"
|
||||
|
||||
staging:
|
||||
description: "Staging: strict security + strict privacy on errors"
|
||||
gates:
|
||||
pr_review:
|
||||
mode: "strict"
|
||||
config_lint:
|
||||
mode: "strict"
|
||||
dependency_scan:
|
||||
mode: "strict"
|
||||
fail_on_severities: ["CRITICAL", "HIGH"]
|
||||
contract_diff:
|
||||
mode: "strict"
|
||||
threat_model:
|
||||
mode: "strict"
|
||||
smoke:
|
||||
mode: "warn"
|
||||
drift:
|
||||
mode: "strict"
|
||||
slo_watch:
|
||||
mode: "strict" # Don't deploy if SLO currently breached
|
||||
followup_watch:
|
||||
mode: "strict"
|
||||
fail_on: ["P0", "P1"]
|
||||
privacy_watch:
|
||||
mode: "strict"
|
||||
fail_on: ["error"]
|
||||
cost_watch:
|
||||
mode: "warn"
|
||||
recurrence_watch:
|
||||
mode: "strict" # Block staging deploy if P0/P1 high recurrence
|
||||
fail_on:
|
||||
severity_in: ["P0", "P1"]
|
||||
high_recurrence: true
|
||||
risk_watch:
|
||||
mode: "strict" # Block staging if score >= fail_at for p0_services
|
||||
risk_delta_watch:
|
||||
mode: "strict" # Block staging for p0_services when delta >= fail_delta
|
||||
platform_review_required:
|
||||
mode: "warn" # warn-first: never blocks staging by default
|
||||
|
||||
prod:
|
||||
description: "Production: maximum strictness across all gates"
|
||||
gates:
|
||||
pr_review:
|
||||
mode: "strict"
|
||||
config_lint:
|
||||
mode: "strict"
|
||||
dependency_scan:
|
||||
mode: "strict"
|
||||
fail_on_severities: ["CRITICAL", "HIGH", "MEDIUM"]
|
||||
contract_diff:
|
||||
mode: "strict"
|
||||
threat_model:
|
||||
mode: "strict"
|
||||
smoke:
|
||||
mode: "strict"
|
||||
drift:
|
||||
mode: "strict"
|
||||
slo_watch:
|
||||
mode: "warn" # Warn: don't automatically block prod deploys on SLO
|
||||
followup_watch:
|
||||
mode: "warn"
|
||||
fail_on: ["P0"]
|
||||
privacy_watch:
|
||||
mode: "strict"
|
||||
fail_on: ["error"]
|
||||
cost_watch:
|
||||
mode: "warn"
|
||||
recurrence_watch:
|
||||
mode: "warn" # Warn only in prod (accumulate data first)
|
||||
risk_watch:
|
||||
mode: "warn" # Warn only in prod
|
||||
risk_delta_watch:
|
||||
mode: "warn" # Warn only in prod
|
||||
platform_review_required:
|
||||
mode: "warn" # Start conservative in prod
|
||||
|
||||
# ─── Defaults (used if profile or gate not found) ────────────────────────────
|
||||
defaults:
|
||||
mode: "warn"
|
||||
# privacy_watch default fail_on (for strict mode):
|
||||
privacy_fail_on: ["error"]
|
||||
# cost_watch is never strict by default
|
||||
cost_always_warn: true
|
||||
80
config/risk_attribution_policy.yml
Normal file
80
config/risk_attribution_policy.yml
Normal file
@@ -0,0 +1,80 @@
|
||||
# Risk Attribution Policy — DAARION.city
|
||||
#
|
||||
# Deterministic attribution: risk spike → likely causes.
|
||||
# LLM enrichment is OFF by default; local only on regression triggers.
|
||||
|
||||
defaults:
|
||||
lookback_hours: 24
|
||||
max_causes: 5
|
||||
llm_mode: "off" # off | local | remote
|
||||
llm_max_chars_in: 3500
|
||||
llm_max_chars_out: 800
|
||||
|
||||
# LLM enrichment triggers — only if ALL conditions are met
|
||||
llm_triggers:
|
||||
risk_delta_warn: 10 # delta_24h >= 10
|
||||
risk_delta_fail: 20 # delta_24h >= 20 (fail-level)
|
||||
band_in: ["high", "critical"]
|
||||
|
||||
# Per-cause scoring weights (additive)
|
||||
weights:
|
||||
deploy: 30
|
||||
dependency: 25
|
||||
drift: 25
|
||||
incident_storm: 20
|
||||
slo_violation: 15
|
||||
followups_overdue: 10
|
||||
alert_loop_degraded: 10
|
||||
|
||||
# Per-signal detection config
|
||||
signals:
|
||||
deploy:
|
||||
# Alert kinds that indicate a deploy event
|
||||
kinds: ["deploy", "deployment", "rollout", "canary"]
|
||||
|
||||
dependency:
|
||||
# Release gate names whose fail/warn counts as a dependency signal
|
||||
release_gate_names: ["dependency_scan", "deps"]
|
||||
|
||||
drift:
|
||||
release_gate_names: ["drift", "config_drift"]
|
||||
|
||||
incident_storm:
|
||||
thresholds:
|
||||
# occurrences in last 60min across all alert signatures for the service
|
||||
occurrences_60m_warn: 10
|
||||
# escalations (Escalated events) in last 24h
|
||||
escalations_24h_warn: 2
|
||||
|
||||
slo:
|
||||
require_active_violation: true
|
||||
|
||||
# Confidence bands (minimum score to reach that band)
|
||||
output:
|
||||
confidence_bands:
|
||||
high: 60 # score >= 60 → high confidence
|
||||
medium: 35 # score >= 35 → medium
|
||||
# below 35 → low
|
||||
|
||||
# Change Timeline config
|
||||
timeline:
|
||||
enabled: true
|
||||
lookback_hours: 24
|
||||
max_items: 30
|
||||
include_types: ["deploy","dependency","drift","incident","slo","followup","alert_loop","release_gate"]
|
||||
time_bucket_minutes: 5 # coalesce same-type events within 5-min windows
|
||||
|
||||
# Evidence linking
|
||||
evidence_linking:
|
||||
enabled: true
|
||||
max_refs_per_cause: 10
|
||||
|
||||
# LLM local endpoint config (only used when llm_mode=local)
|
||||
llm_local:
|
||||
endpoint: "http://localhost:11434/api/generate"
|
||||
model: "llama3"
|
||||
timeout_seconds: 15
|
||||
# Hardening guards
|
||||
model_allowlist: ["qwen2.5-coder:3b", "llama3.1:8b-instruct", "phi3:mini", "llama3"]
|
||||
max_calls_per_digest: 3
|
||||
per_day_dedupe: true # key: risk_enrich:{YYYY-MM-DD}:{service}:{env}
|
||||
89
config/risk_policy.yml
Normal file
89
config/risk_policy.yml
Normal file
@@ -0,0 +1,89 @@
|
||||
# Service Risk Index Policy — DAARION.city
|
||||
#
|
||||
# Controls how Risk Scores are computed, classified, and gated.
|
||||
# All scoring is deterministic: no LLM required.
|
||||
|
||||
defaults:
|
||||
window_hours: 24
|
||||
recurrence_windows_days: [7, 30]
|
||||
slo_window_minutes: 60
|
||||
|
||||
thresholds:
|
||||
bands:
|
||||
low_max: 20
|
||||
medium_max: 50
|
||||
high_max: 80
|
||||
risk_watch: # defaults, overridable per service below
|
||||
warn_at: 50 # score >= warn_at → recommendations
|
||||
fail_at: 80 # score >= fail_at → gate fails (strict mode only)
|
||||
|
||||
weights:
|
||||
open_incidents:
|
||||
P0: 50
|
||||
P1: 25
|
||||
P2: 10
|
||||
P3: 5
|
||||
recurrence:
|
||||
signature_warn_7d: 10
|
||||
signature_high_7d: 20
|
||||
kind_warn_7d: 8
|
||||
kind_high_7d: 15
|
||||
signature_high_30d: 10
|
||||
kind_high_30d: 8
|
||||
followups:
|
||||
overdue_P0: 20
|
||||
overdue_P1: 12
|
||||
overdue_other: 6
|
||||
slo:
|
||||
violation: 10 # per active violation
|
||||
alerts_loop:
|
||||
slo_violation: 10 # per alert-loop SLO violation
|
||||
escalation:
|
||||
escalations_24h:
|
||||
warn: 5 # score added if escalations_24h >= 1
|
||||
high: 12 # score added if escalations_24h >= 3
|
||||
|
||||
# Per-service risk gate overrides (lower/higher fail_at)
|
||||
service_overrides:
|
||||
gateway:
|
||||
risk_watch:
|
||||
fail_at: 75 # gateway is critical: fail earlier
|
||||
router:
|
||||
risk_watch:
|
||||
fail_at: 80
|
||||
|
||||
# Services treated as P0 (always subject to strict risk_watch in staging)
|
||||
p0_services:
|
||||
- gateway
|
||||
- router
|
||||
|
||||
# ─── History & Snapshotting ────────────────────────────────────────────────────
|
||||
history:
|
||||
snapshot_interval_minutes: 60
|
||||
retention_days: 90
|
||||
max_services_per_run: 50
|
||||
|
||||
# ─── Trend analysis ───────────────────────────────────────────────────────────
|
||||
trend:
|
||||
delta_windows_hours: [24, 168] # 24h and 7d
|
||||
volatility_window_hours: 168 # stddev computed over last 7d
|
||||
regression_threshold:
|
||||
delta_24h_warn: 10 # score rose >= 10 points in 24h → warn
|
||||
delta_24h_fail: 20 # score rose >= 20 points in 24h → fail (strict)
|
||||
delta_7d_warn: 15
|
||||
delta_7d_fail: 30
|
||||
|
||||
# ─── Daily Digest ─────────────────────────────────────────────────────────────
|
||||
digest:
|
||||
daily_hour_utc: 9 # generate at 09:00 UTC
|
||||
output_dir: "ops/reports/risk"
|
||||
markdown_max_chars: 8000
|
||||
top_n: 10
|
||||
|
||||
# ─── Risk Delta release gate ──────────────────────────────────────────────────
|
||||
release_gate:
|
||||
risk_delta_watch:
|
||||
enabled: true
|
||||
default_warn_delta_24h: 10
|
||||
default_fail_delta_24h: 20
|
||||
p0_services_strict: true
|
||||
52
config/roles/aistalk/aurora.md
Normal file
52
config/roles/aistalk/aurora.md
Normal file
@@ -0,0 +1,52 @@
|
||||
# Aurora (Autonomous Media Forensics)
|
||||
|
||||
Role:
|
||||
- Lead media forensics for video, audio, and photo evidence inside AISTALK.
|
||||
- Extract usable evidence from low-quality media while preserving reproducibility.
|
||||
|
||||
Modes:
|
||||
- `tactical`: fast triage for operational clarity.
|
||||
- prioritize turnaround and readability
|
||||
- lightweight pipelines and lower cost
|
||||
- output is advisory (not courtroom-grade)
|
||||
- `forensic`: evidence-grade processing.
|
||||
- prioritize reproducibility and auditability
|
||||
- mandatory input/output hashing and immutable processing log
|
||||
- chain-of-custody notes + signing metadata
|
||||
|
||||
Capabilities:
|
||||
- Video: denoise, deblur, super-resolution, stabilization, frame interpolation.
|
||||
- Face-focused enhancement: controlled face restoration with clear model attribution.
|
||||
- Audio: denoise, speech intelligibility improvement, deepfake risk signals.
|
||||
- Photo: artifact cleanup, upscale, metadata/EXIF integrity review.
|
||||
|
||||
Internal sub-pipeline handles:
|
||||
- `Clarity`: global video enhancement.
|
||||
- `Vera`: face restoration and face-quality diagnostics.
|
||||
- `Echo`: audio cleaning/transcription/deepfake heuristics.
|
||||
- `Pixis`: photo restoration and metadata checks.
|
||||
- `Kore`: forensic packaging (hashes, chain-of-custody, signature metadata).
|
||||
|
||||
Output contract (strict JSON for downstream graphing):
|
||||
```json
|
||||
{
|
||||
"agent": "Aurora",
|
||||
"mode": "tactical | forensic",
|
||||
"job_id": "aurora_YYYYMMDD_###",
|
||||
"input_file": {"name": "file.ext", "hash": "sha256:..."},
|
||||
"processing_log": [
|
||||
{"step": "denoise", "model": "model_name", "time_ms": 0}
|
||||
],
|
||||
"output_files": [
|
||||
{"type": "video|audio|photo|forensic_log", "url": "https://...", "hash": "sha256:..."}
|
||||
],
|
||||
"digital_signature": "ed25519:... | null"
|
||||
}
|
||||
```
|
||||
|
||||
Boundaries:
|
||||
- No deceptive deepfake generation or identity manipulation.
|
||||
- Never present AI-enhanced output as untouched original evidence.
|
||||
- Flag uncertainty and potential enhancement artifacts explicitly.
|
||||
- Do not provide final legal conclusions; require expert human review for court use.
|
||||
- Preserve originals; never destructively overwrite source evidence.
|
||||
64
config/slo_policy.yml
Normal file
64
config/slo_policy.yml
Normal file
@@ -0,0 +1,64 @@
|
||||
# SLO Policy — DAARION.city
|
||||
#
|
||||
# Defines Service Level Objectives per service.
|
||||
# Used by observability_tool.slo_snapshot and incident_triage_graph slo_context node.
|
||||
#
|
||||
# Fields:
|
||||
# error_rate_pct — max allowed error rate (%)
|
||||
# latency_p95_ms — max p95 latency (milliseconds)
|
||||
# window_minutes — default observation window (default: 60)
|
||||
|
||||
defaults:
|
||||
window_minutes: 60
|
||||
error_rate_pct: 1.0
|
||||
latency_p95_ms: 300
|
||||
|
||||
services:
|
||||
gateway:
|
||||
error_rate_pct: 1.0
|
||||
latency_p95_ms: 300
|
||||
router:
|
||||
error_rate_pct: 0.5
|
||||
latency_p95_ms: 200
|
||||
memory-service:
|
||||
error_rate_pct: 1.0
|
||||
latency_p95_ms: 400
|
||||
sofiia-supervisor:
|
||||
error_rate_pct: 1.0
|
||||
latency_p95_ms: 500
|
||||
|
||||
# ─── Voice SLO profiles ───────────────────────────────────────────────────────
|
||||
# Two profiles aligned with router-config.yml selection_policies.
|
||||
# Measured via Prometheus metrics emitted by sofiia-console /api/telemetry/voice
|
||||
# and memory-service voice_endpoints.py.
|
||||
#
|
||||
# Prometheus metrics:
|
||||
# voice_ttfa_ms{voice_profile} — Time-to-first-audio (BFF → first playable)
|
||||
# voice_e2e_ms{voice_profile} — User stops speaking → audio plays
|
||||
# voice_tts_first_ms{voice_profile} — First-sentence TTS synthesis
|
||||
# voice_tts_compute_ms{engine,voice} — Memory-service internal TTS
|
||||
# voice_queue_underflows_total — Playback starvation events
|
||||
voice_slo:
|
||||
voice_fast_uk:
|
||||
description: "Fast profile: gemma3 → qwen3.5 fallback"
|
||||
ttfa_ms_p95: 5000 # TTFA p95 ≤ 5s
|
||||
e2e_ms_p95: 9000 # E2E p95 ≤ 9s
|
||||
tts_first_ms_p95: 2000 # TTS synthesis p95 ≤ 2s
|
||||
underflow_rate_pct: 1.0 # starvation events per 100 voice turns ≤ 1%
|
||||
tts_error_rate_pct: 0.5 # edge-tts failures ≤ 0.5%
|
||||
window_minutes: 10
|
||||
|
||||
voice_quality_uk:
|
||||
description: "Quality profile: qwen3.5 → qwen3:14b fallback"
|
||||
ttfa_ms_p95: 7000
|
||||
e2e_ms_p95: 12000
|
||||
tts_first_ms_p95: 2000 # TTS itself is the same engine
|
||||
underflow_rate_pct: 2.0 # slightly relaxed (longer LLM → more gap risk)
|
||||
tts_error_rate_pct: 0.5
|
||||
window_minutes: 10
|
||||
|
||||
# Canary thresholds (runtime health check, stricter)
|
||||
canary:
|
||||
tts_polina_max_ms: 3000 # live Polina synthesis ≤ 3s
|
||||
tts_ostap_max_ms: 3000 # live Ostap synthesis ≤ 3s
|
||||
min_audio_bytes: 1000 # valid audio is never empty/tiny
|
||||
Reference in New Issue
Block a user