docs(platform): add policy configs, runbooks, ops scripts and platform documentation
Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
This commit is contained in:
114
config/alert_routing_policy.yml
Normal file
114
config/alert_routing_policy.yml
Normal file
@@ -0,0 +1,114 @@
|
||||
# alert_routing_policy.yml
|
||||
# Controls how the alert_triage_graph processes incoming alerts every 5 minutes.
|
||||
# Key design: llm_mode=off means 0 LLM tokens in steady state.
|
||||
|
||||
defaults:
|
||||
poll_interval_seconds: 300 # 5 min
|
||||
max_alerts_per_run: 25
|
||||
only_unacked: true
|
||||
|
||||
# Safety valves (avoid runaway incident creation on alert storm)
|
||||
max_incidents_per_run: 5
|
||||
max_triages_per_run: 5
|
||||
dedupe_window_minutes_default: 120
|
||||
ack_note_prefix: "alert_triage_loop"
|
||||
|
||||
# LLM gating — off = 0 tokens in steady state
|
||||
llm_mode: "off" # off | local | remote
|
||||
llm_on:
|
||||
triage: false
|
||||
postmortem: false
|
||||
|
||||
routing:
|
||||
# ─── HARD AUTO: prod P0/P1 → create incident + deterministic triage ─────────
|
||||
- match:
|
||||
env_in: ["prod"]
|
||||
severity_in: ["P0", "P1"]
|
||||
actions:
|
||||
auto_incident: true
|
||||
auto_triage: true
|
||||
triage_mode: "deterministic" # deterministic | llm
|
||||
incident_severity_cap: "P1"
|
||||
dedupe_window_minutes: 180
|
||||
attach_alert_artifact: true
|
||||
ack: true
|
||||
|
||||
# ─── Security alerts: auto incident + (optional) LLM triage ─────────────────
|
||||
- match:
|
||||
kind_in: ["security"]
|
||||
actions:
|
||||
auto_incident: true
|
||||
auto_triage: true
|
||||
triage_mode: "deterministic" # flip to llm once stable
|
||||
incident_severity_cap: "P0"
|
||||
dedupe_window_minutes: 360
|
||||
attach_alert_artifact: true
|
||||
ack: true
|
||||
|
||||
# ─── Resource-critical: OOM/crashloop/disk in prod|staging ──────────────────
|
||||
- match:
|
||||
kind_in: ["oom", "crashloop", "disk"]
|
||||
env_in: ["prod", "staging"]
|
||||
severity_in: ["P0", "P1", "P2"]
|
||||
actions:
|
||||
auto_incident: true
|
||||
auto_triage: true
|
||||
triage_mode: "deterministic"
|
||||
incident_severity_cap: "P1"
|
||||
dedupe_window_minutes: 240
|
||||
attach_alert_artifact: true
|
||||
ack: true
|
||||
|
||||
# ─── Staging P1: auto incident, no triage (save resources) ─────────────────
|
||||
- match:
|
||||
env_in: ["staging"]
|
||||
severity_in: ["P1"]
|
||||
actions:
|
||||
auto_incident: true
|
||||
auto_triage: false
|
||||
triage_mode: "deterministic"
|
||||
incident_severity_cap: "P1"
|
||||
dedupe_window_minutes: 120
|
||||
attach_alert_artifact: true
|
||||
ack: true
|
||||
|
||||
# ─── Deploy events: digest-only ──────────────────────────────────────────────
|
||||
- match:
|
||||
kind_in: ["deploy"]
|
||||
actions:
|
||||
auto_incident: false
|
||||
digest_only: true
|
||||
ack: true
|
||||
|
||||
# ─── Lower severity: digest-only ─────────────────────────────────────────────
|
||||
- match:
|
||||
severity_in: ["P2", "P3", "INFO"]
|
||||
actions:
|
||||
auto_incident: false
|
||||
digest_only: true
|
||||
ack: true
|
||||
|
||||
# ─── Kind normalization (aliases Monitor may use) ────────────────────────────
|
||||
kind_map:
|
||||
latency: ["latency", "p95_latency", "p99_latency", "slow_response"]
|
||||
error_rate: ["error_rate", "5xx_rate", "http_errors"]
|
||||
slo_breach: ["slo_breach", "slo", "slo_violation"]
|
||||
crashloop: ["crashloop", "restart_loop", "oom_kill"]
|
||||
oom: ["oom", "out_of_memory", "memory_pressure"]
|
||||
disk: ["disk", "disk_full", "disk_pressure", "pvc_full"]
|
||||
security: ["security", "unauthorized", "injection", "brute_force"]
|
||||
|
||||
# ─── Per-kind severity caps for incidents created by the loop ─────────────────
|
||||
severity_caps:
|
||||
deploy: "P2"
|
||||
latency: "P1"
|
||||
error_rate: "P1"
|
||||
slo_breach: "P1"
|
||||
security: "P0"
|
||||
|
||||
# ─── Signature dedupe settings ────────────────────────────────────────────────
|
||||
signature:
|
||||
use_kind: true
|
||||
use_fingerprint: true
|
||||
use_node_label: false # true = per-node incidents (noisier)
|
||||
normalize_title: true # strip numbers/timestamps from title before hash
|
||||
51
config/architecture_pressure_policy.yml
Normal file
51
config/architecture_pressure_policy.yml
Normal file
@@ -0,0 +1,51 @@
|
||||
# Architecture Pressure Policy — DAARION.city
|
||||
#
|
||||
# Deterministic structural health index: measures long-term architectural strain.
|
||||
# Risk = short-term stability. Pressure = long-term structural debt.
|
||||
#
|
||||
# All thresholds / weights configurable here; no LLM, no external calls.
|
||||
|
||||
defaults:
|
||||
lookback_days: 30
|
||||
top_n: 10
|
||||
|
||||
# Per-signal additive weights
|
||||
weights:
|
||||
recurrence_high_30d: 20 # high-recurrence bucket present in 30d
|
||||
recurrence_warn_30d: 10 # warn-level recurrence in 30d
|
||||
regressions_30d: 15 # each positive delta_24h event in 30d
|
||||
escalations_30d: 12 # each escalation event in 30d
|
||||
followups_created_30d: 8 # each new followup created in 30d
|
||||
followups_overdue: 15 # current overdue followups (snapshot)
|
||||
drift_failures_30d: 10 # drift gate fail/warn events in 30d
|
||||
dependency_high_30d: 10 # dependency scan HIGH/CRITICAL findings in 30d
|
||||
|
||||
# Score → band mapping
|
||||
bands:
|
||||
low_max: 20
|
||||
medium_max: 45
|
||||
high_max: 70
|
||||
# above high_max → critical
|
||||
|
||||
# Priority rules for automatic follow-up creation
|
||||
priority_rules:
|
||||
require_arch_review_at: 70 # pressure score >= this → requires_arch_review=true
|
||||
auto_create_followup: true # create a follow-up when require_arch_review triggered
|
||||
followup_priority: "P1"
|
||||
followup_due_days: 14
|
||||
followup_owner: "cto"
|
||||
# Dedupe key: arch_review:{YYYY-WW}:{service}
|
||||
# Prevents duplicate creation within the same ISO week
|
||||
|
||||
# Release gate behaviour
|
||||
release_gate:
|
||||
platform_review_required:
|
||||
enabled: true
|
||||
warn_at: 60
|
||||
fail_at: 85 # only blocks if gate profile is "strict"
|
||||
|
||||
# Digest settings
|
||||
digest:
|
||||
output_dir: "ops/reports/platform"
|
||||
max_chars: 12000
|
||||
top_n_in_digest: 10
|
||||
86
config/backlog_policy.yml
Normal file
86
config/backlog_policy.yml
Normal file
@@ -0,0 +1,86 @@
|
||||
# Engineering Backlog Policy — DAARION.city
|
||||
#
|
||||
# Governs auto-generation of platform backlog items from Risk/Pressure digests,
|
||||
# workflow transitions, ownership, and storage retention.
|
||||
#
|
||||
# No LLM. Deterministic generation. Source of truth for engineering priorities.
|
||||
|
||||
defaults:
|
||||
env: "prod"
|
||||
retention_days: 180
|
||||
max_items_per_run: 50
|
||||
|
||||
# Dedupe scheme: prevents duplicate creation within the same ISO week
|
||||
dedupe:
|
||||
scheme: "YYYY-WW" # weekly deduplication window
|
||||
key_fields: ["service", "category", "env"]
|
||||
key_prefix: "platform_backlog"
|
||||
# Final key: platform_backlog:{YYYY-WW}:{env}:{service}:{category}
|
||||
|
||||
# Per-category defaults
|
||||
categories:
|
||||
arch_review:
|
||||
priority: "P1"
|
||||
due_days: 14
|
||||
refactor:
|
||||
priority: "P1"
|
||||
due_days: 21
|
||||
slo_hardening:
|
||||
priority: "P2"
|
||||
due_days: 30
|
||||
cleanup_followups:
|
||||
priority: "P2"
|
||||
due_days: 14
|
||||
security:
|
||||
priority: "P0"
|
||||
due_days: 7
|
||||
|
||||
# Auto-generation rules (evaluated per-service top-to-bottom; first match wins per category)
|
||||
generation:
|
||||
weekly_from_pressure_digest: true
|
||||
daily_from_risk_digest: false
|
||||
rules:
|
||||
- name: "arch_review_required"
|
||||
when:
|
||||
pressure_requires_arch_review: true
|
||||
create:
|
||||
category: "arch_review"
|
||||
title_template: "[ARCH] Review required: {service}"
|
||||
|
||||
- name: "high_pressure_refactor"
|
||||
when:
|
||||
pressure_band_in: ["high", "critical"]
|
||||
risk_band_in: ["high", "critical"]
|
||||
create:
|
||||
category: "refactor"
|
||||
title_template: "[REF] Reduce pressure & risk: {service}"
|
||||
|
||||
- name: "slo_violations"
|
||||
when:
|
||||
risk_has_slo_violations: true
|
||||
create:
|
||||
category: "slo_hardening"
|
||||
title_template: "[SLO] Fix violations: {service}"
|
||||
|
||||
- name: "followup_backlog"
|
||||
when:
|
||||
followups_overdue_gt: 0
|
||||
create:
|
||||
category: "cleanup_followups"
|
||||
title_template: "[OPS] Close overdue followups: {service}"
|
||||
|
||||
# Owner assignments (default + service-level overrides)
|
||||
ownership:
|
||||
default_owner: "oncall"
|
||||
overrides:
|
||||
gateway: "cto"
|
||||
|
||||
# Workflow state machine
|
||||
workflow:
|
||||
statuses: ["open", "in_progress", "blocked", "done", "canceled"]
|
||||
allowed_transitions:
|
||||
open: ["in_progress", "blocked", "canceled"]
|
||||
in_progress: ["blocked", "done", "canceled"]
|
||||
blocked: ["open", "in_progress", "canceled"]
|
||||
done: []
|
||||
canceled: []
|
||||
133
config/cost_weights.yml
Normal file
133
config/cost_weights.yml
Normal file
@@ -0,0 +1,133 @@
|
||||
# Cost Weights — DAARION FinOps MVP
|
||||
#
|
||||
# "cost_units" = cost_per_call + duration_ms * cost_per_ms
|
||||
# These are RELATIVE units for ranking, not actual dollars.
|
||||
#
|
||||
# Update weights as actual cost data becomes available.
|
||||
|
||||
defaults:
|
||||
cost_per_call: 1.0 # baseline: 1 unit per call
|
||||
cost_per_ms: 0.001 # 0.001 units per ms elapsed
|
||||
|
||||
tools:
|
||||
# ─── Heavy GPU/compute (high cost) ───────────────────────────────────────
|
||||
comfy_generate_video:
|
||||
cost_per_call: 120.0
|
||||
cost_per_ms: 0.005
|
||||
category: media
|
||||
|
||||
comfy_generate_image:
|
||||
cost_per_call: 50.0
|
||||
cost_per_ms: 0.003
|
||||
category: media
|
||||
|
||||
# ─── Release / governance tools ──────────────────────────────────────────
|
||||
pr_reviewer_tool:
|
||||
cost_per_call: 10.0
|
||||
cost_per_ms: 0.002
|
||||
category: release
|
||||
|
||||
contract_tool:
|
||||
cost_per_call: 5.0
|
||||
cost_per_ms: 0.001
|
||||
category: release
|
||||
|
||||
threatmodel_tool:
|
||||
cost_per_call: 5.0
|
||||
cost_per_ms: 0.001
|
||||
category: release
|
||||
|
||||
dependency_scanner_tool:
|
||||
cost_per_call: 3.0
|
||||
cost_per_ms: 0.001
|
||||
category: release
|
||||
|
||||
drift_analyzer_tool:
|
||||
cost_per_call: 4.0
|
||||
cost_per_ms: 0.001
|
||||
category: release
|
||||
|
||||
cost_analyzer_tool:
|
||||
cost_per_call: 2.0
|
||||
cost_per_ms: 0.001
|
||||
category: finops
|
||||
|
||||
# ─── Observability (moderate cost, often called) ─────────────────────────
|
||||
observability_tool:
|
||||
cost_per_call: 2.0
|
||||
cost_per_ms: 0.001
|
||||
category: observability
|
||||
|
||||
# ─── Jobs / orchestration ────────────────────────────────────────────────
|
||||
job_orchestrator_tool:
|
||||
cost_per_call: 3.0
|
||||
cost_per_ms: 0.001
|
||||
category: ops
|
||||
|
||||
# ─── Web / external (network cost) ───────────────────────────────────────
|
||||
web_search:
|
||||
cost_per_call: 2.0
|
||||
cost_per_ms: 0.001
|
||||
category: web
|
||||
|
||||
web_extract:
|
||||
cost_per_call: 1.5
|
||||
cost_per_ms: 0.001
|
||||
category: web
|
||||
|
||||
crawl4ai_scrape:
|
||||
cost_per_call: 3.0
|
||||
cost_per_ms: 0.001
|
||||
category: web
|
||||
|
||||
# ─── Knowledge / memory (low cost) ───────────────────────────────────────
|
||||
memory_search:
|
||||
cost_per_call: 0.5
|
||||
cost_per_ms: 0.0005
|
||||
category: memory
|
||||
|
||||
remember_fact:
|
||||
cost_per_call: 0.5
|
||||
cost_per_ms: 0.0005
|
||||
category: memory
|
||||
|
||||
graph_query:
|
||||
cost_per_call: 0.5
|
||||
cost_per_ms: 0.0005
|
||||
category: memory
|
||||
|
||||
kb_tool:
|
||||
cost_per_call: 1.0
|
||||
cost_per_ms: 0.001
|
||||
category: knowledge
|
||||
|
||||
# ─── Repo / code tools ───────────────────────────────────────────────────
|
||||
repo_tool:
|
||||
cost_per_call: 1.5
|
||||
cost_per_ms: 0.001
|
||||
category: dev
|
||||
|
||||
config_linter_tool:
|
||||
cost_per_call: 2.0
|
||||
cost_per_ms: 0.001
|
||||
category: release
|
||||
|
||||
# ─── Oncall / incident ───────────────────────────────────────────────────
|
||||
oncall_tool:
|
||||
cost_per_call: 1.0
|
||||
cost_per_ms: 0.001
|
||||
category: ops
|
||||
|
||||
# ─── Anomaly detection thresholds ────────────────────────────────────────────
|
||||
anomaly:
|
||||
# Spike: window_cost / baseline_avg_cost >= ratio_threshold
|
||||
spike_ratio_threshold: 3.0
|
||||
# Must have at least this many calls in window to be an anomaly
|
||||
min_calls_threshold: 10
|
||||
# High-priority tools for cost_watch gate in release_check
|
||||
priority_tools:
|
||||
- comfy_generate_video
|
||||
- comfy_generate_image
|
||||
- pr_reviewer_tool
|
||||
- job_orchestrator_tool
|
||||
- observability_tool
|
||||
192
config/data_governance_policy.yml
Normal file
192
config/data_governance_policy.yml
Normal file
@@ -0,0 +1,192 @@
|
||||
# Data Governance & Privacy Policy — DAARION.city
|
||||
#
|
||||
# Used by data_governance_tool to scan for PII/secrets/logging/retention risks.
|
||||
# Severity: "error" = high risk (still warning-only in gate_mode=warning_only).
|
||||
# "warning" = medium risk.
|
||||
# "info" = low risk / informational.
|
||||
|
||||
# ─── Retention policies ───────────────────────────────────────────────────────
|
||||
retention:
|
||||
audit_jsonl_days: 30
|
||||
audit_postgres_days: 90
|
||||
memory_events_days: 90
|
||||
logs_days: 14
|
||||
# Large output threshold: if audit out_size >= this, flag as anomaly
|
||||
large_output_bytes: 65536 # 64KB
|
||||
|
||||
# ─── PII patterns ─────────────────────────────────────────────────────────────
|
||||
pii_patterns:
|
||||
email:
|
||||
regex: "(?i)\\b[A-Z0-9._%+\\-]+@[A-Z0-9.\\-]+\\.[A-Z]{2,}\\b"
|
||||
severity: "warning"
|
||||
id: "DG-PII-001"
|
||||
description: "Email address detected"
|
||||
|
||||
phone_ua_intl:
|
||||
regex: "\\b\\+?[0-9][0-9\\-\\s()]{7,}[0-9]\\b"
|
||||
severity: "warning"
|
||||
id: "DG-PII-002"
|
||||
description: "Phone-like number detected"
|
||||
|
||||
credit_card:
|
||||
regex: "\\b(?:\\d[ \\-]*?){13,19}\\b"
|
||||
severity: "error"
|
||||
id: "DG-PII-003"
|
||||
description: "Credit card-like number detected"
|
||||
|
||||
passport_like:
|
||||
regex: "\\b[A-Z]{2}\\d{6,7}\\b"
|
||||
severity: "warning"
|
||||
id: "DG-PII-004"
|
||||
description: "Passport-like identifier detected"
|
||||
|
||||
tax_id_ua:
|
||||
regex: "\\b\\d{10}\\b"
|
||||
severity: "info"
|
||||
id: "DG-PII-005"
|
||||
description: "Possible Ukrainian tax ID (10 digits)"
|
||||
|
||||
# ─── Extra secret patterns (supplement tool_governance._SECRET_PATTERNS) ──────
|
||||
secret_patterns:
|
||||
inherit_from_tool_governance: true
|
||||
extra:
|
||||
- name: "private_key_block"
|
||||
regex: "-----BEGIN [A-Z ]*PRIVATE KEY-----"
|
||||
severity: "error"
|
||||
id: "DG-SEC-001"
|
||||
- name: "aws_mfa_token"
|
||||
regex: "(?i)mfa[_\\-]?token[\\s=:]+['\"`]?[\\dA-Z]{6,8}['\"`]?"
|
||||
severity: "warning"
|
||||
id: "DG-SEC-002"
|
||||
- name: "pem_certificate"
|
||||
regex: "-----BEGIN CERTIFICATE-----"
|
||||
severity: "info"
|
||||
id: "DG-SEC-003"
|
||||
|
||||
# ─── Logging safety rules ─────────────────────────────────────────────────────
|
||||
logging_rules:
|
||||
# Field names that must NOT appear unmasked in logger calls
|
||||
forbid_logging_fields:
|
||||
- password
|
||||
- passwd
|
||||
- token
|
||||
- secret
|
||||
- private_key
|
||||
- api_key
|
||||
- access_key
|
||||
- credential
|
||||
- auth_header
|
||||
- bearer
|
||||
|
||||
# Fields that should appear as hash-only (warn if logged raw)
|
||||
sensitive_fields_warn:
|
||||
- user_id
|
||||
- chat_id
|
||||
- telegram_id
|
||||
- session_id
|
||||
- workspace_id
|
||||
|
||||
# Calls that indicate redaction is applied (good)
|
||||
redaction_calls:
|
||||
- redact
|
||||
- mask
|
||||
- sanitize
|
||||
- anonymize
|
||||
- _hash
|
||||
- sha256
|
||||
|
||||
# Payload field names that indicate raw content is being logged/stored
|
||||
raw_payload_indicators:
|
||||
- payload
|
||||
- diff_text
|
||||
- openapi_text
|
||||
- request_body
|
||||
- response_body
|
||||
- prompt
|
||||
- messages
|
||||
- content
|
||||
- transcript
|
||||
- conversation
|
||||
- full_text
|
||||
|
||||
# ─── Storage / retention keywords ─────────────────────────────────────────────
|
||||
storage_keywords:
|
||||
write_patterns:
|
||||
- save_message
|
||||
- store_event
|
||||
- insert_record
|
||||
- append_event
|
||||
- write_event
|
||||
- write_record
|
||||
- persist
|
||||
- bulk_insert
|
||||
- executemany
|
||||
retention_indicators:
|
||||
- ttl
|
||||
- expire
|
||||
- retention
|
||||
- cleanup
|
||||
- delete_old
|
||||
- purge
|
||||
- rotate
|
||||
- max_age
|
||||
- expiry
|
||||
context_window: 20 # lines before/after to search for retention indicator
|
||||
|
||||
# ─── Scan paths ───────────────────────────────────────────────────────────────
|
||||
paths:
|
||||
include:
|
||||
- "services/"
|
||||
- "docs/"
|
||||
- "ops/"
|
||||
- "config/"
|
||||
exclude:
|
||||
- "**/node_modules/**"
|
||||
- "**/.git/**"
|
||||
- "**/dist/**"
|
||||
- "**/build/**"
|
||||
- "**/.venv/**"
|
||||
- "**/__pycache__/**"
|
||||
- "**/*.pyc"
|
||||
- "**/*.lock" # dependency lock files (high false-positive risk)
|
||||
- "**/*.min.js"
|
||||
|
||||
# File extensions to scan
|
||||
scan_extensions:
|
||||
- ".py"
|
||||
- ".ts"
|
||||
- ".js"
|
||||
- ".yml"
|
||||
- ".yaml"
|
||||
- ".json"
|
||||
- ".env.example"
|
||||
- ".md"
|
||||
- ".txt"
|
||||
- ".sh"
|
||||
|
||||
# Never scan these (sensitive or binary)
|
||||
never_scan:
|
||||
- "*.env"
|
||||
- ".env.*"
|
||||
- "*.pem"
|
||||
- "*.key"
|
||||
- "*.pfx"
|
||||
- "*.p12"
|
||||
- "*.crt"
|
||||
|
||||
# ─── Gate behaviour ───────────────────────────────────────────────────────────
|
||||
severity_behavior:
|
||||
# warning_only: gate always pass=True (adds recommendations only)
|
||||
# strict: gate pass=False on any error finding
|
||||
gate_mode: "warning_only"
|
||||
recommend_on:
|
||||
- "warning"
|
||||
- "error"
|
||||
|
||||
# ─── Limits ───────────────────────────────────────────────────────────────────
|
||||
limits:
|
||||
max_files_fast: 200
|
||||
max_files_full: 500
|
||||
max_bytes_per_file: 262144 # 256KB
|
||||
max_findings: 200 # cap before truncating
|
||||
max_evidence_chars: 200 # mask and truncate evidence snippets
|
||||
37
config/incident_escalation_policy.yml
Normal file
37
config/incident_escalation_policy.yml
Normal file
@@ -0,0 +1,37 @@
|
||||
# Incident Escalation Policy
|
||||
# Controls deterministic escalation and auto-resolve candidate logic.
|
||||
|
||||
defaults:
|
||||
window_minutes: 60
|
||||
|
||||
escalation:
|
||||
# Escalate when the same signature storms
|
||||
occurrences_thresholds:
|
||||
P2_to_P1: 10 # occurrences_60m to escalate P2 → P1
|
||||
P1_to_P0: 25 # occurrences_60m to escalate P1 → P0
|
||||
|
||||
triage_thresholds_24h:
|
||||
P2_to_P1: 3 # triage_count_24h to escalate P2 → P1
|
||||
P1_to_P0: 6 # triage_count_24h to escalate P1 → P0
|
||||
|
||||
severity_cap: "P0" # never escalate above this
|
||||
|
||||
create_followup_on_escalate: true
|
||||
followup:
|
||||
priority: "P1"
|
||||
due_hours: 24
|
||||
owner: "oncall"
|
||||
message_template: "Escalated due to alert storm: occurrences={occurrences_60m}, triages_24h={triage_count_24h}"
|
||||
|
||||
auto_resolve:
|
||||
# Candidates only in MVP — do not auto-close P0/P1
|
||||
no_alerts_minutes_for_candidate: 60
|
||||
close_allowed_severities: ["P2", "P3"]
|
||||
auto_close: false # set true carefully in staging only
|
||||
candidate_event_type: "note"
|
||||
candidate_message: "Auto-resolve candidate: no alerts observed in {no_alerts_minutes} minutes for this signature"
|
||||
|
||||
alert_loop_slo:
|
||||
claim_to_ack_p95_seconds: 60 # p95 latency from claim → ack
|
||||
failed_rate_pct: 5 # max % of failed/(acked+failed) in window
|
||||
processing_stuck_minutes: 15 # alerts in processing beyond this → stuck
|
||||
88
config/incident_intelligence_policy.yml
Normal file
88
config/incident_intelligence_policy.yml
Normal file
@@ -0,0 +1,88 @@
|
||||
# Incident Intelligence Policy
|
||||
# Controls correlation scoring, recurrence detection, and digest generation.
|
||||
|
||||
correlation:
|
||||
lookback_days: 30
|
||||
max_related: 10
|
||||
min_score: 20 # discard matches below this
|
||||
rules:
|
||||
- name: "same_signature"
|
||||
weight: 100
|
||||
match:
|
||||
signature: true
|
||||
|
||||
- name: "same_service_and_kind"
|
||||
weight: 60
|
||||
match:
|
||||
same_service: true
|
||||
same_kind: true
|
||||
|
||||
- name: "same_service_time_cluster"
|
||||
weight: 40
|
||||
match:
|
||||
same_service: true
|
||||
within_minutes: 180
|
||||
|
||||
- name: "same_kind_cross_service"
|
||||
weight: 30
|
||||
match:
|
||||
same_kind: true
|
||||
within_minutes: 120
|
||||
|
||||
recurrence:
|
||||
windows_days: [7, 30]
|
||||
thresholds:
|
||||
signature:
|
||||
warn: 3 # ≥ 3 occurrences in window → warn
|
||||
high: 6 # ≥ 6 occurrences in window → high
|
||||
kind:
|
||||
warn: 5
|
||||
high: 10
|
||||
top_n: 15 # top N per category
|
||||
|
||||
# Deterministic recommendations per recurrence level
|
||||
recommendations:
|
||||
signature_high: "Create permanent fix: add regression test + SLO guard for this failure type"
|
||||
signature_warn: "Review root cause history; consider adding monitoring threshold"
|
||||
kind_high: "Systemic issue with kind={kind}: review architecture / add circuit breaker"
|
||||
kind_warn: "Recurring kind={kind}: validate if alert thresholds are tuned correctly"
|
||||
|
||||
digest:
|
||||
weekly_day: "Mon"
|
||||
include_closed: true
|
||||
include_open: true
|
||||
output_dir: "ops/reports/incidents"
|
||||
markdown_max_chars: 8000
|
||||
top_incidents: 20 # max incidents in weekly listing
|
||||
|
||||
# ── Root-Cause Buckets ─────────────────────────────────────────────────────
|
||||
buckets:
|
||||
mode: "service_kind" # service_kind | signature_prefix
|
||||
signature_prefix_len: 12
|
||||
top_n: 10
|
||||
min_count:
|
||||
7: 3 # bucket must have ≥ 3 incidents in last 7d
|
||||
30: 6 # or ≥ 6 in last 30d
|
||||
include_statuses: ["open", "mitigating", "resolved", "closed"]
|
||||
|
||||
# ── Auto Follow-ups (policy-driven, no LLM) ───────────────────────────────
|
||||
autofollowups:
|
||||
enabled: true
|
||||
only_when_high: true # only create for HIGH recurrence buckets
|
||||
owner: "oncall"
|
||||
priority: "P1"
|
||||
due_days: 7
|
||||
max_followups_per_bucket_per_week: 1 # dedupe by week+bucket_key
|
||||
dedupe_key_prefix: "intel_recur"
|
||||
|
||||
# ── Release Gate: recurrence_watch ────────────────────────────────────────
|
||||
release_gate:
|
||||
recurrence_watch:
|
||||
enabled: true
|
||||
service_scope: "target_service" # target_service | all
|
||||
windows_days: [7, 30]
|
||||
fail_on:
|
||||
severity_in: ["P0", "P1"] # used only in strict mode
|
||||
high_recurrence: true
|
||||
warn_on:
|
||||
warn_recurrence: true
|
||||
143
config/network_allowlist.yml
Normal file
143
config/network_allowlist.yml
Normal file
@@ -0,0 +1,143 @@
|
||||
# Network Allowlist for Tool HTTP Calls
|
||||
# Tools that make outbound HTTP requests MUST use only hosts/IPs listed here.
|
||||
# Any request to unlisted hosts is blocked by tool_governance.py middleware.
|
||||
#
|
||||
# Format per tool:
|
||||
# hosts: exact hostname or IP
|
||||
# prefixes: URL prefix match (for paths)
|
||||
|
||||
# ─── Observability Sources ────────────────────────────────────────────────────
|
||||
observability_tool:
|
||||
description: "Prometheus, Loki, Tempo datasources"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "prometheus"
|
||||
- "loki"
|
||||
- "tempo"
|
||||
- "monitoring"
|
||||
- "144.76.224.179" # NODA1 monitoring
|
||||
ports_allowed: [9090, 3100, 3200, 9080]
|
||||
schemes: ["http", "https"]
|
||||
|
||||
# ─── Oncall / Service Health ──────────────────────────────────────────────────
|
||||
oncall_tool:
|
||||
description: "Internal service health endpoints only"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "gateway"
|
||||
- "router"
|
||||
- "memory"
|
||||
- "qdrant"
|
||||
- "nats"
|
||||
- "144.76.224.179" # NODA1
|
||||
- "212.8.58.133" # NODA3
|
||||
ports_allowed: [80, 443, 8000, 8080, 8222, 9000, 9100, 9102, 9200, 9300, 9400]
|
||||
schemes: ["http", "https"]
|
||||
|
||||
# ─── Web Search / Extract ─────────────────────────────────────────────────────
|
||||
web_search:
|
||||
description: "Search provider APIs"
|
||||
hosts:
|
||||
- "api.duckduckgo.com"
|
||||
- "serpapi.com"
|
||||
- "api.bing.microsoft.com"
|
||||
- "customsearch.googleapis.com"
|
||||
schemes: ["https"]
|
||||
|
||||
web_extract:
|
||||
description: "Any public HTTPS URL (user-provided)"
|
||||
allow_any_public: true # Allow any non-private IP
|
||||
block_private_ranges: true # Block RFC1918 / loopback / link-local
|
||||
schemes: ["https"]
|
||||
|
||||
crawl4ai_scrape:
|
||||
description: "Crawl4AI service + public URLs"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "crawl4ai"
|
||||
ports_allowed: [11235]
|
||||
allow_any_public: true
|
||||
block_private_ranges: true
|
||||
schemes: ["http", "https"]
|
||||
|
||||
# ─── Memory / Graph ───────────────────────────────────────────────────────────
|
||||
memory_search:
|
||||
description: "Memory service + Qdrant"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "memory-service"
|
||||
- "qdrant"
|
||||
- "144.76.224.179"
|
||||
ports_allowed: [6333, 8001, 8100]
|
||||
schemes: ["http", "https"]
|
||||
|
||||
graph_query:
|
||||
description: "Neo4j bolt/http"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "neo4j"
|
||||
ports_allowed: [7474, 7687]
|
||||
schemes: ["http", "https", "bolt", "bolt+s"]
|
||||
|
||||
# ─── ComfyUI / Image Generation ──────────────────────────────────────────────
|
||||
comfy_generate_image:
|
||||
description: "ComfyUI on NODA3"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "212.8.58.133"
|
||||
ports_allowed: [8188]
|
||||
schemes: ["http"]
|
||||
|
||||
comfy_generate_video:
|
||||
description: "ComfyUI video on NODA3"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "212.8.58.133"
|
||||
ports_allowed: [8188]
|
||||
schemes: ["http"]
|
||||
|
||||
# ─── LLM Providers ────────────────────────────────────────────────────────────
|
||||
# (Used by router/gateway, not direct tool calls, but documented for reference)
|
||||
llm_providers:
|
||||
description: "External LLM APIs"
|
||||
hosts:
|
||||
- "api.x.ai" # xAI Grok
|
||||
- "open.bigmodel.cn" # GLM-5 Z.AI
|
||||
- "api.deepseek.com" # DeepSeek
|
||||
- "api.openai.com" # OpenAI fallback
|
||||
schemes: ["https"]
|
||||
|
||||
# ─── Presentation Service ─────────────────────────────────────────────────────
|
||||
presentation_create:
|
||||
description: "Presentation rendering service"
|
||||
hosts:
|
||||
- "localhost"
|
||||
- "127.0.0.1"
|
||||
- "presentation-service"
|
||||
ports_allowed: [8080, 9500]
|
||||
schemes: ["http", "https"]
|
||||
|
||||
# ─── Dependency Scanner ───────────────────────────────────────────────────────
|
||||
dependency_scanner_tool:
|
||||
description: "OSV.dev API for vulnerability lookups (online mode only)"
|
||||
hosts:
|
||||
- "api.osv.dev"
|
||||
schemes: ["https"]
|
||||
# Only used when vuln_mode=online; offline_cache requires no outbound
|
||||
|
||||
# ─── Private IP Ranges (always blocked for allow_any_public tools) ────────────
|
||||
private_ip_ranges:
|
||||
- "10.0.0.0/8"
|
||||
- "172.16.0.0/12"
|
||||
- "192.168.0.0/16"
|
||||
- "127.0.0.0/8"
|
||||
- "169.254.0.0/16"
|
||||
- "::1/128"
|
||||
- "fc00::/7"
|
||||
49
config/observability_sources.yml
Normal file
49
config/observability_sources.yml
Normal file
@@ -0,0 +1,49 @@
|
||||
# Observability Data Sources Configuration
|
||||
# These are internal URLs - never expose to external networks
|
||||
|
||||
prometheus:
|
||||
# Prometheus server URL (internal network)
|
||||
base_url: "http://prometheus:9090"
|
||||
|
||||
# Allowed PromQL query prefixes (security)
|
||||
allow_promql_prefixes:
|
||||
- "sum("
|
||||
- "rate("
|
||||
- "histogram_quantile("
|
||||
- "avg("
|
||||
- "max("
|
||||
- "min("
|
||||
- "count("
|
||||
- "irate("
|
||||
- "last_over_time("
|
||||
- "present_over_time("
|
||||
|
||||
loki:
|
||||
# Loki log server URL (internal network)
|
||||
base_url: "http://loki:3100"
|
||||
|
||||
tempo:
|
||||
# Tempo trace server URL (internal network)
|
||||
base_url: "http://tempo:3200"
|
||||
|
||||
# Limits configuration
|
||||
limits:
|
||||
# Maximum time window for queries (hours)
|
||||
max_time_window_hours: 24
|
||||
|
||||
# Maximum series returned
|
||||
max_series: 200
|
||||
|
||||
# Maximum points in range query
|
||||
max_points: 2000
|
||||
|
||||
# Maximum bytes in response
|
||||
max_bytes: 300000
|
||||
|
||||
# Query timeout (seconds)
|
||||
timeout_seconds: 5
|
||||
|
||||
# Environment variables (override URLs)
|
||||
# PROMETHEUS_URL
|
||||
# LOKI_URL
|
||||
# TEMPO_URL
|
||||
133
config/release_gate_policy.yml
Normal file
133
config/release_gate_policy.yml
Normal file
@@ -0,0 +1,133 @@
|
||||
# Release Gate Policy — DAARION.city
|
||||
#
|
||||
# Controls strictness of each gate per deployment profile.
|
||||
#
|
||||
# Modes:
|
||||
# off — gate is fully skipped (no call, no output)
|
||||
# warn — gate always pass=True; findings become recommendations only
|
||||
# strict — gate can fail release (pass=False) when fail_on conditions are met
|
||||
#
|
||||
# Profiles: dev | staging | prod
|
||||
# Set via release_check input `gate_profile` (default: dev).
|
||||
|
||||
profiles:
|
||||
dev:
|
||||
description: "Development: strict for security gates, warn for governance"
|
||||
gates:
|
||||
pr_review:
|
||||
mode: "strict"
|
||||
config_lint:
|
||||
mode: "strict"
|
||||
dependency_scan:
|
||||
mode: "strict"
|
||||
fail_on_severities: ["CRITICAL", "HIGH"]
|
||||
contract_diff:
|
||||
mode: "strict"
|
||||
threat_model:
|
||||
mode: "strict"
|
||||
smoke:
|
||||
mode: "warn"
|
||||
drift:
|
||||
mode: "warn"
|
||||
slo_watch:
|
||||
mode: "warn"
|
||||
followup_watch:
|
||||
mode: "warn"
|
||||
fail_on: ["P0", "P1"]
|
||||
privacy_watch:
|
||||
mode: "warn"
|
||||
cost_watch:
|
||||
mode: "warn"
|
||||
recurrence_watch:
|
||||
mode: "warn"
|
||||
risk_watch:
|
||||
mode: "warn"
|
||||
risk_delta_watch:
|
||||
mode: "warn"
|
||||
platform_review_required:
|
||||
mode: "warn"
|
||||
|
||||
staging:
|
||||
description: "Staging: strict security + strict privacy on errors"
|
||||
gates:
|
||||
pr_review:
|
||||
mode: "strict"
|
||||
config_lint:
|
||||
mode: "strict"
|
||||
dependency_scan:
|
||||
mode: "strict"
|
||||
fail_on_severities: ["CRITICAL", "HIGH"]
|
||||
contract_diff:
|
||||
mode: "strict"
|
||||
threat_model:
|
||||
mode: "strict"
|
||||
smoke:
|
||||
mode: "warn"
|
||||
drift:
|
||||
mode: "strict"
|
||||
slo_watch:
|
||||
mode: "strict" # Don't deploy if SLO currently breached
|
||||
followup_watch:
|
||||
mode: "strict"
|
||||
fail_on: ["P0", "P1"]
|
||||
privacy_watch:
|
||||
mode: "strict"
|
||||
fail_on: ["error"]
|
||||
cost_watch:
|
||||
mode: "warn"
|
||||
recurrence_watch:
|
||||
mode: "strict" # Block staging deploy if P0/P1 high recurrence
|
||||
fail_on:
|
||||
severity_in: ["P0", "P1"]
|
||||
high_recurrence: true
|
||||
risk_watch:
|
||||
mode: "strict" # Block staging if score >= fail_at for p0_services
|
||||
risk_delta_watch:
|
||||
mode: "strict" # Block staging for p0_services when delta >= fail_delta
|
||||
platform_review_required:
|
||||
mode: "warn" # warn-first: never blocks staging by default
|
||||
|
||||
prod:
|
||||
description: "Production: maximum strictness across all gates"
|
||||
gates:
|
||||
pr_review:
|
||||
mode: "strict"
|
||||
config_lint:
|
||||
mode: "strict"
|
||||
dependency_scan:
|
||||
mode: "strict"
|
||||
fail_on_severities: ["CRITICAL", "HIGH", "MEDIUM"]
|
||||
contract_diff:
|
||||
mode: "strict"
|
||||
threat_model:
|
||||
mode: "strict"
|
||||
smoke:
|
||||
mode: "strict"
|
||||
drift:
|
||||
mode: "strict"
|
||||
slo_watch:
|
||||
mode: "warn" # Warn: don't automatically block prod deploys on SLO
|
||||
followup_watch:
|
||||
mode: "warn"
|
||||
fail_on: ["P0"]
|
||||
privacy_watch:
|
||||
mode: "strict"
|
||||
fail_on: ["error"]
|
||||
cost_watch:
|
||||
mode: "warn"
|
||||
recurrence_watch:
|
||||
mode: "warn" # Warn only in prod (accumulate data first)
|
||||
risk_watch:
|
||||
mode: "warn" # Warn only in prod
|
||||
risk_delta_watch:
|
||||
mode: "warn" # Warn only in prod
|
||||
platform_review_required:
|
||||
mode: "warn" # Start conservative in prod
|
||||
|
||||
# ─── Defaults (used if profile or gate not found) ────────────────────────────
|
||||
defaults:
|
||||
mode: "warn"
|
||||
# privacy_watch default fail_on (for strict mode):
|
||||
privacy_fail_on: ["error"]
|
||||
# cost_watch is never strict by default
|
||||
cost_always_warn: true
|
||||
80
config/risk_attribution_policy.yml
Normal file
80
config/risk_attribution_policy.yml
Normal file
@@ -0,0 +1,80 @@
|
||||
# Risk Attribution Policy — DAARION.city
|
||||
#
|
||||
# Deterministic attribution: risk spike → likely causes.
|
||||
# LLM enrichment is OFF by default; local only on regression triggers.
|
||||
|
||||
defaults:
|
||||
lookback_hours: 24
|
||||
max_causes: 5
|
||||
llm_mode: "off" # off | local | remote
|
||||
llm_max_chars_in: 3500
|
||||
llm_max_chars_out: 800
|
||||
|
||||
# LLM enrichment triggers — only if ALL conditions are met
|
||||
llm_triggers:
|
||||
risk_delta_warn: 10 # delta_24h >= 10
|
||||
risk_delta_fail: 20 # delta_24h >= 20 (fail-level)
|
||||
band_in: ["high", "critical"]
|
||||
|
||||
# Per-cause scoring weights (additive)
|
||||
weights:
|
||||
deploy: 30
|
||||
dependency: 25
|
||||
drift: 25
|
||||
incident_storm: 20
|
||||
slo_violation: 15
|
||||
followups_overdue: 10
|
||||
alert_loop_degraded: 10
|
||||
|
||||
# Per-signal detection config
|
||||
signals:
|
||||
deploy:
|
||||
# Alert kinds that indicate a deploy event
|
||||
kinds: ["deploy", "deployment", "rollout", "canary"]
|
||||
|
||||
dependency:
|
||||
# Release gate names whose fail/warn counts as a dependency signal
|
||||
release_gate_names: ["dependency_scan", "deps"]
|
||||
|
||||
drift:
|
||||
release_gate_names: ["drift", "config_drift"]
|
||||
|
||||
incident_storm:
|
||||
thresholds:
|
||||
# occurrences in last 60min across all alert signatures for the service
|
||||
occurrences_60m_warn: 10
|
||||
# escalations (Escalated events) in last 24h
|
||||
escalations_24h_warn: 2
|
||||
|
||||
slo:
|
||||
require_active_violation: true
|
||||
|
||||
# Confidence bands (minimum score to reach that band)
|
||||
output:
|
||||
confidence_bands:
|
||||
high: 60 # score >= 60 → high confidence
|
||||
medium: 35 # score >= 35 → medium
|
||||
# below 35 → low
|
||||
|
||||
# Change Timeline config
|
||||
timeline:
|
||||
enabled: true
|
||||
lookback_hours: 24
|
||||
max_items: 30
|
||||
include_types: ["deploy","dependency","drift","incident","slo","followup","alert_loop","release_gate"]
|
||||
time_bucket_minutes: 5 # coalesce same-type events within 5-min windows
|
||||
|
||||
# Evidence linking
|
||||
evidence_linking:
|
||||
enabled: true
|
||||
max_refs_per_cause: 10
|
||||
|
||||
# LLM local endpoint config (only used when llm_mode=local)
|
||||
llm_local:
|
||||
endpoint: "http://localhost:11434/api/generate"
|
||||
model: "llama3"
|
||||
timeout_seconds: 15
|
||||
# Hardening guards
|
||||
model_allowlist: ["qwen2.5-coder:3b", "llama3.1:8b-instruct", "phi3:mini", "llama3"]
|
||||
max_calls_per_digest: 3
|
||||
per_day_dedupe: true # key: risk_enrich:{YYYY-MM-DD}:{service}:{env}
|
||||
89
config/risk_policy.yml
Normal file
89
config/risk_policy.yml
Normal file
@@ -0,0 +1,89 @@
|
||||
# Service Risk Index Policy — DAARION.city
|
||||
#
|
||||
# Controls how Risk Scores are computed, classified, and gated.
|
||||
# All scoring is deterministic: no LLM required.
|
||||
|
||||
defaults:
|
||||
window_hours: 24
|
||||
recurrence_windows_days: [7, 30]
|
||||
slo_window_minutes: 60
|
||||
|
||||
thresholds:
|
||||
bands:
|
||||
low_max: 20
|
||||
medium_max: 50
|
||||
high_max: 80
|
||||
risk_watch: # defaults, overridable per service below
|
||||
warn_at: 50 # score >= warn_at → recommendations
|
||||
fail_at: 80 # score >= fail_at → gate fails (strict mode only)
|
||||
|
||||
weights:
|
||||
open_incidents:
|
||||
P0: 50
|
||||
P1: 25
|
||||
P2: 10
|
||||
P3: 5
|
||||
recurrence:
|
||||
signature_warn_7d: 10
|
||||
signature_high_7d: 20
|
||||
kind_warn_7d: 8
|
||||
kind_high_7d: 15
|
||||
signature_high_30d: 10
|
||||
kind_high_30d: 8
|
||||
followups:
|
||||
overdue_P0: 20
|
||||
overdue_P1: 12
|
||||
overdue_other: 6
|
||||
slo:
|
||||
violation: 10 # per active violation
|
||||
alerts_loop:
|
||||
slo_violation: 10 # per alert-loop SLO violation
|
||||
escalation:
|
||||
escalations_24h:
|
||||
warn: 5 # score added if escalations_24h >= 1
|
||||
high: 12 # score added if escalations_24h >= 3
|
||||
|
||||
# Per-service risk gate overrides (lower/higher fail_at)
|
||||
service_overrides:
|
||||
gateway:
|
||||
risk_watch:
|
||||
fail_at: 75 # gateway is critical: fail earlier
|
||||
router:
|
||||
risk_watch:
|
||||
fail_at: 80
|
||||
|
||||
# Services treated as P0 (always subject to strict risk_watch in staging)
|
||||
p0_services:
|
||||
- gateway
|
||||
- router
|
||||
|
||||
# ─── History & Snapshotting ────────────────────────────────────────────────────
|
||||
history:
|
||||
snapshot_interval_minutes: 60
|
||||
retention_days: 90
|
||||
max_services_per_run: 50
|
||||
|
||||
# ─── Trend analysis ───────────────────────────────────────────────────────────
|
||||
trend:
|
||||
delta_windows_hours: [24, 168] # 24h and 7d
|
||||
volatility_window_hours: 168 # stddev computed over last 7d
|
||||
regression_threshold:
|
||||
delta_24h_warn: 10 # score rose >= 10 points in 24h → warn
|
||||
delta_24h_fail: 20 # score rose >= 20 points in 24h → fail (strict)
|
||||
delta_7d_warn: 15
|
||||
delta_7d_fail: 30
|
||||
|
||||
# ─── Daily Digest ─────────────────────────────────────────────────────────────
|
||||
digest:
|
||||
daily_hour_utc: 9 # generate at 09:00 UTC
|
||||
output_dir: "ops/reports/risk"
|
||||
markdown_max_chars: 8000
|
||||
top_n: 10
|
||||
|
||||
# ─── Risk Delta release gate ──────────────────────────────────────────────────
|
||||
release_gate:
|
||||
risk_delta_watch:
|
||||
enabled: true
|
||||
default_warn_delta_24h: 10
|
||||
default_fail_delta_24h: 20
|
||||
p0_services_strict: true
|
||||
52
config/roles/aistalk/aurora.md
Normal file
52
config/roles/aistalk/aurora.md
Normal file
@@ -0,0 +1,52 @@
|
||||
# Aurora (Autonomous Media Forensics)
|
||||
|
||||
Role:
|
||||
- Lead media forensics for video, audio, and photo evidence inside AISTALK.
|
||||
- Extract usable evidence from low-quality media while preserving reproducibility.
|
||||
|
||||
Modes:
|
||||
- `tactical`: fast triage for operational clarity.
|
||||
- prioritize turnaround and readability
|
||||
- lightweight pipelines and lower cost
|
||||
- output is advisory (not courtroom-grade)
|
||||
- `forensic`: evidence-grade processing.
|
||||
- prioritize reproducibility and auditability
|
||||
- mandatory input/output hashing and immutable processing log
|
||||
- chain-of-custody notes + signing metadata
|
||||
|
||||
Capabilities:
|
||||
- Video: denoise, deblur, super-resolution, stabilization, frame interpolation.
|
||||
- Face-focused enhancement: controlled face restoration with clear model attribution.
|
||||
- Audio: denoise, speech intelligibility improvement, deepfake risk signals.
|
||||
- Photo: artifact cleanup, upscale, metadata/EXIF integrity review.
|
||||
|
||||
Internal sub-pipeline handles:
|
||||
- `Clarity`: global video enhancement.
|
||||
- `Vera`: face restoration and face-quality diagnostics.
|
||||
- `Echo`: audio cleaning/transcription/deepfake heuristics.
|
||||
- `Pixis`: photo restoration and metadata checks.
|
||||
- `Kore`: forensic packaging (hashes, chain-of-custody, signature metadata).
|
||||
|
||||
Output contract (strict JSON for downstream graphing):
|
||||
```json
|
||||
{
|
||||
"agent": "Aurora",
|
||||
"mode": "tactical | forensic",
|
||||
"job_id": "aurora_YYYYMMDD_###",
|
||||
"input_file": {"name": "file.ext", "hash": "sha256:..."},
|
||||
"processing_log": [
|
||||
{"step": "denoise", "model": "model_name", "time_ms": 0}
|
||||
],
|
||||
"output_files": [
|
||||
{"type": "video|audio|photo|forensic_log", "url": "https://...", "hash": "sha256:..."}
|
||||
],
|
||||
"digital_signature": "ed25519:... | null"
|
||||
}
|
||||
```
|
||||
|
||||
Boundaries:
|
||||
- No deceptive deepfake generation or identity manipulation.
|
||||
- Never present AI-enhanced output as untouched original evidence.
|
||||
- Flag uncertainty and potential enhancement artifacts explicitly.
|
||||
- Do not provide final legal conclusions; require expert human review for court use.
|
||||
- Preserve originals; never destructively overwrite source evidence.
|
||||
64
config/slo_policy.yml
Normal file
64
config/slo_policy.yml
Normal file
@@ -0,0 +1,64 @@
|
||||
# SLO Policy — DAARION.city
|
||||
#
|
||||
# Defines Service Level Objectives per service.
|
||||
# Used by observability_tool.slo_snapshot and incident_triage_graph slo_context node.
|
||||
#
|
||||
# Fields:
|
||||
# error_rate_pct — max allowed error rate (%)
|
||||
# latency_p95_ms — max p95 latency (milliseconds)
|
||||
# window_minutes — default observation window (default: 60)
|
||||
|
||||
defaults:
|
||||
window_minutes: 60
|
||||
error_rate_pct: 1.0
|
||||
latency_p95_ms: 300
|
||||
|
||||
services:
|
||||
gateway:
|
||||
error_rate_pct: 1.0
|
||||
latency_p95_ms: 300
|
||||
router:
|
||||
error_rate_pct: 0.5
|
||||
latency_p95_ms: 200
|
||||
memory-service:
|
||||
error_rate_pct: 1.0
|
||||
latency_p95_ms: 400
|
||||
sofiia-supervisor:
|
||||
error_rate_pct: 1.0
|
||||
latency_p95_ms: 500
|
||||
|
||||
# ─── Voice SLO profiles ───────────────────────────────────────────────────────
|
||||
# Two profiles aligned with router-config.yml selection_policies.
|
||||
# Measured via Prometheus metrics emitted by sofiia-console /api/telemetry/voice
|
||||
# and memory-service voice_endpoints.py.
|
||||
#
|
||||
# Prometheus metrics:
|
||||
# voice_ttfa_ms{voice_profile} — Time-to-first-audio (BFF → first playable)
|
||||
# voice_e2e_ms{voice_profile} — User stops speaking → audio plays
|
||||
# voice_tts_first_ms{voice_profile} — First-sentence TTS synthesis
|
||||
# voice_tts_compute_ms{engine,voice} — Memory-service internal TTS
|
||||
# voice_queue_underflows_total — Playback starvation events
|
||||
voice_slo:
|
||||
voice_fast_uk:
|
||||
description: "Fast profile: gemma3 → qwen3.5 fallback"
|
||||
ttfa_ms_p95: 5000 # TTFA p95 ≤ 5s
|
||||
e2e_ms_p95: 9000 # E2E p95 ≤ 9s
|
||||
tts_first_ms_p95: 2000 # TTS synthesis p95 ≤ 2s
|
||||
underflow_rate_pct: 1.0 # starvation events per 100 voice turns ≤ 1%
|
||||
tts_error_rate_pct: 0.5 # edge-tts failures ≤ 0.5%
|
||||
window_minutes: 10
|
||||
|
||||
voice_quality_uk:
|
||||
description: "Quality profile: qwen3.5 → qwen3:14b fallback"
|
||||
ttfa_ms_p95: 7000
|
||||
e2e_ms_p95: 12000
|
||||
tts_first_ms_p95: 2000 # TTS itself is the same engine
|
||||
underflow_rate_pct: 2.0 # slightly relaxed (longer LLM → more gap risk)
|
||||
tts_error_rate_pct: 0.5
|
||||
window_minutes: 10
|
||||
|
||||
# Canary thresholds (runtime health check, stricter)
|
||||
canary:
|
||||
tts_polina_max_ms: 3000 # live Polina synthesis ≤ 3s
|
||||
tts_ostap_max_ms: 3000 # live Ostap synthesis ≤ 3s
|
||||
min_audio_bytes: 1000 # valid audio is never empty/tiny
|
||||
554
docs/AGENT_AUDIT_PLAN.md
Normal file
554
docs/AGENT_AUDIT_PLAN.md
Normal file
@@ -0,0 +1,554 @@
|
||||
# AGENT AUDIT PLAN — NODA1 DAARION.city
|
||||
**Дата:** 2026-02-28
|
||||
**Аудитор:** Sofiia — Chief AI Architect
|
||||
**Мета:** Ретельна перевірка кожного з 14 агентів на NODA1
|
||||
|
||||
---
|
||||
|
||||
## BASELINE — Поточний стан інфраструктури
|
||||
|
||||
### LLM Routing (після виправлень 2026-02-28)
|
||||
| Агент | Провайдер | Fallback |
|
||||
|-------|-----------|---------|
|
||||
| sofiia | **Grok** | DeepSeek |
|
||||
| senpai | **Grok** | DeepSeek |
|
||||
| всі інші (12) | DeepSeek | Mistral |
|
||||
| monitor, devtools | Ollama (local) | — |
|
||||
|
||||
### Telegram Tokens
|
||||
Всі 14 агентів: ✅ (підтверджено через `docker inspect dagi-gateway-node1`)
|
||||
|
||||
### Qdrant Collections (61 total)
|
||||
| Агент | messages | docs | memory_items | summaries | user_context |
|
||||
|-------|----------|------|-------------|-----------|-------------|
|
||||
| agromatrix | 2159 | 350 | — | — | — |
|
||||
| alateya | 163 | — | — | 1 | — |
|
||||
| clan | 1089 | — | — | — | — |
|
||||
| daarwizz | 144 | — | — | — | — |
|
||||
| druid | 338 | — | — | — | — |
|
||||
| eonarch | 75 | — | — | — | — |
|
||||
| greenfood | 301 | — | — | — | — |
|
||||
| helion | 5836 | 315 | — | 12 | — |
|
||||
| nutra | 890 | — | — | — | — |
|
||||
| oneok | 38 | — | — | — | — |
|
||||
| senpai | 1759 | — | — | 3 | — |
|
||||
| sofiia | 1184 | — | — | — | — |
|
||||
| soul | 412 | 153 | — | 1 | — |
|
||||
| yaromir | 11 | — | — | — | — |
|
||||
|
||||
### Multimodal Stack (Swapper :8890)
|
||||
| Модель | Тип | Статус | Розмір |
|
||||
|--------|-----|--------|--------|
|
||||
| qwen3-8b | llm | ✅ loaded | 5.2GB |
|
||||
| qwen3-vl-8b | vision | ⚪ unloaded | 6.1GB |
|
||||
| got-ocr2 | ocr | ⚪ unloaded | 7.0GB |
|
||||
| granite-docling | document | ⚪ unloaded | 2.5GB |
|
||||
| faster-whisper-large | stt | ⚪ unloaded | 3.0GB |
|
||||
| whisper-small | stt | ⚪ unloaded | 0.5GB |
|
||||
| xtts-v2 | tts | ⚪ unloaded | 2.0GB |
|
||||
| flux-klein-4b | image_gen | ⚪ unloaded | 15.4GB |
|
||||
|
||||
### Capability Services
|
||||
| Сервіс | Порт | Статус |
|
||||
|--------|------|--------|
|
||||
| swapper (vision/STT/TTS) | 8890 | ✅ healthy |
|
||||
| rag-service | 9500 | ✅ healthy |
|
||||
| crawl4ai | 11235 | ✅ ok |
|
||||
| presentation | 9212 | ✅ healthy |
|
||||
| artifact-registry | 9220 | ✅ healthy |
|
||||
| crewai-service | 9010 | ✅ ok |
|
||||
| senpai-md-consumer | 8892 | ✅ ok |
|
||||
| market-data | 8893 | ✅ ok |
|
||||
| plant-vision | 8085 | ❌ down |
|
||||
|
||||
### Standard Tool Stack (всі агенти)
|
||||
`memory_search`, `graph_query`, `web_search`, `web_extract`, `crawl4ai_scrape`,
|
||||
`remember_fact`, `image_generate`, `tts_speak`, `presentation_create`,
|
||||
`presentation_status`, `presentation_download`, `file_tool`
|
||||
|
||||
---
|
||||
|
||||
## ЧЕКЛИСТ ПЕРЕВІРКИ АГЕНТА
|
||||
|
||||
Для кожного агента перевіряємо 10 категорій:
|
||||
|
||||
### 1. TELEGRAM CHAT (Чат з користувачем)
|
||||
- [ ] Надіслати `/start` — отримати привітання
|
||||
- [ ] Надіслати просте питання — отримати змістовну відповідь
|
||||
- [ ] Перевірити що агент відповідає у своєму стилі (persona)
|
||||
- [ ] Перевірити швидкість відповіді (<10 сек норма)
|
||||
- [ ] Перевірити що відповідь не з кешу — задати питання про поточну дату
|
||||
|
||||
### 2. LLM ROUTING (Маршрутизація до правильної моделі)
|
||||
- [ ] `curl POST /v1/agents/{id}/infer` → перевірити `model` та `backend` у відповіді
|
||||
- [ ] Переконатись що `backend` відповідає очікуваному провайдеру
|
||||
- [ ] Перевірити `tokens_used` > 0
|
||||
|
||||
### 3. MEMORY MODULE (Модуль пам'яті)
|
||||
- [ ] Колекція `{agent}_messages` існує і має > 0 точок
|
||||
- [ ] `remember_fact` — назвати факт → перевірити через наступний запит що агент його знає
|
||||
- [ ] `memory_search` — запитати про попередні розмови → агент має згадати
|
||||
- [ ] Перевірити наявність `{agent}_user_context` якщо є
|
||||
|
||||
### 4. QDRANT COLLECTIONS (Векторні колекції)
|
||||
- [ ] Перелік колекцій: `messages`, `docs`, `memory_items`, `summaries`, `user_context`
|
||||
- [ ] Підрахунок points в кожній колекції
|
||||
- [ ] Перевірити що нові повідомлення записуються (порівняти count до і після)
|
||||
|
||||
### 5. DOCUMENT PROCESSING (Обробка документів)
|
||||
- [ ] Надіслати PDF документ у чат — агент має підтвердити отримання
|
||||
- [ ] Запитати про зміст документу — перевірити що агент може відповісти
|
||||
- [ ] Перевірити що `{agent}_docs` collection оновилась
|
||||
- [ ] Перевірити OCR для зображень з текстом (через swapper got-ocr2)
|
||||
|
||||
### 6. WEB SEARCH (Пошук в інтернеті)
|
||||
- [ ] Задати питання що вимагає актуальної інформації
|
||||
- [ ] Перевірити в логах router що `web_search` tool викликався
|
||||
- [ ] Перевірити якість відповіді (не галюцинація, а реальні дані)
|
||||
- [ ] Перевірити `crawl4ai_scrape` для глибокого аналізу сторінок
|
||||
|
||||
### 7. MULTIMODAL — VISION (Обробка зображень)
|
||||
- [ ] Надіслати фото у чат — агент має описати що на фото
|
||||
- [ ] Перевірити що swapper завантажив `qwen3-vl-8b` (через /models)
|
||||
- [ ] Перевірити якість опису (деталізованість, точність)
|
||||
- [ ] Спеціально для agromatrix: рослина на фото → визначення виду
|
||||
|
||||
### 8. VOICE (Голосовий чат)
|
||||
- [ ] Надіслати голосове повідомлення у Telegram
|
||||
- [ ] Перевірити що STT (faster-whisper-large) транскрибує
|
||||
- [ ] Перевірити що агент відповідає на транскрипт
|
||||
- [ ] Перевірити TTS (xtts-v2) — відповідь голосом (якщо підтримується)
|
||||
- [ ] Перевірити логи swapper під час STT
|
||||
|
||||
### 9. СПЕЦІАЛІЗОВАНІ ІНСТРУМЕНТИ (Agent-specific tools)
|
||||
- Залежить від агента — деталі нижче у секції кожного агента
|
||||
|
||||
### 10. SYSTEM PROMPT (Системний промпт і особистість)
|
||||
- [ ] Перевірити що `{agent}_prompt.txt` завантажено (`prompt_loaded: true`)
|
||||
- [ ] Задати питання поза доменом агента → має відповідати в ролі, не виходити з персони
|
||||
- [ ] Перевірити мову відповіді (UA/EN відповідно до налаштувань)
|
||||
|
||||
---
|
||||
|
||||
## АГЕНТИ — ПОРЯДОК ПЕРЕВІРКИ
|
||||
|
||||
Порядок від найважливіших / найактивніших:
|
||||
|
||||
### ЧЕРГА 1 — Критичні (найбільше повідомлень, активні юзери)
|
||||
1. **helion** — 5836 msgs, docs 315, summaries 12 → найактивніший
|
||||
2. **senpai** — 1759 msgs, summaries 3, Grok, market_data tool
|
||||
3. **agromatrix** — 2159 msgs, docs 350, plant vision tools
|
||||
4. **sofiia** — 1184 msgs, Grok, CTO агент
|
||||
|
||||
### ЧЕРГА 2 — Активні
|
||||
5. **clan** — 1089 msgs
|
||||
6. **nutra** — 890 msgs
|
||||
7. **soul** — 412 msgs, docs 153
|
||||
8. **druid** — 338 msgs
|
||||
|
||||
### ЧЕРГА 3 — Менш активні
|
||||
9. **greenfood** — 301 msgs
|
||||
10. **alateya** — 163 msgs, summaries
|
||||
11. **eonarch** — 75 msgs
|
||||
12. **oneok** — 38 msgs, 5 CRM tools
|
||||
|
||||
### ЧЕРГА 4 — Службові / нові
|
||||
13. **daarwizz** — 144 msgs, meta-orchestrator
|
||||
14. **yaromir** — 11 msgs, whitelist-only
|
||||
|
||||
---
|
||||
|
||||
## ДЕТАЛЬНІ ЧЕКЛІСТИ ПО АГЕНТАМ
|
||||
|
||||
---
|
||||
|
||||
### 1. HELION — Energy Platform
|
||||
**LLM:** DeepSeek → fallback Mistral
|
||||
**Crew:** ✅ enabled
|
||||
**Спец. інструменти:** `comfy_generate_image`, `comfy_generate_video`
|
||||
**Колекції:** messages(5836), docs(315), summaries(12), artifacts
|
||||
|
||||
**Спецперевірки:**
|
||||
- [ ] Запитати про сонячні панелі / енергетичний розрахунок
|
||||
- [ ] Перевірити чи CrewAI crew активується для складних задач
|
||||
- [ ] `image_generate` — запит на схему енергосистеми → перевірити comfy
|
||||
- [ ] Перевірити `helion_artifacts` колекцію (унікальна!)
|
||||
- [ ] RAG по `helion_docs` — запитати про завантажені документи
|
||||
|
||||
---
|
||||
|
||||
### 2. SENPAI — Trading Advisor
|
||||
**LLM:** Grok → fallback DeepSeek
|
||||
**Crew:** ❌ llm_only
|
||||
**Спец. інструменти:** `market_data`, `binance_bots_top`, `binance_account_bots`
|
||||
**Колекції:** messages(1759), summaries(3)
|
||||
|
||||
**Спецперевірки:**
|
||||
- [ ] Запитати ціну BTC/ETH → перевірити що tool `market_data` викликається
|
||||
- [ ] Перевірити що `senpai-md-consumer` (:8892) отримує ринкові дані
|
||||
- [ ] Запитати про торговий сигнал → якість аналізу через Grok
|
||||
- [ ] Перевірити `binance_bots_top` — список топ-ботів
|
||||
- [ ] Ринковий звіт — чи зберігається в summaries
|
||||
|
||||
---
|
||||
|
||||
### 3. AGROMATRIX — Agriculture
|
||||
**LLM:** DeepSeek → fallback Mistral
|
||||
**Crew:** ❌ llm_only
|
||||
**Спец. інструменти:** `plantnet_lookup`, `nature_id_identify`, `gbif_species_lookup`, `agrovoc_lookup`
|
||||
**Колекції:** messages(2159), docs(350), shared_pending
|
||||
|
||||
**Спецперевірки:**
|
||||
- [ ] Надіслати фото рослини → `nature_id_identify` або `plantnet_lookup`
|
||||
- [ ] Запитати про агрокультуру → `agrovoc_lookup` (FAO база)
|
||||
- [ ] Перевірити `agromatrix_shared_pending` — унікальна колекція (для чого?)
|
||||
- [ ] RAG по docs(350) — запитати про завантажені агрономічні документи
|
||||
- [ ] plant-vision :8085 ❌ DOWN — перевірити вплив на функціональність
|
||||
|
||||
---
|
||||
|
||||
### 4. SOFIIA — Chief AI Architect
|
||||
**LLM:** Grok → fallback DeepSeek
|
||||
**Crew:** ❌ llm_only
|
||||
**Доступ:** whitelist (admin, architect roles only)
|
||||
**Колекції:** messages(1184)
|
||||
|
||||
**Спецперевірки:**
|
||||
- [ ] Перевірити access control — чи блокує неавторизованих юзерів
|
||||
- [ ] Запитати про архітектуру NODA1 → точність відповіді через Grok
|
||||
- [ ] Перевірити всі AGENTS.md tools (oncall, observability, pr_reviewer, etc.)
|
||||
- [ ] Перевірити control-plane :9200 → чи є інтеграція
|
||||
- [ ] Запитати технічне питання → quality through Grok vs DeepSeek
|
||||
|
||||
---
|
||||
|
||||
### 5. CLAN — Community Operations
|
||||
**LLM:** DeepSeek → fallback Mistral
|
||||
**Crew:** ❌ llm_only
|
||||
**Колекції:** messages(1089)
|
||||
|
||||
**Спецперевірки:**
|
||||
- [ ] Запитати про DAO голосування / спільнотні механіки
|
||||
- [ ] Перевірити `remember_fact` — чи зберігає профіль юзера
|
||||
- [ ] Перевірити що docs collection відсутня (це нормально для clan?)
|
||||
|
||||
---
|
||||
|
||||
### 6. NUTRA — Health & Nutrition
|
||||
**LLM:** DeepSeek → fallback Mistral
|
||||
**Crew:** ✅ enabled
|
||||
**Спец. інструменти:** `comfy_generate_image`, `comfy_generate_video`
|
||||
**Колекції:** messages(890), food_knowledge (спеціальна!)
|
||||
|
||||
**Спецперевірки:**
|
||||
- [ ] Запитати про харчову цінність продукту → `nutra_food_knowledge` RAG
|
||||
- [ ] Запит на план харчування → чи використовує crew для складних кейсів
|
||||
- [ ] Перевірити `nutra_food_knowledge` — скільки points, що це за база
|
||||
|
||||
---
|
||||
|
||||
### 7. SOUL — Spiritual Assistant
|
||||
**LLM:** DeepSeek → fallback Mistral
|
||||
**Crew:** ❌ llm_only
|
||||
**Колекції:** messages(412), docs(153), summaries(1)
|
||||
|
||||
**Спецперевірки:**
|
||||
- [ ] Запитати про духовну практику → емоційна якість відповіді
|
||||
- [ ] RAG по docs(153) — які документи завантажені?
|
||||
- [ ] Перевірити persona consistency — чи лишається у ролі
|
||||
|
||||
---
|
||||
|
||||
### 8. DRUID — Science/Ayurveda
|
||||
**LLM:** DeepSeek → fallback Mistral
|
||||
**Спец. інструменти:** `comfy_generate_image`, `comfy_generate_video`
|
||||
**Колекції:** messages(338), legal_kb (унікальна!)
|
||||
|
||||
**Спецперевірки:**
|
||||
- [ ] Запитати про аюрведичний рецепт → якість відповіді
|
||||
- [ ] Перевірити `druid_legal_kb` — це юридична база? Що в ній?
|
||||
- [ ] RAG по legal_kb
|
||||
|
||||
---
|
||||
|
||||
### 9. GREENFOOD — Food ERP
|
||||
**LLM:** DeepSeek → fallback Mistral
|
||||
**Колекції:** messages(301)
|
||||
|
||||
**Спецперевірки:**
|
||||
- [ ] Запитати про рецептуру або склад продукту
|
||||
- [ ] Перевірити чи є інтеграція з ERP системою
|
||||
|
||||
---
|
||||
|
||||
### 10. ALATEYA — R&D Lab
|
||||
**LLM:** DeepSeek → fallback Mistral
|
||||
**Crew:** ❌ llm_only
|
||||
**Колекції:** messages(163), summaries(1)
|
||||
|
||||
**Спецперевірки:**
|
||||
- [ ] Запитати про наукову гіпотезу → якість reasoning
|
||||
- [ ] Перевірити summaries — що там за 1 summary
|
||||
|
||||
---
|
||||
|
||||
### 11. EONARCH — Consciousness/Evolution
|
||||
**LLM:** DeepSeek → fallback Mistral
|
||||
**Колекції:** messages(75)
|
||||
|
||||
**Спецперевірки:**
|
||||
- [ ] Запитати про трансформацію свідомості → філософська глибина
|
||||
- [ ] Мало messages — чи активний взагалі?
|
||||
|
||||
---
|
||||
|
||||
### 12. ONEOK — Window Master
|
||||
**LLM:** DeepSeek → fallback Mistral
|
||||
**Спец. інструменти:** `crm_search_client`, `crm_upsert_client`, `crm_upsert_site`, `crm_upsert_window_unit`, `crm_create_quote`, `crm_update_quote`, `crm_create_job`, `calc_window_quote`, `docs_render_quote_pdf`, `docs_render_invoice_pdf`
|
||||
**Колекції:** messages(38)
|
||||
|
||||
**Спецперевірки:**
|
||||
- [ ] Запитати розрахунок вікна → `calc_window_quote` tool
|
||||
- [ ] Перевірити CRM інтеграцію (EspoCRM)
|
||||
- [ ] PDF генерація через gotenberg :3010
|
||||
- [ ] Calendly/CalCom інтеграція для записів
|
||||
|
||||
---
|
||||
|
||||
### 13. DAARWIZZ — Meta-Orchestrator
|
||||
**LLM:** DeepSeek → fallback Mistral
|
||||
**Колекції:** messages(144)
|
||||
|
||||
**Спецперевірки:**
|
||||
- [ ] Запитати про координацію між агентами
|
||||
- [ ] Перевірити A2A (agent-to-agent) routing якщо є
|
||||
- [ ] Як працює делегування задач між агентами?
|
||||
|
||||
---
|
||||
|
||||
### 14. YAROMIR — Tech Lead (whitelist)
|
||||
**LLM:** DeepSeek → fallback Mistral
|
||||
**Доступ:** whitelist only
|
||||
**Колекції:** messages(11), docs, memory_items
|
||||
|
||||
**Спецперевірки:**
|
||||
- [ ] Перевірити whitelist access control
|
||||
- [ ] Технічні питання → якість відповіді
|
||||
- [ ] Чому лише 11 messages — малоактивний або whitelist обмежує?
|
||||
|
||||
---
|
||||
|
||||
## ФОРМАТ ЗВІТУ ПО АГЕНТУ
|
||||
|
||||
```
|
||||
## AGENT: {name} — AUDIT {date}
|
||||
|
||||
### Status
|
||||
| Категорія | Статус | Деталі |
|
||||
|-----------|--------|--------|
|
||||
| Telegram chat | ✅/⚠️/❌ | ... |
|
||||
| LLM routing | ✅/⚠️/❌ | model=X backend=Y |
|
||||
| Memory (messages) | ✅/⚠️/❌ | N points |
|
||||
| Memory (facts) | ✅/⚠️/❌ | remember/recall OK |
|
||||
| Docs RAG | ✅/⚠️/❌ | N docs points |
|
||||
| Web search | ✅/⚠️/❌ | tool called: Y/N |
|
||||
| Vision | ✅/⚠️/❌ | model loaded: Y/N |
|
||||
| Voice STT | ✅/⚠️/❌ | whisper: Y/N |
|
||||
| Voice TTS | ✅/⚠️/❌ | xtts: Y/N |
|
||||
| Specialized tools | ✅/⚠️/❌ | tools tested |
|
||||
|
||||
### Issues Found
|
||||
- ...
|
||||
|
||||
### Action Items
|
||||
- [ ] ...
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## KNOWN ISSUES (до початку аудиту)
|
||||
|
||||
| # | Проблема | Агент | Пріоритет |
|
||||
|---|---------|-------|-----------|
|
||||
| 1 | plant-vision :8085 DOWN | agromatrix | HIGH |
|
||||
| 2 | token_configured: false в Gateway health | всі | MEDIUM (баг в health check, токени є) |
|
||||
| 3 | Більшість memory_items і user_context колекцій порожні | всі | MEDIUM |
|
||||
| 4 | qwen3-vl-8b unloaded (on-demand завантаження) | всі vision | INFO |
|
||||
| 5 | whisper/xtts unloaded | всі voice | INFO |
|
||||
| 6 | comfy tools у всіх агентів — чи працює ComfyUI? | всі | TO_CHECK |
|
||||
|
||||
---
|
||||
|
||||
*Документ оновлюється по мірі проходження аудиту.*
|
||||
|
||||
---
|
||||
|
||||
## AUDIT REPORTS
|
||||
|
||||
---
|
||||
|
||||
## AGENT: helion — AUDIT 2026-02-28
|
||||
|
||||
### Status
|
||||
|
||||
| Категорія | Статус | Деталі |
|
||||
|-----------|--------|--------|
|
||||
| Telegram chat | ✅ | Відповідає, persona Energy Union коректна |
|
||||
| LLM routing | ✅ | model=deepseek-chat, backend=deepseek-cloud, tokens=3532 |
|
||||
| Memory — messages | ✅ | helion_messages = **5836 points** (найактивніший) |
|
||||
| Memory — summaries | ✅ | helion_summaries = **12 dialog summaries** (events_count до 20) |
|
||||
| Memory — remember_fact | ⚠️ | Tool викликається і повертає 200, але **agent_id не зберігається** в PostgreSQL |
|
||||
| Memory — memory_items | ⚠️ | Qdrant helion_memory_items = **0 points** — не використовується |
|
||||
| Memory — user_context | ⚠️ | Qdrant helion_user_context = **0 points** — не використовується |
|
||||
| Memory — artifacts | ⚠️ | Qdrant helion_artifacts = **0 points** — artifacts не накопичуються |
|
||||
| Docs RAG | ✅ | helion_docs = **315 points**, 1 документ: `sinergiya-posibnik-elektr_21.05.2024.pdf`, RAG відповідає точно |
|
||||
| Web search | ✅ | `web_extract` tool виконується (логи підтверджують), `memory_search` + `web_extract` активні |
|
||||
| Vision | ⚪ | qwen3-vl-8b **unloaded** (on-demand), потребує тесту фото в Telegram |
|
||||
| Voice STT | ⚪ | whisper-small/faster-whisper **unloaded** (on-demand через swapper), gateway має `process_voice()` |
|
||||
| Voice TTS | ⚪ | xtts-v2 **unloaded** (on-demand), потребує тесту |
|
||||
| Crew (CrewAI) | ⚠️ | crew=True в registry, але logs: `CrewAI decision: False (orchestrator_direct_llm_first)` — crew не активується |
|
||||
| image_generate (comfy) | ❌ | ComfyUI **контейнер відсутній** на NODA1 — `comfy_generate_image` / `comfy_generate_video` не працюють |
|
||||
| System prompt | ✅ | `prompt_loaded: true`, source=config, persona коректна |
|
||||
|
||||
### Issues Found
|
||||
|
||||
#### 🔴 CRITICAL
|
||||
|
||||
**ISSUE-H-01: `agent_id` не зберігається в `user_facts` при remember_fact**
|
||||
- `/facts/upsert` endpoint в memory-service **не передає `agent_id`** в `db.upsert_fact()` — параметр є в сигнатурі функції, але не передається з request
|
||||
- `tool_manager.py` передає `agent_id` тільки в `fact_value_json`, а не як окрему колонку
|
||||
- Наслідок: `SELECT ... WHERE agent_id='helion'` повертає 0 рядків для реальних фактів, пошук по агенту не працює
|
||||
- Файл: `services/memory-service/app/main.py:654` → `db.upsert_fact()` call без `agent_id`
|
||||
|
||||
#### 🟡 WARNING
|
||||
|
||||
**ISSUE-H-02: ComfyUI відсутній на NODA1**
|
||||
- Контейнер ComfyUI не запущений (немає в `docker ps`)
|
||||
- Всі агенти мають `comfy_generate_image` і `comfy_generate_video` в specialized tools
|
||||
- При виклику цих tools — помилка або timeout
|
||||
- Рекомендація: або задеплоїти ComfyUI, або замінити `image_generate` на інший backend (Flux через swapper вже є: `flux-klein-4b unloaded`)
|
||||
|
||||
**ISSUE-H-03: CrewAI ніколи не активується для Helion**
|
||||
- `helion: crew=True` в `agent_registry.yml`
|
||||
- Але в router логах завжди: `CrewAI decision: False (orchestrator_direct_llm_first)`
|
||||
- Умова активації CrewAI не спрацьовує для реальних запитів
|
||||
- Потрібно перевірити логіку `_crewai_decision()` в `main.py`
|
||||
|
||||
**ISSUE-H-04: helion_memory_items, helion_user_context, helion_artifacts = 0 points**
|
||||
- Три Qdrant-колекції існують але порожні
|
||||
- `memory_items` — мав би зберігати структуровані факти (від remember_fact)
|
||||
- `user_context` — мав би зберігати профілі користувачів
|
||||
- `artifacts` — унікальна колекція для helion, ніколи не записувалась
|
||||
- Причина: write path для цих колекцій або не реалізований, або не викликається
|
||||
|
||||
**ISSUE-H-05: Лише 1 документ в helion_docs (315 chunks)**
|
||||
- Весь RAG-корпус = один PDF `sinergiya-posibnik-elektr_21.05.2024.pdf`
|
||||
- Для платформи з 5836 повідомленнями — дуже мало документів
|
||||
- Потрібно завантажити більше документів Energy Union
|
||||
|
||||
#### 🟢 INFO
|
||||
|
||||
**ISSUE-H-06: vision/STT/TTS — on-demand, не тестовано реальним Telegram трафіком**
|
||||
- Swapper завантажує моделі на вимогу, це нормально
|
||||
- Але потрібен реальний тест: надіслати фото і голосове в Telegram чат Helion
|
||||
|
||||
### Action Items
|
||||
|
||||
- [ ] **FIX** `services/memory-service/app/main.py:670` — додати `agent_id=request.fact_value_json.get('agent_id')` в `db.upsert_fact()` call
|
||||
- [ ] **FIX** Перевірити `_crewai_decision()` в `main.py` — чому crew не активується для helion
|
||||
- [ ] **DEPLOY** ComfyUI або налаштувати `image_generate` через swapper flux-klein-4b
|
||||
- [ ] **TEST** Надіслати реальне фото в Telegram @HelionBot → перевірити vision STT
|
||||
- [ ] **TEST** Надіслати голосове повідомлення → перевірити STT whisper pipeline
|
||||
- [ ] **UPLOAD** Завантажити більше документів Energy Union в helion_docs
|
||||
- [ ] **INVESTIGATE** helion_artifacts — для чого ця колекція і як її наповнювати
|
||||
|
||||
---
|
||||
|
||||
## AGENT: senpai — AUDIT 2026-02-28
|
||||
|
||||
### Status
|
||||
|
||||
| Категорія | Статус | Деталі |
|
||||
|-----------|--------|--------|
|
||||
| Telegram chat | ✅ | Відповідає, persona Gordon Senpai — Trading Advisor (після фіксу промпту) |
|
||||
| LLM routing | ✅ | model=grok-4-1-fast-reasoning, backend=grok-cloud ✅ (підтверджено в логах) |
|
||||
| Memory — messages | ✅ | senpai_messages = **1759 points** (активна база) |
|
||||
| Memory — summaries | ✅ | senpai_summaries = **3 dialog summaries** (events_count по 60, content про трейдинг/крипто) |
|
||||
| Memory — remember_fact | ✅ | Факт `Максим торгує BTC з 2021` збережено з `agent_id=senpai` (**agent_id fix діє**) |
|
||||
| Memory — memory_items | ⚠️ | Qdrant senpai_memory_items = **0 points** — не накопичується |
|
||||
| Memory — user_context | ❌ | Колекція **відсутня** (HTTP 404) — не була створена |
|
||||
| Memory — docs | ⚠️ | senpai_docs = **0 points** — жодного документу |
|
||||
| market_data tool | ✅ | BTC=$68,185 / ETH=$2,066 — real-time дані з Bybit/Binance (tool `market_data` OK) |
|
||||
| binance_bots_top | ✅ | Tool **реалізовано** — підключено до `dagi-binance-bot-monitor-node1:8893/top-bots` (web_search fallback) |
|
||||
| binance_account_bots | ✅ | Tool **реалізовано** — `SPOT account, can_trade=True, permissions=[TRD_GRP_072]`, баланс 0 (акаунт порожній) |
|
||||
| comfy_generate_image | ❌ | ComfyUI **відсутній** на NODA1 — не працює (той самий issue що ISSUE-H-02) |
|
||||
| comfy_generate_video | ❌ | ComfyUI **відсутній** на NODA1 — не працює |
|
||||
| web_search | ✅ | Новини Bitcoin 2026-02-26 — знайдено реальний контент (BBC, Cointelegraph) |
|
||||
| Vision | ⚪ | Не налаштовано для senpai (`vision_enabled` не встановлено в config) |
|
||||
| Voice STT | ⚪ | On-demand через swapper (не тестовано реальним трафіком) |
|
||||
| Voice TTS | ⚪ | On-demand через swapper (не тестовано реальним трафіком) |
|
||||
| Crew (CrewAI) | ℹ️ | crew=❌ за конфігом (trading agent — crew не потрібен) |
|
||||
| System prompt | ✅ | **FIXED** — було placeholder `(loaded from senpai_prompt.txt)`, тепер `!file:/app/prompts/senpai_prompt.txt` (13KB, Gordon Senpai v1.1) |
|
||||
| senpai-md-consumer | ✅ | `http://localhost:8892/health` → `{"status":"ok","service":"senpai-md-consumer"}` |
|
||||
| market-data-service | ✅ | `http://localhost:8893/health` → `{"status":"ok","service":"market-data-service"}` |
|
||||
|
||||
### Issues Found
|
||||
|
||||
#### 🔴 CRITICAL
|
||||
|
||||
**ISSUE-S-01: System prompt був placeholder — Senpai відповідав як "Energy Union AI" замість "Gordon Senpai"**
|
||||
- `router-config.yml` містив буквальний рядок `(loaded from senpai_prompt.txt)` замість реального промпту
|
||||
- `prompt_builder._get_from_config()` передавав цей рядок в Grok як system_prompt
|
||||
- Grok без контексту вигадував "Energy Union" персону (з короткого placeholder)
|
||||
- **FIXED:**
|
||||
1. `prompt_builder.py` — додано підтримку `!file:/path/to/file.txt` references
|
||||
2. `docker-compose.node1.yml` — додано volume mount `gateway-bot -> /app/prompts`
|
||||
3. `router-config.yml` — `senpai.system_prompt = !file:/app/prompts/senpai_prompt.txt`
|
||||
4. Аналогічно для `sofiia` (sofiia_prompt.txt 136KB)
|
||||
- **VERIFY:** `grok-4-1-fast-reasoning` → "Я — Гордон Сэнпай, советник высшего уровня по рынкам капитала и цифровым активам" ✅
|
||||
|
||||
#### 🔴 CRITICAL (системний)
|
||||
|
||||
**ISSUE-S-02: `binance_bots_top` і `binance_account_bots` — "ghost tools" → ВИПРАВЛЕНО**
|
||||
- Обидва tools були присутні в `agent_tools_config.py` але **відсутні** в `TOOL_DEFINITIONS` і `execute_tool()` handler
|
||||
- **FIXED:** Реалізовано `_binance_bots_top()` і `_binance_account_bots()` в `tool_manager.py`
|
||||
- Сервіс `dagi-binance-bot-monitor-node1` запущений (порт 8893 внутрішній)
|
||||
- Новий Binance API ключ встановлено в `.env.node1` і задеплоєно → **HTTP 200 OK**
|
||||
- `binance_account_bots` → `SPOT, can_trade=True, TRD_GRP_072, balance=0` ✅
|
||||
- `binance_bots_top` → web_search fallback (marketplace scraping) ✅
|
||||
|
||||
#### 🟡 WARNING
|
||||
|
||||
**ISSUE-S-03: senpai_user_context колекція відсутня (404)**
|
||||
- На відміну від helion, де колекція є але порожня — у senpai її взагалі немає
|
||||
- Потрібно перевірити чому memory-service не створила цю колекцію для senpai
|
||||
|
||||
**ISSUE-S-04: senpai_docs = 0 points**
|
||||
- Для Trading Advisor — відсутні будь-які документи (аналітика, стратегії, ринкові огляди)
|
||||
- Це обмежує RAG-можливості агента
|
||||
|
||||
**ISSUE-S-05: senpai_memory_items = 0 points**
|
||||
- Аналогічно helion — `memory_items` не накопичується
|
||||
- Структуровані факти про трейдери не зберігаються у Qdrant
|
||||
|
||||
**ISSUE-S-06: System prompt — мова Russian (v1.1)**
|
||||
- `senpai_prompt.txt` написаний переважно **російською мовою** ("Версия: 1.1, Язык: русский")
|
||||
- Для українського продукту — потрібно перейти на UA/EN промпт
|
||||
|
||||
#### 🟢 INFO
|
||||
|
||||
**ISSUE-S-07: vision/STT/TTS не тестовано реальним трафіком**
|
||||
- Trading agent — мінімальна потреба у voice/vision
|
||||
- Але доступ до swapper є, on-demand навантаження норма
|
||||
|
||||
### Action Items
|
||||
|
||||
- [x] **FIXED** `services/router/prompt_builder.py` — підтримка `!file:` references для системних промптів
|
||||
- [x] **FIXED** `docker-compose.node1.yml` — volume mount `gateway-bot -> /app/prompts` для router
|
||||
- [x] **FIXED** `services/router/router-config.yml` — senpai і sofiia тепер мають `!file:` references
|
||||
- [x] **FIXED** `binance_bots_top` і `binance_account_bots` tools реалізовані в `tool_manager.py` → з'єднані з `dagi-binance-bot-monitor-node1:8893`
|
||||
- [x] **FIXED** Новий Binance API ключ встановлено в `.env.node1`, сервіс перезапущено → 200 OK
|
||||
- [ ] **CREATE** `senpai_user_context` Qdrant collection — зрозуміти чому не створилась
|
||||
- [ ] **UPLOAD** Завантажити торгові документи/аналітику в `senpai_docs` через RAG pipeline
|
||||
- [ ] **TRANSLATE** `senpai_prompt.txt` на ukrainian/english (зараз russian v1.1)
|
||||
- [ ] **TEST** Реальний тест в Telegram @SenpAI_agent_bot з типовими запитами трейдера
|
||||
133
docs/GRAPH_CONTRACT.md
Normal file
133
docs/GRAPH_CONTRACT.md
Normal file
@@ -0,0 +1,133 @@
|
||||
# Sofiia Dialog Graph — Canonical Contract v1.0
|
||||
|
||||
## Core Invariants
|
||||
|
||||
Every meaningful artifact in the Sofiia system MUST be represented in the Dialog Graph:
|
||||
|
||||
```
|
||||
1. Every artifact has a node.
|
||||
2. Every action has an edge.
|
||||
3. No artifact exists without graph presence.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Node Types
|
||||
|
||||
| node_type | ref_id points to | Created by |
|
||||
|-----------------|-----------------------|------------------------------------|
|
||||
| `message` | message_id (session) | session message handler |
|
||||
| `task` | tasks.task_id | `create_task()` — atomically |
|
||||
| `meeting` | meetings.meeting_id | `create_meeting()` — atomically |
|
||||
| `doc` | documents.doc_id | document upload/create |
|
||||
| `agent_run` | run_id (supervisor) | `create_evidence_pack()` |
|
||||
| `ops_run` | job_id (ops) | ops job completion hook |
|
||||
| `repo_changeset`| changeset_id | repo diff / PR tracking |
|
||||
| `pull_request` | PR number/id | PR flow integration |
|
||||
| `decision` | decision_id | explicit decision recording |
|
||||
| `goal` | goal_id | strategic goal setting |
|
||||
|
||||
---
|
||||
|
||||
## Edge Types
|
||||
|
||||
| edge_type | Meaning | Example |
|
||||
|---------------------|-------------------------------------------|------------------------------------|
|
||||
| `references` | A mentions/cites B | message → doc |
|
||||
| `summarizes` | A is a summary of B | doc → session |
|
||||
| `derives_task` | A produced task B | message → task |
|
||||
| `updates_doc` | A updates/modifies doc B | ops_run → doc |
|
||||
| `schedules_meeting` | A scheduled meeting B | message → meeting |
|
||||
| `resolves` | A resolves/closes B | task → task (blocker resolved) |
|
||||
| `blocks` | A blocks B | task → task |
|
||||
| `relates_to` | A is related to B | any → any |
|
||||
| `produced_by` | B was produced by run A | agent_run → task/doc |
|
||||
| `executed_as` | plan A was executed as ops_run B | decision → ops_run |
|
||||
|
||||
---
|
||||
|
||||
## Atomic Creation Rules
|
||||
|
||||
When creating an artifact, the node MUST be created in the same SQLite transaction:
|
||||
|
||||
```python
|
||||
# CORRECT: task + node in one BEGIN...COMMIT
|
||||
await db.execute("BEGIN")
|
||||
await db.execute("INSERT INTO tasks ...")
|
||||
await db.execute("INSERT INTO dialog_nodes ... ON CONFLICT DO UPDATE")
|
||||
await db.commit()
|
||||
|
||||
# WRONG: two separate commits
|
||||
await create_task(...) # commit 1
|
||||
await upsert_dialog_node(...) # commit 2 — can diverge
|
||||
```
|
||||
|
||||
Functions that guarantee atomicity:
|
||||
- `db.create_task()` — always upserts task node
|
||||
- `db.create_meeting()` — always upserts meeting node
|
||||
- `db.create_evidence_pack()` — creates agent_run node + derived task nodes + edges
|
||||
|
||||
---
|
||||
|
||||
## Evidence Pack
|
||||
|
||||
After every Supervisor run, an Evidence Pack MUST be recorded:
|
||||
|
||||
```json
|
||||
{
|
||||
"run_id": "<uuid>",
|
||||
"graph_name": "release_check|incident_triage|...",
|
||||
"status": "completed",
|
||||
"summary": "...",
|
||||
"findings": [...],
|
||||
"recommendations": [...],
|
||||
"follow_up_tasks": [
|
||||
{"title": "...", "description": "...", "priority": "normal|high|urgent"}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
This creates:
|
||||
1. `agent_run` dialog node
|
||||
2. `doc_version` with evidence markdown (if evidence_log.md exists in project)
|
||||
3. `task` nodes for each follow_up_task (in `backlog` with label `evidence`)
|
||||
4. `produced_by` edges: agent_run → each task node
|
||||
|
||||
---
|
||||
|
||||
## Integrity Checks
|
||||
|
||||
Run `GET /api/projects/{id}/graph/integrity` to verify:
|
||||
|
||||
| Check | Description |
|
||||
|-------------------------|------------------------------------------------------|
|
||||
| `orphaned_edge_from` | Edges referencing non-existent from_node |
|
||||
| `orphaned_edge_to` | Edges referencing non-existent to_node |
|
||||
| `dangling_task_nodes` | `node_type=task` nodes with no matching task row |
|
||||
| `dangling_meeting_nodes`| `node_type=meeting` nodes with no matching meeting |
|
||||
| `self_loop_edges` | Edges where from_node_id == to_node_id |
|
||||
|
||||
**Expected**: `{"ok": true, "violations": []}`
|
||||
|
||||
---
|
||||
|
||||
## DDL Freeze
|
||||
|
||||
As of v1.0, the schema is **frozen**. Any schema changes require:
|
||||
1. A migration file in `services/sofiia-console/migrations/`
|
||||
2. Update to this contract document
|
||||
3. Update to `tests/test_graph_integrity.py`
|
||||
|
||||
Current canonical DDL: `services/sofiia-console/app/db.py` (init_db function)
|
||||
|
||||
---
|
||||
|
||||
## Quality Gates
|
||||
|
||||
Before merging any feature that touches artifacts:
|
||||
|
||||
| Gate | Check |
|
||||
|---------------------|----------------------------------------------|
|
||||
| **Reproducibility** | Does the feature create a node + edge? |
|
||||
| **Safety** | Is creation atomic (single transaction)? |
|
||||
| **Observability** | Does `GET /graph/integrity` stay `ok: true`? |
|
||||
147
docs/HUMANIZED_STEPAN_v2.7_CHANGELOG.md
Normal file
147
docs/HUMANIZED_STEPAN_v2.7_CHANGELOG.md
Normal file
@@ -0,0 +1,147 @@
|
||||
# Humanized Stepan — CHANGELOG v2.7
|
||||
|
||||
**Version:** v2.7
|
||||
**Date:** 2026-02-25
|
||||
**Базується на:** v2.6 (Jaccard guard, tone_constraints, 3-рівневі привітання, seeded RNG)
|
||||
|
||||
---
|
||||
|
||||
## Summary
|
||||
|
||||
- Додано **memory horizon**: `recent_topics` (до 5 записів) замість єдиного `last_topic`.
|
||||
- Додано **human topic labels** (`last_topic_label`) — Степан оперує "план на завтра поле 12", а не "plan_day".
|
||||
- Додано **`summarize_topic_label()`** — rule-based витяг 6–8 слів з тексту без дієслів-тригерів і стоп-слів.
|
||||
- Light follow-up (≤6 слів + last_topic) **не додає шум** до `recent_topics` (`depth="light"` → `push` не відбувається).
|
||||
- Contextual greeting (`interaction_count ≥ 8`) тепер: з ймовірністю 20% (seeded rng) підхоплює `recent_topics[-2]` — Степан "пам'ятає" більше однієї теми без подвійного згадування.
|
||||
- **ZZR safety disclaimer**: якщо погодний тригер + обприскування/гербіцид/ЗЗР — автоматично додається `"Дозування та вікна застосування — за етикеткою препарату та регламентом."`.
|
||||
- Додано **`tests/test_stepan_invariants.py`** — 25 тестів-інваріантів проти "повзучої ботячості".
|
||||
|
||||
---
|
||||
|
||||
## Key features (деталі)
|
||||
|
||||
### Memory horizon — `recent_topics`
|
||||
|
||||
```json
|
||||
"recent_topics": [
|
||||
{"label": "план на завтра поле 12", "intent": "plan_day", "ts": "2026-02-25T..."},
|
||||
{"label": "датчики вологості поле 7", "intent": "iot_sensors", "ts": "2026-02-25T..."}
|
||||
]
|
||||
```
|
||||
|
||||
- Максимум 5 записів; старіші витісняються.
|
||||
- `last_topic` і `last_topic_label` — backward-compat aliases на `recent_topics[-1]`.
|
||||
- Dedup: якщо той самий `intent` + `label` підряд — не дублюється.
|
||||
|
||||
### summarize_topic_label
|
||||
|
||||
| Вхід | Вихід |
|
||||
|---|---|
|
||||
| `"зроби план на завтра по полю 12"` | `"План на завтра по полю 12"` |
|
||||
| `"перевір датчики вологості поле 7"` | `"Датчики вологості поле 7"` |
|
||||
| `"сплануй тижневий збір по полях"` | `"Тижневий збір по полях"` |
|
||||
|
||||
Правила: прибирається leading action verb (зроби/перевір/порахуй/…), стоп-слова, обрізка до 8 слів. Числа, поля, культури, дати зберігаються.
|
||||
|
||||
### ZZR disclaimer
|
||||
|
||||
Regex `_ZZR_RE` спрацьовує на: `обробк|обприскування|гербіцид|фунгіцид|ЗЗР|пестицид|інсектицид|протруювач`.
|
||||
Застереження додається лише коли є **і** погодний тригер **і** ZZR-тригер в одному повідомленні.
|
||||
|
||||
### Invariant tests (anti-regression)
|
||||
|
||||
| Інваріант | Обмеження |
|
||||
|---|---|
|
||||
| INV-1: Greeting | ≤ 80 символів |
|
||||
| INV-2: Thanks/Ack | ≤ 40 символів |
|
||||
| INV-3: Заборонені фрази | "чим можу допомогти", "оберіть", "я як агент", "я бот" |
|
||||
| INV-4: Технічні слова | container, uvicorn, trace_id, STEPAN_IMPORTS_OK |
|
||||
| INV-5: ZZR disclaimer | при ZZR+погода → "за етикеткою" або "за регламентом" |
|
||||
| INV-6: Horizon | `len(recent_topics) ≤ 5` після 7+ push |
|
||||
| INV-7: Міграція | lazy, idempotent, backward-compat |
|
||||
|
||||
---
|
||||
|
||||
## Backward compatibility
|
||||
|
||||
| Аспект | Деталі |
|
||||
|---|---|
|
||||
| `_version` | 3 → 4 (нові поля `recent_topics`, `last_topic_label`) |
|
||||
| Міграція | Lazy при `load_user_profile()` — виконується автоматично при першому зверненні |
|
||||
| `last_topic` | Залишається як alias, завжди синхронізований з `recent_topics[-1].intent` |
|
||||
| `last_topic_label` | Новий alias на `recent_topics[-1].label`; якщо нема — встановлюється під час міграції |
|
||||
| `tone_constraints` | Вже в v2.6; міграція додає якщо відсутній |
|
||||
| `update_profile_if_needed` | Новий параметр `depth="deep"` (default) — backward-compat, старі виклики не ламаються |
|
||||
| `recent_topics` відсутній | Якщо профіль v3 без `recent_topics` — `migrate_profile_topics()` створює 1 елемент з `last_topic` |
|
||||
|
||||
Міграція `migrate_profile_topics()` — **idempotent**: повторний виклик не змінює вже мігрований профіль.
|
||||
|
||||
---
|
||||
|
||||
## Non-goals / not included
|
||||
|
||||
- Немає LLM у light mode або reflection.
|
||||
- Немає змін в інфраструктурі (Dockerfile, compose, env).
|
||||
- Немає змін у Gateway/http_api.py.
|
||||
- Немає нових API ендпоінтів.
|
||||
- Немає змін у поведінці deep mode orchestration.
|
||||
- Немає змін у системному промпті (тільки хедер-версія).
|
||||
|
||||
---
|
||||
|
||||
## Tests
|
||||
|
||||
**Результат:** 101/101 зелених (без регресій з v2.6)
|
||||
|
||||
| Файл | Тестів | Опис |
|
||||
|---|---|---|
|
||||
| `tests/test_stepan_invariants.py` | 25 | Нові інваріанти anti-regression |
|
||||
| `tests/test_stepan_acceptance.py` | 28 | Acceptance + v2.7 сесійні сценарії |
|
||||
| `tests/test_stepan_light_reply.py` | ~26 | Light reply юніт-тести |
|
||||
| `tests/test_stepan_memory_followup.py` | ~22 | Memory + follow-up класифікація |
|
||||
|
||||
```bash
|
||||
# Тільки інваріанти
|
||||
python3 -m pytest tests/test_stepan_invariants.py -v
|
||||
|
||||
# Acceptance
|
||||
python3 -m pytest tests/test_stepan_acceptance.py -v
|
||||
|
||||
# Всі Stepan тести
|
||||
python3 -m pytest tests/test_stepan_invariants.py tests/test_stepan_acceptance.py \
|
||||
tests/test_stepan_light_reply.py tests/test_stepan_memory_followup.py -v
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Known limitations
|
||||
|
||||
### Timezone і daily seed
|
||||
`date.today()` використовує локаль контейнера. Контейнер має бути в `Europe/Kyiv` (`TZ=Europe/Kyiv`), інакше "новий день" Степана настане о 22:00 або 23:00 за Київським часом. Перевірка:
|
||||
```bash
|
||||
docker exec dagi-gateway-node1 date
|
||||
```
|
||||
|
||||
### Memory-service downtime
|
||||
При недоступності — деградація до локального in-memory кешу (TTL 30 хв). Кеш не переживає рестарт контейнера. Профілі не зберігаються між сесіями якщо memory-service down > 30 хв.
|
||||
|
||||
### ZZR regex — можливий overreach
|
||||
Слово `"обробка"` без агрохімічного контексту (напр. "обробка ґрунту") може спрацювати. Якщо в проді виявиться шум — звузити regex: вимагати ще одне слово з `[препарат|норма|л/га|кг/га|концентрат]`.
|
||||
|
||||
---
|
||||
|
||||
## Rollback
|
||||
|
||||
```bash
|
||||
# Відкатити зміни у конкретних файлах
|
||||
git checkout HEAD~1 -- crews/agromatrix_crew/memory_manager.py
|
||||
git checkout HEAD~1 -- crews/agromatrix_crew/light_reply.py
|
||||
git checkout HEAD~1 -- crews/agromatrix_crew/run.py
|
||||
|
||||
# Rebuild gateway (без секретів)
|
||||
cd /opt/microdao-daarion
|
||||
docker compose -f docker-compose.node1.yml up -d --build dagi-gateway-node1
|
||||
|
||||
# Перевірка
|
||||
docker logs dagi-gateway-node1 --since 5m 2>&1 | grep -E "Stepan mode|STEPAN_IMPORTS_OK|error|Error" | tail -30
|
||||
```
|
||||
139
docs/HUMANIZED_STEPAN_v2.7_RELEASE_CHECKLIST.md
Normal file
139
docs/HUMANIZED_STEPAN_v2.7_RELEASE_CHECKLIST.md
Normal file
@@ -0,0 +1,139 @@
|
||||
# Humanized Stepan — Release Checklist
|
||||
|
||||
**Version:** v3 (оновлено з v2.7) | **Date:** 2026-02-24
|
||||
|
||||
---
|
||||
|
||||
## PRE-DEPLOY
|
||||
|
||||
- [ ] **Тести пройдені локально (232/232)**
|
||||
```bash
|
||||
python3 -m pytest \
|
||||
tests/test_stepan_invariants.py tests/test_stepan_acceptance.py \
|
||||
tests/test_stepan_light_reply.py tests/test_stepan_memory_followup.py \
|
||||
tests/test_stepan_telemetry.py tests/test_stepan_v28_farm.py \
|
||||
tests/test_stepan_v29_consolidation.py \
|
||||
tests/test_stepan_v3_session_proactivity_stability.py -v
|
||||
```
|
||||
|
||||
- [ ] **Diff review** — перевірити, що змінені тільки:
|
||||
- `crews/agromatrix_crew/session_context.py` (новий файл — v3)
|
||||
- `crews/agromatrix_crew/proactivity.py` (новий файл — v3)
|
||||
- `crews/agromatrix_crew/depth_classifier.py` (stability guard + `session=` param)
|
||||
- `crews/agromatrix_crew/run.py` (3 мінімальних гачки session/proactivity)
|
||||
- `tests/test_stepan_v3_session_proactivity_stability.py` (новий файл)
|
||||
- `docs/*.md` (документація, не runtime)
|
||||
|
||||
- [ ] **Env перевірка**
|
||||
```bash
|
||||
# На НОДА1 (значення масковані — тільки наявність)
|
||||
ssh root@144.76.224.179 "docker exec dagi-gateway-node1 env \
|
||||
| grep -E '^AGX_OPERATOR_IDS=|^AGX_STEPAN_MODE=|^TZ=' | sed 's/=.*/=***/' "
|
||||
```
|
||||
- [ ] `AGX_STEPAN_MODE=inproc`
|
||||
- [ ] `TZ=Europe/Kyiv`
|
||||
- [ ] `AGX_OPERATOR_IDS` не порожній
|
||||
|
||||
- [ ] **memory-service доступний**
|
||||
```bash
|
||||
docker exec dagi-gateway-node1 curl -s http://memory-service:8000/health
|
||||
```
|
||||
|
||||
- [ ] **Rollback plan підготовлений** — знати попередній image tag або commit hash
|
||||
|
||||
---
|
||||
|
||||
## DEPLOY
|
||||
|
||||
```bash
|
||||
cd /opt/microdao-daarion
|
||||
|
||||
# 1. Pull змін
|
||||
git pull origin main # або потрібна гілка
|
||||
|
||||
# 2. Rebuild тільки gateway
|
||||
docker compose -f docker-compose.node1.yml up -d --build dagi-gateway-node1
|
||||
|
||||
# 3. Перевірка старту (чекати ~30 сек)
|
||||
sleep 30
|
||||
docker logs dagi-gateway-node1 --since 1m 2>&1 | grep -E "Stepan mode|STEPAN_IMPORTS_OK" | tail -5
|
||||
```
|
||||
|
||||
**Очікувані рядки в логах після старту:**
|
||||
```
|
||||
Stepan mode: inproc
|
||||
STEPAN_IMPORTS_OK=True
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## POST-DEPLOY
|
||||
|
||||
### Health перевірка
|
||||
```bash
|
||||
# Логи без помилок
|
||||
docker logs dagi-gateway-node1 --since 5m 2>&1 \
|
||||
| grep -E "ImportError|ModuleNotFoundError|Stepan disabled|ERROR" | wc -l
|
||||
# Очікується: 0
|
||||
```
|
||||
|
||||
### 5 Smoke сценаріїв (Telegram, від оператора)
|
||||
|
||||
| # | Повідомлення | Очікування | Лог-перевірка |
|
||||
|---|---|---|---|
|
||||
| 1 | `Привіт` | ≤80 символів, без "чим допомогти" | `depth=light, crew_launch=false, session_updated` |
|
||||
| 2 | `Зроби план на завтра по полю 12` | Deep відповідь, crew запущений | `depth=deep, crew_launch=true, topics_push=true, session_updated` |
|
||||
| 3 | `а на післязавтра?` | Light, підхоплює тему без нового push; якщо попереднє було light — `stability_guard_triggered` | `depth=light, topics_push=false, session_updated` |
|
||||
| 4 | `обприскування гербіцидом якщо дощ` | Light + disclaimer "за етикеткою"/"за регламентом" | `depth=light` |
|
||||
| 5 | `Дякую` | ≤40 символів, без питань | `depth=light, crew_launch=false` |
|
||||
|
||||
### Спостереження telemetry v3 (30–60 хв після деплою)
|
||||
|
||||
```bash
|
||||
# Session events (перевірити що є, не занадто багато expired)
|
||||
docker logs dagi-gateway-node1 --since 1h 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC session_" | tail -80
|
||||
|
||||
# Stability guard (має бути, але не домінувати)
|
||||
docker logs dagi-gateway-node1 --since 1h 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC stability_guard_triggered" | tail -50
|
||||
|
||||
# Proactivity (має бути рідко)
|
||||
docker logs dagi-gateway-node1 --since 1h 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC proactivity_added" | tail -20
|
||||
```
|
||||
|
||||
### Memory validate
|
||||
```bash
|
||||
# Перевірити що profile зберігся після deep взаємодії
|
||||
# (через memory-service API або логи)
|
||||
docker logs dagi-gateway-node1 --since 10m 2>&1 | grep -E "UserProfile.*updated|FarmProfile.*updated" | tail -10
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## ROLLBACK TRIGGER CONDITIONS
|
||||
|
||||
Негайний rollback якщо:
|
||||
- [ ] `Stepan disabled` у логах після старту
|
||||
- [ ] `ModuleNotFoundError` або `ImportError` у логах
|
||||
- [ ] Більше 3 помилок типу `500` у gateway за 5 хв після деплою
|
||||
- [ ] `light_rate < 0.40` за 30+ повідомлень (занадто багато deep)
|
||||
- [ ] ZZR disclaimer з'являється на не-ЗЗР контекст > 3 рази за сесію
|
||||
|
||||
**v3-специфічні тригери:**
|
||||
- [ ] `proactivity_added` > 3 рази за 30 хв в одному чаті → перевірити `interaction_count` логіку
|
||||
- [ ] `stability_guard_triggered` домінує і deep майже зник (`light_rate > 0.90` при `total >= 30`) → guard надто агресивний
|
||||
- [ ] `session_expired` > 20/год на активному чаті → перевірити TZ контейнера (`docker exec dagi-gateway-node1 date`)
|
||||
|
||||
```bash
|
||||
# Швидкий rollback (v3-файли)
|
||||
cd /opt/microdao-daarion
|
||||
git checkout HEAD~1 -- \
|
||||
crews/agromatrix_crew/run.py \
|
||||
crews/agromatrix_crew/depth_classifier.py
|
||||
# Якщо потрібно прибрати нові модулі повністю:
|
||||
# git checkout HEAD~1 -- crews/agromatrix_crew/session_context.py
|
||||
# git checkout HEAD~1 -- crews/agromatrix_crew/proactivity.py
|
||||
docker compose -f docker-compose.node1.yml up -d --build dagi-gateway-node1
|
||||
```
|
||||
465
docs/HUMANIZED_STEPAN_v2.7_RUNBOOK.md
Normal file
465
docs/HUMANIZED_STEPAN_v2.7_RUNBOOK.md
Normal file
@@ -0,0 +1,465 @@
|
||||
# Humanized Stepan — Production Runbook
|
||||
|
||||
**Version:** v3 (оновлено з v2.7)
|
||||
**Date:** 2026-02-24
|
||||
**Scope:** crews/agromatrix_crew (in-process Stepan, AGX_STEPAN_MODE=inproc)
|
||||
|
||||
---
|
||||
|
||||
## A) Purpose / Scope
|
||||
|
||||
Цей runbook описує операційний контроль Humanized Stepan (v2.7 → v3) у виробничому середовищі НОДА1.
|
||||
Охоплює: перевірку справності, 5 smoke-сценаріїв, troubleshooting, rollback, v3 observability.
|
||||
|
||||
**Поза scope:** crewai-service HTTP mode (AGX_STEPAN_MODE=http), інші агенти.
|
||||
|
||||
---
|
||||
|
||||
## B) Preconditions
|
||||
|
||||
Перед smoke-тестуванням перевірити:
|
||||
|
||||
```bash
|
||||
# 1. Степан увімкнений
|
||||
docker exec dagi-gateway-node1 env | grep -E "AGX_STEPAN_MODE|STEPAN_IMPORTS_OK" | sed 's/=.*/=***/'
|
||||
|
||||
# 2. Оператор налаштований
|
||||
docker exec dagi-gateway-node1 env | grep -E "AGX_OPERATOR_IDS|AGX_OPERATOR_CHAT_ID" | sed 's/=.*/=***/'
|
||||
|
||||
# 3. Memory-service доступний
|
||||
docker exec dagi-gateway-node1 curl -s http://memory-service:8000/health | head -1
|
||||
|
||||
# 4. Timezone
|
||||
docker exec dagi-gateway-node1 date
|
||||
# Очікується: Europe/Kyiv або EET/EEST
|
||||
|
||||
# 5. Crews і tools на місці
|
||||
docker exec dagi-gateway-node1 ls /app/crews/agromatrix_crew/ | head -5
|
||||
docker exec dagi-gateway-node1 python3 -c "import agromatrix_tools; print('OK')"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## C) 5 Live Smoke Scenarios (Telegram)
|
||||
|
||||
Надсилаються оператором у чат, де активний Степан.
|
||||
|
||||
---
|
||||
|
||||
### Сценарій 1: Новий / невідомий user — Нейтральне привітання
|
||||
|
||||
**Повідомлення:** `Привіт`
|
||||
|
||||
**Очікування:**
|
||||
- Відповідь: 1 коротка фраза, ≤ 80 символів
|
||||
- Без "чим можу допомогти", без питання-списку
|
||||
- Для першого звернення (interaction_count ≤ 2): нейтральна форма ("Привіт. Що зараз важливіше: план чи статуси?")
|
||||
|
||||
**Grep у логах:**
|
||||
```bash
|
||||
docker logs dagi-gateway-node1 --since 2m 2>&1 | grep -E "depth=light|crew_launch=false"
|
||||
```
|
||||
|
||||
**Очікується:** `depth=light`, `crew_launch=false`
|
||||
|
||||
---
|
||||
|
||||
### Сценарій 2: Deep запит — тема записується в recent_topics
|
||||
|
||||
**Повідомлення:** `Зроби план на завтра по полю 12`
|
||||
|
||||
**Очікування:**
|
||||
- Степан запускає orchestration (deep)
|
||||
- Відповідь: план або уточнюючі питання
|
||||
- `recent_topics` поповнюється записом типу `{"label": "план на завтра по полю 12", "intent": "plan_day", ...}`
|
||||
|
||||
**Grep у логах:**
|
||||
```bash
|
||||
docker logs dagi-gateway-node1 --since 2m 2>&1 | grep -E "depth=deep|crew_launch=true|topics_push=true"
|
||||
```
|
||||
|
||||
**Очікується:** `depth=deep`, `crew_launch=true`, `topics_push=true`
|
||||
|
||||
---
|
||||
|
||||
### Сценарій 3: Light follow-up — тема НЕ додається повторно
|
||||
|
||||
**Повідомлення:** `а на післязавтра?` (одразу після сценарію 2)
|
||||
|
||||
**Очікування:**
|
||||
- Відповідь: коротка, підхоплює тему ("план на завтра по полю 12" або подібне)
|
||||
- `recent_topics` не змінюється (no new push)
|
||||
- Crew не запускається
|
||||
- **v3:** якщо сценарій 2 був light — `stability_guard_triggered` в логах замість стандартної класифікації
|
||||
|
||||
**Grep у логах:**
|
||||
```bash
|
||||
docker logs dagi-gateway-node1 --since 2m 2>&1 | grep -E "depth=light|topics_push=false|crew_launch=false|stability_guard_triggered"
|
||||
```
|
||||
|
||||
**Очікується:** `depth=light`, `topics_push=false`, `crew_launch=false`
|
||||
|
||||
---
|
||||
|
||||
### Сценарій 4: Weather + ZZR — disclaimer обов'язковий
|
||||
|
||||
**Повідомлення:** `обприскування гербіцидом — якщо дощ сьогодні?`
|
||||
|
||||
**Очікування:**
|
||||
- Відповідь містить практичну пораду по погоді (light mode)
|
||||
- Відповідь **обов'язково** містить: `"за етикеткою"` або `"за регламентом"`
|
||||
- Crew не запускається
|
||||
|
||||
**Grep у логах:**
|
||||
```bash
|
||||
docker logs dagi-gateway-node1 --since 2m 2>&1 | grep -E "depth=light|weather|crew_launch=false"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Сценарій 5: Подяка — коротко, без питань
|
||||
|
||||
**Повідомлення:** `Дякую`
|
||||
|
||||
**Очікування:**
|
||||
- Відповідь: 2–5 слів, ≤ 40 символів
|
||||
- Без питань
|
||||
- Без "будь ласка, звертайтесь", без довгих формулювань
|
||||
|
||||
**Grep у логах:**
|
||||
```bash
|
||||
docker logs dagi-gateway-node1 --since 2m 2>&1 | grep -E "depth=light|crew_launch=false"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## D) Telemetry Tag і Log Grep Patterns
|
||||
|
||||
### Telemetry Tag (v2.7.1)
|
||||
|
||||
Усі ключові метричні рядки мають єдиний префікс **`AGX_STEPAN_METRIC`**.
|
||||
Формат: `AGX_STEPAN_METRIC <event> key=value key2=value2`
|
||||
|
||||
| Event | Ключі | Де генерується |
|
||||
|---|---|---|
|
||||
| `depth` | `depth=light\|deep reason=...` | `depth_classifier.py` |
|
||||
| `crew_launch` | `launched=true\|false depth=...` | `run.py` |
|
||||
| `topics_push` | `pushed=true\|false intent=... label=... horizon=N` | `memory_manager.py` |
|
||||
| `memory_save` | `entity=UserProfile\|FarmProfile ok=true` | `memory_manager.py` |
|
||||
| `memory_fallback` | `entity=... reason=memory_service_unavailable` | `memory_manager.py` |
|
||||
| `memory_summary_updated` | `user_id=...` | `memory_manager.py` |
|
||||
| `reflection_done` | `confidence=0.NN clarifying=true\|false new_facts=[...]` | `reflection_engine.py` |
|
||||
| `reflection_skip` | `reason=recursion_guard\|error` | `reflection_engine.py` |
|
||||
| `session_loaded` | `chat_id=h:... status=new\|hit last_depth=...` | `session_context.py` |
|
||||
| `session_expired` | `chat_id=h:... age_s=N` | `session_context.py` |
|
||||
| `session_updated` | `chat_id=h:... depth=... agents=[...]` | `session_context.py` |
|
||||
| `stability_guard_triggered` | `chat_id=n/a words=N last_depth=light` | `depth_classifier.py` |
|
||||
| `proactivity_added` | `user_id=h:... intent=... style=...` | `proactivity.py` |
|
||||
| `proactivity_skipped` | `user_id=h:... reason=...` | `proactivity.py` |
|
||||
|
||||
### Grep one-liners (уніфіковані)
|
||||
|
||||
```bash
|
||||
# ─── Усі метричні рядки Степана ─────────────────────────────────────────────
|
||||
docker logs dagi-gateway-node1 --since 30m 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC" | tail -50
|
||||
|
||||
# ─── Тільки depth (класифікація режиму) ─────────────────────────────────────
|
||||
docker logs dagi-gateway-node1 --since 30m 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC depth"
|
||||
|
||||
# ─── Тільки crew_launch ──────────────────────────────────────────────────────
|
||||
docker logs dagi-gateway-node1 --since 30m 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC crew_launch"
|
||||
|
||||
# ─── Тільки topics_push ──────────────────────────────────────────────────────
|
||||
docker logs dagi-gateway-node1 --since 30m 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC topics_push"
|
||||
|
||||
# ─── Memory fallback (аларм) ─────────────────────────────────────────────────
|
||||
docker logs dagi-gateway-node1 --since 30m 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC memory_fallback"
|
||||
|
||||
# ─── light_rate (тільки tagged рядки) ────────────────────────────────────────
|
||||
L=$(docker logs dagi-gateway-node1 --since 60m 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC depth" | grep -c "depth=light")
|
||||
D=$(docker logs dagi-gateway-node1 --since 60m 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC depth" | grep -c "depth=deep")
|
||||
T=$((L + D))
|
||||
if [ "$T" -ge 10 ]; then
|
||||
echo "light=$L deep=$D total=$T light_rate=$(echo "scale=2; $L/$T" | bc)"
|
||||
else
|
||||
echo "light=$L deep=$D total=$T — замало даних (< 10), не робити висновків"
|
||||
fi
|
||||
```
|
||||
|
||||
**Норма light_rate:** 0.60–0.80 для типового оператора.
|
||||
Нижче 0.50 → перевірити `_DEEP_ACTION_RE` у `depth_classifier.py` + запустити `test_stepan_invariants.py`.
|
||||
|
||||
```bash
|
||||
# ─── v3: Session events (сесійний шар) ───────────────────────────────────────
|
||||
docker logs dagi-gateway-node1 --since 2h 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC session_" | tail -80
|
||||
|
||||
# ─── v3: Stability guard ─────────────────────────────────────────────────────
|
||||
docker logs dagi-gateway-node1 --since 2h 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC stability_guard_triggered" | tail -50
|
||||
|
||||
# ─── v3: Proactivity ─────────────────────────────────────────────────────────
|
||||
docker logs dagi-gateway-node1 --since 2h 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC proactivity_added" | tail -50
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## E) PII-safe Telemetry (v2.7.2)
|
||||
|
||||
### Що анонімізується
|
||||
|
||||
Ключі `user_id` і `chat_id` у будь-якому `tlog()` виклику **автоматично** замінюються на хеш-псевдонім формату `h:<10 hex символів>`:
|
||||
|
||||
```
|
||||
AGX_STEPAN_METRIC memory_save entity=UserProfile user_id=h:3f9a12b4c7 ok=true
|
||||
```
|
||||
|
||||
Сирі ідентифікатори у `AGX_STEPAN_METRIC` рядках **відсутні**.
|
||||
|
||||
### Формат псевдоніму
|
||||
|
||||
```
|
||||
h: + sha256(raw_id)[:10] → "h:3f9a12b4c7"
|
||||
```
|
||||
|
||||
Завжди 12 символів. Стабільний для одного `user_id` між рестартами та логами.
|
||||
|
||||
### Кореляція подій одного користувача
|
||||
|
||||
Щоб знайти всі події одного користувача у логах (не знаючи сирого id):
|
||||
|
||||
```bash
|
||||
# Знайти псевдонім вручну (виконати разом з оператором):
|
||||
python3 -c "import hashlib; print('h:' + hashlib.sha256(b'<raw_user_id>').hexdigest()[:10])"
|
||||
|
||||
# Потім grep:
|
||||
docker logs dagi-gateway-node1 --since 60m 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC" | grep "h:3f9a12b4c7"
|
||||
```
|
||||
|
||||
### Важливі застереження
|
||||
|
||||
- Це **не** криптографічна анонімізація. Якщо атакуючий знає `user_id` — він може відновити псевдонім і знайти події.
|
||||
- Захищає від **випадкового** витоку у лог-агрегаторах (Loki, ELK, CloudWatch), де до логів мають доступ більше людей, ніж до БД.
|
||||
- **Доступ до логів контейнера** має бути обмежений тільки для DevOps/операторів.
|
||||
- Якщо потрібна повна GDPR/DPIA відповідність — застосуйте окрему маскування перед відправкою в зовнішній лог-сервіс.
|
||||
|
||||
---
|
||||
|
||||
## K) v3 Additions — Session / Proactivity / Stability Guard
|
||||
|
||||
### K1) Session Context Layer
|
||||
|
||||
**Що це:** in-memory кеш сесії на `chat_id`, TTL 15 хвилин.
|
||||
**Зберігає:**
|
||||
- `last_messages` (до 3 повідомлень)
|
||||
- `last_depth` (`"light"` / `"deep"`)
|
||||
- `last_agents` (до 5 назв агентів)
|
||||
- `last_question` — уточнюючий запит від reflection, якщо був
|
||||
|
||||
**Важливо:**
|
||||
- Сесія **не** пишеться у memory-service — тільки в оперативній пам'яті процесу.
|
||||
- При рестарті контейнера сесія скидається — це очікувано (TTL 15 хв).
|
||||
- При `session_expired` стан повертається в default без втрати профілів.
|
||||
|
||||
**Telemetry:**
|
||||
```
|
||||
AGX_STEPAN_METRIC session_loaded chat_id=h:... status=new|hit
|
||||
AGX_STEPAN_METRIC session_expired chat_id=h:... age_s=N
|
||||
AGX_STEPAN_METRIC session_updated chat_id=h:... depth=... agents=[...]
|
||||
```
|
||||
|
||||
**Норма `session_expired`:** поодинокі. Якщо > 20/год на активному чаті — перевірити системний час контейнера (`docker exec dagi-gateway-node1 date`). Можлива причина: контейнер в UTC, а TZ операторів — Europe/Kyiv.
|
||||
|
||||
---
|
||||
|
||||
### K2) Intent Stability Guard
|
||||
|
||||
**Що це:** короткий follow-up після light-взаємодії не може випадково потрапити в deep.
|
||||
|
||||
**Умови спрацювання (всі одночасно):**
|
||||
- `session.last_depth == "light"`
|
||||
- Кількість слів ≤ 6
|
||||
- Немає action verbs (`_DEEP_ACTION_RE`)
|
||||
- Немає urgent слів (`_DEEP_URGENT_RE`)
|
||||
|
||||
**Перебивається:** будь-яке action verb або urgent слово — guard не спрацьовує і класифікація йде звичайним шляхом.
|
||||
|
||||
**Telemetry:**
|
||||
```
|
||||
AGX_STEPAN_METRIC stability_guard_triggered chat_id=n/a words=N last_depth=light
|
||||
```
|
||||
|
||||
**Норма:** 20–40% від усіх light-повідомлень після активної сесії — це нормально.
|
||||
**Аларм:** якщо `stability_guard_triggered` домінує (> 90% від depth events) і deep майже зник — guard надто агресивний. Розслідувати, чи немає регресії у action verb regex.
|
||||
|
||||
---
|
||||
|
||||
### K3) Soft Proactivity Layer
|
||||
|
||||
**Що це:** рівно 1 коротке речення ≤ 120 символів, без `!`, додається в кінець deep-відповіді.
|
||||
|
||||
**Умови (всі одночасно):**
|
||||
1. `depth == "deep"`
|
||||
2. `reflection.confidence >= 0.7` (або reflection відсутній)
|
||||
3. `interaction_count % 10 == 0`
|
||||
4. В `known_intents` один intent зустрівся ≥ 3 рази
|
||||
5. НЕ (`preferred_style == "brief"` AND відповідь вже містить `"?"`)
|
||||
|
||||
**Банки фраз:** 4 банки — generic, iot, plan, sustainability. Вибір seeded за `user_id + interaction_count`.
|
||||
|
||||
**Telemetry:**
|
||||
```
|
||||
AGX_STEPAN_METRIC proactivity_added user_id=h:... intent=... style=...
|
||||
AGX_STEPAN_METRIC proactivity_skipped user_id=h:... reason=not_deep|not_tenth|...
|
||||
```
|
||||
|
||||
**Норма:** рідко — 1 раз на ~10 deep-взаємодій з постійним користувачем. Якщо `proactivity_added` > 3 рази за 30 хв в одному чаті — перевірити `interaction_count` логіку.
|
||||
|
||||
---
|
||||
|
||||
## F) Troubleshooting
|
||||
|
||||
### Memory-service недоступний
|
||||
**Симптом:** у логах `UserProfile fallback` або `memory.*timeout`
|
||||
**Поведінка:** Степан продовжує роботу з in-memory кешем (TTL 30 хв). Профілі не зберігаються між рестартами.
|
||||
**Дія:** перевірити memory-service:
|
||||
```bash
|
||||
docker ps | grep memory-service
|
||||
docker logs memory-service --since 10m 2>&1 | tail -30
|
||||
```
|
||||
|
||||
### Дивна повторюваність відповідей між днями
|
||||
**Симптом:** Степан відповідає однаково кілька днів підряд (не змінюється щодня)
|
||||
**Причина:** TZ контейнера — UTC замість Europe/Kyiv; `date.today()` повертає UTC-дату
|
||||
**Дія:**
|
||||
```bash
|
||||
docker exec dagi-gateway-node1 date
|
||||
# Якщо не Kyiv — додати в docker-compose.node1.yml:
|
||||
# environment:
|
||||
# TZ: "Europe/Kyiv"
|
||||
```
|
||||
|
||||
### Занадто багато deep-запусків
|
||||
**Симптом:** `crew_launch=true` на прості запити ("ок", "зрозумів")
|
||||
**Причина:** регресія у action-verb regex або новий тригер у `_DEEP_ACTION_RE`
|
||||
**Дія:**
|
||||
```bash
|
||||
# Перевірити depth_classifier.py — порівняти _DEEP_ACTION_RE з референсом v2.7
|
||||
# Запустити інваріантні тести
|
||||
python3 -m pytest tests/test_stepan_invariants.py tests/test_stepan_memory_followup.py -v
|
||||
```
|
||||
|
||||
### ZZR disclaimer надто часто (false positives)
|
||||
**Симптом:** "обробка ґрунту після дощу" отримує disclaimer
|
||||
**Причина:** `_ZZR_RE` чіпляє загальне "обробк"
|
||||
**Дія:** звузити regex — додати вимогу другого слова:
|
||||
```python
|
||||
# Поточний: r'\b(обробк|обприскування|...)\w*\b'
|
||||
# Звужений: вимагати [препарат|норма|л/га|кг/га] поруч
|
||||
```
|
||||
Це зміна в `light_reply.py` — перед внесенням перезапустити `test_stepan_invariants.py::test_inv5_*`.
|
||||
|
||||
### Степан не відповідає (Stepan disabled)
|
||||
**Симптом:** у логах `Stepan disabled` або `STEPAN_IMPORTS_OK=False`
|
||||
**Дія:**
|
||||
```bash
|
||||
docker logs dagi-gateway-node1 --since 5m 2>&1 | grep -E "ImportError|ModuleNotFoundError|Stepan disabled"
|
||||
# Якщо crews відсутні:
|
||||
docker exec dagi-gateway-node1 ls /app/crews/agromatrix_crew/ | head -5
|
||||
# Якщо agromatrix_tools відсутній:
|
||||
docker exec dagi-gateway-node1 python3 -c "import agromatrix_tools"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## F) Safety Notes
|
||||
|
||||
### ZZR Disclaimer — чому він тут
|
||||
Степан може надавати погодні рекомендації у light mode (без LLM, rule-based). Коли в запиті є обприскування/гербіцид + погодні умови, є ризик надто конкретної поради по нормам або вікнах застосування. Disclaimer фіксує відповідальність на етикетці препарату і є **mandatory** — не видаляти без перегляду safety policy.
|
||||
|
||||
### Seeded RNG — чому щоденна, а не per-interaction
|
||||
Stабільність відповідей на рівні дня — це баланс між передбачуваністю та людяністю. Якщо seed per-interaction — фрази відчуваються "скачуть" у межах одної сесії. Якщо seed стала — фрази однакові тижнями. Daily seed дає природну варіацію без artifactів.
|
||||
|
||||
---
|
||||
|
||||
## G) Rollback Steps
|
||||
|
||||
### Швидкий rollback (тільки код)
|
||||
```bash
|
||||
cd /opt/microdao-daarion
|
||||
|
||||
# Відкатити Stepan-файли до попередньої версії
|
||||
git checkout HEAD~1 -- crews/agromatrix_crew/memory_manager.py
|
||||
git checkout HEAD~1 -- crews/agromatrix_crew/light_reply.py
|
||||
git checkout HEAD~1 -- crews/agromatrix_crew/run.py
|
||||
|
||||
# Rebuild
|
||||
docker compose -f docker-compose.node1.yml up -d --build dagi-gateway-node1
|
||||
|
||||
# Verify
|
||||
docker logs dagi-gateway-node1 --since 3m 2>&1 | grep -E "Stepan mode|STEPAN_IMPORTS_OK" | tail -5
|
||||
```
|
||||
|
||||
### Rollback через Docker image tag
|
||||
```bash
|
||||
# Якщо збережений попередній image tag (наприклад :v2.6)
|
||||
docker compose -f docker-compose.node1.yml down dagi-gateway-node1
|
||||
docker tag dagi-gateway-node1:v2.6 dagi-gateway-node1:current
|
||||
docker compose -f docker-compose.node1.yml up -d dagi-gateway-node1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## H) Multi-user Farm Model (v2.8)
|
||||
|
||||
### Схема зберігання
|
||||
|
||||
| Що | Ключ | Хто ділить |
|
||||
|---|---|---|
|
||||
| UserProfile | `user_profile:agromatrix:{user_id}` | Тільки один user |
|
||||
| FarmProfile | `farm_profile:agromatrix:chat:{chat_id}` | Усі users у чаті |
|
||||
| FarmProfile (legacy) | `farm_profile:agromatrix:{user_id}` | Deprecated — мігрується при першому запиті |
|
||||
|
||||
### Як перевірити що міграція відбулась
|
||||
|
||||
```bash
|
||||
docker logs dagi-gateway-node1 --since 60m 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC farm_profile_migrated"
|
||||
```
|
||||
|
||||
### Як виявити конфлікт
|
||||
|
||||
```bash
|
||||
docker logs dagi-gateway-node1 --since 60m 2>&1 \
|
||||
| grep "AGX_STEPAN_METRIC farm_profile_conflict"
|
||||
```
|
||||
|
||||
При конфлікті — chat-profile **не** перезаписується. Лише лог. Якщо потрібно вирішити вручну — або очистити legacy ключ у memory-service, або видалити chat-ключ.
|
||||
|
||||
## J) Monitoring Suggestions (Manual)
|
||||
|
||||
**light_rate** — частка light-відповідей:
|
||||
```bash
|
||||
# За останню годину
|
||||
L=$(docker logs dagi-gateway-node1 --since 60m 2>&1 | grep -c "depth=light")
|
||||
D=$(docker logs dagi-gateway-node1 --since 60m 2>&1 | grep -c "depth=deep")
|
||||
echo "light=$L deep=$D ratio=$(echo "scale=2; $L/($L+$D)" | bc)"
|
||||
```
|
||||
Норма: light_rate ≈ 0.60–0.80 для типового оператора. Нижче 0.50 — перевірити action-verb regex.
|
||||
|
||||
**avg_chars_light / avg_chars_deep** — вручну для вибірки:
|
||||
Зберегти кілька реальних відповідей і підрахувати довжину. Light має бути < 120 символів у медіані.
|
||||
|
||||
Якщо light_rate різко знизився або avg_chars_light зріс після деплою — першою дією є:
|
||||
```bash
|
||||
python3 -m pytest tests/test_stepan_invariants.py -v
|
||||
```
|
||||
123
docs/HUMANIZED_STEPAN_v2.8_CHANGELOG.md
Normal file
123
docs/HUMANIZED_STEPAN_v2.8_CHANGELOG.md
Normal file
@@ -0,0 +1,123 @@
|
||||
# Humanized Stepan — CHANGELOG v2.8
|
||||
|
||||
**Version:** v2.8
|
||||
**Date:** 2026-02-25
|
||||
**Базується на:** v2.7.2 (PII-safe telemetry, recent_topics horizon, invariant tests)
|
||||
|
||||
---
|
||||
|
||||
## Summary
|
||||
|
||||
- **Multi-user farm model**: `FarmProfile` тепер зберігається під ключем `farm_profile:agromatrix:chat:{chat_id}` — shared для всіх операторів в одному чаті.
|
||||
- **UserProfile** залишається per-user (`user_profile:agromatrix:{user_id}`) — стиль, recent_topics, interaction_summary окремі для кожного.
|
||||
- **Lazy migration**: перший запит з `user_id` автоматично мігрує старий legacy-ключ `farm_profile:agromatrix:{user_id}` у новий chat-ключ (write-through, без ручного втручання).
|
||||
- **Conflict policy**: якщо chat-profile вже існує і відрізняється від legacy — не перезаписуємо; лише tlog `farm_profile_conflict`.
|
||||
- **FarmProfile v5**: додані нові поля (`farm_name`, `field_ids`, `crop_ids`, `active_integrations`, `iot_sensors`, `alert_thresholds`, `seasonal_context`).
|
||||
- **Backward-compat**: `load_farm_profile(chat_id)` без `user_id` — не крашить, повертає default.
|
||||
|
||||
---
|
||||
|
||||
## Key features (деталі)
|
||||
|
||||
### Нові fact-ключі
|
||||
|
||||
| Тип | Ключ | Scope |
|
||||
|---|---|---|
|
||||
| UserProfile | `user_profile:agromatrix:{user_id}` | per-user (без змін) |
|
||||
| FarmProfile (v2.8) | `farm_profile:agromatrix:chat:{chat_id}` | per-chat (новий) |
|
||||
| FarmProfile (legacy) | `farm_profile:agromatrix:{user_id}` | deprecated, мігрується lazy |
|
||||
|
||||
### Lazy Migration Flow
|
||||
|
||||
```
|
||||
load_farm_profile(chat_id, user_id)
|
||||
│
|
||||
├── cache hit (chat-key)? → return
|
||||
├── memory-service chat-key? → return + cache
|
||||
├── memory-service legacy-key (user_id)?
|
||||
│ ├── YES → copy to chat-key (write-through) + return migrated profile
|
||||
│ │ tlog: farm_profile_migrated
|
||||
│ └── NO → default farm_profile(chat_id)
|
||||
```
|
||||
|
||||
### Conflict Policy
|
||||
|
||||
При явній міграції через `migrate_farm_profile_legacy_to_chat()`:
|
||||
- Якщо chat-profile існує і **суттєво відрізняється** (crops/field_ids/region/season_state) → NOT overwritten
|
||||
- `tlog: farm_profile_conflict reason=legacy_diff`
|
||||
- Повертається існуючий chat-profile
|
||||
|
||||
Критерій суттєвої відмінності (`_farm_profiles_differ`): порівнює `crops`, `field_ids`, `fields`, `region`, `season_state`, `active_integrations`.
|
||||
|
||||
### FarmProfile v5 — нові поля
|
||||
|
||||
```json
|
||||
{
|
||||
"_version": 5,
|
||||
"chat_id": "...",
|
||||
"farm_name": null,
|
||||
"field_ids": [],
|
||||
"crop_ids": [],
|
||||
"active_integrations": [],
|
||||
"iot_sensors": [],
|
||||
"alert_thresholds": {},
|
||||
"seasonal_context": {},
|
||||
"region": null,
|
||||
"crops": [],
|
||||
"fields": [],
|
||||
"season_state": null
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Backward Compatibility
|
||||
|
||||
| Аспект | Деталі |
|
||||
|---|---|
|
||||
| `load_farm_profile(chat_id)` | Без `user_id` — не крашить (legacy path пропускається) |
|
||||
| `load_farm_profile(chat_id, user_id)` | Новий API; `user_id` потрібен тільки для lazy migration |
|
||||
| `save_farm_profile(chat_id, profile)` | API без змін (тепер під chat-key автоматично) |
|
||||
| Legacy ключ | Не видаляється, існує в memory-service до явного очищення |
|
||||
| `_version` FarmProfile | 1 → 5; non-breaking (нові поля, старі залишаються) |
|
||||
|
||||
---
|
||||
|
||||
## Non-goals / not included
|
||||
|
||||
- Немає автоматичного merge при конфлікті.
|
||||
- Немає видалення legacy ключів (тільки read-migrate).
|
||||
- Немає зміни light/deep логіки, тональності, банків фраз.
|
||||
- Немає нових ендпоінтів або інфра-змін.
|
||||
|
||||
---
|
||||
|
||||
## Tests
|
||||
|
||||
**Результат:** 161/161 зелених (без регресій з v2.7.2)
|
||||
|
||||
| Файл | Нових тестів | Опис |
|
||||
|---|---|---|
|
||||
| `tests/test_stepan_v28_farm.py` | 24 | Multi-user farm: ключі, міграція, конфлікт, acceptance |
|
||||
|
||||
```bash
|
||||
# Тільки v2.8 farm тести
|
||||
python3 -m pytest tests/test_stepan_v28_farm.py -v
|
||||
|
||||
# Всі Stepan тести
|
||||
python3 -m pytest tests/test_stepan_v28_farm.py tests/test_stepan_telemetry.py \
|
||||
tests/test_stepan_invariants.py tests/test_stepan_acceptance.py \
|
||||
tests/test_stepan_light_reply.py tests/test_stepan_memory_followup.py -v
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Rollback
|
||||
|
||||
```bash
|
||||
git checkout HEAD~1 -- crews/agromatrix_crew/memory_manager.py \
|
||||
crews/agromatrix_crew/run.py
|
||||
docker compose -f docker-compose.node1.yml up -d --build dagi-gateway-node1
|
||||
```
|
||||
|
||||
Після rollback до v2.7.x: farm_profile знову читатиметься зі старого legacy-ключа (якщо є в cache/memory-service). Новий chat-ключ залишиться в memory-service, але не буде використовуватись.
|
||||
113
docs/HUMANIZED_STEPAN_v2.9_CHANGELOG.md
Normal file
113
docs/HUMANIZED_STEPAN_v2.9_CHANGELOG.md
Normal file
@@ -0,0 +1,113 @@
|
||||
# Humanized Stepan — CHANGELOG v2.9
|
||||
|
||||
**Version:** v2.9
|
||||
**Date:** 2026-02-25
|
||||
**Базується на:** v2.8 (Multi-user FarmProfile, lazy migration, PII-safe telemetry)
|
||||
|
||||
---
|
||||
|
||||
## Summary
|
||||
|
||||
Memory Consolidation — детермінована, ідемпотентна очистка UserProfile і FarmProfile:
|
||||
|
||||
- Профілі не "розростаються" нескінченно з часом.
|
||||
- Запускається автоматично кожні 25 взаємодій (або раніше при hard trigger).
|
||||
- Без LLM. Без зміни Light/Deep логіки і текстів відповідей.
|
||||
- Fail-safe: будь-яка помилка → профіль залишається незміненим, tlog warning.
|
||||
- PII-safe: всі telemetry логи через `tlog` з анонімізацією `user_id`/`chat_id`.
|
||||
|
||||
---
|
||||
|
||||
## Що обрізається і чому це safe
|
||||
|
||||
| Поле | Ліміт | Метод |
|
||||
|---|---|---|
|
||||
| `context_notes` | ≤ 20 | dedup + trim (останні N) |
|
||||
| `known_intents` | ≤ 30 | dedup + trim (останні N) |
|
||||
| `preferences` | whitelist keys | видалення невідомих ключів |
|
||||
| `tone_constraints` | bool-ключі | нормалізація типів + видалення невідомих |
|
||||
| `interaction_summary` | ≤ 220 символів | cap без обрізки посередині слова |
|
||||
| `recent_topics` | ≤ 5 | dedup (вже є horizon, для безпеки) |
|
||||
| `field_ids` | ≤ 200 | dedup + trim |
|
||||
| `crop_ids` | ≤ 100 | dedup + trim |
|
||||
| `active_integrations` | ≤ 20 | dedup + trim |
|
||||
|
||||
**Whitelist `preferences` keys:** `units`, `report_format`, `tone_constraints`, `language`
|
||||
|
||||
Consolidation зберігає останні N записів (не перші) — найновіші теми/поля мають пріоритет.
|
||||
|
||||
---
|
||||
|
||||
## Тригери
|
||||
|
||||
| Тип | Умова |
|
||||
|---|---|
|
||||
| Periodic | `interaction_count % 25 == 0` (25, 50, 75…) |
|
||||
| Hard trigger (user) | `len(context_notes) > 30` або `len(known_intents) > 45` |
|
||||
| Hard trigger (farm) | `len(field_ids) > 300`, `len(crop_ids) > 150`, або `len(active_integrations) > 30` |
|
||||
|
||||
---
|
||||
|
||||
## Telemetry events
|
||||
|
||||
```
|
||||
AGX_STEPAN_METRIC memory_consolidated entity=user_profile user_id=h:... changed=true reason=periodic
|
||||
AGX_STEPAN_METRIC memory_consolidated entity=farm_profile chat_id=h:... changed=false reason=hard_trigger
|
||||
AGX_STEPAN_METRIC memory_consolidation_error entity=user_profile user_id=h:... error=...
|
||||
```
|
||||
|
||||
Grep у проді:
|
||||
```bash
|
||||
docker logs dagi-gateway-node1 --since 60m 2>&1 | grep "AGX_STEPAN_METRIC memory_consolidated"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Що НЕ змінюється
|
||||
|
||||
- `classify_depth` / `depth_classifier` — без змін
|
||||
- `light_reply` банки фраз і поведінка — без змін
|
||||
- `reflection_engine` — без змін
|
||||
- Тексти відповідей агента — без змін
|
||||
- `recent_topics` semantics (horizon 5) — без змін
|
||||
- FarmProfile `chat_id` key (v2.8) — без змін
|
||||
|
||||
---
|
||||
|
||||
## Backward Compatibility
|
||||
|
||||
- Поля яких немає в профілі (наприклад `context_notes`) — ігноруються (не створюються)
|
||||
- `preferences` без whitelist-ключів — тільки видаляються зайві, наявні зберігаються
|
||||
- `tone_constraints` з невалідними типами (int замість bool) — нормалізуються до bool
|
||||
|
||||
---
|
||||
|
||||
## Tests
|
||||
|
||||
**Результат:** 203/203 зелених
|
||||
|
||||
| Файл | Нових тестів | Опис |
|
||||
|---|---|---|
|
||||
| `tests/test_stepan_v29_consolidation.py` | 42 | Limits, dedup, triggers, idempotency, fail-safe, telemetry |
|
||||
|
||||
```bash
|
||||
# Тільки v2.9 consolidation тести
|
||||
python3 -m pytest tests/test_stepan_v29_consolidation.py -v
|
||||
|
||||
# Всі Stepan тести (203)
|
||||
python3 -m pytest tests/test_stepan_v29_consolidation.py tests/test_stepan_v28_farm.py \
|
||||
tests/test_stepan_telemetry.py tests/test_stepan_invariants.py \
|
||||
tests/test_stepan_acceptance.py tests/test_stepan_light_reply.py \
|
||||
tests/test_stepan_memory_followup.py -v
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Rollback
|
||||
|
||||
```bash
|
||||
git checkout HEAD~1 -- crews/agromatrix_crew/memory_manager.py
|
||||
docker compose -f docker-compose.node1.yml up -d --build dagi-gateway-node1
|
||||
```
|
||||
|
||||
Після rollback: consolidation не запускається, профілі накопичуються як раніше. Існуючі профілі не ламаються.
|
||||
142
docs/HUMANIZED_STEPAN_v3_CHANGELOG.md
Normal file
142
docs/HUMANIZED_STEPAN_v3_CHANGELOG.md
Normal file
@@ -0,0 +1,142 @@
|
||||
# Humanized Stepan v3 — Changelog
|
||||
|
||||
**Version:** v3
|
||||
**Date:** 2026-02-24
|
||||
**Base:** v2.9 (Memory Consolidation)
|
||||
**Type:** Additive — нова функціональність без змін v2.9-ядра
|
||||
|
||||
---
|
||||
|
||||
## Summary
|
||||
|
||||
v3 додає три ізольованих шари поверх v2.9, не торкаючись:
|
||||
- light_reply банків фраз
|
||||
- memory consolidation логіки
|
||||
- telemetry/PII-safe механіки (AGX_STEPAN_METRIC, anonymize_id)
|
||||
- FarmProfile v2.8 міграції
|
||||
- reflection engine
|
||||
- depth classifier основної логіки (тільки новий опціональний param)
|
||||
|
||||
---
|
||||
|
||||
## Що додано
|
||||
|
||||
### 1. Session Context Layer (`session_context.py`)
|
||||
|
||||
- In-memory кеш `dict[chat_id → SessionContext]`, TTL = 900s (15 хв).
|
||||
- Структура `SessionContext`:
|
||||
```json
|
||||
{
|
||||
"last_messages": ["...", "...", "..."],
|
||||
"last_depth": "light" | "deep" | null,
|
||||
"last_agents": ["ops", "iot", ...],
|
||||
"last_question": "Уточни поле?" | null,
|
||||
"updated_at": 1234567890.0
|
||||
}
|
||||
```
|
||||
- API: `load_session(chat_id)` / `update_session(chat_id, ...)` / `clear_session(chat_id)`.
|
||||
- `load_session` при протуханні повертає default без виключень (fail-safe).
|
||||
- Telemetry: `session_loaded`, `session_expired`, `session_updated` — PII-safe `chat_id=h:...`.
|
||||
|
||||
### 2. Intent Stability Guard (розширення `depth_classifier.py`)
|
||||
|
||||
- Новий опціональний параметр `session: dict | None` у `classify_depth(...)`.
|
||||
- Guard на початку класифікації: якщо `session.last_depth == "light"` і поточне повідомлення ≤ 6 слів без action verbs і без urgent → одразу повертає `"light"`.
|
||||
- Action verbs або urgent слово перебивають guard → звичайна класифікація.
|
||||
- Без `session` (або `session=None`) поведінка ідентична v2.9.
|
||||
- Telemetry: `stability_guard_triggered`.
|
||||
|
||||
### 3. Soft Proactivity Layer (`proactivity.py`)
|
||||
|
||||
- `maybe_add_proactivity(response, user_profile, depth, reflection) -> (str, bool)`.
|
||||
- Додає рівно 1 речення ≤ 120 символів без `!` в кінець відповіді.
|
||||
- Умови спрацювання (всі одночасно):
|
||||
1. `depth == "deep"`
|
||||
2. `reflection.confidence >= 0.7` або `reflection is None`
|
||||
3. `interaction_count % 10 == 0`
|
||||
4. Один intent у `known_intents` зустрівся ≥ 3 рази
|
||||
5. Не `(style == "concise"/"brief" AND "?" в response)`
|
||||
- Чотири банки фраз: generic, iot, plan, sustainability — seeded вибір.
|
||||
- Telemetry: `proactivity_added`, `proactivity_skipped`.
|
||||
|
||||
---
|
||||
|
||||
## Зміни в існуючих файлах
|
||||
|
||||
| Файл | Зміна |
|
||||
|---|---|
|
||||
| `depth_classifier.py` | Новий `session=None` param + stability guard на початку `classify_depth` |
|
||||
| `run.py` | 3 гачки: `load_session` → `classify_depth(session=)` → `update_session` + `maybe_add_proactivity` |
|
||||
|
||||
---
|
||||
|
||||
## Backward Compatibility
|
||||
|
||||
- `classify_depth(session=None)` — поведінка ідентична v2.9.
|
||||
- Усі v2.9 тести (203 шт.) без змін, зелені.
|
||||
- Нові тести: 29 тестів у `test_stepan_v3_session_proactivity_stability.py`.
|
||||
- Загальний suite: 232/232.
|
||||
|
||||
---
|
||||
|
||||
## Telemetry Events (нові у v3)
|
||||
|
||||
| Event | Файл | PII-safe ключі |
|
||||
|---|---|---|
|
||||
| `session_loaded` | `session_context.py` | `chat_id=h:...` |
|
||||
| `session_expired` | `session_context.py` | `chat_id=h:...` |
|
||||
| `session_updated` | `session_context.py` | `chat_id=h:...` |
|
||||
| `stability_guard_triggered` | `depth_classifier.py` | — |
|
||||
| `proactivity_added` | `proactivity.py` | `user_id=h:...` |
|
||||
| `proactivity_skipped` | `proactivity.py` | `user_id=h:...` |
|
||||
|
||||
---
|
||||
|
||||
## Known Limitations
|
||||
|
||||
1. **Session скидається при рестарті контейнера** — очікувано. TTL 15 хв — це сесія в рамках активного діалогу, не довготривала пам'ять (для неї є UserProfile в memory-service).
|
||||
2. **Stability guard працює тільки з `last_depth`** — не враховує зміст попереднього повідомлення. Якщо потрібна складніша логіка (наприклад, "попереднє було deep з темою X") — треба розширити `SessionContext`.
|
||||
3. **Proactivity банки фраз — rule-based, не персоналізовані** — фрази обираються за top intent, а не за конкретним контентом відповіді. Для глибшої персоналізації потрібно або LLM, або значно більші банки.
|
||||
4. **`stability_guard_triggered` логить `chat_id=n/a`** — тимчасово, оскільки `depth_classifier` не приймає `chat_id` напряму. Можна виправити у v3.1, передавши `chat_id` через `session`.
|
||||
|
||||
---
|
||||
|
||||
## Команди запуску тестів
|
||||
|
||||
```bash
|
||||
# Тільки v3
|
||||
python3 -m pytest tests/test_stepan_v3_session_proactivity_stability.py -v
|
||||
|
||||
# Повний Stepan suite (v2.5 – v3)
|
||||
python3 -m pytest \
|
||||
tests/test_stepan_light_reply.py \
|
||||
tests/test_stepan_memory_followup.py \
|
||||
tests/test_stepan_acceptance.py \
|
||||
tests/test_stepan_invariants.py \
|
||||
tests/test_stepan_telemetry.py \
|
||||
tests/test_stepan_v28_farm.py \
|
||||
tests/test_stepan_v29_consolidation.py \
|
||||
tests/test_stepan_v3_session_proactivity_stability.py \
|
||||
-v
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Rollback
|
||||
|
||||
```bash
|
||||
cd /opt/microdao-daarion
|
||||
|
||||
# Мінімальний rollback (прибрати гачки, залишити нові файли неактивними)
|
||||
git checkout HEAD~1 -- crews/agromatrix_crew/run.py
|
||||
git checkout HEAD~1 -- crews/agromatrix_crew/depth_classifier.py
|
||||
|
||||
# Повний rollback (включно з новими модулями)
|
||||
git checkout HEAD~1 -- \
|
||||
crews/agromatrix_crew/run.py \
|
||||
crews/agromatrix_crew/depth_classifier.py \
|
||||
crews/agromatrix_crew/session_context.py \
|
||||
crews/agromatrix_crew/proactivity.py
|
||||
|
||||
docker compose -f docker-compose.node1.yml up -d --build dagi-gateway-node1
|
||||
```
|
||||
690
docs/Humanized_Stepan_Architecture_Plan.md
Normal file
690
docs/Humanized_Stepan_Architecture_Plan.md
Normal file
@@ -0,0 +1,690 @@
|
||||
# Humanized Stepan v2 — Architecture Plan
|
||||
|
||||
**Версія:** 0.1-draft
|
||||
**Статус:** plan (без коду)
|
||||
**Область змін:** `crews/agromatrix_crew/` + мінімальний торкання `http_api.py`
|
||||
**Принцип:** fail-closed, backward-compatible, жодної нескінченної рекурсії
|
||||
|
||||
---
|
||||
|
||||
## 1. Проблеми поточної архітектури
|
||||
|
||||
| Симптом | Причина у коді |
|
||||
|---------|----------------|
|
||||
| На "привіт" запускаються всі 5 під-агентів | `run.py` завжди викликає ops, iot, platform, spreadsheet, sustainability |
|
||||
| Роботизовані відповіді | JSON-схема фінального агента, відсутня адаптація стилю |
|
||||
| Степан не знає хто ти | Немає UserProfile, жодного звернення до memory-service |
|
||||
| Степан не знає твою ферму | Немає FarmProfile |
|
||||
| Після відповіді немає самоперевірки | Reflection відсутній |
|
||||
| Оператор і звичайний користувач мають однакову відповідь | is_operator є, але стиль не змінюється |
|
||||
| Зміна `detect_intent()` ламає всю логіку | Ключові слова захардкожені в одній функції |
|
||||
|
||||
---
|
||||
|
||||
## 2. Загальна схема нового потоку
|
||||
|
||||
```
|
||||
handle_message(text, user_id, chat_id, ops_mode)
|
||||
│
|
||||
├─► [activation_gate.pre_check(text)] ← блокує рекурсію, лічить глибину
|
||||
│
|
||||
├─► [memory_manager.load(user_id)] ← UserProfile + FarmProfile
|
||||
│ │ fallback: порожній профіль ← fail-safe
|
||||
│
|
||||
├─► [depth_classifier.classify(text, profile)]
|
||||
│ │ → DepthDecision {mode, intent, crew_needed, confidence}
|
||||
│ │ fallback: mode="deep" ← fail-closed: краще зробити більше
|
||||
│
|
||||
├─► if mode == "light":
|
||||
│ [style_adapter.render(profile)] → system_prompt_prefix
|
||||
│ Stepan відповідає сам (без під-агентів)
|
||||
│ → response
|
||||
│
|
||||
├─► if mode == "deep":
|
||||
│ [activation_gate.select_crew(DepthDecision, FarmProfile)]
|
||||
│ → {ops?, iot?, platform?, spreadsheet?, sustainability?}
|
||||
│ Запускати ТІЛЬКИ потрібних під-агентів
|
||||
│ Stepan консолідує
|
||||
│ → response
|
||||
│
|
||||
├─► [reflection_engine.reflect(response, profile, intent)] ← один прохід, не рекурсія
|
||||
│ │ fallback: оригінальна відповідь
|
||||
│
|
||||
├─► [memory_manager.update_async(user_id, text, response)] ← не блокує
|
||||
│
|
||||
└─► return final_response
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Нові модулі
|
||||
|
||||
### 3.1 `depth_classifier.py`
|
||||
|
||||
**Розташування:** `crews/agromatrix_crew/depth_classifier.py`
|
||||
|
||||
**Відповідальність:** визначити глибину запиту і які под-агенти взагалі потрібні.
|
||||
|
||||
**Вхід:**
|
||||
- `text: str` — текст повідомлення
|
||||
- `profile: UserProfile | None` — профіль користувача
|
||||
- `farm: FarmProfile | None` — профіль ферми
|
||||
|
||||
**Вихід: `DepthDecision`**
|
||||
```python
|
||||
@dataclass
|
||||
class DepthDecision:
|
||||
mode: Literal["light", "deep"] # ключовий перемикач
|
||||
intent: str # human-readable intent
|
||||
crew_needed: list[str] # підмножина: ops, iot, platform, spreadsheet, sustainability
|
||||
confidence: float # 0..1, < 0.4 → force deep
|
||||
reason: str # для audit логу
|
||||
```
|
||||
|
||||
**Логіка класифікації (rule-based, без LLM):**
|
||||
|
||||
Light mode — якщо текст відповідає хоча б одному патерну:
|
||||
```
|
||||
LIGHT_PATTERNS = {
|
||||
"greeting": ["привіт", "доброго", "hello", "hi", "добрий ранок", "добрий вечір"],
|
||||
"thanks": ["дякую", "дякуй", "спасибі", "дякую степан"],
|
||||
"ack": ["зрозумів", "ок", "добре", "чудово", "зрозуміла"],
|
||||
"whoami_check": ["хто я", "мої права"],
|
||||
"simple_status": ["який статус", "що зараз"],
|
||||
}
|
||||
```
|
||||
|
||||
Deep mode — якщо текст відповідає хоча б одному:
|
||||
```
|
||||
DEEP_PATTERNS = {
|
||||
"planning": ["сплануй", "план на", "розробити план", "графік робіт"],
|
||||
"multi_ops": ["по всіх полях", "кілька ділянок", "всі культури"],
|
||||
"iot_alert": ["аномалія", "тривога", "sensors", "вологість впала"],
|
||||
"analysis": ["план/факт", "план факт", "статистика", "зведення", "порівняй"],
|
||||
"decision": ["що робити", "порадь", "проаналізуй", "виріши"],
|
||||
"recording": ["запиши", "зафіксуй", "внеси", "додай операцію"],
|
||||
}
|
||||
```
|
||||
|
||||
Crew selection у deep mode:
|
||||
```
|
||||
crew_needed logic:
|
||||
"ops" → "запиши" | "зафіксуй" | "внеси" | farmos keywords
|
||||
"iot" → "датчик" | "вологість" | "temp" | "sensor" | FarmProfile.has_iot
|
||||
"platform" → "статус сервісів" | "інтеграція" | "помилка підключення"
|
||||
"spreadsheet" → "таблиц" | "excel" | "звіт" | "xlsx"
|
||||
"sustainability" → "зведення" | "агрегація" | "підсумки"
|
||||
```
|
||||
|
||||
**Fail-safe:** будь-який виняток → `DepthDecision(mode="deep", intent="unknown", crew_needed=["ops","iot","platform","spreadsheet","sustainability"], confidence=0.0, reason="classifier_error")`.
|
||||
|
||||
---
|
||||
|
||||
### 3.2 `memory_manager.py`
|
||||
|
||||
**Розташування:** `crews/agromatrix_crew/memory_manager.py`
|
||||
|
||||
**Відповідальність:** завантажити, зберегти і оновити профілі через memory-service. Повна деградація до in-memory fallback.
|
||||
|
||||
**API:**
|
||||
```python
|
||||
def load(user_id: str) -> tuple[UserProfile, FarmProfile]
|
||||
def update(user_id: str, interaction: InteractionContext) -> None
|
||||
```
|
||||
|
||||
**Реалізація (sync, бо `run.py` sync):**
|
||||
- HTTP запити через `httpx.Client` (sync), timeout 2s
|
||||
- При недоступності memory-service → використовує `_local_cache: dict` (процесна пам'ять)
|
||||
- `_local_cache` зберігає до 200 записів, TTL 30 хвилин
|
||||
- Факт-ключі в memory-service:
|
||||
- `user_profile:agromatrix:{user_id}`
|
||||
- `farm_profile:agromatrix:{user_id}`
|
||||
- user_id для memory-service: `stepan:{user_id}` (ізоляція від gateway-агентів)
|
||||
|
||||
**Fail-safe:**
|
||||
```python
|
||||
try:
|
||||
profile = _fetch_from_memory(user_id)
|
||||
except Exception:
|
||||
profile = UserProfile.default(user_id) # порожній, але валідний
|
||||
logger.warning("memory_manager: fallback to default profile user=%s", user_id)
|
||||
```
|
||||
|
||||
**Не блокуючий update:**
|
||||
```python
|
||||
def update_async(user_id: str, interaction: InteractionContext):
|
||||
"""Запускає оновлення в threading.Thread (daemon=True), не чекає результату."""
|
||||
t = threading.Thread(target=_do_update, args=(user_id, interaction), daemon=True)
|
||||
t.start()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3.3 `style_adapter.py`
|
||||
|
||||
**Розташування:** `crews/agromatrix_crew/style_adapter.py`
|
||||
|
||||
**Відповідальність:** сформувати prefix для system prompt Степана залежно від профілю.
|
||||
|
||||
**Вхід:** `UserProfile`, `DepthDecision`
|
||||
**Вихід:** `str` — prefix для system prompt Степана
|
||||
|
||||
**Рівні expertise:**
|
||||
```
|
||||
novice: мова проста, уникай термінів, давай короткий приклад, 2-3 речення
|
||||
intermediate: збалансована відповідь, терміни пояснюй в дужках, до 5 речень
|
||||
expert: технічна відповідь, скорочений формат, опускай очевидне
|
||||
```
|
||||
|
||||
**Стилі:**
|
||||
```
|
||||
brief: 1-2 речення, тільки суть
|
||||
detailed: повний опис з контекстом
|
||||
conversational: живий тон, питання-відповідь, можна питати уточнення
|
||||
```
|
||||
|
||||
**Формат prefix:**
|
||||
```
|
||||
"Відповідай на рівні {expertise_label}.
|
||||
Стиль: {style_label}.
|
||||
Ти знаєш цього користувача: {name or 'агрономе'}.
|
||||
Фермерський контекст: {farm_context_summary}."
|
||||
```
|
||||
|
||||
**Fail-safe:** будь-який виняток → повертає порожній рядок, Степан працює зі стандартним backstory.
|
||||
|
||||
---
|
||||
|
||||
### 3.4 `reflection_engine.py`
|
||||
|
||||
**Розташування:** `crews/agromatrix_crew/reflection_engine.py`
|
||||
|
||||
**Відповідальність:** одноразова пост-обробка відповіді для відповідності профілю і стилю.
|
||||
|
||||
**Механізм (без LLM для Light mode, з LLM для Deep mode):**
|
||||
|
||||
**Light mode reflection (rule-based):**
|
||||
- Відповідь > 500 символів і UserProfile.preferred_style == "brief" → обрізати до 3 речень
|
||||
- Відповідь містить JSON-фрагменти → замінити на людський текст
|
||||
- Відповідь містить технічні ідентифікатори (uuid, trace_id) → прибрати з відповіді користувачу
|
||||
|
||||
**Deep mode reflection (LLM, one-shot):**
|
||||
```
|
||||
Prompt:
|
||||
"Оціни цю відповідь для {expertise_level} користувача:
|
||||
[RESPONSE]
|
||||
Якщо відповідь занадто технічна — спрости.
|
||||
Якщо занадто довга для {preferred_style} — скороти.
|
||||
Відповідай тільки виправленою відповіддю."
|
||||
```
|
||||
|
||||
**Anti-recursion guard:**
|
||||
```python
|
||||
# В reflection_engine.py — module-level flag
|
||||
_REFLECTING: bool = False
|
||||
|
||||
def reflect(response: str, profile: UserProfile, trace_id: str) -> str:
|
||||
global _REFLECTING
|
||||
if _REFLECTING:
|
||||
logger.warning("reflection: recursion guard active, skipping trace=%s", trace_id)
|
||||
return response
|
||||
_REFLECTING = True
|
||||
try:
|
||||
return _do_reflect(response, profile, trace_id)
|
||||
except Exception:
|
||||
return response
|
||||
finally:
|
||||
_REFLECTING = False
|
||||
```
|
||||
|
||||
**Fail-safe:** будь-який виняток → повертає оригінальну відповідь без змін.
|
||||
|
||||
---
|
||||
|
||||
### 3.5 `activation_gate.py`
|
||||
|
||||
**Розташування:** `crews/agromatrix_crew/activation_gate.py`
|
||||
|
||||
**Відповідальність:**
|
||||
1. Pre-check: блокує подвійний виклик handle_message з того самого контексту
|
||||
2. Select: визначає мінімальний набір під-агентів для запуску
|
||||
3. Post-check: обмежує глибину делегування
|
||||
|
||||
**Структура:**
|
||||
```python
|
||||
_CALL_DEPTH: threading.local # per-thread, не глобальне
|
||||
|
||||
MAX_DEPTH = 1 # Степан може делегувати, але не можна повторно входити в handle_message
|
||||
|
||||
def pre_check(trace_id: str) -> bool:
|
||||
"""Повертає True якщо дозволено продовжувати, False якщо глибина перевищена."""
|
||||
depth = getattr(_CALL_DEPTH, "depth", 0)
|
||||
if depth >= MAX_DEPTH:
|
||||
logger.error("activation_gate: max depth %d reached trace=%s", MAX_DEPTH, trace_id)
|
||||
return False
|
||||
_CALL_DEPTH.depth = depth + 1
|
||||
return True
|
||||
|
||||
def release(trace_id: str):
|
||||
"""Зменшити лічильник після завершення handle_message."""
|
||||
_CALL_DEPTH.depth = max(0, getattr(_CALL_DEPTH, "depth", 0) - 1)
|
||||
|
||||
def select_crew(decision: DepthDecision, farm: FarmProfile) -> list[str]:
|
||||
"""Повернути список під-агентів для запуску."""
|
||||
needed = list(decision.crew_needed)
|
||||
# Видалити IoT якщо FarmProfile.active_integrations не має iot
|
||||
if "iot" in needed and not farm.has_iot_integration:
|
||||
needed.remove("iot")
|
||||
# Видалити spreadsheet якщо не запит до таблиць
|
||||
if "spreadsheet" in needed and "spreadsheet" not in decision.intent:
|
||||
needed.remove("spreadsheet")
|
||||
return needed if needed else []
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Структура UserProfile JSON
|
||||
|
||||
```json
|
||||
{
|
||||
"_version": 1,
|
||||
"_fact_key": "user_profile:agromatrix:{user_id}",
|
||||
"user_id": "tg:123456789",
|
||||
"agent": "agromatrix",
|
||||
"name": "Іван",
|
||||
"expertise_level": "intermediate",
|
||||
"preferred_language": "uk",
|
||||
"preferred_style": "conversational",
|
||||
"last_seen": "2026-02-24T10:00:00Z",
|
||||
"interaction_count": 42,
|
||||
"known_intents": [
|
||||
"plan_day",
|
||||
"show_critical_tomorrow",
|
||||
"iot_status"
|
||||
],
|
||||
"context_notes": [
|
||||
"has_farmos_access",
|
||||
"uses_thingsboard",
|
||||
"prefers_short_answers"
|
||||
],
|
||||
"farm_profile_ref": "farm_profile:agromatrix:{user_id}",
|
||||
"recent_topics": [
|
||||
{"intent": "plan_day", "ts": "2026-02-24T09:00:00Z"},
|
||||
{"intent": "iot_status", "ts": "2026-02-23T18:00:00Z"}
|
||||
],
|
||||
"operator": false,
|
||||
"updated_at": "2026-02-24T10:00:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
**Поля та семантика:**
|
||||
|
||||
| Поле | Тип | Опис |
|
||||
|------|-----|------|
|
||||
| `expertise_level` | enum | novice / intermediate / expert; оновлюється автоматично після 10+ взаємодій |
|
||||
| `preferred_style` | enum | brief / detailed / conversational |
|
||||
| `interaction_count` | int | лічильник всіх взаємодій для авто-підвищення рівня |
|
||||
| `known_intents` | list[str] | унікальні intents, накопичуються; use для FarmProfile автодоповнення |
|
||||
| `context_notes` | list[str] | вільні мітки, збагачуються під час взаємодій |
|
||||
| `recent_topics` | list[{intent, ts}] | останні 10 тем (для cold-start relief) |
|
||||
| `operator` | bool | чи є цей user оператором (AGX_OPERATOR_IDS); read-only у memory |
|
||||
|
||||
---
|
||||
|
||||
## 5. Структура FarmProfile JSON
|
||||
|
||||
```json
|
||||
{
|
||||
"_version": 1,
|
||||
"_fact_key": "farm_profile:agromatrix:{user_id}",
|
||||
"user_id": "tg:123456789",
|
||||
"farm_name": "Ферма Калинівка",
|
||||
"field_ids": ["field:north-01", "field:south-02"],
|
||||
"crop_ids": ["crop:wheat-winter", "crop:corn-hybrid"],
|
||||
"active_integrations": ["farmos", "thingsboard"],
|
||||
"seasonal_context": {
|
||||
"current_phase": "growing",
|
||||
"active_operations": ["irrigation", "monitoring"],
|
||||
"hemisphere": "north",
|
||||
"approximate_month": 2
|
||||
},
|
||||
"iot_sensors": {
|
||||
"has_iot_integration": true,
|
||||
"sensor_types": ["soil_moisture", "temperature"],
|
||||
"last_alert": null
|
||||
},
|
||||
"typical_intents": ["plan_day", "iot_status", "plan_vs_fact"],
|
||||
"alert_thresholds": {
|
||||
"soil_moisture_min": 20.0,
|
||||
"temperature_min": -5.0,
|
||||
"temperature_max": 38.0
|
||||
},
|
||||
"dict_pending_count": 0,
|
||||
"updated_at": "2026-02-24T10:00:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
**Поля та семантика:**
|
||||
|
||||
| Поле | Тип | Опис |
|
||||
|------|-----|------|
|
||||
| `field_ids` | list[str] | заповнюються під час нормалізації терміну tool_dictionary |
|
||||
| `crop_ids` | list[str] | аналогічно |
|
||||
| `active_integrations` | list[str] | визначають які crew_agents потенційно потрібні |
|
||||
| `seasonal_context` | object | підказки для планування і класифікатора глибини |
|
||||
| `iot_sensors.has_iot_integration` | bool | ключ для activation_gate: чи включати IoT агента |
|
||||
| `typical_intents` | list[str] | акумулюються; використовуються для Light/Deep розмежування |
|
||||
| `dict_pending_count` | int | кеш кількості pending термінів для оператора |
|
||||
| `alert_thresholds` | object | якщо IoT дані виходять за поріг → auto-trigger Deep mode |
|
||||
|
||||
---
|
||||
|
||||
## 6. Коли і як оновлюється профіль
|
||||
|
||||
### UserProfile
|
||||
|
||||
| Подія | Що оновлюється | Коли |
|
||||
|-------|----------------|------|
|
||||
| Будь-яка взаємодія | `last_seen`, `interaction_count`, `recent_topics` | Завжди, після відповіді |
|
||||
| Новий intent | `known_intents.append(intent)` | Якщо intent не порожній |
|
||||
| interaction_count >= 10 і всі intents — "planning" | `expertise_level` → intermediate | При update |
|
||||
| interaction_count >= 30 і є технічні intents | `expertise_level` → expert | При update |
|
||||
| Оператор надіслав `/profile set style brief` | `preferred_style` | Одразу |
|
||||
| FarmProfile змінений | `farm_profile_ref` sync | При update |
|
||||
|
||||
### FarmProfile
|
||||
|
||||
| Подія | Що оновлюється | Коли |
|
||||
|-------|----------------|------|
|
||||
| tool_dictionary.normalize успішний | `field_ids`, `crop_ids` | При нормалізації |
|
||||
| Новий інтент з IoT | `active_integrations`, `iot_sensors.has_iot_integration` | При Deep mode |
|
||||
| Новий інтент з spreadsheet | `active_integrations.append("spreadsheet")` | При Deep mode |
|
||||
| Оператор `/farm update phase=sowing` | `seasonal_context.current_phase` | Одразу |
|
||||
| dict_review.stats() | `dict_pending_count` | При ops_mode load |
|
||||
|
||||
---
|
||||
|
||||
## 7. Тригери Deep mode
|
||||
|
||||
**Автоматичні (depth_classifier):**
|
||||
|
||||
| Тригер | Умова |
|
||||
|--------|-------|
|
||||
| Планування | текст містить DEEP_PATTERNS["planning"] |
|
||||
| Мультипольова операція | DEEP_PATTERNS["multi_ops"] |
|
||||
| IoT аномалія | DEEP_PATTERNS["iot_alert"] АБО IoT дані з alert_thresholds порушені |
|
||||
| Аналіз план/факт | DEEP_PATTERNS["analysis"] |
|
||||
| Запис у farmOS | DEEP_PATTERNS["recording"] |
|
||||
| Низька впевненість | confidence < 0.4 після класифікації |
|
||||
| Нові терміни | tool_dictionary normalization повернув pending items |
|
||||
| Перша взаємодія | interaction_count == 0 (невідомий користувач) |
|
||||
|
||||
**Примусові (env/flag):**
|
||||
|
||||
| Тригер | Механізм |
|
||||
|--------|----------|
|
||||
| `AGX_FORCE_DEEP=1` | env в контейнері (тестування) |
|
||||
| Текст починається з `--deep` | парситься в handle_message before classify |
|
||||
| Оператор вручну | operator_commands + flag в trace |
|
||||
|
||||
---
|
||||
|
||||
## 8. Тригери запуску під-команди (активація crew_agent)
|
||||
|
||||
| Crew Agent | Тригер (keyword or FarmProfile) | Light може обійтись? |
|
||||
|------------|----------------------------------|----------------------|
|
||||
| `ops` | "запиши", "внеси", "зафіксуй", "farmOS" | Ні |
|
||||
| `iot` | "датчик", "вологість", "температура" + `has_iot_integration=true` | Ні |
|
||||
| `platform` | "статус", "перевір сервіс", "інтеграція впала" | Іноді (кешований статус) |
|
||||
| `spreadsheet` | "таблиця", "excel", "звіт", "xlsx" | Ні |
|
||||
| `sustainability` | "зведення", "агрегація", "підсумки по сезону" | Ні |
|
||||
| **всі одночасно** | `intent == "general"` без профілю (fallback) | Ні |
|
||||
|
||||
---
|
||||
|
||||
## 9. Ситуації, що залишаються Light mode
|
||||
|
||||
| Ситуація | Чому Light | Хто відповідає |
|
||||
|----------|------------|----------------|
|
||||
| Привітання будь-якого типу | Не потребує даних з farmOS/IoT | Степан з style_adapter |
|
||||
| "Дякую", "ок", "зрозумів" | Підтвердження, не запит | Степан (2 слова) |
|
||||
| /whoami, /pending, /approve | Operator commands | operator_commands.py (незмінний) |
|
||||
| "Що ти вмієш?" | Довідка | Степан з профілем |
|
||||
| Повторне питання тієї ж теми (< 5 хв) | recent_topics cache | Степан з кешем контексту |
|
||||
| Simple status якщо кеш свіжий | FarmProfile.seasonal_context свіжий (< 1 год) | Степан без crew |
|
||||
| Повідомлення < 4 слів | Незрозумілий запит → уточнення | Степан питає |
|
||||
| Текст не пов'язаний з агрономією | Off-topic filter | Степан ввічливо redirects |
|
||||
|
||||
---
|
||||
|
||||
## 10. Принцип fail-safe
|
||||
|
||||
**Ієрархія деградації:**
|
||||
|
||||
```
|
||||
Нормальна робота:
|
||||
memory-service online → профілі загружені → класифікатор → вибір crew → рефлексія
|
||||
|
||||
Деградація 1 (memory недоступна):
|
||||
fallback UserProfile.default() → класифікатор без персоналізації → crew → рефлексія skip
|
||||
|
||||
Деградація 2 (classifier помилка):
|
||||
force Deep mode → всі crew → рефлексія skip
|
||||
|
||||
Деградація 3 (частина crew агентів впала):
|
||||
інші crew продовжують → Степан синтезує з частковими даними
|
||||
run_task_with_retry вже існує (max_retries=2)
|
||||
|
||||
Деградація 4 (OpenAI недоступний):
|
||||
handle_stepan_message повертає "Помилка обробки. trace_id=..."
|
||||
gateway вже обробляє це (stepan_disabled fallback)
|
||||
```
|
||||
|
||||
**Правила:**
|
||||
- Жодний модуль не може кинути виняток, що зупинить `handle_message`
|
||||
- Кожен новий модуль wrap-ується в try/except з fallback
|
||||
- `reflection_engine` завжди має повертати `str`, ніколи `None` або виняток
|
||||
- `memory_manager.update_async` daemon=True — смерть процесу не втрачає відповідь
|
||||
- При будь-якій помилці profile: `interaction_count=0`, `expertise_level="intermediate"`, `preferred_style="conversational"`
|
||||
|
||||
---
|
||||
|
||||
## 11. Як не створити нескінченну рекурсію
|
||||
|
||||
**Три незалежні шари захисту:**
|
||||
|
||||
### Шар 1 — `activation_gate` (threading.local counter)
|
||||
```
|
||||
handle_message:
|
||||
pre_check() → depth becomes 1
|
||||
... робота ...
|
||||
release() → depth back to 0
|
||||
|
||||
Якщо under_running_task викликає handle_message:
|
||||
pre_check() → depth == 1 → MAX_DEPTH reached → return error response
|
||||
```
|
||||
`threading.local` — ізоляція per-thread, не заважає паралельним викликам з різних чатів.
|
||||
|
||||
### Шар 2 — `reflection_engine._REFLECTING` flag
|
||||
- Глобальний (module-level) булевий прапорець
|
||||
- Встановлюється в `True` перед LLM-рефлексією, скидається в `finally`
|
||||
- Якщо рефлексія викличе щось що знову зайде в рефлексію → миттєво скидається
|
||||
|
||||
### Шар 3 — Архітектурна заборона
|
||||
- Під-агенти (ops, iot, platform, spreadsheet, sustainability) мають `allow_delegation=False`
|
||||
- Жоден агент не має знань про `handle_message` або `run.py`
|
||||
- `depth_classifier`, `style_adapter`, `memory_manager` — pure functions, без CrewAI, без LLM
|
||||
- Тільки `reflection_engine` (Deep mode) і фінальна задача Степана — LLM-виклики
|
||||
|
||||
---
|
||||
|
||||
## 12. Де саме інтегрувати
|
||||
|
||||
### 12.1 `crews/agromatrix_crew/run.py`
|
||||
|
||||
**Змінити:**
|
||||
```python
|
||||
# Новий imports (top)
|
||||
from crews.agromatrix_crew.depth_classifier import classify, DepthDecision
|
||||
from crews.agromatrix_crew.memory_manager import load_profiles, update_async
|
||||
from crews.agromatrix_crew.style_adapter import build_prefix
|
||||
from crews.agromatrix_crew.reflection_engine import reflect
|
||||
from crews.agromatrix_crew.activation_gate import pre_check, release, select_crew
|
||||
|
||||
# handle_message:
|
||||
# 1. pre_check (перше, до всього)
|
||||
# 2. load_profiles (до classify)
|
||||
# 3. classify (до побудови агентів)
|
||||
# 4. if light → stepan_only_response
|
||||
# 5. if deep → activation_gate.select_crew → run selected
|
||||
# 6. reflect (після відповіді)
|
||||
# 7. update_async (не блокуючий, daemon thread)
|
||||
# 8. release (в finally)
|
||||
```
|
||||
|
||||
**Зберегти:**
|
||||
- Весь `route_operator_command` / `route_operator_text` (operator_commands не змінюємо)
|
||||
- `tool_dictionary.normalize_from_text` + pending check (залишається до classify)
|
||||
- `run_task_with_retry` (залишається для Deep mode)
|
||||
- `audit_event` (залишається, розширюємо depth/mode в event)
|
||||
- `farmos_ui_hint` (залишається)
|
||||
|
||||
**НЕ змінювати:**
|
||||
- Сигнатуру `handle_message(text, user_id, chat_id, trace_id, ops_mode, last_pending_list)`
|
||||
- Формат повернення (str, valid for JSON parse by http_api)
|
||||
|
||||
### 12.2 `crews/agromatrix_crew/operator_commands.py`
|
||||
|
||||
**Додати команди:**
|
||||
```
|
||||
/profile → показати UserProfile (user_id, expertise, style, last_seen, interaction_count)
|
||||
/profile set <k>=<v> → оновити expertise_level або preferred_style
|
||||
/farm → показати FarmProfile (коротко: поля, культури, інтеграції, сезон)
|
||||
/farm update <k>=<v> → оновити seasonal_context.current_phase, порогові значення
|
||||
```
|
||||
|
||||
**Зберегти без змін:**
|
||||
- `/whoami`, `/pending`, `/approve`, `/reject`, `/apply_dict`, `/pending_stats`
|
||||
- `is_operator()` — не змінювати
|
||||
- `route_operator_command()` — розширити case, не переписувати
|
||||
- `route_operator_text()` — залишити
|
||||
|
||||
**OPERATOR_COMMANDS set** — додати `"profile"`, `"farm"`.
|
||||
|
||||
### 12.3 `gateway-bot/http_api.py`
|
||||
|
||||
**Мінімальні зміни:**
|
||||
- Додати env `AGX_FORCE_DEEP` → якщо "1", передавати в metadata або через handle_message (ops_mode вже є, можна додати depth_override parameter)
|
||||
- **Нічого більше не змінювати.** handle_message вже приймає text, user_id, chat_id, trace_id, ops_mode.
|
||||
|
||||
**Не змінювати:**
|
||||
- Маршрутизацію оператор/не-оператор (вже виправлена попереднім патчем)
|
||||
- STEPAN_IMPORTS_OK logic
|
||||
- doc_context logic
|
||||
|
||||
### 12.4 `memory-service`
|
||||
|
||||
**Не змінювати сервіс.** Використовуємо існуючий `/facts/upsert` і `/facts/get`.
|
||||
|
||||
**Нові fact-ключі:**
|
||||
```
|
||||
user_profile:agromatrix:{user_id} → UserProfile JSON (fact_value_json)
|
||||
farm_profile:agromatrix:{user_id} → FarmProfile JSON (fact_value_json)
|
||||
```
|
||||
|
||||
**memory_manager.py в crews** викликає memory-service по HTTP (sync httpx), URL з env:
|
||||
```
|
||||
AGX_MEMORY_SERVICE_URL=http://memory-service:8000
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 13. Схема файлів після впровадження
|
||||
|
||||
```
|
||||
crews/agromatrix_crew/
|
||||
├── __init__.py
|
||||
├── run.py ← ЗМІНЕНО (нові модулі вмонтовані)
|
||||
├── audit.py ← без змін
|
||||
├── operator_commands.py ← РОЗШИРЕНО (/profile, /farm)
|
||||
│
|
||||
├── depth_classifier.py ← НОВИЙ
|
||||
├── memory_manager.py ← НОВИЙ
|
||||
├── style_adapter.py ← НОВИЙ
|
||||
├── reflection_engine.py ← НОВИЙ
|
||||
├── activation_gate.py ← НОВИЙ
|
||||
│
|
||||
├── agents/
|
||||
│ ├── stepan_orchestrator.py ← backstory розширюється від style_adapter
|
||||
│ ├── operations_agent.py ← без змін
|
||||
│ ├── iot_agent.py ← без змін
|
||||
│ ├── platform_agent.py ← без змін
|
||||
│ ├── spreadsheet_agent.py ← без змін
|
||||
│ └── sustainability_agent.py ← без змін
|
||||
│
|
||||
├── tasks/
|
||||
│ ├── intake_and_plan.py ← без змін (лише для compatibility)
|
||||
│ ├── execute_ops.py ← без змін
|
||||
│ ├── execute_iot.py ← без змін
|
||||
│ ├── execute_spreadsheets.py ← без змін
|
||||
│ └── reporting.py ← без змін
|
||||
│
|
||||
└── tools/
|
||||
└── __init__.py ← без змін
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 14. Порядок впровадження (поетапно)
|
||||
|
||||
**Фаза 1 — Foundation (без змін у run.py)**
|
||||
1. `memory_manager.py` — реалізувати, написати unit-тест з mock memory-service
|
||||
2. `depth_classifier.py` — реалізувати rule-based, написати тести по кожному патерну
|
||||
3. `activation_gate.py` — реалізувати pre_check/release/select_crew, тест на рекурсію
|
||||
|
||||
**Фаза 2 — Light mode**
|
||||
4. `style_adapter.py` — реалізувати три рівні і три стилі
|
||||
5. Модифікувати `run.py`: вставити Light mode path (якщо light → пропустити всі crew)
|
||||
6. Smoke-test: надіслати "привіт" → відповідь без crew
|
||||
|
||||
**Фаза 3 — Deep mode + Activation Gate**
|
||||
7. Модифікувати `run.py`: Deep mode використовує `select_crew`, не всіх 5 агентів
|
||||
8. Тест: `"сплануй тиждень"` → ops + sustainability, але не iot (якщо has_iot=false)
|
||||
|
||||
**Фаза 4 — Reflection + Profiles**
|
||||
9. `reflection_engine.py` — rule-based Light reflection (без LLM)
|
||||
10. Оновити `operator_commands.py` — `/profile`, `/farm`
|
||||
11. E2E тест: 3 взаємодії → перевірка UserProfile накопичення
|
||||
|
||||
**Фаза 5 — Deep reflection (LLM)**
|
||||
12. Додати LLM-рефлексію тільки для Deep mode
|
||||
13. Тест на рекурсію: перевірити `_REFLECTING` flag спрацьовує
|
||||
|
||||
---
|
||||
|
||||
## 15. Метрики успіху
|
||||
|
||||
| Метрика | Ціль |
|
||||
|---------|------|
|
||||
| % запитів у Light mode (грітинги + прості) | > 30% від загального трафіку |
|
||||
| Середній час відповіді Light mode | < 2s (без crew launch) |
|
||||
| Середній час відповіді Deep mode | < 30s (тільки потрібні crew) |
|
||||
| % запитів що запускають тільки 1-2 crew | > 50% від Deep запитів |
|
||||
| Оператор `/profile` — відображає дані | 100% (якщо memory-service online) |
|
||||
| Fallback без memory-service | Gateway не падає (fail-safe) |
|
||||
| Рекурсивний виклик handle_message | 0 (activation_gate блокує) |
|
||||
|
||||
---
|
||||
|
||||
## 16. Відкриті питання (потрібно вирішити перед реалізацією)
|
||||
|
||||
1. **Sync vs async memory_manager**: `run.py` sync, але memory-service async-HTTP. Поточне рішення — sync httpx.Client. Альтернатива: asyncio.run() в окремому thread. Потребує рішення.
|
||||
2. **UserProfile.expertise_level auto-upgrade**: поріг 10/30 взаємодій — достатньо? Або враховувати час між взаємодіями?
|
||||
3. **reflection LLM model**: який LLM для рефлексії — той самий GPT-4, або дешевший GPT-3.5/Mistral? Вплив на latency та cost.
|
||||
4. **FarmProfile cold-start**: перша взаємодія — profile порожній. Deep mode завжди? Або запитати у користувача дані ферми?
|
||||
5. **Multi-user farm**: кілька операторів з однієї ферми — один FarmProfile чи кілька? Зараз `user_id`-based.
|
||||
6. **Operator profile isolation**: оператор і звичайний користувач можуть мати одне user_id якщо оператор пише без оператор-чату. Чи потрібна окрема UserProfile для ops-mode?
|
||||
|
||||
---
|
||||
|
||||
*Документ готовий до review. Після погодження — розпочинати Фазу 1.*
|
||||
164
docs/NODA1-NODA2-STATUS.md
Normal file
164
docs/NODA1-NODA2-STATUS.md
Normal file
@@ -0,0 +1,164 @@
|
||||
# 📊 Звіт: Перевірка NODA1 та очищення NODA2
|
||||
|
||||
## ✅ Очищення NODA2 виконано!
|
||||
|
||||
### Звільнено місце:
|
||||
| Дія | Розмір | Статус |
|
||||
|-----|--------|--------|
|
||||
| microdao3 images | ~5.7 GB | ✅ Видалено |
|
||||
| Docker build cache | 14.49 GB | ✅ Очищено |
|
||||
| second_me_memory.db | 32 KB | ✅ Видалено |
|
||||
| **Загалом** | **~20.2 GB** | ✅ **Звільнено** |
|
||||
|
||||
### Поточний стан диска NODA2:
|
||||
```
|
||||
/dev/disk3s3s1 1.8Ti 11Ti 618Gi 2% 453k 4.3G
|
||||
```
|
||||
✅ **618 GB вільно** (було менше)
|
||||
|
||||
---
|
||||
|
||||
## 🔍 Перевірка NODA1 (144.76.224.179):
|
||||
|
||||
### ✅ Статус сервісів:
|
||||
|
||||
| Сервіс | Статус | Порт | Опис |
|
||||
|--------|--------|------|------|
|
||||
| **dagi-market-data-node1** | ✅ Up 13 days (healthy) | 8893 | Market Data Service |
|
||||
| **dagi-memory-service-node1** | ✅ Up 20 hours (healthy) | 8000 | Memory Service |
|
||||
| **dagi-qdrant-node1** | ✅ Up 3 weeks | 6333 | Vector Database |
|
||||
| **dagi-postgres** | ✅ Up 5 days | 5432 | PostgreSQL |
|
||||
| dagi-gateway-node1 | ✅ Up 2 hours (healthy) | 9300 | Gateway |
|
||||
| postgres-backup-node1 | ⚠️ Up 3 weeks (unhealthy) | 5432 | Backup (needs fix) |
|
||||
|
||||
### 💾 Диск NODA1:
|
||||
```
|
||||
Filesystem Size Used Avail Use% Mounted on
|
||||
/dev/md2 1.7T 1.3T 320G 81% /
|
||||
```
|
||||
⚠️ **81% використано** (320 GB вільно)
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Підтвердження: Market Data на NODA1
|
||||
|
||||
### ✅ Market Data Service ПРАЦЮЄ на NODA1:
|
||||
```bash
|
||||
# NODA1
|
||||
dagi-market-data-node1 Up 13 days (healthy) 0.0.0.0:8893->8891/tcp
|
||||
|
||||
# Health check
|
||||
curl http://localhost:8893/health
|
||||
{"status":"ok","service":"market-data-service"}
|
||||
```
|
||||
|
||||
### ❓ Market Data на NODA2:
|
||||
```bash
|
||||
# NODA2 (цей ноут)
|
||||
services/market-data-service/
|
||||
├── market_data.db 27 GB (mistakenly copied)
|
||||
└── events.jsonl 40 GB (mistakenly copied
|
||||
```
|
||||
|
||||
**Висновок:** ✅ **Підтверджено!** Market Data працює на NODA1, файли на NODA2 - помилкові копії.
|
||||
|
||||
---
|
||||
|
||||
## 🔍 SenpAI Agent на NODA1:
|
||||
|
||||
### Пошук:
|
||||
```bash
|
||||
# Конфігурація
|
||||
grep -r "senpai\|SenpAI" config/*.yml → ❌ Не знайдено
|
||||
|
||||
# Qdrant колекції
|
||||
curl http://localhost:6333/collections → ❌ Немає senpai_* колекцій
|
||||
```
|
||||
|
||||
**Висновок:** ⚠️ **SenpAI агент НЕ налаштований на NODA1**
|
||||
|
||||
---
|
||||
|
||||
## 📋 Що потрібно зробити:
|
||||
|
||||
### 1. ✅ Видалити Market Data з NODA2 (67 GB)
|
||||
|
||||
**Безпечно видаляемо, бо NODA1 має оригінал:**
|
||||
|
||||
```bash
|
||||
# Видалити помилкові файли
|
||||
rm /Users/apple/github-projects/microdao-daarion/services/market-data-service/market_data.db
|
||||
rm /Users/apple/github-projects/microdao-daarion/services/market-data-service/events.jsonl
|
||||
|
||||
# Або архівувати (якщо потрібні для порівняння)
|
||||
cd /Users/apple/github-projects/microdao-daarion/services/market-data-service
|
||||
tar -czf ~/market_data_node2_backup.tar.gz market_data.db events.jsonl
|
||||
rm market_data.db events.jsonl
|
||||
```
|
||||
|
||||
**Економія:** 67 GB
|
||||
**Загалом звільнено:** ~87 GB (20.2 + 67)
|
||||
|
||||
---
|
||||
|
||||
### 2. ⚠️ Налаштувати SenpAI на NODA1
|
||||
|
||||
**Якщо потрібен агент SenpAI:**
|
||||
|
||||
1. Створити конфігурацію в `config/agent_registry.yml`
|
||||
2. Створити prompt файл
|
||||
3. Створити колекції в Qdrant
|
||||
4. Налаштувати LLM profile
|
||||
|
||||
**Або ви впевнені, що SenpAI має бути на іншій ноді?**
|
||||
|
||||
---
|
||||
|
||||
### 3. ⚠️ Виправити postgres-backup-node1 (NODA1)
|
||||
|
||||
**Проблема:** `postgres-backup-node1` - unhealthy
|
||||
|
||||
**Рішення:** Див. `/docs/NODA1-MEMORY-RUNBOOK.md` (pg_dump version mismatch)
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Рекомендація:
|
||||
|
||||
### Крок 1: Видалити Market Data з NODA2
|
||||
|
||||
```bash
|
||||
# Підтверджуєте видалення 67 GB?
|
||||
rm -f /Users/apple/github-projects/microdao-daarion/services/market-data-service/market_data.db
|
||||
rm -f /Users/apple/github-projects/microdao-daarion/services/market-data-service/events.jsonl
|
||||
```
|
||||
|
||||
### Крок 2: Запустити Memory Stack на NODA2
|
||||
|
||||
```bash
|
||||
cd /Users/apple/github-projects/microdao-daarion
|
||||
./scripts/start-memory-node2.sh
|
||||
```
|
||||
|
||||
### Крок 3: Налаштувати Sofiia пам'ять
|
||||
|
||||
```bash
|
||||
python3 scripts/init-sofiia-memory.py
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Підсумок:
|
||||
|
||||
| Дія | Статус | Економія |
|
||||
|-----|--------|----------|
|
||||
| Очищення Docker (NODA2) | ✅ Виконано | 20.2 GB |
|
||||
| Видалення Market Data (NODA2) | ⏳ Очікує підтвердження | 67 GB |
|
||||
| Налаштування SenpAI (NODA1) | ⏳ Потрібна інформація | - |
|
||||
| Memory Stack (NODA2) | ⏳ Готово до запуску | - |
|
||||
|
||||
---
|
||||
|
||||
**Питання:**
|
||||
1. ✅ **Видаляємо Market Data (67 GB) з NODA2?**
|
||||
2. ❓ **Де має бути агент SenpAI - на NODA1 чи іншій ноді?**
|
||||
3. ❓ **Запускаємо Memory Stack для Sofiia на NODA2 зараз?**
|
||||
197
docs/NODA1_DEPLOY_STEPAN_V3.md
Normal file
197
docs/NODA1_DEPLOY_STEPAN_V3.md
Normal file
@@ -0,0 +1,197 @@
|
||||
# НОДА1 — Deploy Humanized Stepan v3
|
||||
|
||||
**Дата підготовки:** 2026-02-25
|
||||
**Статус:** ready-to-deploy
|
||||
**LLM:** DeepSeek (primary) або OpenAI (fallback)
|
||||
|
||||
---
|
||||
|
||||
## Передумови
|
||||
|
||||
- SSH доступ до НОДА1 (root@144.76.224.179 або root@2a01:4f8:201:2a6::2)
|
||||
- `DEEPSEEK_API_KEY` або `OPENAI_API_KEY` (хоча б один)
|
||||
- Telegram user_id оператора для `AGX_OPERATOR_IDS`
|
||||
|
||||
---
|
||||
|
||||
## Крок 1 — Скопіювати файли на НОДА1
|
||||
|
||||
З локальної машини (де є репо):
|
||||
|
||||
```bash
|
||||
cd /path/to/microdao-daarion
|
||||
|
||||
# Всі Stepan v3 модулі
|
||||
scp -6 \
|
||||
crews/agromatrix_crew/llm_factory.py \
|
||||
crews/agromatrix_crew/depth_classifier.py \
|
||||
crews/agromatrix_crew/memory_manager.py \
|
||||
crews/agromatrix_crew/light_reply.py \
|
||||
crews/agromatrix_crew/telemetry.py \
|
||||
crews/agromatrix_crew/reflection_engine.py \
|
||||
crews/agromatrix_crew/style_adapter.py \
|
||||
crews/agromatrix_crew/session_context.py \
|
||||
crews/agromatrix_crew/proactivity.py \
|
||||
crews/agromatrix_crew/run.py \
|
||||
crews/agromatrix_crew/stepan_system_prompt_v2.txt \
|
||||
crews/agromatrix_crew/stepan_system_prompt_v2.7.txt \
|
||||
"root@[2a01:4f8:201:2a6::2]:/opt/microdao-daarion/crews/agromatrix_crew/"
|
||||
|
||||
# Agents з DeepSeek LLM
|
||||
scp -6 \
|
||||
crews/agromatrix_crew/agents/stepan_orchestrator.py \
|
||||
crews/agromatrix_crew/agents/operations_agent.py \
|
||||
crews/agromatrix_crew/agents/iot_agent.py \
|
||||
crews/agromatrix_crew/agents/platform_agent.py \
|
||||
crews/agromatrix_crew/agents/spreadsheet_agent.py \
|
||||
crews/agromatrix_crew/agents/sustainability_agent.py \
|
||||
"root@[2a01:4f8:201:2a6::2]:/opt/microdao-daarion/crews/agromatrix_crew/agents/"
|
||||
|
||||
# gateway-bot http_api.py (з оновленим sys.path та stepan_enabled)
|
||||
scp -6 \
|
||||
gateway-bot/http_api.py \
|
||||
"root@[2a01:4f8:201:2a6::2]:/opt/microdao-daarion/gateway-bot/"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Крок 2 — Скопіювати crews у gateway-bot volume (на НОДА1)
|
||||
|
||||
```bash
|
||||
# НА НОДА1:
|
||||
# Синхронізуємо crews у gateway-bot volume
|
||||
rsync -av /opt/microdao-daarion/crews/ /opt/microdao-daarion/gateway-bot/crews/
|
||||
rsync -av /opt/microdao-daarion/packages/agromatrix-tools/ /opt/microdao-daarion/gateway-bot/agromatrix-tools/ 2>/dev/null || true
|
||||
|
||||
# Перевірити що файли є в контейнері
|
||||
docker exec dagi-gateway-node1 ls /app/gateway-bot/crews/agromatrix_crew/ | head -20
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Крок 3 — Налаштувати env (секрети)
|
||||
|
||||
```bash
|
||||
# НА НОДА1 — створити/оновити env файл
|
||||
cat > /opt/microdao-daarion/.env.stepan.node1 << 'EOF'
|
||||
DEEPSEEK_API_KEY=sk-ВАШИЙ_КЛЮЧ_DEEPSEEK
|
||||
AGX_STEPAN_MODE=inproc
|
||||
AGX_OPERATOR_IDS=ВАШ_TELEGRAM_USER_ID
|
||||
TZ=Europe/Kyiv
|
||||
EOF
|
||||
|
||||
chmod 600 /opt/microdao-daarion/.env.stepan.node1
|
||||
```
|
||||
|
||||
Потім переконайтесь що `docker-compose.node1.yml` підключає цей файл у секції `gateway → env_file`:
|
||||
|
||||
```yaml
|
||||
# У docker-compose.node1.yml, секція gateway/environment або після volumes:
|
||||
env_file:
|
||||
- .env.stepan.node1
|
||||
```
|
||||
|
||||
> **Якщо env_file не хочете чіпати** — можна додати змінні прямо в секцію `environment:` compose файлу як `DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY}` і export їх у shell перед `docker compose up`.
|
||||
|
||||
---
|
||||
|
||||
## Крок 4 — Перезапустити gateway
|
||||
|
||||
```bash
|
||||
cd /opt/microdao-daarion
|
||||
|
||||
# Варіант А — простий restart (якщо volume і env вже на місці, без rebuild)
|
||||
docker compose -f docker-compose.node1.yml restart dagi-gateway-node1
|
||||
sleep 15
|
||||
|
||||
# Варіант Б — повний rebuild (якщо змінився Dockerfile або requirements)
|
||||
docker compose -f docker-compose.node1.yml up -d --build gateway
|
||||
sleep 30
|
||||
|
||||
# Health check
|
||||
curl -s http://127.0.0.1:9300/health
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Крок 5 — Перевірити старт
|
||||
|
||||
```bash
|
||||
# Лог старту (20 сек після restart)
|
||||
docker logs dagi-gateway-node1 --since 3m 2>&1 | grep -E "Stepan|STEPAN|ImportError|ModuleNotFoundError|LLM:" | tail -20
|
||||
|
||||
# Env у контейнері (маскований)
|
||||
docker exec dagi-gateway-node1 env | grep -E "DEEPSEEK|OPENAI|AGX_" | sed 's/=.*/=***/'
|
||||
|
||||
# Тест імпортів
|
||||
docker exec dagi-gateway-node1 python3 -c "
|
||||
import sys; sys.path.insert(0, '/app/gateway-bot'); sys.path.insert(0, '/app/gateway-bot/agromatrix-tools')
|
||||
from crews.agromatrix_crew.session_context import load_session
|
||||
from crews.agromatrix_crew.depth_classifier import classify_depth
|
||||
from crews.agromatrix_crew.llm_factory import make_llm
|
||||
print('imports OK')
|
||||
llm = make_llm()
|
||||
print('LLM:', type(llm).__name__ if llm else 'None - NO API KEY!')
|
||||
"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Крок 6 — 5 Smoketests (Telegram, оператор)
|
||||
|
||||
| # | Повідомлення | Очікування | Grep |
|
||||
|---|---|---|---|
|
||||
| 1 | `Привіт` | ≤80 символів, без "чим допомогти" | `depth=light, crew_launch=false` |
|
||||
| 2 | `/whoami` | Показує user_id та is_operator=True | — |
|
||||
| 3 | `Зроби план на завтра по полю 12` | Deep, crew запущений | `depth=deep, crew_launch=true` |
|
||||
| 4 | `а на після завтра?` | Light (stability guard або follow-up) | `depth=light` |
|
||||
| 5 | `Дякую` | ≤40 символів | `crew_launch=false` |
|
||||
|
||||
---
|
||||
|
||||
## Крок 7 — Telemetry моніторинг (перші 30 хв)
|
||||
|
||||
```bash
|
||||
# Всі AGX_STEPAN_METRIC події
|
||||
docker logs dagi-gateway-node1 --since 30m 2>&1 | grep "AGX_STEPAN_METRIC" | tail -80
|
||||
|
||||
# Depth розподіл
|
||||
docker logs dagi-gateway-node1 --since 30m 2>&1 | grep "AGX_STEPAN_METRIC depth" | \
|
||||
awk '{for(i=1;i<=NF;i++) if($i~/^depth=/) print $i}' | sort | uniq -c
|
||||
|
||||
# Session layer
|
||||
docker logs dagi-gateway-node1 --since 30m 2>&1 | grep "AGX_STEPAN_METRIC session_"
|
||||
|
||||
# LLM (DeepSeek) active
|
||||
docker logs dagi-gateway-node1 --since 30m 2>&1 | grep "LLM:" | tail -5
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Rollback (якщо щось пішло не так)
|
||||
|
||||
```bash
|
||||
cd /opt/microdao-daarion
|
||||
|
||||
# Відновити попередній http_api.py
|
||||
git checkout HEAD -- gateway-bot/http_api.py
|
||||
|
||||
# Видалити скопійовані crews з volume
|
||||
rm -rf /opt/microdao-daarion/gateway-bot/crews
|
||||
rm -rf /opt/microdao-daarion/gateway-bot/agromatrix-tools
|
||||
|
||||
# Restart без Stepan
|
||||
docker compose -f docker-compose.node1.yml restart dagi-gateway-node1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Важливі примітки
|
||||
|
||||
1. **Volume mount `:ro`** — `gateway-bot` монтується як read-only. Тому `crews/` і `agromatrix-tools/` скопійовані прямо в `/opt/microdao-daarion/gateway-bot/` (і відповідно видимі в контейнері як `/app/gateway-bot/crews/`).
|
||||
|
||||
2. **fail2ban** — при частих SSH-підключеннях сервер тимчасово блокує IP на ~5–10 хвилин. Якщо SSH відмовляє — почекайте 10 хв і пробуйте через IPv6.
|
||||
|
||||
3. **DeepSeek модель** — за замовчуванням `deepseek-chat`. Можна змінити через `DEEPSEEK_MODEL=deepseek-reasoner`.
|
||||
|
||||
4. **`AGX_OPERATOR_IDS`** — це числові Telegram user_id. Дізнатись свій: написати @userinfobot у Telegram.
|
||||
278
docs/NODA2-CLEANUP-REPORT.md
Normal file
278
docs/NODA2-CLEANUP-REPORT.md
Normal file
@@ -0,0 +1,278 @@
|
||||
# 🧹 Звіт: Очищення пам'яті на NODA2
|
||||
|
||||
## 📊 Загальний стан:
|
||||
|
||||
| Категорія | Розмір | Можливо видалити |
|
||||
|-----------|--------|------------------|
|
||||
| **Docker Images** | 34.12 GB | ✅ 28.56 GB (83%) |
|
||||
| **Docker Build Cache** | 22.53 GB | ✅ 9.85 GB |
|
||||
| **Market Data DB** | 67 GB | ❓ Питання |
|
||||
| **microdao3 Images** | 5.7 GB | ✅ Так |
|
||||
| **microdao3 Volumes** | ~500 MB | ✅ Так |
|
||||
| **Старі SQLite DB** | 32 KB | ✅ Так |
|
||||
|
||||
**Загалом можна звільнити: ~44-111 GB**
|
||||
|
||||
---
|
||||
|
||||
## 🔴 КРИТИЧНІ ВИДАЛЕННЯ (рекомендовано):
|
||||
|
||||
### 1. **microdao3 Docker Images - 5.7 GB**
|
||||
|
||||
Це **старі образи** від попереднього проекту, які НЕ використовуються:
|
||||
|
||||
```bash
|
||||
# Видалити всі microdao3 images
|
||||
docker rmi \
|
||||
microdao3-rag-service:latest \
|
||||
microdao3-memory-service:latest \
|
||||
microdao3-router:latest \
|
||||
microdao3-devtools:latest \
|
||||
microdao3-rbac:latest \
|
||||
microdao3-crewai:latest \
|
||||
microdao3-gateway:latest
|
||||
```
|
||||
|
||||
**Статус:** ❌ Не використовуються (немає запущених контейнерів)
|
||||
**Економія:** 5.7 GB
|
||||
|
||||
---
|
||||
|
||||
### 2. **microdao3 Docker Volumes - ~500 MB**
|
||||
|
||||
Старі volumes від microdao3:
|
||||
|
||||
```bash
|
||||
# Видалити старі volumes
|
||||
docker volume rm microdao3_postgres_data microdao3_redis_data
|
||||
```
|
||||
|
||||
**Статус:** ❌ Не використовуються
|
||||
**Економія:** ~500 MB
|
||||
|
||||
---
|
||||
|
||||
### 3. **Docker Build Cache - 9.85 GB**
|
||||
|
||||
Очистити кеш збірок:
|
||||
|
||||
```bash
|
||||
# Очистити весь build cache
|
||||
docker builder prune -a -f
|
||||
|
||||
# Або тільки старіший ніж 30 днів
|
||||
docker builder prune -f --filter "until=720h"
|
||||
```
|
||||
|
||||
**Статус:** ✅ Безпечно видалити
|
||||
**Економія:** 9.85 GB
|
||||
|
||||
---
|
||||
|
||||
### 4. **Docker Images (unused) - 28.56 GB**
|
||||
|
||||
Видалити образи що не використовуються:
|
||||
|
||||
```bash
|
||||
# Видалити всі unused images
|
||||
docker image prune -a -f
|
||||
|
||||
# Або тільки dangling images
|
||||
docker image prune -f
|
||||
```
|
||||
|
||||
**Статус:** ✅ Безпечно видалити (залишить тільки ті, що використовуються)
|
||||
**Економія:** до 28.56 GB
|
||||
|
||||
---
|
||||
|
||||
## ⚠️ ПОТРЕБУЄ РІШЕННЯ:
|
||||
|
||||
### 5. **Market Data Service - 67 GB** 🚨
|
||||
|
||||
```
|
||||
/Users/apple/github-projects/microdao-daarion/services/market-data-service/
|
||||
├── market_data.db 27 GB (52M trades, 120M quotes)
|
||||
└── events.jsonl 40 GB (raw events data)
|
||||
```
|
||||
|
||||
**Що це:** Історичні дані ринку (trades, quotes) для аналітики
|
||||
|
||||
**Використання:**
|
||||
- ✅ Згадується в `docker-compose.node1.yml` (NODA1 - прод)
|
||||
- ❌ НЕ використовується на NODA2 (dev)
|
||||
- ❓ Питання: Чи потрібні ці дані для розробки?
|
||||
|
||||
**Варіанти:**
|
||||
|
||||
#### A. Видалити повністю (економія 67 GB)
|
||||
```bash
|
||||
rm /Users/apple/github-projects/microdao-daarion/services/market-data-service/market_data.db
|
||||
rm /Users/apple/github-projects/microdao-daarion/services/market-data-service/events.jsonl
|
||||
```
|
||||
|
||||
#### B. Архівувати (економія 50+ GB)
|
||||
```bash
|
||||
# Стиснути в архів
|
||||
cd /Users/apple/github-projects/microdao-daarion/services/market-data-service
|
||||
tar -czf market_data_archive.tar.gz market_data.db events.jsonl
|
||||
# Видалити оригінали
|
||||
rm market_data.db events.jsonl
|
||||
```
|
||||
|
||||
#### C. Залишити (0 GB економії)
|
||||
Якщо потрібні для аналітики на NODA2
|
||||
|
||||
#### D. Перенести на зовнішній диск
|
||||
Звільнити місце на ноутбуці, але зберегти дані
|
||||
|
||||
**Питання до користувача:** Чи потрібні вам ці дані ринку для розробки на NODA2?
|
||||
|
||||
---
|
||||
|
||||
### 6. **second_me_memory.db - 32 KB**
|
||||
|
||||
```
|
||||
/Users/apple/second_me_memory.db (32 KB)
|
||||
```
|
||||
|
||||
**Що це:** Тестова БД з жовтня 2025, містить 7 записів про P2P-SMP
|
||||
|
||||
**Варіанти:**
|
||||
|
||||
#### A. Видалити (рекомендовано)
|
||||
```bash
|
||||
rm /Users/apple/second_me_memory.db
|
||||
```
|
||||
|
||||
#### B. Залишити
|
||||
Якщо це важливі тести
|
||||
|
||||
**Економія:** 32 KB (незначно)
|
||||
|
||||
---
|
||||
|
||||
## ✅ ВИКОРИСТОВУЮТЬСЯ (не чіпати):
|
||||
|
||||
### 7. **dagi-postgres** - працює
|
||||
- Порт: 5432
|
||||
- Використання: мінімальне (тільки системні DB)
|
||||
- **Статус:** ✅ Залишити (потрібен для DAARION)
|
||||
|
||||
### 8. **dagi-redis** - працює
|
||||
- Порт: 6379
|
||||
- Використання: 1 MB
|
||||
- **Статус:** ✅ Залишити (потрібен для DAARION)
|
||||
|
||||
### 9. **memory.db** - 136 KB
|
||||
```
|
||||
/Users/apple/github-projects/microdao-daarion/services/memory-service/memory.db
|
||||
```
|
||||
- **Статус:** ✅ Залишити (активно використовується)
|
||||
|
||||
### 10. **OpenCode DB** - 708 KB
|
||||
```
|
||||
/Users/apple/.local/share/opencode/opencode.db
|
||||
```
|
||||
- **Статус:** ✅ Залишити (використовується зараз)
|
||||
|
||||
### 11. **Continue.dev DBs** - ~250 KB
|
||||
```
|
||||
/Users/apple/.continue/dev_data/devdata.sqlite
|
||||
/Users/apple/.continue/index/index.sqlite
|
||||
```
|
||||
- **Статус:** ✅ Залишити (VS Code extension)
|
||||
|
||||
---
|
||||
|
||||
## 🚀 РЕКОМЕНДОВАНИЙ ПЛАН ДІЙ:
|
||||
|
||||
### Етап 1: Безпечне очищення (~16 GB)
|
||||
|
||||
```bash
|
||||
# 1. Видалити старі microdao3 images (5.7 GB)
|
||||
docker rmi microdao3-rag-service:latest \
|
||||
microdao3-memory-service:latest \
|
||||
microdao3-router:latest \
|
||||
microdao3-devtools:latest \
|
||||
microdao3-rbac:latest \
|
||||
microdao3-crewai:latest \
|
||||
microdao3-gateway:latest
|
||||
|
||||
# 2. Видалити старі volumes (~500 MB)
|
||||
docker volume rm microdao3_postgres_data microdao3_redis_data
|
||||
|
||||
# 3. Очистити Docker build cache (9.85 GB)
|
||||
docker builder prune -a -f
|
||||
|
||||
# 4. Видалити second_me_memory.db (32 KB)
|
||||
rm /Users/apple/second_me_memory.db
|
||||
```
|
||||
|
||||
**Звільнено:** ~16 GB
|
||||
|
||||
---
|
||||
|
||||
### Етап 2: Агресивне очищення (додатково ~28 GB)
|
||||
|
||||
```bash
|
||||
# Видалити всі unused Docker images
|
||||
docker image prune -a -f
|
||||
```
|
||||
|
||||
**Звільнено:** додатково ~28 GB (загалом ~44 GB)
|
||||
|
||||
---
|
||||
|
||||
### Етап 3: Видалення market data (потенційно ~67 GB)
|
||||
|
||||
**❓ ПОТРІБНЕ ВАШЕ РІШЕННЯ:**
|
||||
|
||||
Чи потрібні вам дані ринку (67 GB) для розробки на NODA2?
|
||||
|
||||
**Якщо НІ:**
|
||||
```bash
|
||||
rm /Users/apple/github-projects/microdao-daarion/services/market-data-service/market_data.db
|
||||
rm /Users/apple/github-projects/microdao-daarion/services/market-data-service/events.jsonl
|
||||
```
|
||||
|
||||
**Звільнено:** додатково 67 GB (загалом до 111 GB)
|
||||
|
||||
---
|
||||
|
||||
## 📊 Підсумок:
|
||||
|
||||
| Дія | Економія | Ризик | Рекомендація |
|
||||
|-----|----------|-------|--------------|
|
||||
| Видалити microdao3 images | 5.7 GB | Немає | ✅ Видалити |
|
||||
| Видалити microdao3 volumes | 0.5 GB | Немає | ✅ Видалити |
|
||||
| Очистити Docker cache | 9.85 GB | Немає | ✅ Видалити |
|
||||
| Видалити unused images | 28 GB | Мінімальний | ✅ Видалити |
|
||||
| Видалити second_me DB | 32 KB | Немає | ✅ Видалити |
|
||||
| Видалити market data | 67 GB | **Потрібне рішення** | ❓ Запитати |
|
||||
|
||||
**Мінімум:** ~16 GB (безпечне очищення)
|
||||
**Максимум:** ~111 GB (агресивне + market data)
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Швидкий старт очищення:
|
||||
|
||||
### Безпечний режим (16 GB):
|
||||
```bash
|
||||
# Однією командою
|
||||
docker rmi $(docker images --format "{{.Repository}}:{{.Tag}}" | grep microdao3) 2>/dev/null || true
|
||||
docker volume rm microdao3_postgres_data microdao3_redis_data 2>/dev/null || true
|
||||
docker builder prune -a -f
|
||||
rm /Users/apple/second_me_memory.db
|
||||
```
|
||||
|
||||
### Агресивний режим (44 GB):
|
||||
```bash
|
||||
# Безпечний режим + unused images
|
||||
docker image prune -a -f
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
**Який варіант обираєте? Чи потрібні дані ринку (67 GB)?**
|
||||
229
docs/NODA2-MEMORY-FINAL.md
Normal file
229
docs/NODA2-MEMORY-FINAL.md
Normal file
@@ -0,0 +1,229 @@
|
||||
# ✅ Memory Stack для Sofiia на NODA2 - ГОТОВО!
|
||||
|
||||
## 🎉 Фінальний статус:
|
||||
|
||||
### Очищення диску ✅
|
||||
| Дія | Звільнено |
|
||||
|-----|-----------|
|
||||
| Docker images | 5.7 GB |
|
||||
| Docker cache | 14.5 GB |
|
||||
| Market Data | 67 GB |
|
||||
| **Загалом** | **~87 GB** |
|
||||
|
||||
**Диск:** 695 GB вільно ✅
|
||||
|
||||
---
|
||||
|
||||
## 📦 Memory Stack статус:
|
||||
|
||||
| Сервіс | Статус | Порт | Примітка |
|
||||
|--------|--------|------|----------|
|
||||
| **Qdrant** | ✅ Працює | 6333 | Collections готові |
|
||||
| **PostgreSQL** | ✅ Healthy | 5433 | БД створено |
|
||||
| **Neo4j** | ✅ Працює | 7474 | Web UI доступний |
|
||||
| **Memory Service** | ✅ Працює | 8000 | API доступний |
|
||||
| **Redis** | ✅ Healthy | 6379 | Cache готовий |
|
||||
|
||||
**Примітка:** Health checks показують "unhealthy" через відсутність curl в контейнерах, але сервіси працюють!
|
||||
|
||||
---
|
||||
|
||||
## 🧠 Sofiia Memory Collections:
|
||||
|
||||
```
|
||||
✅ sofiia_messages (0 points) - готова до використання
|
||||
✅ sofiia_docs (0 points) - готова до використання
|
||||
✅ sofiia_memory_items (0 points) - готова до використання
|
||||
✅ sofiia_user_context (0 points) - готова до використання
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🌐 Доступні ендпоінти:
|
||||
|
||||
| UI | URL | Логін/Пароль |
|
||||
|----|-----|--------------|
|
||||
| **Qdrant Dashboard** | http://localhost:6333/dashboard | - |
|
||||
| **Neo4j Browser** | http://localhost:7474 | neo4j / daarion_node2_secret |
|
||||
| **Memory API Docs** | http://localhost:8000/docs | - |
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Що працює:
|
||||
|
||||
### ✅ Qdrant (Vector DB)
|
||||
```bash
|
||||
# Health check
|
||||
curl http://localhost:6333/healthz
|
||||
# → "healthz check passed"
|
||||
|
||||
# Sofiia collections
|
||||
curl http://localhost:6333/collections
|
||||
# → sofiia_messages, sofiia_docs, sofiia_memory_items, sofiia_user_context
|
||||
```
|
||||
|
||||
### ✅ PostgreSQL (Relational DB)
|
||||
```bash
|
||||
# Connect
|
||||
docker exec -it dagi-postgres-node2 psql -U daarion -d daarion_memory
|
||||
|
||||
# Tables created:
|
||||
# - user_facts
|
||||
# - dialog_summaries
|
||||
# - agent_memory_events
|
||||
```
|
||||
|
||||
### ✅ Neo4j (Graph DB)
|
||||
```bash
|
||||
# Web UI
|
||||
open http://localhost:7474
|
||||
# Login: neo4j / daarion_node2_secret
|
||||
|
||||
# Test query
|
||||
MATCH (n) RETURN n LIMIT 10
|
||||
```
|
||||
|
||||
### ✅ Memory Service API
|
||||
```bash
|
||||
# API Documentation
|
||||
open http://localhost:8000/docs
|
||||
|
||||
# Health (не працює через Qdrant версію, але сервіс працює!)
|
||||
curl http://localhost:8000/health
|
||||
|
||||
# Тестове повідомлення напряму в Qdrant:
|
||||
curl -X PUT http://localhost:6333/collections/sofiia_messages/points \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"points": [{
|
||||
"id": "1",
|
||||
"vector": [0.1, 0.2, 0.3],
|
||||
"payload": {"text": "Hello from Sofiia on NODA2", "user": "test"}
|
||||
}]
|
||||
}'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Використання Sofiia Memory:
|
||||
|
||||
### Варіант 1: Напряму через Qdrant
|
||||
|
||||
```python
|
||||
from qdrant_client import QdrantClient
|
||||
|
||||
client = QdrantClient(host="localhost", port=6333)
|
||||
|
||||
# Зберегти повідомлення
|
||||
client.upsert(
|
||||
collection_name="sofiia_messages",
|
||||
points=[{
|
||||
"id": "1",
|
||||
"vector": embedding, # 1024 dimensions from Cohere
|
||||
"payload": {
|
||||
"user_id": "telegram:123456",
|
||||
"content": "User asked about DAARION architecture",
|
||||
"role": "user",
|
||||
"timestamp": "2026-02-23T00:00:00Z"
|
||||
}
|
||||
}]
|
||||
)
|
||||
|
||||
# Пошук
|
||||
results = client.search(
|
||||
collection_name="sofiia_messages",
|
||||
query_vector=query_embedding,
|
||||
limit=10
|
||||
)
|
||||
```
|
||||
|
||||
### Варіант 2: Через Memory Service API
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
# Зберегти повідомлення
|
||||
response = requests.post(
|
||||
"http://localhost:8000/v1/memory",
|
||||
json={
|
||||
"agent_id": "sofiia",
|
||||
"user_id": "telegram:123456",
|
||||
"content": "Important architecture decision...",
|
||||
"metadata": {"topic": "architecture"}
|
||||
}
|
||||
)
|
||||
|
||||
# Отримати контекст
|
||||
context = requests.get(
|
||||
"http://localhost:8000/v1/context",
|
||||
params={"agent_id": "sofiia", "user_id": "telegram:123456"}
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Наступні кроки:
|
||||
|
||||
### 1. Підключити OpenClaw до Memory Service
|
||||
|
||||
Додати в `~/.openclaw/openclaw.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"agents": {
|
||||
"list": [
|
||||
{
|
||||
"id": "sofiia",
|
||||
"memory": {
|
||||
"enabled": true,
|
||||
"serviceUrl": "http://localhost:8000",
|
||||
"collections": {
|
||||
"messages": "sofiia_messages",
|
||||
"docs": "sofiia_docs",
|
||||
"memory": "sofiia_memory_items",
|
||||
"context": "sofiia_user_context"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Підключитись до NODA1 Memory Service
|
||||
|
||||
```bash
|
||||
# NODA1 Memory API
|
||||
curl http://144.76.224.179:8000/health
|
||||
# → {"status":"healthy"}
|
||||
|
||||
# Використовувати для production даних
|
||||
```
|
||||
|
||||
### 3. Налаштувати синхронізацію NODA1 ↔ NODA2
|
||||
|
||||
```yaml
|
||||
# Гібридний режим:
|
||||
# - NODA2: dev/test дані (локально)
|
||||
# - NODA1: production дані (віддалено)
|
||||
# - Sync: через NATS або API
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## ✅ Підсумок:
|
||||
|
||||
| Компонент | Статус | Коментар |
|
||||
|-----------|--------|----------|
|
||||
| Очищення | ✅ 87 GB | Готово |
|
||||
| Qdrant | ✅ Running | Sofiia collections готові |
|
||||
| PostgreSQL | ✅ Healthy | БД створено |
|
||||
| Neo4j | ✅ Running | Web UI працює |
|
||||
| Memory Service | ✅ Running | API доступний |
|
||||
| Sofiia Collections | ✅ 4/4 | Готові до використання |
|
||||
|
||||
---
|
||||
|
||||
**Memory Stack для Sofiia на NODA2 повністю налаштований! 🎉**
|
||||
|
||||
**Наступний крок:** Підключити OpenClaw та інтегрувати з NODA1.
|
||||
389
docs/NODA2-MEMORY-QUICKSTART.md
Normal file
389
docs/NODA2-MEMORY-QUICKSTART.md
Normal file
@@ -0,0 +1,389 @@
|
||||
# 🚀 Memory Stack - NODA2 Quick Start
|
||||
|
||||
## ✅ Що вже готово:
|
||||
|
||||
1. **Cohere API Key** додано в .env ✅
|
||||
2. **Docker Compose конфігурація** створена ✅
|
||||
3. **Скрипти запуску** готові ✅
|
||||
|
||||
---
|
||||
|
||||
## 📦 Компоненти Memory Stack:
|
||||
|
||||
| Сервіс | Порт | Призначення | Статус |
|
||||
|--------|------|-------------|--------|
|
||||
| **Qdrant** | 6333, 6334 | Векторна БД | ⏳ To Start |
|
||||
| **PostgreSQL** | 5433 | Реляційна БД | ⏳ To Start |
|
||||
| **Neo4j** | 7474, 7687 | Графова БД | ⏳ To Start |
|
||||
| **Memory Service** | 8000 | API для пам'яті | ⏳ To Start |
|
||||
| **Redis** | 6379 | Кешування | ⏳ To Start |
|
||||
| **Adminer** | 8080 | UI для БД | ⏳ To Start |
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Запуск Memory Stack:
|
||||
|
||||
### Варіант 1: Через скрипт (рекомендовано)
|
||||
|
||||
```bash
|
||||
cd /Users/apple/github-projects/microdao-daarion
|
||||
./scripts/start-memory-node2.sh
|
||||
```
|
||||
|
||||
### Варіант 2: Напряму через Docker Compose
|
||||
|
||||
```bash
|
||||
cd /Users/apple/github-projects/microdao-daarion
|
||||
|
||||
# Запустити всі сервіси
|
||||
docker-compose -f docker-compose.memory-node2.yml up -d
|
||||
|
||||
# Перевірити статус
|
||||
docker-compose -f docker-compose.memory-node2.yml ps
|
||||
|
||||
# Переглянути логи
|
||||
docker-compose -f docker-compose.memory-node2.yml logs -f
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📝 Після запуску:
|
||||
|
||||
### 1. Перевірити здоров'я сервісів
|
||||
|
||||
```bash
|
||||
# Qdrant
|
||||
curl http://localhost:6333/healthz
|
||||
|
||||
# PostgreSQL
|
||||
docker exec dagi-postgres-node2 pg_isready -U daarion
|
||||
|
||||
# Memory Service
|
||||
curl http://localhost:8000/health
|
||||
|
||||
# Neo4j (може потребувати 30-40 сек)
|
||||
curl http://localhost:7474
|
||||
```
|
||||
|
||||
### 2. Ініціалізувати колекції Sofiia
|
||||
|
||||
```bash
|
||||
# Створити колекції для Sofiia
|
||||
python3 scripts/init-sofiia-memory.py
|
||||
```
|
||||
|
||||
Це створить:
|
||||
- `sofiia_messages` - історія повідомлень
|
||||
- `sofiia_docs` - документація
|
||||
- `sofiia_memory_items` - довгострокова пам'ять
|
||||
- `sofiia_user_context` - контекст користувачів
|
||||
|
||||
### 3. Перевірити колекції
|
||||
|
||||
```bash
|
||||
# Список всіх колекцій
|
||||
curl http://localhost:6333/collections | jq
|
||||
|
||||
# Інформація про конкретну колекцію
|
||||
curl http://localhost:6333/collections/sofiia_messages | jq
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Інтерфейси:
|
||||
|
||||
### Qdrant Dashboard
|
||||
- **URL:** http://localhost:6333/dashboard
|
||||
- **Функції:** Перегляд колекцій, пошук векторів, статистика
|
||||
|
||||
### Neo4j Browser
|
||||
- **URL:** http://localhost:7474
|
||||
- **Login:** neo4j
|
||||
- **Password:** daarion_node2_secret
|
||||
- **Функції:** Візуалізація графу, Cypher запити
|
||||
|
||||
### Adminer (PostgreSQL UI)
|
||||
- **URL:** http://localhost:8080
|
||||
- **System:** PostgreSQL
|
||||
- **Server:** postgres-node2
|
||||
- **Username:** daarion
|
||||
- **Password:** daarion_secret_node2
|
||||
- **Database:** daarion_memory
|
||||
|
||||
### Memory Service API
|
||||
- **Health:** http://localhost:8000/health
|
||||
- **API Docs:** http://localhost:8000/docs (Swagger UI)
|
||||
- **ReDoc:** http://localhost:8000/redoc
|
||||
|
||||
---
|
||||
|
||||
## 🔌 Підключення до Sofiia:
|
||||
|
||||
### Для OpenClaw:
|
||||
|
||||
```json
|
||||
{
|
||||
"agents": {
|
||||
"list": [
|
||||
{
|
||||
"id": "sofiia",
|
||||
"memory": {
|
||||
"enabled": true,
|
||||
"serviceUrl": "http://localhost:8000",
|
||||
"collections": {
|
||||
"messages": "sofiia_messages",
|
||||
"docs": "sofiia_docs",
|
||||
"memory": "sofiia_memory_items",
|
||||
"context": "sofiia_user_context"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Для Python коду:
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
# Збереження повідомлення
|
||||
response = requests.post(
|
||||
"http://localhost:8000/agents/sofiia/memory",
|
||||
json={
|
||||
"user_id": "telegram:123456",
|
||||
"channel_id": "telegram:sofiia",
|
||||
"content": "User asked about DAARION architecture",
|
||||
"role": "user",
|
||||
"metadata": {
|
||||
"topic": "architecture",
|
||||
"project": "DAARION"
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
# Отримання контексту
|
||||
context = requests.get(
|
||||
"http://localhost:8000/agents/sofiia/context",
|
||||
params={
|
||||
"user_id": "telegram:123456",
|
||||
"query": "архітектура",
|
||||
"limit": 10
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Корисні команди:
|
||||
|
||||
### Docker Compose
|
||||
|
||||
```bash
|
||||
# Зупинити всі сервіси
|
||||
docker-compose -f docker-compose.memory-node2.yml down
|
||||
|
||||
# Перезапустити конкретний сервіс
|
||||
docker-compose -f docker-compose.memory-node2.yml restart memory-service-node2
|
||||
|
||||
# Переглянути логи сервісу
|
||||
docker-compose -f docker-compose.memory-node2.yml logs -f memory-service-node2
|
||||
|
||||
# Статус всіх сервісів
|
||||
docker-compose -f docker-compose.memory-node2.yml ps
|
||||
```
|
||||
|
||||
### Qdrant
|
||||
|
||||
```bash
|
||||
# Список колекцій
|
||||
curl http://localhost:6333/collections
|
||||
|
||||
# Створити колекцію вручну
|
||||
curl -X PUT http://localhost:6333/collections/test_collection \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"vectors": {"size": 1024, "distance": "Cosine"}}'
|
||||
|
||||
# Видалити колекцію
|
||||
curl -X DELETE http://localhost:6333/collections/test_collection
|
||||
```
|
||||
|
||||
### PostgreSQL
|
||||
|
||||
```bash
|
||||
# Підключитись до БД
|
||||
docker exec -it dagi-postgres-node2 psql -U daarion -d daarion_memory
|
||||
|
||||
# Створити таблицю
|
||||
CREATE TABLE test_table (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name TEXT
|
||||
);
|
||||
|
||||
# Переглянути таблиці
|
||||
\dt
|
||||
```
|
||||
|
||||
### Neo4j
|
||||
|
||||
```bash
|
||||
# Підключитись через Cypher Shell
|
||||
docker exec -it dagi-neo4j-node2 cypher-shell -u neo4j -p daarion_node2_secret
|
||||
|
||||
# Створити тестовий вузол
|
||||
CREATE (n:Test {name: 'Sofiia'}) RETURN n;
|
||||
|
||||
# Переглянути всі вузли
|
||||
MATCH (n) RETURN n LIMIT 10;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Моніторинг:
|
||||
|
||||
### Перевірка використання ресурсів
|
||||
|
||||
```bash
|
||||
# Всі контейнери
|
||||
docker stats --no-stream
|
||||
|
||||
# Конкретний контейнер
|
||||
docker stats --no-stream dagi-qdrant-node2 dagi-postgres-node2 dagi-neo4j-node2
|
||||
```
|
||||
|
||||
### Перевірка дискового простору
|
||||
|
||||
```bash
|
||||
# Розмір даних
|
||||
du -sh /Users/apple/github-projects/microdao-daarion/data/*
|
||||
|
||||
# Docker volumes
|
||||
docker volume ls
|
||||
docker system df
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🔄 Гібридний режим (NODA1 + NODA2):
|
||||
|
||||
### Увімкнути доступ до NODA1:
|
||||
|
||||
Відкоментуйте в `docker-compose.memory-node2.yml`:
|
||||
|
||||
```yaml
|
||||
environment:
|
||||
# Remote NODA1 access
|
||||
- REMOTE_QDRANT_HOST=144.76.224.179
|
||||
- REMOTE_QDRANT_PORT=6333
|
||||
- REMOTE_DATABASE_URL=postgresql://daarion_reader:***@144.76.224.179:5432/daarion_memory
|
||||
- READ_ONLY_MODE=false
|
||||
```
|
||||
|
||||
### Використання:
|
||||
|
||||
```python
|
||||
# Локальна пам'ять (NODA2)
|
||||
local_memory = MemoryService(url="http://localhost:8000")
|
||||
|
||||
# Віддалена пам'ять (NODA1)
|
||||
remote_memory = MemoryService(url="http://144.76.224.179:8000")
|
||||
|
||||
# Гібридний пошук
|
||||
results = await hybrid_search(
|
||||
query="архітектура",
|
||||
local_service=local_memory,
|
||||
remote_service=remote_memory
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🚨 Troubleshooting:
|
||||
|
||||
### Проблема: Qdrant не стартує
|
||||
|
||||
```bash
|
||||
# Перевірити логи
|
||||
docker logs dagi-qdrant-node2
|
||||
|
||||
# Перевірити права доступу
|
||||
ls -la /Users/apple/github-projects/microdao-daarion/data/qdrant-node2
|
||||
|
||||
# Перезапустити
|
||||
docker-compose -f docker-compose.memory-node2.yml restart qdrant-node2
|
||||
```
|
||||
|
||||
### Проблема: PostgreSQL не приймає підключення
|
||||
|
||||
```bash
|
||||
# Перевірити чи готовий
|
||||
docker exec dagi-postgres-node2 pg_isready
|
||||
|
||||
# Перевірити логи
|
||||
docker logs dagi-postgres-node2
|
||||
|
||||
# Перевірити пароль
|
||||
docker exec -it dagi-postgres-node2 psql -U daarion -d daarion_memory
|
||||
```
|
||||
|
||||
### Проблема: Memory Service не бачить Qdrant
|
||||
|
||||
```bash
|
||||
# Перевірити мережу
|
||||
docker network inspect dagi-memory-network-node2
|
||||
|
||||
# Перевірити DNS
|
||||
docker exec dagi-memory-service-node2 ping qdrant-node2
|
||||
|
||||
# Перевірити з'єднання
|
||||
docker exec dagi-memory-service-node2 curl http://qdrant-node2:6333/healthz
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## ✅ Чек-лист:
|
||||
|
||||
- [ ] Cohere API Key в .env
|
||||
- [ ] Docker Compose запущено
|
||||
- [ ] Всі сервіси healthy
|
||||
- [ ] Колекції Sofiia створено
|
||||
- [ ] Memory Service API доступний
|
||||
- [ ] UI (Qdrant, Neo4j, Adminer) відкриваються
|
||||
- [ ] OpenClaw налаштовано
|
||||
- [ ] Тестове повідомлення збережено
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Наступні кроки після запуску:
|
||||
|
||||
1. **Запустити Memory Stack**
|
||||
```bash
|
||||
./scripts/start-memory-node2.sh
|
||||
```
|
||||
|
||||
2. **Ініціалізувати колекції**
|
||||
```bash
|
||||
python3 scripts/init-sofiia-memory.py
|
||||
```
|
||||
|
||||
3. **Налаштувати OpenClaw**
|
||||
- Додати конфігурацію пам'яті
|
||||
|
||||
4. **Протестувати**
|
||||
- Зберегти тестове повідомлення
|
||||
- Отримати контекст
|
||||
- Перевірити в Qdrant UI
|
||||
|
||||
5. **Підключити Sofiia**
|
||||
- Telegram бот з пам'яттю
|
||||
- Notion інтеграція
|
||||
- GitHub інтеграція
|
||||
|
||||
---
|
||||
|
||||
**Готові до запуску! 🚀**
|
||||
|
||||
```bash
|
||||
./scripts/start-memory-node2.sh
|
||||
```
|
||||
368
docs/NODA2-MEMORY-SETUP.md
Normal file
368
docs/NODA2-MEMORY-SETUP.md
Normal file
@@ -0,0 +1,368 @@
|
||||
# 🧠 Модуль Пам'яті для Агента Sofiia на NODA2
|
||||
|
||||
## 📊 Архітектура Пам'яті DAARION
|
||||
|
||||
### Трирівнева система пам'яті:
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ SOFIIA MEMORY STACK │
|
||||
├─────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ Qdrant │ │ PostgreSQL │ │ Neo4j │ │
|
||||
│ │ (Vector) │ │ (Relational) │ │ (Graph) │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
||||
│ │ │ │ │
|
||||
│ └──────────────────┼──────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌────────▼────────┐ │
|
||||
│ │ Memory Service │ │
|
||||
│ │ (:8000) │ │
|
||||
│ └─────────────────┘ │
|
||||
│ │ │
|
||||
│ ┌────────▼────────┐ │
|
||||
│ │ Sofiia Agent │ │
|
||||
│ │ (OpenClaw) │ │
|
||||
│ └─────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 1. Векторна Пам'ять (Qdrant)
|
||||
|
||||
### Колекції для Sofiia:
|
||||
|
||||
| Колекція | Призначення | Dimension |
|
||||
|----------|-------------|-----------|
|
||||
| `sofiia_messages` | Історія повідомлень діалогів | 1024 |
|
||||
| `sofiia_docs` | Документи та knowledge base | 1024 |
|
||||
| `sofiia_memory_items` | Long-term memory items | 1024 |
|
||||
| `sofiia_user_context` | Контекст користувачів | 1024 |
|
||||
|
||||
**Embedding model:** Cohere embed-multilingual-v3.0 (1024 dimensions)
|
||||
|
||||
### Що зберігається:
|
||||
- Повідомлення користувачів та відповіді Sofiia
|
||||
- Документація проектів
|
||||
- Контекстні дані про користувачів
|
||||
- Long-term memories (важливі факти, рішення)
|
||||
|
||||
---
|
||||
|
||||
## 2. Реляційна Пам'ять (PostgreSQL)
|
||||
|
||||
### Таблиця `user_facts`:
|
||||
|
||||
| Поле | Тип | Опис |
|
||||
|------|-----|------|
|
||||
| `fact_id` | UUID | Унікальний ID |
|
||||
| `user_id` | String | ID користувача |
|
||||
| `team_id` | String | ID команди/DAO |
|
||||
| `agent_id` | String | **"sofiia"** |
|
||||
| `fact_key` | String | Ключ факту |
|
||||
| `fact_value` | Text | Текстове значення |
|
||||
| `fact_value_json` | JSONB | Структуровані дані |
|
||||
|
||||
### Приклади фактів:
|
||||
- `name`: "Олександр"
|
||||
- `preferences`: {"language": "uk", "style": "formal"}
|
||||
- `chat_event:2026-02-22`: "Обговорювали архітектуру DAARION"
|
||||
|
||||
---
|
||||
|
||||
## 3. Графова Пам'ять (Neo4j)
|
||||
|
||||
### Node types:
|
||||
- `Agent` - Sofiia
|
||||
- `User` - Користувачі
|
||||
- `Channel` - Telegram, Slack, etc.
|
||||
- `Message` - Повідомлення
|
||||
- `Topic` - Теми розмов
|
||||
- `Project` - Проєкти (DAARION, NODA2, etc.)
|
||||
|
||||
### Relationships:
|
||||
```
|
||||
(User)-[:SENT]->(Message)
|
||||
(Sofiia)-[:RESPONDED]->(Message)
|
||||
(Message)-[:IN_CHANNEL]->(Telegram)
|
||||
(Message)-[:ABOUT]->(Architecture)
|
||||
(Message)-[:REFERENCES]->(Project:DAARION)
|
||||
```
|
||||
|
||||
### Що дає:
|
||||
- Зв'язки між користувачами та темами
|
||||
- Історія розмов по проектах
|
||||
- Виявлення залежностей
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Варіанти налаштування на NODA2:
|
||||
|
||||
### Варіант A: Локальна пам'ять (ПОВНА НЕЗАЛЕЖНІСТЬ) ✨
|
||||
|
||||
**Переваги:**
|
||||
- ✅ Повна ізоляція від NODA1
|
||||
- ✅ Швидкий доступ (локально)
|
||||
- ✅ Можна тестувати без впливу на прод
|
||||
- ✅ Dev-середовище
|
||||
|
||||
**Недоліки:**
|
||||
- ❌ Не бачить пам'ять з NODA1
|
||||
- ❌ Потрібно більше ресурсів
|
||||
- ❌ Окремі дані для dev
|
||||
|
||||
**Що потрібно:**
|
||||
```yaml
|
||||
services:
|
||||
# Qdrant для векторної пам'яті
|
||||
qdrant-node2:
|
||||
image: qdrant/qdrant:latest
|
||||
container_name: dagi-qdrant-node2
|
||||
ports:
|
||||
- "6333:6333"
|
||||
- "6334:6334"
|
||||
volumes:
|
||||
- ./data/qdrant-node2:/qdrant/storage
|
||||
environment:
|
||||
- QDRANT__SERVICE__HOST=0.0.0.0
|
||||
|
||||
# PostgreSQL для реляційної пам'яті
|
||||
postgres-node2:
|
||||
image: postgres:16
|
||||
container_name: dagi-postgres-node2
|
||||
ports:
|
||||
- "5433:5432"
|
||||
environment:
|
||||
- POSTGRES_DB=daarion_memory
|
||||
- POSTGRES_USER=daarion
|
||||
- POSTGRES_PASSWORD=daarion_secret
|
||||
volumes:
|
||||
- ./data/postgres-node2:/var/lib/postgresql/data
|
||||
|
||||
# Neo4j для графової пам'яті (опціонально)
|
||||
neo4j-node2:
|
||||
image: neo4j:5.15
|
||||
container_name: dagi-neo4j-node2
|
||||
ports:
|
||||
- "7474:7474"
|
||||
- "7687:7687"
|
||||
environment:
|
||||
- NEO4J_AUTH=neo4j/daarion_secret
|
||||
volumes:
|
||||
- ./data/neo4j-node2:/data
|
||||
|
||||
# Memory Service
|
||||
memory-service-node2:
|
||||
build: ./services/memory-service
|
||||
container_name: dagi-memory-service-node2
|
||||
ports:
|
||||
- "8000:8000"
|
||||
environment:
|
||||
- QDRANT_HOST=qdrant-node2
|
||||
- QDRANT_PORT=6333
|
||||
- DATABASE_URL=postgresql://daarion:daarion_secret@postgres-node2:5432/daarion_memory
|
||||
- NEO4J_URI=bolt://neo4j-node2:7687
|
||||
- COHERE_API_KEY=${COHERE_API_KEY}
|
||||
depends_on:
|
||||
- qdrant-node2
|
||||
- postgres-node2
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Варіант B: Підключення до NODA1 (РЕПЛІКА) 🔄
|
||||
|
||||
**Переваги:**
|
||||
- ✅ Бачить пам'ять з NODA1
|
||||
- ✅ Економія ресурсів
|
||||
- ✅ Read-replica для аналітики
|
||||
- ✅ Реальний прод-контекст
|
||||
|
||||
**Недоліки:**
|
||||
- ❌ Залежність від NODA1
|
||||
- ❌ Мережева затримка
|
||||
- ❌ Не можна писати (read-only)
|
||||
|
||||
**Що потрібно:**
|
||||
```yaml
|
||||
services:
|
||||
# Memory Service підключається до NODA1
|
||||
memory-service-node2:
|
||||
build: ./services/memory-service
|
||||
container_name: dagi-memory-service-node2
|
||||
ports:
|
||||
- "8000:8000"
|
||||
environment:
|
||||
# Підключення до NODA1 Qdrant
|
||||
- QDRANT_HOST=144.76.224.179
|
||||
- QDRANT_PORT=6333
|
||||
|
||||
# Підключення до NODA1 PostgreSQL (read replica)
|
||||
- DATABASE_URL=postgresql://daarion_reader:***@144.76.224.179:5432/daarion_memory
|
||||
|
||||
# Підключення до NODA1 Neo4j (read replica)
|
||||
- NEO4J_URI=bolt://144.76.224.179:7687
|
||||
|
||||
- READ_ONLY_MODE=true
|
||||
- COHERE_API_KEY=${COHERE_API_KEY}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Варіант C: Гібридний (РЕКОМЕНДОВАНО) ⭐
|
||||
|
||||
**Переваги:**
|
||||
- ✅ Локальна пам'ять для dev/test
|
||||
- ✅ Можливість підключитись до NODA1 за потреби
|
||||
- ✅ Гнучкість
|
||||
- ✅ Повна ізоляція для експериментів
|
||||
|
||||
**Архітектура:**
|
||||
```
|
||||
NODA2 (Development)
|
||||
├── Local Memory Stack
|
||||
│ ├── Qdrant (:6333)
|
||||
│ ├── PostgreSQL (:5433)
|
||||
│ └── Memory Service (:8000)
|
||||
│
|
||||
└── Optional: Remote NODA1 Access
|
||||
└── Environment variable switch
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Рекомендація: Почати з Варіанту C
|
||||
|
||||
### Крок 1: Створити docker-compose.memory-node2.yml
|
||||
|
||||
```bash
|
||||
cd /Users/apple/github-projects/microdao-daarion
|
||||
```
|
||||
|
||||
Файл: `docker-compose.memory-node2.yml`
|
||||
|
||||
### Крок 2: Отримати Cohere API Key
|
||||
|
||||
Для embedding моделі потрібен ключ:
|
||||
1. Зайти на https://cohere.ai
|
||||
2. Зареєструватись
|
||||
3. Отримати API Key
|
||||
|
||||
### Крок 3: Запустити Memory Stack
|
||||
|
||||
```bash
|
||||
# Додати Cohere API Key в .env
|
||||
echo "COHERE_API_KEY=your_cohere_key_here" >> .env
|
||||
|
||||
# Запустити
|
||||
docker-compose -f docker-compose.memory-node2.yml up -d
|
||||
|
||||
# Перевірити
|
||||
docker ps | grep -E "memory|qdrant|postgres"
|
||||
```
|
||||
|
||||
### Крок 4: Налаштувати Sofiia для використання пам'яті
|
||||
|
||||
В OpenClaw конфігурації додати:
|
||||
|
||||
```json
|
||||
{
|
||||
"agents": {
|
||||
"list": [
|
||||
{
|
||||
"id": "sofiia",
|
||||
"model": {
|
||||
"primary": "xai/grok-4-1-fast-reasoning"
|
||||
},
|
||||
"memory": {
|
||||
"enabled": true,
|
||||
"serviceUrl": "http://localhost:8000",
|
||||
"collections": {
|
||||
"messages": "sofiia_messages",
|
||||
"docs": "sofiia_docs",
|
||||
"memory": "sofiia_memory_items",
|
||||
"context": "sofiia_user_context"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📝 Приклад використання пам'яті Sofiia:
|
||||
|
||||
### Збереження повідомлення:
|
||||
```python
|
||||
await memory_client.save_memory(
|
||||
agent_id="sofiia",
|
||||
user_id="telegram:123456",
|
||||
channel_id="telegram:sofiia",
|
||||
content="User asked about DAARION architecture",
|
||||
role="user",
|
||||
metadata={
|
||||
"topic": "architecture",
|
||||
"intent": "question",
|
||||
"project": "DAARION"
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
### Отримання контексту:
|
||||
```python
|
||||
context = await memory_client.get_context(
|
||||
agent_id="sofiia",
|
||||
user_id="telegram:123456",
|
||||
query="архітектура",
|
||||
limit=10
|
||||
)
|
||||
```
|
||||
|
||||
### Збереження факту:
|
||||
```python
|
||||
await memory_client.save_fact(
|
||||
agent_id="sofiia",
|
||||
user_id="telegram:123456",
|
||||
fact_key="preferences",
|
||||
fact_value_json={
|
||||
"language": "uk",
|
||||
"style": "technical",
|
||||
"projects": ["DAARION", "NODA2"]
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Статистика пам'яті (для моніторингу):
|
||||
|
||||
```sql
|
||||
-- Кількість фактів для Sofiia
|
||||
SELECT COUNT(*) FROM user_facts WHERE agent_id = 'sofiia';
|
||||
|
||||
-- Останні повідомлення
|
||||
SELECT * FROM user_facts
|
||||
WHERE agent_id = 'sofiia'
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 10;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Наступні кроки:
|
||||
|
||||
1. ✅ Ознайомитись з архітектурою пам'яті (готово!)
|
||||
2. ⏳ Отримати Cohere API Key
|
||||
3. ⏳ Створити docker-compose.memory-node2.yml
|
||||
4. ⏳ Запустити Memory Stack
|
||||
5. ⏳ Налаштувати OpenClaw для підключення
|
||||
6. ⏳ Протестувати збереження та пошук
|
||||
|
||||
---
|
||||
|
||||
**Який варіант цікавить вас більше? Можу допомогти з реалізацією! 🚀**
|
||||
330
docs/aistalk/contract.md
Normal file
330
docs/aistalk/contract.md
Normal file
@@ -0,0 +1,330 @@
|
||||
# AISTALK ↔ Sofiia Console — Integration Contract
|
||||
|
||||
Version: 1.0
|
||||
Date: 2026-02-25
|
||||
Status: **STUB READY** — integration pending AISTALK implementation
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
AISTALK connects to Sofiia Console BFF (`sofiia-console`, port 8002) via two channels:
|
||||
|
||||
| Channel | Direction | Protocol |
|
||||
|---|---|---|
|
||||
| `/ws/events` | BFF → AISTALK | WebSocket (text/JSON) |
|
||||
| `/api/chat/send` | AISTALK → BFF | HTTP POST |
|
||||
| `/api/voice/stt` | AISTALK → BFF | HTTP POST multipart |
|
||||
| `/api/voice/tts` | AISTALK → BFF | HTTP POST → audio stream |
|
||||
|
||||
---
|
||||
|
||||
## 1. WebSocket Event Stream: `/ws/events`
|
||||
|
||||
AISTALK connects as a subscriber to receive all platform events in real time.
|
||||
|
||||
### Connection
|
||||
|
||||
```
|
||||
ws://<BFF_HOST>:8002/ws/events
|
||||
```
|
||||
|
||||
Optional auth header (if `SOFIIA_CONSOLE_API_KEY` is set):
|
||||
```
|
||||
X-API-Key: <key>
|
||||
```
|
||||
|
||||
### Keep-alive (ping/pong)
|
||||
|
||||
Client should send `{"type":"ping"}` every 10–30s.
|
||||
Server responds with `{"type":"pong","ts":"..."}`.
|
||||
|
||||
### Event Envelope
|
||||
|
||||
Every event has this shape:
|
||||
|
||||
```json
|
||||
{
|
||||
"v": 1,
|
||||
"type": "<event_type>",
|
||||
"ts": "2026-02-25T12:34:56.789Z",
|
||||
"project_id": "default",
|
||||
"session_id": "sess_abc123",
|
||||
"user_id": "console_user",
|
||||
"data": { ... }
|
||||
}
|
||||
```
|
||||
|
||||
### Event Types AISTALK Should Consume
|
||||
|
||||
#### `chat.message` — user sent a message
|
||||
```json
|
||||
{
|
||||
"data": {
|
||||
"text": "...",
|
||||
"provider": "ollama|router",
|
||||
"model": "ollama:glm-4.7-flash:32k"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### `chat.reply` — Sofiia replied
|
||||
```json
|
||||
{
|
||||
"data": {
|
||||
"text": "...",
|
||||
"provider": "ollama|router",
|
||||
"model": "...",
|
||||
"latency_ms": 1234
|
||||
}
|
||||
}
|
||||
```
|
||||
> AISTALK should TTS this text (if voice channel is active) via `/api/voice/tts`.
|
||||
|
||||
#### `voice.stt` — STT lifecycle
|
||||
```json
|
||||
{
|
||||
"data": {
|
||||
"phase": "start|done|error",
|
||||
"elapsed_ms": 456
|
||||
}
|
||||
}
|
||||
```
|
||||
> AISTALK uses `phase=start` to mute its own mic; `phase=done` to unmute.
|
||||
|
||||
#### `voice.tts` — TTS lifecycle
|
||||
```json
|
||||
{
|
||||
"data": {
|
||||
"phase": "start|done|error",
|
||||
"voice": "Polina",
|
||||
"elapsed_ms": 789
|
||||
}
|
||||
}
|
||||
```
|
||||
> AISTALK uses `phase=start` to begin audio playback; `phase=done` as end signal.
|
||||
|
||||
#### `ops.run` — governance operation result
|
||||
```json
|
||||
{
|
||||
"data": {
|
||||
"name": "risk_dashboard|pressure_dashboard|backlog_generate_weekly|release_check",
|
||||
"ok": true,
|
||||
"elapsed_ms": 999
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### `nodes.status` — node network heartbeat (every 15s)
|
||||
```json
|
||||
{
|
||||
"data": {
|
||||
"bff_uptime_s": 3600,
|
||||
"ws_clients": 2,
|
||||
"nodes": [
|
||||
{"id": "NODA1", "online": true, "router_ok": true, "router_latency_ms": 12},
|
||||
{"id": "NODA2", "online": true, "router_ok": true, "router_latency_ms": 5}
|
||||
],
|
||||
"nodes_ts": "2026-02-25T12:34:50Z"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### `error` — platform error
|
||||
```json
|
||||
{
|
||||
"data": {
|
||||
"where": "bff|router|memory|ollama",
|
||||
"message": "...",
|
||||
"code": "optional_code"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Event Types AISTALK Should Ignore
|
||||
- `tool.called` / `tool.result` — internal governance, not relevant for voice
|
||||
- Any `type` not listed above — forward compatibility, AISTALK must not crash on unknown types
|
||||
|
||||
---
|
||||
|
||||
## 2. Sending Text to Sofiia: `POST /api/chat/send`
|
||||
|
||||
AISTALK sends user text (transcribed from voice or typed):
|
||||
|
||||
```http
|
||||
POST http://<BFF_HOST>:8002/api/chat/send
|
||||
Content-Type: application/json
|
||||
X-API-Key: <key>
|
||||
|
||||
{
|
||||
"message": "Sofiia, покажи risk dashboard",
|
||||
"model": "ollama:glm-4.7-flash:32k",
|
||||
"project_id": "aistalk",
|
||||
"session_id": "aistalk_sess_<uuid>",
|
||||
"user_id": "aistalk_user",
|
||||
"provider": "ollama"
|
||||
}
|
||||
```
|
||||
|
||||
Response:
|
||||
```json
|
||||
{
|
||||
"ok": true,
|
||||
"project_id": "aistalk",
|
||||
"session_id": "aistalk_sess_...",
|
||||
"user_id": "aistalk_user",
|
||||
"response": "Ось Risk Dashboard...",
|
||||
"model": "ollama:glm-4.7-flash:32k",
|
||||
"backend": "ollama",
|
||||
"meta": {"latency_ms": 1234, "tokens_est": 87}
|
||||
}
|
||||
```
|
||||
|
||||
AISTALK should use the `response` field text for TTS.
|
||||
|
||||
---
|
||||
|
||||
## 3. Speech-to-Text: `POST /api/voice/stt`
|
||||
|
||||
```http
|
||||
POST http://<BFF_HOST>:8002/api/voice/stt?session_id=<sid>&project_id=<pid>
|
||||
Content-Type: multipart/form-data
|
||||
X-API-Key: <key>
|
||||
|
||||
audio=<binary; MIME: audio/webm or audio/wav>
|
||||
```
|
||||
|
||||
Response:
|
||||
```json
|
||||
{
|
||||
"text": "Sofiia, покажи risk dashboard",
|
||||
"language": "uk",
|
||||
"segments": [...]
|
||||
}
|
||||
```
|
||||
|
||||
Audio constraints:
|
||||
- Max size: no hard limit, but keep under 10MB per chunk
|
||||
- Format: `audio/webm` (Opus) or `audio/wav`
|
||||
- Duration: up to 60s per chunk
|
||||
|
||||
---
|
||||
|
||||
## 4. Text-to-Speech: `POST /api/voice/tts`
|
||||
|
||||
```http
|
||||
POST http://<BFF_HOST>:8002/api/voice/tts
|
||||
Content-Type: application/json
|
||||
X-API-Key: <key>
|
||||
|
||||
{
|
||||
"text": "Ось Risk Dashboard для gateway...",
|
||||
"voice": "default",
|
||||
"speed": 1.0,
|
||||
"session_id": "aistalk_sess_...",
|
||||
"project_id": "aistalk"
|
||||
}
|
||||
```
|
||||
|
||||
Response: `audio/wav` binary stream (or `audio/mpeg`).
|
||||
|
||||
Voice options (Ukrainian):
|
||||
| voice | description |
|
||||
|---|---|
|
||||
| `default` | Polina Neural (uk-UA, edge-tts) |
|
||||
| `Ostap` | Ostap Neural (uk-UA, edge-tts) |
|
||||
| `Milena` | Milena (macOS, fallback) |
|
||||
| `Yuri` | Yuri (macOS, fallback) |
|
||||
|
||||
Text limit: 500 chars per call (BFF enforces). Split longer responses.
|
||||
|
||||
---
|
||||
|
||||
## 5. AISTALK Adapter Interface (BFF-side stub)
|
||||
|
||||
File: `services/sofiia-console/app/adapters/aistalk.py`
|
||||
|
||||
```python
|
||||
class AISTALKAdapter:
|
||||
def send_text(self, project_id, session_id, text) -> None
|
||||
def send_audio(self, project_id, session_id, audio_bytes, mime) -> None
|
||||
def handle_event(self, event: dict) -> None # called on chat.reply, ops.run etc.
|
||||
def on_event(self, event: dict) -> None # alias
|
||||
```
|
||||
|
||||
Activation:
|
||||
```env
|
||||
AISTALK_ENABLED=true
|
||||
AISTALK_URL=http://<aistalk-bridge>:<port>
|
||||
AISTALK_API_KEY=<optional>
|
||||
```
|
||||
|
||||
Currently the adapter is a **noop stub** with logging. Replace `send_text` / `send_audio` / `handle_event` with actual HTTP/WebSocket calls to AISTALK bridge when ready.
|
||||
|
||||
---
|
||||
|
||||
## 6. Session Identity
|
||||
|
||||
AISTALK must use consistent `project_id` and `session_id` across all calls in one conversation:
|
||||
|
||||
```
|
||||
project_id: "aistalk" # fixed
|
||||
session_id: "aistalk_<uuid>" # new UUID per conversation
|
||||
user_id: "aistalk_user" # fixed or per-user identity
|
||||
```
|
||||
|
||||
This ensures memory continuity in memory-service and proper WS event filtering.
|
||||
|
||||
---
|
||||
|
||||
## 7. Rate Limits (BFF enforces)
|
||||
|
||||
| Endpoint | Limit |
|
||||
|---|---|
|
||||
| `/api/chat/send` | 30 req/min per IP |
|
||||
| `/api/voice/stt` | 20 req/min per IP |
|
||||
| `/api/voice/tts` | 30 req/min per IP |
|
||||
|
||||
AISTALK should implement backoff on HTTP 429.
|
||||
|
||||
---
|
||||
|
||||
## 8. Hello World Verification
|
||||
|
||||
```bash
|
||||
# 1. Connect WS
|
||||
wscat -c ws://localhost:8002/ws/events
|
||||
|
||||
# 2. Send a message
|
||||
curl -X POST http://localhost:8002/api/chat/send \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"message":"привіт Sofiia","model":"ollama:glm-4.7-flash:32k","project_id":"aistalk","session_id":"test_001","user_id":"aistalk_user"}'
|
||||
|
||||
# 3. WS should receive chat.message + chat.reply events
|
||||
|
||||
# 4. TTS test
|
||||
curl -X POST http://localhost:8002/api/voice/tts \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"text":"Привіт! Я Sofiia.","voice":"default"}' \
|
||||
--output test.wav && afplay test.wav
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 9. Full-Duplex Voice Flow (AISTALK sequence)
|
||||
|
||||
```
|
||||
User speaks
|
||||
→ AISTALK records audio
|
||||
→ POST /api/voice/stt (receives text)
|
||||
→ POST /api/chat/send (receives reply text)
|
||||
→ POST /api/voice/tts (receives audio)
|
||||
→ AISTALK plays audio
|
||||
|
||||
WS events observed:
|
||||
voice.stt {phase:start} → voice.stt {phase:done}
|
||||
→ chat.message → chat.reply
|
||||
→ voice.tts {phase:start} → voice.tts {phase:done}
|
||||
```
|
||||
|
||||
Echo cancellation: AISTALK must mute its microphone during TTS playback (`voice.tts phase=start` → mute, `phase=done` → unmute).
|
||||
477
docs/audit/gaps_and_recovery_plan.md
Normal file
477
docs/audit/gaps_and_recovery_plan.md
Normal file
@@ -0,0 +1,477 @@
|
||||
# Sofiia CTO Agent — Gaps & Recovery Plan (E)
|
||||
|
||||
> Generated: 2026-02-26 | P0 = блокуюче | P1 = критичне для vNext | P2 = покращення
|
||||
|
||||
---
|
||||
|
||||
## Критичне резюме
|
||||
|
||||
**Що вже готово і може йти в UI:** Chat, Voice, Projects CRUD, File upload, Sessions, Dialog Map tree, Ops actions, Node health.
|
||||
|
||||
**Що не готово і блокує vNext:** Tasks/Kanban, Meetings, Dialog Map canvas + Postgres schema, Doc versions, CTO Repo/Ops flow, Supervisor через BFF, Semantic search.
|
||||
|
||||
---
|
||||
|
||||
## Таблиця прогалин з пріоритетами
|
||||
|
||||
| # | Gap | Пріоритет | Складність | Блокує |
|
||||
|---|-----|-----------|-----------|--------|
|
||||
| G1 | `dialog_nodes`/`dialog_edges` Postgres tables + API | P0 | Medium | Dialog Map vNext |
|
||||
| G2 | `tasks` table + CRUD API + Kanban UI | P0 | Medium | Projects Board |
|
||||
| G3 | `meetings` table + CRUD API | P0 | Medium | Projects Meetings tab |
|
||||
| G4 | Supervisor не проксюється через BFF | P0 | Low | CTO workflow access |
|
||||
| G5 | `docs_versions` table + API | P1 | Low | Doc history/rollback |
|
||||
| G6 | `entity_links` table + API | P1 | Low | Cross-entity linking |
|
||||
| G7 | `repo_changesets` + `repo_patches` + PR flow | P1 | High | CTO code workflow |
|
||||
| G8 | `ops_runs` job system (not one-shot) | P1 | Medium | CTO ops audit trail |
|
||||
| G9 | Semantic search (Qdrant/Meilisearch) | P1 | Medium | Doc/Project search |
|
||||
| G10 | NATS `attachment.created` on upload | P1 | Low | Parser pipeline hook |
|
||||
| G11 | `DELETE` endpoints (projects/docs) | P1 | Low | CRUD completeness |
|
||||
| G12 | Real-time WS events for map/tasks | P1 | Medium | Live UI updates |
|
||||
| G13 | E2EE / confidential mode | P2 | Very High | Privacy |
|
||||
| G14 | 2-step Plan → Apply for dangerous actions | P2 | High | Safe ops flow |
|
||||
| G15 | `agent_id="l"` vs `"sofiia"` inconsistency | P1 | Low | Config correctness |
|
||||
| G16 | `dialog_views` saved views | P2 | Low | UX |
|
||||
| G17 | NODA3 integration | P2 | Medium | AI/ML workstation |
|
||||
| G18 | Meilisearch deployment | P2 | Low | Full-text search |
|
||||
| G19 | Privacy Gate middleware (Router) | P2 | High | Confidential mode |
|
||||
| G20 | Wiki Markdown editor UI | P2 | Medium | Docs/Wiki experience |
|
||||
| G21 | `doc_index_state` table + reindex jobs | P2 | Low | AI doc indexing |
|
||||
| G22 | Meeting reminders (push/WS) | P2 | Medium | Meetings UX |
|
||||
| G23 | `DELETE /api/nodes/{id}` | P2 | Low | Node management |
|
||||
| G24 | S3/MinIO для file storage | P2 | High | Scale (replace volume) |
|
||||
|
||||
---
|
||||
|
||||
## P0 — Блокуючі прогалини (потрібні для vNext)
|
||||
|
||||
### G1: Dialog Map — Postgres schema + API
|
||||
|
||||
**Що зроблено:** SQLite tree via `parent_msg_id`. Works for conversation branching.
|
||||
|
||||
**Чого не вистачає:**
|
||||
- Postgres tables: `dialog_nodes`, `dialog_edges`, `dialog_views`
|
||||
- API: `GET /api/projects/{id}/dialog-map`, `POST /api/links`
|
||||
- WS event: `dialog_map.updated`
|
||||
- Auto-edge creation from NATS events
|
||||
|
||||
**Recovery plan:**
|
||||
```sql
|
||||
-- Step 1: Add to sofiia-console db.py (SQLite first, Postgres later)
|
||||
CREATE TABLE IF NOT EXISTS dialog_nodes (
|
||||
node_id TEXT PRIMARY KEY,
|
||||
project_id TEXT NOT NULL,
|
||||
node_type TEXT NOT NULL CHECK(node_type IN ('message','task','doc','meeting','agent_run','decision','goal')),
|
||||
ref_id TEXT NOT NULL, -- FK to actual entity
|
||||
title TEXT DEFAULT '',
|
||||
created_at TEXT NOT NULL,
|
||||
created_by TEXT DEFAULT 'system'
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS dialog_edges (
|
||||
edge_id TEXT PRIMARY KEY,
|
||||
project_id TEXT NOT NULL,
|
||||
from_node_id TEXT NOT NULL REFERENCES dialog_nodes(node_id),
|
||||
to_node_id TEXT NOT NULL REFERENCES dialog_nodes(node_id),
|
||||
edge_type TEXT NOT NULL CHECK(edge_type IN ('references','resolves','derives_task','updates_doc','schedules','summarizes')),
|
||||
created_at TEXT NOT NULL,
|
||||
props TEXT DEFAULT '{}' -- JSON
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS dialog_views (
|
||||
view_id TEXT PRIMARY KEY,
|
||||
project_id TEXT NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
filters TEXT DEFAULT '{}',
|
||||
layout TEXT DEFAULT '{}'
|
||||
);
|
||||
```
|
||||
|
||||
```python
|
||||
# Step 2: New endpoint in docs_router.py
|
||||
@router.get("/api/projects/{project_id}/dialog-map")
|
||||
async def get_project_dialog_map(project_id: str):
|
||||
nodes = await db.get_dialog_nodes(project_id)
|
||||
edges = await db.get_dialog_edges(project_id)
|
||||
return {"nodes": nodes, "edges": edges}
|
||||
|
||||
@router.post("/api/links")
|
||||
async def create_link(body: LinkCreate):
|
||||
# Creates dialog_edge between two entities
|
||||
...
|
||||
```
|
||||
|
||||
**Оцінка:** 4–6 годин роботи.
|
||||
|
||||
---
|
||||
|
||||
### G2: Tasks + Kanban
|
||||
|
||||
**Що зроблено:** Немає.
|
||||
|
||||
**Recovery plan:**
|
||||
```sql
|
||||
CREATE TABLE IF NOT EXISTS tasks (
|
||||
task_id TEXT PRIMARY KEY,
|
||||
project_id TEXT NOT NULL REFERENCES projects(project_id),
|
||||
title TEXT NOT NULL,
|
||||
description TEXT DEFAULT '',
|
||||
status TEXT DEFAULT 'backlog' CHECK(status IN ('backlog','in_progress','review','done')),
|
||||
priority TEXT DEFAULT 'medium',
|
||||
assignee_id TEXT DEFAULT '',
|
||||
labels TEXT DEFAULT '[]', -- JSON
|
||||
due_at TEXT,
|
||||
created_at TEXT NOT NULL,
|
||||
updated_at TEXT NOT NULL,
|
||||
msg_id TEXT -- Optional: link to originating message
|
||||
);
|
||||
```
|
||||
|
||||
- API: `GET/POST /api/projects/{id}/tasks`, `PATCH /api/tasks/{id}`, `DELETE /api/tasks/{id}`
|
||||
- UI: Kanban board з drag-drop (можна почати з простим list + status buttons)
|
||||
- Dialog Map auto-edge: `POST /api/links` after task creation
|
||||
|
||||
**Оцінка:** 1–2 дні (backend + basic UI).
|
||||
|
||||
---
|
||||
|
||||
### G3: Meetings
|
||||
|
||||
**Recovery plan:**
|
||||
```sql
|
||||
CREATE TABLE IF NOT EXISTS meetings (
|
||||
meeting_id TEXT PRIMARY KEY,
|
||||
project_id TEXT NOT NULL REFERENCES projects(project_id),
|
||||
title TEXT NOT NULL,
|
||||
starts_at TEXT NOT NULL,
|
||||
duration_min INTEGER DEFAULT 60,
|
||||
attendees TEXT DEFAULT '[]', -- JSON
|
||||
location TEXT DEFAULT '',
|
||||
agenda TEXT DEFAULT '',
|
||||
created_at TEXT NOT NULL
|
||||
);
|
||||
```
|
||||
|
||||
- API: `GET/POST /api/projects/{id}/meetings`, `PATCH /api/meetings/{id}`
|
||||
- UI: simple form (title, date/time, duration, attendees)
|
||||
- Reminders: Phase 2 (WS push)
|
||||
|
||||
**Оцінка:** 1 день.
|
||||
|
||||
---
|
||||
|
||||
### G4: Supervisor → BFF proxy
|
||||
|
||||
**Що зроблено:** Supervisor API exists at `http://sofiia-supervisor:8080` (або port 9400).
|
||||
|
||||
**Recovery plan:**
|
||||
```python
|
||||
# Add to services/sofiia-console/app/main.py:
|
||||
|
||||
SUPERVISOR_URL = os.getenv("SUPERVISOR_URL", "http://sofiia-supervisor:8080")
|
||||
|
||||
@app.post("/api/supervisor/runs")
|
||||
async def run_supervisor_graph(body: dict, _auth: str = Depends(require_auth)):
|
||||
async with httpx.AsyncClient() as c:
|
||||
resp = await c.post(f"{SUPERVISOR_URL}/v1/graphs/{body['graph']}/runs",
|
||||
json=body, timeout=60)
|
||||
return resp.json()
|
||||
|
||||
@app.get("/api/supervisor/runs/{run_id}")
|
||||
async def get_supervisor_run(run_id: str, _auth: str = Depends(require_auth)):
|
||||
async with httpx.AsyncClient() as c:
|
||||
resp = await c.get(f"{SUPERVISOR_URL}/v1/runs/{run_id}", timeout=10)
|
||||
return resp.json()
|
||||
```
|
||||
|
||||
**Оцінка:** 30 хвилин.
|
||||
|
||||
---
|
||||
|
||||
## P1 — Критичні для vNext
|
||||
|
||||
### G5: Doc versions
|
||||
|
||||
```sql
|
||||
CREATE TABLE IF NOT EXISTS doc_versions (
|
||||
version_id TEXT PRIMARY KEY,
|
||||
doc_id TEXT NOT NULL REFERENCES documents(doc_id),
|
||||
content TEXT NOT NULL, -- full text
|
||||
author_id TEXT DEFAULT 'system',
|
||||
created_at TEXT NOT NULL
|
||||
);
|
||||
```
|
||||
|
||||
```python
|
||||
# New endpoints in docs_router.py:
|
||||
# GET /api/projects/{pid}/documents/{did}/versions
|
||||
# POST /api/projects/{pid}/documents/{did}/restore
|
||||
```
|
||||
|
||||
**Оцінка:** 2 години.
|
||||
|
||||
---
|
||||
|
||||
### G7: Repo Changesets (CTO Code Flow)
|
||||
|
||||
Це найскладніша частина. **Рекомендація:** почати з mock endpoints, потім реалізувати реальну логіку.
|
||||
|
||||
**Mock endpoint (30 хв):**
|
||||
```python
|
||||
@app.post("/api/repo/changesets")
|
||||
async def create_changeset_mock(body: dict, _auth=Depends(require_auth)):
|
||||
# Mock: store in SQLite, return changeset_id
|
||||
cs_id = str(uuid.uuid4())
|
||||
# await db.save_changeset(cs_id, body)
|
||||
return {"changeset_id": cs_id, "status": "draft", "mock": True}
|
||||
```
|
||||
|
||||
**Реальна реалізація (2–3 дні):**
|
||||
```sql
|
||||
CREATE TABLE repo_changesets (
|
||||
cs_id TEXT PRIMARY KEY,
|
||||
project_id TEXT,
|
||||
repo TEXT NOT NULL, -- e.g., "github.com/IvanTytar/microdao-daarion"
|
||||
base_ref TEXT NOT NULL, -- branch/commit
|
||||
intent TEXT NOT NULL,
|
||||
risk_level TEXT DEFAULT 'low',
|
||||
status TEXT DEFAULT 'draft',
|
||||
created_by TEXT,
|
||||
created_at TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE repo_patches (
|
||||
patch_id TEXT PRIMARY KEY,
|
||||
cs_id TEXT NOT NULL REFERENCES repo_changesets(cs_id),
|
||||
file_path TEXT NOT NULL,
|
||||
patch_text TEXT NOT NULL, -- unified diff
|
||||
created_at TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE pull_requests (
|
||||
pr_id TEXT PRIMARY KEY,
|
||||
cs_id TEXT NOT NULL REFERENCES repo_changesets(cs_id),
|
||||
provider TEXT DEFAULT 'github', -- github/gitlab/gitea
|
||||
pr_url TEXT,
|
||||
pr_number INTEGER,
|
||||
status TEXT DEFAULT 'draft',
|
||||
created_at TEXT NOT NULL
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### G8: Ops Runs (Job System)
|
||||
|
||||
Поточний `/api/ops/run` — one-shot dispatch. Потрібен job tracking.
|
||||
|
||||
```sql
|
||||
CREATE TABLE ops_runs (
|
||||
run_id TEXT PRIMARY KEY,
|
||||
project_id TEXT,
|
||||
node_id TEXT NOT NULL, -- noda1/noda2
|
||||
action TEXT NOT NULL, -- з allowlist
|
||||
params TEXT DEFAULT '{}', -- JSON
|
||||
dry_run INTEGER DEFAULT 1,
|
||||
status TEXT DEFAULT 'pending', -- pending/running/success/failed
|
||||
result TEXT DEFAULT '',
|
||||
started_at TEXT,
|
||||
finished_at TEXT,
|
||||
created_by TEXT
|
||||
);
|
||||
```
|
||||
|
||||
**API:**
|
||||
- `POST /api/ops/runs` (створити job, dry_run=true за замовч.)
|
||||
- `GET /api/ops/runs/{id}` (статус)
|
||||
- `GET /api/ops/runs?project_id=&limit=20` (список)
|
||||
|
||||
**Оцінка:** 4 години (backend) + 2 год (UI list).
|
||||
|
||||
---
|
||||
|
||||
### G10: NATS attachment.created
|
||||
|
||||
Одна зміна в `docs_router.py`:
|
||||
|
||||
```python
|
||||
# After successful file save:
|
||||
try:
|
||||
import nats
|
||||
nc = await nats.connect(NATS_URL)
|
||||
await nc.publish(f"attachment.created.{mime_category}",
|
||||
json.dumps({"file_id": file_id, "doc_id": doc_id, ...}).encode())
|
||||
await nc.close()
|
||||
except Exception:
|
||||
pass # best-effort
|
||||
```
|
||||
|
||||
**Оцінка:** 1 година.
|
||||
|
||||
---
|
||||
|
||||
### G15: agent_id "l" vs "sofiia"
|
||||
|
||||
У `services/router/router-config.yml` для NODA2:
|
||||
|
||||
```yaml
|
||||
# Check if there's "l:" entry that should be "sofiia:"
|
||||
```
|
||||
|
||||
**Action:** знайти і замінити `"l"` → `"sofiia"` у router-config відповідної ноди.
|
||||
|
||||
**Оцінка:** 15 хвилин.
|
||||
|
||||
---
|
||||
|
||||
## P2 — Покращення
|
||||
|
||||
### G13: E2EE (confidential mode)
|
||||
|
||||
**Складність:** Дуже висока. Потребує:
|
||||
1. Client-side key generation (WebCrypto API)
|
||||
2. Server-side: store only ciphertext + key_id
|
||||
3. Router Privacy Gate middleware
|
||||
4. Dialog Map: тільки user-created edges (не semantic auto-edges)
|
||||
5. Search: тільки metadata, не plaintext
|
||||
|
||||
**Рекомендація:** Не реалізовувати до завершення Projects + Dialog Map. Спочатку `mode=public` тільки.
|
||||
|
||||
---
|
||||
|
||||
### G20: Wiki Markdown Editor
|
||||
|
||||
Потрібна бібліотека (CodeMirror / Monaco / Tiptap). Для Phase 1 — textarea з preview.
|
||||
|
||||
```html
|
||||
<!-- Simple Phase 1 wiki editor -->
|
||||
<div id="wikiEditor">
|
||||
<textarea id="wikiContent" placeholder="# Сторінка wiki..."></textarea>
|
||||
<div id="wikiPreview" class="markdown-preview"></div>
|
||||
</div>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Quick Wins (до 2 годин кожен)
|
||||
|
||||
| # | Quick Win | Час | Цінність |
|
||||
|---|-----------|-----|---------|
|
||||
| QW1 | `DELETE /api/projects/{id}` | 15 хв | CRUD completeness |
|
||||
| QW2 | `DELETE /api/projects/{id}/documents/{did}` | 15 хв | CRUD completeness |
|
||||
| QW3 | BFF proxy до Supervisor (G4) | 30 хв | CTO workflow access |
|
||||
| QW4 | Mock `/api/repo/changesets` | 30 хв | UI CTO panel development |
|
||||
| QW5 | Mock `/api/ops/runs` | 30 хв | UI CTO panel development |
|
||||
| QW6 | `docs_versions` table + API (G5) | 2 год | Doc history |
|
||||
| QW7 | `USE_EMBEDDINGS=true` + Qdrant ingest | 1 год | Semantic search |
|
||||
| QW8 | `agent_id "l"` → `"sofiia"` fix | 15 хв | Config consistency |
|
||||
| QW9 | NATS `attachment.created` on upload | 1 год | Parser pipeline |
|
||||
| QW10 | WS `dialog_map.updated` basic event | 1 год | Live map refresh |
|
||||
|
||||
---
|
||||
|
||||
## Повний план відновлення (поетапно)
|
||||
|
||||
### Тиждень 1: Stabilize & Quick Wins
|
||||
|
||||
```
|
||||
Day 1–2:
|
||||
- QW1, QW2, QW3, QW8 (CRUD + Supervisor proxy + agent_id fix)
|
||||
- Деплой на NODA2, verify через http://localhost:8002
|
||||
|
||||
Day 3–4:
|
||||
- G2: tasks table + basic API + simple list UI
|
||||
- G3: meetings table + basic form UI
|
||||
|
||||
Day 5:
|
||||
- G5: docs_versions + API
|
||||
- G10: NATS attachment.created
|
||||
- QW4, QW5: mock changeset/ops_run endpoints for UI
|
||||
```
|
||||
|
||||
### Тиждень 2: Dialog Map + CTO Panel
|
||||
|
||||
```
|
||||
Day 1–2:
|
||||
- G1: dialog_nodes/edges tables + API
|
||||
- WS event: dialog_map.updated
|
||||
|
||||
Day 3–4:
|
||||
- UI: Dialog Map canvas (D3 tree → force graph)
|
||||
- Entity links UI (drag edge between nodes)
|
||||
|
||||
Day 5:
|
||||
- G8: ops_runs job system
|
||||
- UI: CTO Ops panel (list + status)
|
||||
```
|
||||
|
||||
### Тиждень 3: Advanced Features
|
||||
|
||||
```
|
||||
- G7: Repo changesets (real implementation)
|
||||
- G9: USE_EMBEDDINGS=true + semantic search
|
||||
- G12: Full real-time WS events (tasks, docs, meetings)
|
||||
- Kanban drag-drop UI
|
||||
- Doc versions diff viewer
|
||||
```
|
||||
|
||||
### Тиждень 4+: Scale & Polish
|
||||
|
||||
```
|
||||
- G14: 2-step Plan → Apply
|
||||
- G20: Wiki Markdown editor
|
||||
- G22: Meeting reminders
|
||||
- G24: S3/MinIO for file storage
|
||||
- G13: E2EE (only when everything else is stable)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5 Найбільш Критичних Прогалин
|
||||
|
||||
1. **`dialog_nodes/edges` + project-level Dialog Map API** — без цього vNext граф неможливий
|
||||
2. **Tasks/Kanban** — Projects без задач = тільки файлосховище
|
||||
3. **Meetings** — Projects без зустрічей = неповний workflow
|
||||
4. **Supervisor не проксюється через BFF** — CTO не може запускати LangGraph runs з UI
|
||||
5. **Repo changesets / CTO code flow** — Sofiia не може "пропонувати PR" як structured artifact
|
||||
|
||||
---
|
||||
|
||||
## 5 Найбільш Готових Частин для UI
|
||||
|
||||
1. **Chat + Voice** — повністю готово, production-grade (Phase 2 streaming, HA, SLO, alerts)
|
||||
2. **Projects + Documents + File Upload** — CRUD, search, sessions — все є
|
||||
3. **Dialog Map tree** — `GET /api/sessions/{id}/map` повертає nodes/edges
|
||||
4. **Ops Actions** — risk/pressure/backlog/notion/release — все є через `/api/ops/run`
|
||||
5. **Node Health Dashboard** — multi-node, SSH, WebSocket realtime — все є
|
||||
|
||||
---
|
||||
|
||||
## 3 Рекомендації "Зробити Негайно"
|
||||
|
||||
### 1. Зберегти контекст у Dialog Map
|
||||
|
||||
Найпростіший спосіб не "загубити" поточний дизайн — додати `dialog_nodes/edges` tables у `db.py` прямо зараз (схема вже описана вище). Навіть якщо UI ще не готовий, дані почнуть накопичуватись від поточних повідомлень.
|
||||
|
||||
### 2. Proxy Supervisor через BFF
|
||||
|
||||
30 хвилин роботи, але це дасть Sofiia доступ до `alert_triage`, `incident_triage`, `postmortem_draft`, `release_check` прямо з UI Console — не тільки через Telegram.
|
||||
|
||||
### 3. Нормалізувати `agent_id`
|
||||
|
||||
Знайти і виправити `"l"` → `"sofiia"` у конфігурації NODA2. Це унеможливить silent routing failures де Router не знаходить агента і тихо fallbacks до дефолту.
|
||||
|
||||
---
|
||||
|
||||
## Next Actions for UI Team (1–2 days)
|
||||
|
||||
1. **Розгорнути і протестувати** поточний стек на NODA2 — `http://localhost:8002/` вже повністю робочий
|
||||
2. **Реалізувати QW1–QW5** (прості DELETE + Supervisor proxy + mock endpoints) — 2–3 год
|
||||
3. **Додати `tasks` і `meetings` tables** у `db.py` та відповідні endpoints у `docs_router.py`
|
||||
4. **Додати `dialog_nodes/edges`** у `db.py` (DDL вище) і endpoint `GET /api/projects/{id}/dialog-map`
|
||||
5. **Тестувати** через `tests/test_sofiia_docs.py` — всі 28 тестів мають пройти
|
||||
6. **Оновити** `docker-compose.node2-sofiia.yml` з `SUPERVISOR_URL` env var
|
||||
7. **Перевірити** що `ops/voice_ha_smoke.sh` проходить після деплою
|
||||
8. **Прочитати** `docs/architecture_inventory/` (7 файлів) для повного контексту поточного стеку
|
||||
9. **Використовувати** `ops/fabric_preflight.sh` перед кожним деплоєм (preflight-first policy)
|
||||
10. **Щотижня**: запускати `ops/fabric_snapshot.py --save` і commit результат — щоб мати baseline для drift detection
|
||||
216
docs/audit/sofiia_audit_index.md
Normal file
216
docs/audit/sofiia_audit_index.md
Normal file
@@ -0,0 +1,216 @@
|
||||
# Sofiia CTO Agent — Audit Index (A)
|
||||
|
||||
> Generated: 2026-02-26 | Scope: Full repository scan | Author: Cursor Auditor
|
||||
|
||||
---
|
||||
|
||||
## 1. Canonical Files (Топ-10 "Sources of Truth")
|
||||
|
||||
| # | File | Тип | Статус | Короткий опис |
|
||||
|---|------|-----|--------|---------------|
|
||||
| 1 | `AGENTS.md` | Identity/Capabilities | ✅ Актуальний | Головний identity файл Sofiia. CTO-агент, 3 ноди, всі можливості, toolchain |
|
||||
| 2 | `config/agent_registry.yml` | Config Registry | ✅ Актуальний | Single Source of Truth для конфігурації. Sofiia entry ~рядки 1276–1330 |
|
||||
| 3 | `services/sofiia-console/app/main.py` | BFF Implementation | ✅ Актуальний | FastAPI BFF v0.3.0. Всі endpoint-и Control Console |
|
||||
| 4 | `services/sofiia-console/static/index.html` | UI | ✅ Актуальний | 1600+ рядків SPA. Чат, Projects, Ops, Hub, Nodes, Memory |
|
||||
| 5 | `docs/ADR_ARCHITECTURE_VNEXT.md` | Architecture ADR | ✅ Актуальний (2026-01-19) | Control Plane + Data Plane архітектура, Privacy Gate, NATS standards |
|
||||
| 6 | `services/router/router-config.yml` | Router Config | ✅ Актуальний | LLM profiles, voice policies, agent routing |
|
||||
| 7 | `config/rbac_tools_matrix.yml` | Security | ✅ Актуальний | `agent_cto` роль з 39 дозволами |
|
||||
| 8 | `docs/OPENAPI_CONTRACTS.md` | API Contracts | ✅ Актуальний | Gateway→Router, Router→Memory контракти |
|
||||
| 9 | `docs/architecture_inventory/` | Inventory (7 файлів) | ✅ Актуальний (2026-02-16) | Повний каталог сервісів, інструментів, NATS, безпека |
|
||||
| 10 | `gateway-bot/sofiia_prompt.txt` | System Prompt | ✅ Актуальний | 138KB+ Telegram-промпт Sofiia як Chief AI Architect |
|
||||
|
||||
---
|
||||
|
||||
## 2. Повна Карта Файлів
|
||||
|
||||
### 2.1 Identity та промпти
|
||||
|
||||
| Файл | Опис | Розмір | Стан |
|
||||
|------|------|--------|------|
|
||||
| `AGENTS.md` | Sofiia identity: CTO-агент, NODA1/NODA2/NODA3, інструменти, стиль | ~400 рядків | ✅ Канонічний |
|
||||
| `gateway-bot/sofiia_prompt.txt` | Telegram system prompt (великий, детальний) | ~138KB | ✅ Production |
|
||||
| `services/sofiia-console/app/main.py` lines 138–177 | Console embedded system prompt (BFF) | ~1KB | ✅ Production |
|
||||
| `docs/consolidation/_node1_runtime_docs/gateway-bot/sofiia_prompt.txt` | Копія промпту (NODA1 backup) | ~138KB | ⚠️ Backup copy |
|
||||
|
||||
### 2.2 Core Implementation — sofiia-console
|
||||
|
||||
| Файл | Опис | Рядків |
|
||||
|------|------|--------|
|
||||
| `services/sofiia-console/app/main.py` | BFF FastAPI: всі endpoints, voice, telemetry, degradation SM | ~1800 |
|
||||
| `services/sofiia-console/app/docs_router.py` | Projects/Documents/Sessions/Dialog Map router | ~380 |
|
||||
| `services/sofiia-console/app/db.py` | SQLite async CRUD: projects, documents, sessions, messages, dialog map | ~320 |
|
||||
| `services/sofiia-console/app/auth.py` | API key authentication | ~50 |
|
||||
| `services/sofiia-console/app/config.py` | Node registry, URLs, feature flags | ~100 |
|
||||
| `services/sofiia-console/app/monitor.py` | Multi-node health polling | ~150 |
|
||||
| `services/sofiia-console/app/nodes.py` | Nodes dashboard | ~80 |
|
||||
| `services/sofiia-console/app/ops.py` | Ops actions dispatcher | ~200 |
|
||||
| `services/sofiia-console/app/router_client.py` | Proxy до Router (infer, tools, health) | ~100 |
|
||||
| `services/sofiia-console/app/voice_utils.py` | Voice sanitize, chunk split, think-block clean | ~150 |
|
||||
| `services/sofiia-console/app/adapters/aistalk.py` | AISTALK adapter | ~80 |
|
||||
| `services/sofiia-console/static/index.html` | SPA UI: chat, projects, ops, hub, nodes, memory | ~1600 |
|
||||
| `services/sofiia-console/requirements.txt` | aiosqlite, pypdf, python-docx, fastapi, httpx | 10 рядків |
|
||||
| `services/sofiia-console/Dockerfile` | Docker build | ~25 |
|
||||
|
||||
### 2.3 Sofiia Supervisor (LangGraph)
|
||||
|
||||
| Файл | Опис |
|
||||
|------|------|
|
||||
| `services/sofiia-supervisor/app/main.py` | FastAPI: `/v1/graphs/{name}/runs` API |
|
||||
| `services/sofiia-supervisor/app/graphs/alert_triage_graph.py` | Alert triage LangGraph |
|
||||
| `services/sofiia-supervisor/app/graphs/incident_triage_graph.py` | Incident triage LangGraph |
|
||||
| `services/sofiia-supervisor/app/graphs/postmortem_draft_graph.py` | Postmortem LangGraph |
|
||||
| `services/sofiia-supervisor/app/graphs/release_check_graph.py` | Release check LangGraph |
|
||||
| `services/sofiia-supervisor/app/alert_routing.py` | Routing policy matcher |
|
||||
| `services/sofiia-supervisor/app/gateway_client.py` | RBAC-enforced gateway client |
|
||||
| `services/sofiia-supervisor/app/models.py` | Pydantic models |
|
||||
| `services/sofiia-supervisor/app/state_backend.py` | Redis/in-memory state |
|
||||
| `docker-compose.node2-sofiia-supervisor.yml` | Supervisor Docker Compose |
|
||||
| `services/sofiia-supervisor/tests/` | 6 test files |
|
||||
|
||||
### 2.4 Router та Tools
|
||||
|
||||
| Файл | Опис |
|
||||
|------|------|
|
||||
| `services/router/main.py` | Main router: всі API endpoints, voice HA, capabilities |
|
||||
| `services/router/tool_manager.py` | 20+ інструментів: CRUD, exec, governance |
|
||||
| `services/router/agent_tools_config.py` | Per-agent tool allowlists |
|
||||
| `services/router/router-config.yml` | LLM profiles, voice policies, agent routing |
|
||||
| `services/router/fabric_metrics.py` | Prometheus metrics |
|
||||
| `services/router/offload_client.py` | NATS offload client |
|
||||
| `services/router/risk_engine.py` | Risk assessment engine |
|
||||
| `services/router/backlog_generator.py` | Backlog generation |
|
||||
| `services/router/incident_intelligence.py` | Incident correlation |
|
||||
| `services/router/cost_analyzer.py` | Cost analysis tool |
|
||||
| `services/router/data_governance.py` | Data governance |
|
||||
| `services/router/dependency_scanner.py` | Dependency scanner |
|
||||
| `services/router/drift_analyzer.py` | Infrastructure drift |
|
||||
| `services/router/architecture_pressure.py` | Architecture pressure analysis |
|
||||
|
||||
### 2.5 Memory Service
|
||||
|
||||
| Файл | Опис |
|
||||
|------|------|
|
||||
| `services/memory-service/app/main.py` | FastAPI: threads, events, memories, facts, agent memory |
|
||||
| `services/memory-service/app/vector_store.py` | Qdrant integration |
|
||||
| `services/memory-service/app/voice_endpoints.py` | STT/TTS endpoints з Prometheus metrics |
|
||||
| `services/memory-service/app/integration_endpoints.py` | Integration webhooks |
|
||||
| `services/memory-service/app/integrations.py` | External integrations |
|
||||
|
||||
### 2.6 Configuration
|
||||
|
||||
| Файл | Опис |
|
||||
|------|------|
|
||||
| `config/agent_registry.yml` | Всі 13+ агентів + sofiia entry |
|
||||
| `config/rbac_tools_matrix.yml` | RBAC ролі: `agent_cto` (39 permissions) |
|
||||
| `config/slo_policy.yml` | SLO для voice fast/quality profiles |
|
||||
| `config/risk_policy.yml` | Risk scoring policy |
|
||||
| `config/release_gate_policy.yml` | Release gate rules |
|
||||
| `config/incident_escalation_policy.yml` | Escalation policy |
|
||||
| `config/alert_routing_policy.yml` | Alert routing |
|
||||
| `config/observability_sources.yml` | Prometheus/Loki/Tempo sources |
|
||||
| `config/tool_limits.yml` | Tool rate limits |
|
||||
| `config/tools_rollout.yml` | Tools rollout configuration |
|
||||
| `config/cost_weights.yml` | Cost scoring weights |
|
||||
| `config/network_allowlist.yml` | Network access allowlist |
|
||||
| `config/nodes_registry.yml` | NODA1/NODA2 node registry |
|
||||
| `config/data_governance_policy.yml` | Data governance policy |
|
||||
| `config/backlog_policy.yml` | Backlog generation policy |
|
||||
| `services/router/router-config.yml` | Voice profiles, agent routing |
|
||||
|
||||
### 2.7 Docker Compose (NODA2 Sofiia Stack)
|
||||
|
||||
| Файл | Опис |
|
||||
|------|------|
|
||||
| `docker-compose.node2-sofiia.yml` | Main: sofiia-console + router + node-worker + memory + qdrant |
|
||||
| `docker-compose.node2-sofiia-supervisor.yml` | Sofiia Supervisor + Redis |
|
||||
| `docker-compose.memory-node2.yml` | Memory stack: Postgres + Qdrant + Neo4j + Memory Service |
|
||||
| `docker-compose.node2.yml` | Full NODA2 stack |
|
||||
|
||||
### 2.8 Документація (docs/)
|
||||
|
||||
| Файл/Dir | Опис | Стан |
|
||||
|----------|------|------|
|
||||
| `docs/ADR_ARCHITECTURE_VNEXT.md` | Основний ADR: vNext архітектура | ✅ |
|
||||
| `docs/OPENAPI_CONTRACTS.md` | API контракти Gateway↔Router↔Memory | ✅ |
|
||||
| `docs/ARCHITECTURE_DIAGRAM.md` | Діаграма архітектури | ✅ |
|
||||
| `docs/architecture_inventory/` | 7 файлів: exec summary, service catalog, tool catalog, dataflows, security, observability, open questions | ✅ 2026-02-16 |
|
||||
| `docs/fabric_contract.md` | Fabric multi-node contract, Voice HA | ✅ |
|
||||
| `docs/sofiia_ui_vnext_audit.md` | vNext UI audit | ✅ |
|
||||
| `docs/supervisor/langgraph_supervisor.md` | Supervisor архітектура | ✅ |
|
||||
| `docs/supervisor/postmortem_draft_graph.md` | Postmortem граф | ✅ |
|
||||
| `docs/runbook/sofiia-control-plane.md` | Operations runbook | ✅ |
|
||||
| `docs/NODA1-NODA2-STATUS.md` | Статус нод | ✅ |
|
||||
| `docs/MULTINODE_ARCHITECTURE.md` | Multi-node архітектура | ✅ |
|
||||
| `docs/NATS_SUBJECTS.md` | NATS subject map | ✅ |
|
||||
| `docs/voice_phase2_cutover.md` | Voice Phase 2 cutover plan | ✅ |
|
||||
| `docs/voice_streaming_phase2.md` | Voice Phase 2 spec | ✅ |
|
||||
| `docs/PRIVACY_GATE.md` | Privacy gate policy | ✅ |
|
||||
| `docs/DATA_RETENTION_POLICY.md` | Data retention | ✅ |
|
||||
| `docs/MEMORY_API_POLICY.md` | Memory API policy | ✅ |
|
||||
| `docs/AGENT_RUNTIME_POLICY.md` | Agent runtime policy | ✅ |
|
||||
| `docs/SECURITY_HARDENING_SUMMARY.md` | Security hardening | ✅ |
|
||||
| `docs/backlog/backlog.md` | Поточний беклог | ✅ |
|
||||
| `docs/incident/` | Incident tracking docs | ✅ |
|
||||
| `docs/risk/risk_index.md` | Risk index | ✅ |
|
||||
|
||||
### 2.9 Тести
|
||||
|
||||
| Файл | Що тестує |
|
||||
|------|-----------|
|
||||
| `tests/test_voice_ha.py` | Voice HA: 35 tests |
|
||||
| `tests/test_voice_policy.py` | Voice routing policy: 23 tests |
|
||||
| `tests/test_voice_stream.py` | Voice Phase 2 streaming: 22 tests |
|
||||
| `tests/test_sofiia_docs.py` | Projects/Documents/Sessions/Dialog Map: 28 tests |
|
||||
| `tests/test_tool_governance.py` | Tool RBAC (agent_cto role) |
|
||||
| `tests/test_risk_attribution.py` | Risk engine |
|
||||
| `tests/test_drift_analyzer.py` | Drift analyzer |
|
||||
| `tests/test_cost_analyzer.py` | Cost analyzer |
|
||||
| `tests/test_incident_escalation.py` | Escalation |
|
||||
| `tests/test_backlog_*.py` | Backlog generation/store |
|
||||
| `services/sofiia-supervisor/tests/` | 6 supervisor graph tests |
|
||||
|
||||
### 2.10 Ops Scripts
|
||||
|
||||
| Файл | Опис |
|
||||
|------|------|
|
||||
| `ops/fabric_preflight.sh` | Preflight checks: models, canary, voice |
|
||||
| `ops/voice_ha_smoke.sh` | Voice HA acceptance smoke test |
|
||||
| `ops/voice_latency_audit.sh` | Multi-scenario latency audit |
|
||||
| `ops/voice_policy_update.py` | Auto-update voice policy від audit results |
|
||||
| `ops/scripts/voice_canary.py` | Voice health canary (preflight + runtime) |
|
||||
| `ops/runbook-voice-incidents.md` | Voice incident runbook |
|
||||
| `ops/runbook-sofiia-docs.md` | Projects/Docs runbook |
|
||||
| `ops/grafana_voice_dashboard.json` | Grafana dashboard |
|
||||
| `ops/voice_alerts.yml` | Prometheus alerting rules |
|
||||
|
||||
---
|
||||
|
||||
## 3. Відсутні файли (NOT FOUND — очікувались)
|
||||
|
||||
| Очікуваний файл | Чому очікувався | Статус |
|
||||
|-----------------|-----------------|--------|
|
||||
| `services/projects-service/` | ADR_ARCHITECTURE_VNEXT згадує окремий projects-service | ❌ НЕ ЗНАЙДЕНО |
|
||||
| `services/docs-service/` | ADR згадує окремий docs-service з версіями | ❌ НЕ ЗНАЙДЕНО |
|
||||
| `services/dialogmap-service/` | vNext design, описаний у chat | ❌ НЕ ЗНАЙДЕНО |
|
||||
| `services/ingest-service/` | ADR 2.2 Ingest Service | ❌ НЕ ЗНАЙДЕНО (тільки stub reference) |
|
||||
| `openapi.yml` / `swagger.yml` | Формальна OpenAPI специфікація | ❌ НЕ ЗНАЙДЕНО |
|
||||
| `migrations/` (Postgres DDL для sofiia) | Versioned DB migrations | ⚠️ Є `migrations/046, 049, 052` для memory-service, але не для sofiia-console |
|
||||
| `docs/audit/` (5 аудит-файлів) | Запит цього сеансу | ✅ Створюються зараз |
|
||||
| `docs_versions` table | vNext DDL план | ❌ НЕ РЕАЛІЗОВАНО |
|
||||
| `dialog_nodes` / `dialog_edges` tables (Postgres) | vNext Dialog Map | ⚠️ SQLite-тільки, tree-based |
|
||||
| `entity_links` / `repo_changesets` / `ops_runs` | CTO DDL заготовки | ❌ НЕ ЗНАЙДЕНО |
|
||||
|
||||
---
|
||||
|
||||
## Next Actions for UI Team (1–2 days)
|
||||
|
||||
1. **Ознайомитись з `docs/architecture_inventory/` (7 файлів)** — там повний каталог поточного стеку
|
||||
2. **Перевірити `services/sofiia-console/app/docs_router.py`** — Projects/Documents/Sessions API вже є, потрібно тільки вмикати USE_EMBEDDINGS/USE_FABRIC_OCR
|
||||
3. **`config/agent_registry.yml` Sofiia entry** — перевірити `telegram_mode: whitelist` і `allowed_users: []`
|
||||
4. **Впевнитись що `docker-compose.node2-sofiia.yml`** має `sofiia-data` volume з правильним path
|
||||
5. **Протестувати UI** через `http://localhost:8002/` — відкрити вкладку "📁 Проєкти" і перевірити sidebar
|
||||
6. **Перевірити Dialog Map** через `GET /api/sessions/{sid}/map` — tree view реалізований
|
||||
7. **НОВА ПОТРЕБА**: визначити де буде Dialog Map на Postgres (`dialog_nodes/edges`) — поки SQLite tree-only
|
||||
8. **Пріоритет для UI**: mock endpoints для `repo_changesets` і `ops_runs` (CTO panel) поки не реалізовано
|
||||
9. **Додати `docs_versions` endpoint** в `docs_router.py` (колонка `extracted_text` є, потрібна таблиця версій)
|
||||
10. **Перевірити NATS subjects** в `docs/NATS_SUBJECTS.md` і зіставити з поточними з `docs/ADR_ARCHITECTURE_VNEXT.md §5`
|
||||
441
docs/audit/sofiia_intelligence_system_trace.md
Normal file
441
docs/audit/sofiia_intelligence_system_trace.md
Normal file
@@ -0,0 +1,441 @@
|
||||
# Sofiia CTO Agent — Intelligence System Trace (C)
|
||||
|
||||
> Generated: 2026-02-26 | Реконструкція "інтелектуальної системи" Sofiia
|
||||
|
||||
---
|
||||
|
||||
## Загальна схема мислення
|
||||
|
||||
```
|
||||
User Input (Telegram / Console / Voice)
|
||||
│
|
||||
▼
|
||||
[BFF: sofiia-console]
|
||||
Auth + Rate limit + Session
|
||||
│
|
||||
├─── Voice turn? ──► STT (memory-service) → sanitize_for_voice() → voice_fast_uk
|
||||
│
|
||||
└─── Text turn? ──► [Router /v1/agents/sofiia/infer]
|
||||
│
|
||||
┌────────────┴────────────┐
|
||||
│ │
|
||||
LLM selection Tool call?
|
||||
(profile-based) (tool_manager)
|
||||
│ │
|
||||
[LLM response] [Tool execution]
|
||||
│ │
|
||||
<think> strip RBAC check
|
||||
│ │
|
||||
Memory save Evidence
|
||||
│ │
|
||||
└────────┬────────────────┘
|
||||
│
|
||||
[Dialog Map update]
|
||||
(SQLite tree / future Postgres graph)
|
||||
│
|
||||
[Response to User]
|
||||
│
|
||||
[TTS if voice mode]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 1. Intent → Plan → Execute (Canonical CTO Flow)
|
||||
|
||||
### 1.1 Документовано
|
||||
- **Docs:** `AGENTS.md` §Example Commands, `docs/ADR_ARCHITECTURE_VNEXT.md` §3.1 CrewAI Workers
|
||||
- **Concept:** "Chat/Intent → Plan (Artifacts) → Execute as Job → Evidence → Dialog Map"
|
||||
- **vNext Design:** вся концепція описана в цьому сеансі розмови
|
||||
|
||||
### 1.2 Реалізовано
|
||||
- **Intent → Plan:** ✅ LLM inference через Router (`/v1/agents/sofiia/infer`)
|
||||
- **Plan → Execute (Ops):** ✅ `/api/ops/run` dispatches pre-defined actions
|
||||
- **Execute → Evidence:** ⚠️ частково — ops повертає result, але не зберігає як artifact
|
||||
- **Evidence → Dialog Map:** ❌ ops artifacts не зшиваються в dialog_nodes
|
||||
|
||||
### 1.3 Розриви
|
||||
- Немає загального **Job System** (тільки pre-defined ops actions)
|
||||
- Немає `repo_changesets` / `ops_runs` як артефактів у DB
|
||||
- Dialog Map не оновлюється автоматично від ops actions
|
||||
|
||||
---
|
||||
|
||||
## 2. Модулі Архітектури
|
||||
|
||||
### 2.1 BFF (sofiia-console)
|
||||
|
||||
**Документовано тут:**
|
||||
- `docs/runbook/sofiia-control-plane.md`
|
||||
- `docs/sofiia_ui_vnext_audit.md`
|
||||
- `docs/fabric_contract.md`
|
||||
|
||||
**Реалізовано тут:**
|
||||
- `services/sofiia-console/app/main.py` — FastAPI v0.3.0
|
||||
- `services/sofiia-console/app/config.py` — node registry, ENV loading
|
||||
- `docker-compose.node2-sofiia.yml` — deployment config
|
||||
|
||||
**Що BFF робить:**
|
||||
```
|
||||
1. API Gateway для UI (chat/voice/projects/ops/nodes)
|
||||
2. Session management (SQLite sofiia.db)
|
||||
3. Multi-provider LLM proxy (ollama/router/glm/grok)
|
||||
4. Voice pipeline (STT→LLM→TTS, Phase 2 streaming)
|
||||
5. Ops dispatcher (risk/pressure/backlog/notion/release)
|
||||
6. Multi-node health monitor (polling + WebSocket fan-out)
|
||||
7. Memory save (SQLite first, then Memory Service best-effort)
|
||||
```
|
||||
|
||||
**Розриви:**
|
||||
- Відсутній єдиний Job tracking (кожен ops action — one-shot, без persist)
|
||||
- Відсутній `repo_changesets` flow
|
||||
- `ops.html`, `chat.html`, `nodes.html` — fallback HTML, не окремі файли
|
||||
|
||||
---
|
||||
|
||||
### 2.2 LLM Routing
|
||||
|
||||
**Документовано тут:**
|
||||
- `services/router/router-config.yml`
|
||||
- `docs/architecture_inventory/01_SERVICE_CATALOG.md`
|
||||
- `docs/OPENAPI_CONTRACTS.md`
|
||||
|
||||
**Реалізовано тут:**
|
||||
- `services/router/main.py` — `/v1/agents/{agent_id}/infer`
|
||||
- `services/router/router-config.yml` — `sofiia:` entry
|
||||
|
||||
**Конфігурація Sofiia (router-config.yml):**
|
||||
```yaml
|
||||
sofiia:
|
||||
primary: cloud_grok # Grok API (Telegram mode)
|
||||
fallback: cloud_deepseek # DeepSeek API
|
||||
# Console mode може override через ollama
|
||||
```
|
||||
|
||||
**Voice profiles:**
|
||||
```yaml
|
||||
voice_fast_uk:
|
||||
prefer_models: [gemma3:latest, qwen3.5:35b-a3b, qwen3:14b]
|
||||
deadline_ms: 9000
|
||||
max_tokens: 256
|
||||
|
||||
voice_quality_uk:
|
||||
prefer_models: [qwen3.5:35b-a3b, qwen3:14b]
|
||||
deadline_ms: 12000
|
||||
max_tokens: 256
|
||||
```
|
||||
|
||||
**Розриви:**
|
||||
- Відсутній профіль для `repo_changeset` (long-form, structured output)
|
||||
- Відсутній профіль для `plan_generation` (CTO structured plans)
|
||||
|
||||
---
|
||||
|
||||
### 2.3 Tool System
|
||||
|
||||
**Документовано тут:**
|
||||
- `AGENTS.md` §Tool List
|
||||
- `docs/architecture_inventory/02_TOOL_CATALOG.md`
|
||||
- `config/rbac_tools_matrix.yml`
|
||||
|
||||
**Реалізовано тут:**
|
||||
- `services/router/tool_manager.py` — TOOL_DEFINITIONS + execution
|
||||
- `services/router/agent_tools_config.py` — per-agent allowlists
|
||||
|
||||
**RBAC роль `agent_cto`** (39 permissions):
|
||||
```
|
||||
docs: read ops: read/exec_safe
|
||||
repo: read jobs: smoke/drift/backup/deploy
|
||||
kb: read risk: read/write
|
||||
pr_review: use pressure: read/write
|
||||
contract: use backlog: read/write/admin
|
||||
config_lint: use deps: read/gate
|
||||
threatmodel: use cost: read/gate
|
||||
observability drift: read/gate
|
||||
incidents: write alerts: ingest/read/ack/claim
|
||||
```
|
||||
|
||||
**Sofiia спеціалізовані tools (agent_tools_config.py):**
|
||||
```python
|
||||
AGENT_SPECIALIZED_TOOLS["sofiia"] = [
|
||||
"comfy_generate_image",
|
||||
"comfy_generate_video",
|
||||
"risk_engine_tool",
|
||||
"architecture_pressure_tool",
|
||||
"backlog_tool",
|
||||
"job_orchestrator_tool",
|
||||
"dependency_scanner_tool",
|
||||
"incident_intelligence_tool",
|
||||
"cost_analyzer_tool",
|
||||
"pieces_tool",
|
||||
"notion_tool",
|
||||
]
|
||||
```
|
||||
|
||||
**FULL_STANDARD_STACK** (16 tools available to all agents):
|
||||
```
|
||||
memory_search, graph_query, web_search, web_extract, crawl4ai_scrape,
|
||||
remember_fact, image_generate, tts_speak, presentation_create/status/download,
|
||||
file_tool, repo_tool, pr_reviewer_tool, contract_tool, oncall_tool,
|
||||
observability_tool, config_linter_tool, threatmodel_tool, job_orchestrator_tool,
|
||||
kb_tool, drift_analyzer_tool, pieces_tool
|
||||
```
|
||||
|
||||
**Розриви:**
|
||||
- Відсутній `repo_changeset_tool` (create/patch/plan/pr)
|
||||
- Відсутній `ops_job_tool` (start/status/cancel з job tracking)
|
||||
- `job_orchestrator_tool` є, але не пов'язаний з Dialog Map artifact creation
|
||||
|
||||
---
|
||||
|
||||
### 2.4 Memory System
|
||||
|
||||
**Документовано тут:**
|
||||
- `docs/ADR_ARCHITECTURE_VNEXT.md` §2.5 Memory Service
|
||||
- `docs/MEMORY_API_POLICY.md`
|
||||
- `docs/AGENT-MEMORY-STANDARD.md`
|
||||
|
||||
**Реалізовано тут:**
|
||||
- `services/memory-service/app/main.py` — threads/events/memories/facts/agents
|
||||
- `services/memory-service/app/vector_store.py` — Qdrant
|
||||
- `docker-compose.memory-node2.yml` — Postgres + Qdrant + Neo4j
|
||||
|
||||
**3 рівні пам'яті (згідно ADR):**
|
||||
|
||||
| Рівень | Qdrant | Neo4j | Postgres |
|
||||
|--------|--------|-------|----------|
|
||||
| Personal | `user_{id}_*` | `:User` nodes | `user_facts`, `user_sessions` |
|
||||
| Team/DAO | `team_{id}_*` | `:Team`, `:Project` | `team_facts`, `team_quotas` |
|
||||
| Public | `public_*` | `:Public` | `indexed_content` |
|
||||
|
||||
**Реальні колекції (NODA2):**
|
||||
- `sofiia_messages` — 1183+ points
|
||||
- `sofiia_summaries`
|
||||
- Memory Service Postgres (port 5433, db `daarion_memory`)
|
||||
|
||||
**Console-рівень пам'яті (SQLite `sofiia.db`):**
|
||||
```sql
|
||||
projects, documents, sessions, messages
|
||||
```
|
||||
|
||||
**Розриви:**
|
||||
- Team/DAO namespace: описаний в ADR, реалізований лише для Personal
|
||||
- E2EE для confidential: тільки в ADR, не реалізовано
|
||||
- BFF і Memory Service "знають" одне про одного, але sync неповний
|
||||
|
||||
---
|
||||
|
||||
### 2.5 Planning System (Supervisor)
|
||||
|
||||
**Документовано тут:**
|
||||
- `docs/supervisor/langgraph_supervisor.md`
|
||||
- `docs/supervisor/postmortem_draft_graph.md`
|
||||
|
||||
**Реалізовано тут:**
|
||||
- `services/sofiia-supervisor/app/main.py`
|
||||
- `services/sofiia-supervisor/app/graphs/`
|
||||
|
||||
**Доступні LangGraph графи:**
|
||||
```
|
||||
alert_triage → класифікація/ескалація алертів
|
||||
incident_triage → тріаж інцидентів (SLO, labels, owners)
|
||||
postmortem_draft → автогенерація postmortem документа
|
||||
release_check → pre-release gate checks
|
||||
```
|
||||
|
||||
**Архітектура (загальна):**
|
||||
```
|
||||
Event/Trigger → LangGraph Node → State update → Next Node
|
||||
↓ ↓
|
||||
NATS event Tool calls (via gateway_client)
|
||||
Memory writes
|
||||
Structured output (JSON)
|
||||
```
|
||||
|
||||
**Розриви:**
|
||||
- Немає `cto_intent_graph` (intent → plan → execute)
|
||||
- Немає `repo_changeset_graph` (diff → plan → PR)
|
||||
- Немає `dialog_map_builder_graph` (events → nodes/edges)
|
||||
- Supervisor ізольований від BFF (не інтегрований у `/api/ops/run`)
|
||||
|
||||
---
|
||||
|
||||
## 3. Policies (Безпека, Дозволи, Approval)
|
||||
|
||||
### 3.1 Документовано
|
||||
- `docs/PRIVACY_GATE.md` — Privacy Gate middleware
|
||||
- `docs/ADR_ARCHITECTURE_VNEXT.md` §4 Privacy Gate
|
||||
- `docs/AGENT_RUNTIME_POLICY.md`
|
||||
- `config/rbac_tools_matrix.yml`
|
||||
- `config/data_governance_policy.yml`
|
||||
- `config/risk_policy.yml`
|
||||
|
||||
### 3.2 Реалізовано
|
||||
- RBAC tool allowlist: ✅ `agent_tools_config.py`
|
||||
- API key auth: ✅ `auth.py`
|
||||
- Rate limiting: ✅ per-endpoint
|
||||
- Upload sanitization: ✅ mime + filename + size
|
||||
- Voice guardrails: ✅ `sanitize_for_voice()`
|
||||
- Config linter (secrets detection): ✅ `tool_manager.py`
|
||||
|
||||
### 3.3 Не реалізовано
|
||||
- **Privacy Gate middleware** (перевірка `mode=confidential` в Router): 📄 описаний, не реалізований
|
||||
- **2-step Plan → Apply flow**: 📄 описаний як "dangerous actions", не реалізований
|
||||
- **E2EE client-side encryption**: 📄 тільки ADR, не реалізований
|
||||
- **Confidential doc indexing block**: 📄 тільки ADR, не реалізований
|
||||
|
||||
---
|
||||
|
||||
## 4. Event Model
|
||||
|
||||
### 4.1 Документовано
|
||||
- `docs/ADR_ARCHITECTURE_VNEXT.md` §5 NATS Standards
|
||||
- `docs/NATS_SUBJECTS.md`
|
||||
- `docs/NATS_SUBJECT_MAP.md`
|
||||
|
||||
### 4.2 NATS Subjects (ADR canonical)
|
||||
```
|
||||
message.created.{channel_id} # chat messages
|
||||
attachment.created.{type} # uploaded files
|
||||
agent.run.requested.{agent_id} # agent activation
|
||||
agent.run.completed.{agent_id}
|
||||
quota.consumed.{user_id}
|
||||
audit.{service}.{action} # append-only audit
|
||||
ops.health.{service}
|
||||
ops.alert.{severity}
|
||||
```
|
||||
|
||||
### 4.3 Fabric Subjects (реалізовані у node-worker)
|
||||
```
|
||||
node.{id}.llm.request # LLM offload
|
||||
node.{id}.tts.request # TTS offload
|
||||
node.{id}.stt.request # STT offload
|
||||
node.{id}.voice.llm.request # Voice LLM (dedicated)
|
||||
node.{id}.voice.tts.request # Voice TTS (dedicated)
|
||||
node.{id}.voice.stt.request # Voice STT (dedicated)
|
||||
node.{id}.ocr.request # OCR offload
|
||||
node.{id}.crawl.request # Crawl offload
|
||||
node.{id}.image.request # Image generation
|
||||
```
|
||||
|
||||
### 4.4 Розриви
|
||||
- `attachment.created` — реалізований частково (upload зберігає файл, але не публікує у NATS)
|
||||
- `task_create`, `doc_upsert`, `meeting_create` — не реалізовані (потрібні для Dialog Map auto-edge)
|
||||
- `agent.run.requested` → legacy flat subject ще може бути в деяких шляхах (відомий drift)
|
||||
- Dialog Map не підписаний на NATS events
|
||||
|
||||
---
|
||||
|
||||
## 5. Memory Architecture (деталізована)
|
||||
|
||||
```
|
||||
┌──────────────────────────────────────────────────────────┐
|
||||
│ Sofiia Memory Layers │
|
||||
├──────────────────────────────────────────────────────────┤
|
||||
│ Layer 0: Working Context (per-turn) │
|
||||
│ - history[-12:] in BFF request │
|
||||
│ - sanitize_for_voice() for voice turns │
|
||||
├──────────────────────────────────────────────────────────┤
|
||||
│ Layer 1: Session Memory (sofiia-console SQLite) │
|
||||
│ Tables: projects, documents, sessions, messages │
|
||||
│ TTL: indefinite (volume-backed) │
|
||||
│ Fork: parent_msg_id для branching │
|
||||
├──────────────────────────────────────────────────────────┤
|
||||
│ Layer 2: Long-term Memory (Memory Service) │
|
||||
│ Qdrant: sofiia_messages (1183+ vectors) │
|
||||
│ sofiia_summaries │
|
||||
│ Postgres: daarion_memory DB (facts, threads, events) │
|
||||
│ Neo4j: agent memory graph (infrastructure ready) │
|
||||
├──────────────────────────────────────────────────────────┤
|
||||
│ Layer 3: Factual Memory (Key-Value) │
|
||||
│ /facts/upsert, /facts/{key} │
|
||||
│ Rolling summaries via /threads/{id}/summarize │
|
||||
└──────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
**Namespaces (implemented):**
|
||||
- `sofiia_messages` — agent-specific collection
|
||||
- Загальний: `{agent_id}_{type}` pattern
|
||||
|
||||
**Sync між Layer 1 і Layer 2:**
|
||||
- `_do_save_memory()` у `main.py`: спочатку SQLite, потім Memory Service (best-effort)
|
||||
- Немає зворотнього sync (Memory Service → SQLite)
|
||||
- Немає конфліктів (append-only обидва)
|
||||
|
||||
---
|
||||
|
||||
## 6. Dialog Map Intelligence
|
||||
|
||||
### Поточна реалізація (Phase 1)
|
||||
```
|
||||
SQLite messages table (parent_msg_id = branching)
|
||||
↓
|
||||
GET /api/sessions/{sid}/map
|
||||
↓
|
||||
Python: build_tree(messages) → nodes/edges
|
||||
↓
|
||||
UI: <details><summary> tree
|
||||
```
|
||||
|
||||
### Цільова реалізація (vNext Phase 2)
|
||||
```
|
||||
NATS events (task_create, doc_upsert, meeting_create)
|
||||
↓
|
||||
Dialog Map Builder (новий сервіс або Supervisor граф)
|
||||
↓
|
||||
Postgres: dialog_nodes + dialog_edges
|
||||
↓
|
||||
GET /projects/{id}/dialog-map
|
||||
↓
|
||||
UI: D3/Cytoscape canvas + live WS updates
|
||||
```
|
||||
|
||||
**Node types (vNext):**
|
||||
- `message` — chat message
|
||||
- `task` — задача
|
||||
- `doc` — документ/wiki
|
||||
- `meeting` — зустріч
|
||||
- `agent_run` — виклик агента
|
||||
- `decision` — ADR/рішення
|
||||
- `goal` — ціль/OKR
|
||||
|
||||
**Edge types (vNext):**
|
||||
- `references` — A посилається на B
|
||||
- `resolves` — A вирішує B
|
||||
- `derives_task` — повідомлення → задача
|
||||
- `updates_doc` — action → doc version
|
||||
- `schedules` — message → meeting
|
||||
- `summarizes` — rollup вузол
|
||||
|
||||
---
|
||||
|
||||
## 7. Preflight-First Policy
|
||||
|
||||
**Документовано тут:**
|
||||
- `ops/fabric_preflight.sh`
|
||||
- `docs/fabric_contract.md`
|
||||
|
||||
**Принцип:** "Zero assumptions" — перед будь-яким deploy/change:
|
||||
1. Запустити `ops/fabric_preflight.sh`
|
||||
2. Перевірити моделі (VOICE_REQUIRED_MODELS fail / VOICE_PREFERRED_MODELS warn)
|
||||
3. Перевірити `ops/fabric_snapshot.py --save`
|
||||
4. Тільки потім deploy
|
||||
|
||||
**Реалізовано:**
|
||||
- `ops/fabric_preflight.sh` — перевірки моделей, voice health, canary
|
||||
- `ops/scripts/voice_canary.py` — runtime canary (кожні 5–10 хв)
|
||||
- `ops/voice_latency_audit.sh` — 10-сценарний latency audit
|
||||
|
||||
---
|
||||
|
||||
## Next Actions for UI Team (1–2 days)
|
||||
|
||||
1. **Ознайомитись із Supervisor API** (`/v1/graphs/{name}/runs`) — це готовий "job runner" для CTO workflows
|
||||
2. **Розширити Supervisor**: додати `cto_intent_graph` на базі `release_check_graph` (спільна структура)
|
||||
3. **NATS attachment events**: при upload в `docs_router.py` — публікувати `attachment.created` (1 рядок коду)
|
||||
4. **Dialog Map NATS listener**: простий consumer що upsert-ить SQLite nodes при events
|
||||
5. **`docs_versions` table**: ALTER TABLE + endpoint — 1–2 год роботи
|
||||
6. **Privacy Gate stub**: додати перевірку `mode` поля в BFF, навіть якщо без шифрування
|
||||
7. **Plan → Apply pattern**: для ops actions — показувати "план" перед запуском
|
||||
8. **`agent_id` нормалізація**: замінити `"l"` на `"sofiia"` в node2 router-config.yml
|
||||
9. **Memory sync**: додати endpoint для завантаження Sofiia memory з Memory Service у SQLite
|
||||
10. **CTO Panel**: mock `/api/repo/changesets` і `/api/ops/runs` endpoints для UI розробки
|
||||
192
docs/audit/sofiia_state_of_implementation.md
Normal file
192
docs/audit/sofiia_state_of_implementation.md
Normal file
@@ -0,0 +1,192 @@
|
||||
# Sofiia CTO Agent — State of Implementation (B)
|
||||
|
||||
> Generated: 2026-02-26 | Legend: ✅ Implemented | ⚠️ Partial | 📄 Documented Only | ❌ Not Found
|
||||
|
||||
---
|
||||
|
||||
## 1. Identity & System Prompt
|
||||
|
||||
| Feature | Status | Evidence | Risk |
|
||||
|---------|--------|----------|------|
|
||||
| Sofiia identity (AGENTS.md) | ✅ Implemented | `AGENTS.md` — CTO-агент, NODA1/2/3, capabilities | — |
|
||||
| Telegram system prompt | ✅ Implemented | `gateway-bot/sofiia_prompt.txt` (138KB) | — |
|
||||
| Control Console system prompt | ✅ Implemented | `services/sofiia-console/app/main.py` lines 138–177 | — |
|
||||
| Voice turn prompt suffix | ✅ Implemented | `main.py` `SOFIIA_VOICE_PROMPT_SUFFIX` — max 2 sentences, no markdown | — |
|
||||
| Agent ID consistency | ⚠️ Partial | `"sofiia"` у production, `"l"` у node2-конфігурації та тестах | ⚠️ Confusion risk |
|
||||
| NODA3 integration | 📄 Documented Only | `AGENTS.md` описує NODA3 (IP, GPU, models), але немає compose/config | 🔴 Blocking |
|
||||
|
||||
---
|
||||
|
||||
## 2. Control Console (BFF)
|
||||
|
||||
| Feature | Status | Evidence | Risk |
|
||||
|---------|--------|----------|------|
|
||||
| FastAPI BFF основа | ✅ Implemented | `sofiia-console/app/main.py` v0.3.0, 1800 рядків | — |
|
||||
| Chat: Ollama/Router/GLM/Grok | ✅ Implemented | `/api/chat/send`, providers: ollama, router, glm, grok | — |
|
||||
| Chat: history (client-side) | ✅ Implemented | `body.history[-12:]` передається клієнтом | — |
|
||||
| Chat: session persist (SQLite) | ✅ Implemented | `_do_save_memory` → `db.save_message`, `db.upsert_session` | — |
|
||||
| Chat: session restore on page reload | ✅ Implemented | `GET /api/chat/history`, localStorage session_id | — |
|
||||
| Ops: risk/pressure/backlog/release | ✅ Implemented | `/api/ops/run` + `ops.py` dispatcher | — |
|
||||
| Ops: Notion actions | ✅ Implemented | notion_status/create_task/create_page/create_database | — |
|
||||
| Hub: integrations status | ✅ Implemented | `/api/integrations/status` — Router, Memory, OpenWebUI, Pieces, OpenCode, Notion | — |
|
||||
| Nodes: dashboard | ✅ Implemented | `/api/nodes/dashboard` з caching, multi-node poll | — |
|
||||
| Nodes: SSH status | ✅ Implemented | `/api/nodes/ssh/status` (strict auth) | — |
|
||||
| Nodes: add node | ✅ Implemented | `/api/nodes/add` | — |
|
||||
| Nodes: remove node | ❌ Not Found | Тільки add, без delete | ⚠️ Minor gap |
|
||||
| Memory: status | ✅ Implemented | `/api/memory/status` | — |
|
||||
| Memory: context | ✅ Implemented | `/api/memory/context` | — |
|
||||
| WebSocket event bus | ✅ Implemented | `/ws/events` — nodes.status, chat.reply, voice.*, ops.run | — |
|
||||
| Rate limiting | ✅ Implemented | per-endpoint limiters: 30/min chat, 15/min stream, 30/min TTS | — |
|
||||
| API key auth | ✅ Implemented | `auth.py` + strict mode | — |
|
||||
|
||||
---
|
||||
|
||||
## 3. Voice Layer
|
||||
|
||||
| Feature | Status | Evidence | Risk |
|
||||
|---------|--------|----------|------|
|
||||
| STT proxy | ✅ Implemented | `POST /api/voice/stt` → memory-service | — |
|
||||
| TTS proxy | ✅ Implemented | `POST /api/voice/tts` (legacy + HA path) | — |
|
||||
| Voice streaming Phase 2 | ✅ Implemented | `POST /api/voice/chat/stream` — split → first TTS | — |
|
||||
| Voice policy (voice_fast_uk/quality_uk) | ✅ Implemented | `router-config.yml`, `test_voice_policy.py` 23/23 | — |
|
||||
| Voice guardrails (2 sentences) | ✅ Implemented | `SOFIIA_VOICE_PROMPT_SUFFIX`, `sanitize_for_voice()` | — |
|
||||
| `<think>` stripping | ✅ Implemented | `voice_utils.py` + Router `_clean_think_blocks` | — |
|
||||
| Degradation state machine | ✅ Implemented | `_VoiceDegradationSM` (ok/degraded_tts/degraded_llm/fast_lock/emergency) | — |
|
||||
| TTFA telemetry | ✅ Implemented | `POST /api/telemetry/voice` + Prometheus metrics | — |
|
||||
| Voice HA (multi-node routing) | ✅ Implemented | `VOICE_HA_ENABLED` flag, Router `/v1/capability/voice_*` | — |
|
||||
| Remote voice badge | ✅ Implemented | `X-Voice-Mode: remote` header → `🌐 noda1` badge | — |
|
||||
| Voice canary | ✅ Implemented | `ops/scripts/voice_canary.py` (preflight + runtime mode) | — |
|
||||
| Grafana voice dashboard | ✅ Implemented | `ops/grafana_voice_dashboard.json` | — |
|
||||
| Voice alerts (Prometheus) | ✅ Implemented | `ops/voice_alerts.yml` (6 rules) | — |
|
||||
| SLO definitions | ✅ Implemented | `config/slo_policy.yml` voice_fast_uk / voice_quality_uk | — |
|
||||
| Rate limit / DoS guard | ✅ Implemented | semaphore, per-IP limiter, `rest_chunks ≤ 8` cap | — |
|
||||
|
||||
---
|
||||
|
||||
## 4. Projects, Documents, Sessions
|
||||
|
||||
| Feature | Status | Evidence | Risk |
|
||||
|---------|--------|----------|------|
|
||||
| Projects CRUD | ✅ Implemented | `docs_router.py`: GET/POST/PATCH `/api/projects` | — |
|
||||
| Documents CRUD | ✅ Implemented | upload, list, get, keyword search | — |
|
||||
| File upload (multipart) | ✅ Implemented | `POST /api/files/upload` — sha256, mime detect, size limit | — |
|
||||
| Text extraction (PDF/DOCX/TXT) | ✅ Implemented | `_extract_text_simple()` у docs_router | — |
|
||||
| Sessions persistence | ✅ Implemented | `upsert_session`, `save_message`, SQLite `sofiia.db` | — |
|
||||
| Chat history restore | ✅ Implemented | `GET /api/chat/history?session_id=...` | — |
|
||||
| Dialog Map (tree) | ✅ Implemented | `GET /api/sessions/{sid}/map` → nodes/edges | — |
|
||||
| Dialog Map (canvas/D3) | ❌ Not Found | Поточний — `<details>` collapsible tree тільки | Phase 2 |
|
||||
| Session fork | ✅ Implemented | `POST /api/sessions/{sid}/fork` | — |
|
||||
| Projects sidebar (chat UI) | ✅ Implemented | `#sidebarProjectList` у `index.html` | — |
|
||||
| Projects section (full UI) | ✅ Implemented | `#section-projects` з tabs: docs, sessions, map | — |
|
||||
| Fabric OCR для uploaded images | ⚠️ Feature Flag Off | `USE_FABRIC_OCR=false` за замовч. | Low risk |
|
||||
| Qdrant embeddings для docs | ⚠️ Feature Flag Off | `USE_EMBEDDINGS=false` за замовч. | Low risk |
|
||||
| Docs versions (history) | ❌ Not Found | `docs_versions` таблиця відсутня | 🔴 vNext gap |
|
||||
| Docs backlinks (entity_links) | ❌ Not Found | `docs_links`/`entity_links` таблиця відсутня | 🔴 vNext gap |
|
||||
| `doc_index_state` table | ❌ Not Found | Відсутня | 🔴 vNext gap |
|
||||
| Semantic search (Meilisearch) | ❌ Not Found | Тільки SQL LIKE keyword search | 📄 ADR describes it |
|
||||
|
||||
---
|
||||
|
||||
## 5. CTO-specific Capabilities (Repo/Ops)
|
||||
|
||||
| Feature | Status | Evidence | Risk |
|
||||
|---------|--------|----------|------|
|
||||
| `repo_tool` (read-only) | ✅ Implemented | `tool_manager.py` — tree/read/search/metadata | — |
|
||||
| `pr_reviewer_tool` | ✅ Implemented | `tool_manager.py` — blocking_only/full_review | — |
|
||||
| `contract_tool` (OpenAPI) | ✅ Implemented | `tool_manager.py` — lint_openapi/diff_openapi/generate_client_stub | — |
|
||||
| `oncall_tool` | ✅ Implemented | services_list/health/runbook_search/incident_log | — |
|
||||
| `observability_tool` | ✅ Implemented | Prometheus/Loki/Tempo queries | — |
|
||||
| `config_linter_tool` | ✅ Implemented | Secrets detection, policy violations | — |
|
||||
| `threatmodel_tool` | ✅ Implemented | STRIDE-based threat modeling | — |
|
||||
| `job_orchestrator_tool` | ✅ Implemented | smoke/drift/backup/deploy tasks | — |
|
||||
| `kb_tool` | ✅ Implemented | ADR/docs search | — |
|
||||
| `drift_analyzer_tool` | ✅ Implemented | Infrastructure drift detection | — |
|
||||
| `risk_engine_tool` | ✅ Implemented | Risk scoring | — |
|
||||
| `architecture_pressure_tool` | ✅ Implemented | Architecture health analysis | — |
|
||||
| `backlog_tool` | ✅ Implemented | Backlog generation/management | — |
|
||||
| `dependency_scanner_tool` | ✅ Implemented | Dependency security scan | — |
|
||||
| `incident_intelligence_tool` | ✅ Implemented | Incident correlation | — |
|
||||
| `cost_analyzer_tool` | ✅ Implemented | Cost analysis | — |
|
||||
| `notion_tool` | ✅ Implemented | Notion pages/tasks/databases | — |
|
||||
| **`repo_changesets`** (CTO workflow) | ❌ Not Found | Тільки описано в vNext design | 🔴 Blocking |
|
||||
| **`ops_runs` API** | ❌ Not Found | Тільки `ops.py` dispatcher (не як job system) | 🔴 Blocking |
|
||||
| **`pull_requests` API** | ❌ Not Found | PR Review tool є, але PR object як артефакт — немає | 🔴 vNext gap |
|
||||
| **`entity_links`** | ❌ Not Found | Concept described, not implemented | 🔴 vNext gap |
|
||||
| Direct NODA3 integration | ❌ Not Found | Описано в AGENTS.md, відсутній docker-compose/router config | 🔴 |
|
||||
|
||||
---
|
||||
|
||||
## 6. Supervisor (LangGraph)
|
||||
|
||||
| Feature | Status | Evidence | Risk |
|
||||
|---------|--------|----------|------|
|
||||
| Alert triage graph | ✅ Implemented | `alert_triage_graph.py` + tests | — |
|
||||
| Incident triage graph | ✅ Implemented | `incident_triage_graph.py` + tests | — |
|
||||
| Postmortem draft graph | ✅ Implemented | `postmortem_draft_graph.py` + tests | — |
|
||||
| Release check graph | ✅ Implemented | `release_check_graph.py` + tests | — |
|
||||
| Supervisor API | ✅ Implemented | `/v1/graphs/{name}/runs` | — |
|
||||
| **CTO workflow graph** (intent→plan→execute) | ❌ Not Found | Описано в vNext design, немає реалізації | 🔴 vNext gap |
|
||||
| **Repo changeset graph** | ❌ Not Found | Тільки в дизайн-доці | 🔴 vNext gap |
|
||||
|
||||
---
|
||||
|
||||
## 7. Memory System
|
||||
|
||||
| Feature | Status | Evidence | Risk |
|
||||
|---------|--------|----------|------|
|
||||
| Short-term memory (threads/events) | ✅ Implemented | Memory Service `/threads`, `/events` | — |
|
||||
| Long-term memory (Qdrant) | ✅ Implemented | `/memories` + semantic search | — |
|
||||
| Facts store | ✅ Implemented | `/facts/upsert`, `/facts/{key}` | — |
|
||||
| Agent memory (Postgres + Neo4j) | ✅ Implemented | `/agents/{id}/memory` | — |
|
||||
| Rolling summaries | ✅ Implemented | `/threads/{id}/summarize` | — |
|
||||
| Neo4j graph memory | ✅ Infrastructure Ready | docker-compose.memory-node2.yml | Не тестований |
|
||||
| **Personal namespace** | ⚠️ Partial | ADR описує `user_{id}_*` collections, реалізація через `user_id` param | — |
|
||||
| **Team/DAO namespace** | 📄 Documented Only | ADR, не реалізовано в code | 🔴 vNext gap |
|
||||
| **E2EE (confidential docs)** | 📄 Documented Only | ADR + PRIVACY_GATE.md, не реалізовано | 🔴 vNext gap |
|
||||
|
||||
---
|
||||
|
||||
## 8. Infrastructure
|
||||
|
||||
| Feature | Status | Evidence | Risk |
|
||||
|---------|--------|----------|------|
|
||||
| NODA2 Docker stack | ✅ Implemented | `docker-compose.node2-sofiia.yml` | — |
|
||||
| NODA1 health + SSH | ✅ Implemented | nodes.py + SSH key auth | — |
|
||||
| Prometheus metrics | ✅ Implemented | fabric_metrics.py (router + node-worker), voice metrics | — |
|
||||
| NATS subjects | ✅ Implemented | Fabric node.{id}.*.request subjects | — |
|
||||
| Voice HA semaphores | ✅ Implemented | node-worker separate voice semaphores | — |
|
||||
| sofiia-data volume | ✅ Implemented | docker-compose.node2-sofiia.yml sofiia-data:/app/data | — |
|
||||
| Postgres для sofiia docs | ⚠️ SQLite Only | Phase 1: SQLite у sofiia-console, Postgres для Memory Service | Phase 2 needed |
|
||||
| S3/MinIO storage | ❌ Not Found | ADR описує, upload зараз у volume | 🔴 Phase 2 |
|
||||
| Meilisearch | ❌ Not Found | ADR описує для search, не розгорнутий | 🔴 vNext |
|
||||
| Control Plane service | ❌ Not Found | ADR 1.1-1.3, reference у security audit | 🔴 vNext |
|
||||
|
||||
---
|
||||
|
||||
## 9. Security
|
||||
|
||||
| Feature | Status | Evidence | Risk |
|
||||
|---------|--------|----------|------|
|
||||
| RBAC per agent | ✅ Implemented | `rbac_tools_matrix.yml` agent_cto (39 permissions) | — |
|
||||
| Tool allowlist per agent | ✅ Implemented | `agent_tools_config.py` AGENT_SPECIALIZED_TOOLS["sofiia"] | — |
|
||||
| API key auth | ✅ Implemented | `auth.py` — console + strict modes | — |
|
||||
| Upload sanitization (filename/mime) | ✅ Implemented | `_safe_filename()`, `_detect_mime()` у docs_router | — |
|
||||
| Rate limiting | ✅ Implemented | per-endpoint + semaphore + `rest_chunks ≤ 8` | — |
|
||||
| **E2EE (confidential)** | ❌ Not Found | Privacy Gate описаний в ADR, не реалізований | 🔴 |
|
||||
| **2-step approval для dangerous actions** | ❌ Not Found | ADR описує Plan → Apply flow | 🔴 vNext |
|
||||
| Audit log (append-only) | ⚠️ Partial | audit.py у agromatrix crew, `audit.{service}.{action}` NATS — частково | 🔴 |
|
||||
|
||||
---
|
||||
|
||||
## Next Actions for UI Team (1–2 days)
|
||||
|
||||
1. **Зверніть увагу**: `repo_changesets`, `ops_runs`, `entity_links` — **не існують**. UI CTO panel потребує mock endpoints
|
||||
2. **Quick win**: `docs_versions` таблиця — 30хв роботи (ALTER TABLE + endpoint у docs_router.py)
|
||||
3. **Quick win**: увімкнути `USE_EMBEDDINGS=true` в docker-compose для реального vector search
|
||||
4. **Перевірити** соfiia agent_id у тестах: `"l"` vs `"sofiia"` — потрібна нормалізація
|
||||
5. **Postgres migration**: коли sofiia-console готова до Postgres, потрібен `DATABASE_URL` env + аналогічний `init_db()`
|
||||
6. **E2EE**: перед вмиканням confidential docs — треба спроєктувати ключі (client-side only)
|
||||
7. **Dialog Map Phase 2**: canvas rendering (D3/Cytoscape) — `<details>` tree є, але не масштабується
|
||||
8. **Meilisearch**: поки `LIKE` search, але коли кількість docs зросте — потрібен реальний search index
|
||||
9. **NODA3**: додати до `nodes_registry.yml` і `docker-compose.node2-sofiia.yml` (якщо NODA3 реально доступна)
|
||||
10. **CTO workflow graph**: перший крок — alert_triage граф вже є, на його основі зробити `cto_intent_graph`
|
||||
248
docs/audit/ui_vnext_dependency_map.md
Normal file
248
docs/audit/ui_vnext_dependency_map.md
Normal file
@@ -0,0 +1,248 @@
|
||||
# Sofiia UI vNext — Dependency Map (D)
|
||||
|
||||
> Generated: 2026-02-26 | Карта залежностей UI → Backend → DB → Events
|
||||
|
||||
---
|
||||
|
||||
## Легенда
|
||||
|
||||
| Символ | Значення |
|
||||
|--------|----------|
|
||||
| ✅ | Endpoint/Model реалізований |
|
||||
| ⚠️ | Частково реалізований або за feature flag |
|
||||
| ❌ | Відсутній, потрібна реалізація |
|
||||
| 🔧 | Потрібне виправлення/доопрацювання |
|
||||
| 📄 | Тільки документований |
|
||||
|
||||
---
|
||||
|
||||
## Таблиця 1: Chat & Voice
|
||||
|
||||
| UI Feature | Expected API/Event | Found? | Evidence | Action |
|
||||
|-----------|-------------------|--------|----------|--------|
|
||||
| Text chat | `POST /api/chat/send` | ✅ | `main.py` | — |
|
||||
| Voice STT (WebM) | `POST /api/voice/stt` | ✅ | `main.py` → memory-service | — |
|
||||
| Voice TTS | `POST /api/voice/tts` | ✅ | `main.py` (legacy + HA) | — |
|
||||
| Voice Phase 2 stream | `POST /api/voice/chat/stream` | ✅ | `main.py` | — |
|
||||
| Voice stop/abort | AbortController + `POST /api/voice/tts` cancel | ✅ | `index.html` JS | — |
|
||||
| TTFA telemetry | `POST /api/telemetry/voice` | ✅ | `main.py` | — |
|
||||
| Batch telemetry | `POST /api/telemetry/voice/batch` | ✅ | `main.py` | — |
|
||||
| Degradation badge | `GET /api/voice/degradation_status` | ✅ | `main.py` | — |
|
||||
| Remote voice badge | `X-Voice-Mode: remote` header | ✅ | `main.py` + `index.html` | — |
|
||||
| Model selector UI | inline models list | ✅ | `index.html` (hardcoded) | 🔧 Should come from `/api/models` |
|
||||
| Chat history restore | `GET /api/chat/history?session_id=` | ✅ | `docs_router.py` | — |
|
||||
| Session persistence | localStorage `session_id` | ✅ | `index.html` | — |
|
||||
| Memory status | `GET /api/memory/status` | ✅ | `main.py` | — |
|
||||
|
||||
---
|
||||
|
||||
## Таблиця 2: Projects
|
||||
|
||||
| UI Feature | Expected API/Event | Found? | Evidence | Action |
|
||||
|-----------|-------------------|--------|----------|--------|
|
||||
| Projects list | `GET /api/projects` | ✅ | `docs_router.py` | — |
|
||||
| Create project | `POST /api/projects` | ✅ | `docs_router.py` | — |
|
||||
| Get project | `GET /api/projects/{id}` | ✅ | `docs_router.py` | — |
|
||||
| Update project | `PATCH /api/projects/{id}` | ✅ | `docs_router.py` | — |
|
||||
| Delete project | `DELETE /api/projects/{id}` | ❌ | Not found | implement |
|
||||
| Projects sidebar | `GET /api/projects` (on load) | ✅ | `index.html` `loadSidebarProjects()` | — |
|
||||
| Project switcher | localStorage `project_id` | ✅ | `index.html` | — |
|
||||
| **Board (Kanban)** | `GET /api/projects/{id}/tasks` | ❌ | Not found | implement or mock |
|
||||
| **Tasks CRUD** | `/api/projects/{id}/tasks` | ❌ | Not found | implement |
|
||||
| **Meetings** | `GET /api/projects/{id}/meetings` | ❌ | Not found | implement or mock |
|
||||
| **Meeting create** | `POST /api/meetings` | ❌ | Not found | implement |
|
||||
| **Meeting reminders** | NATS `meeting.reminder.*` | ❌ | Not found | Phase 2 |
|
||||
| Project settings | `PATCH /api/projects/{id}` | ✅ | `docs_router.py` | — |
|
||||
|
||||
---
|
||||
|
||||
## Таблиця 3: Documents DB
|
||||
|
||||
| UI Feature | Expected API/Event | Found? | Evidence | Action |
|
||||
|-----------|-------------------|--------|----------|--------|
|
||||
| Upload file | `POST /api/files/upload` (multipart) | ✅ | `docs_router.py` | — |
|
||||
| List documents | `GET /api/projects/{id}/documents` | ✅ | `docs_router.py` | — |
|
||||
| Get document | `GET /api/projects/{id}/documents/{doc_id}` | ✅ | `docs_router.py` | — |
|
||||
| Download file | `GET /api/files/{file_id}/download` | ✅ | `docs_router.py` | — |
|
||||
| Search docs | `POST /api/projects/{id}/search` | ✅ | `docs_router.py` (SQL LIKE) | 🔧 Needs semantic search |
|
||||
| Delete document | `DELETE /api/projects/{id}/documents/{doc_id}` | ❌ | Not found | implement |
|
||||
| **Doc versioning** | `GET /docs/{id}/versions` | ❌ | Not found | implement (DDL needed) |
|
||||
| **Restore version** | `POST /docs/{id}/restore` | ❌ | Not found | implement |
|
||||
| **Doc diff** | `GET /docs/{id}/diff?from=&to=` | ❌ | Not found | Phase 2 |
|
||||
| **Backlinks (entity_links)** | `POST /docs/{id}/links` | ❌ | Not found | implement |
|
||||
| **"Index for AI" toggle** | `POST /docs/{id}/index` | ❌ | Not found (USE_EMBEDDINGS flag) | implement |
|
||||
| **doc_index_state** | status tracking | ❌ | Not found | implement |
|
||||
| **Wiki Markdown editor** | Frontend only | ❌ | Not in index.html | implement (Phase 2) |
|
||||
| **Docs tree navigation** | Frontend only | ❌ | Not in index.html | implement (Phase 2) |
|
||||
| Fabric OCR on upload | `POST /v1/capability/ocr` | ⚠️ | `USE_FABRIC_OCR=false` | enable flag |
|
||||
| Embeddings on upload | Qdrant ingest via Router | ⚠️ | `USE_EMBEDDINGS=false` | enable flag |
|
||||
| NATS event on upload | `attachment.created` | ❌ | Not published | add to upload handler |
|
||||
|
||||
---
|
||||
|
||||
## Таблиця 4: Sessions & Dialog Map
|
||||
|
||||
| UI Feature | Expected API/Event | Found? | Evidence | Action |
|
||||
|-----------|-------------------|--------|----------|--------|
|
||||
| Sessions list | `GET /api/sessions?project_id=` | ✅ | `docs_router.py` | — |
|
||||
| Resume session | `GET /api/chat/history?session_id=` | ✅ | `docs_router.py` | — |
|
||||
| Session title update | `PATCH /api/sessions/{id}/title` | ✅ | `docs_router.py` | — |
|
||||
| Session fork | `POST /api/sessions/{id}/fork` | ✅ | `docs_router.py` | — |
|
||||
| Dialog Map (tree) | `GET /api/sessions/{id}/map` | ✅ | `docs_router.py` | — |
|
||||
| **Dialog Map (canvas)** | D3/Cytoscape rendering | ❌ | `<details>` tree only in UI | Phase 2 |
|
||||
| **Project-level map** | `GET /api/projects/{id}/dialog-map` | ❌ | Not found | implement (Postgres needed) |
|
||||
| **Node types** (task/doc/meeting) | NATS consumers | ❌ | Not found | Phase 2 |
|
||||
| **Edge creation UI** | `POST /api/links` | ❌ | Not found | implement |
|
||||
| **Pin important node** | `PATCH /api/sessions/{id}/pin/{msg_id}` | ❌ | Not found | implement |
|
||||
| Real-time map updates | WS `dialog_map.updated` event | ❌ | Not found | implement |
|
||||
| **Saved views** | `dialog_views` table | ❌ | Not found | implement |
|
||||
|
||||
---
|
||||
|
||||
## Таблиця 5: CTO Panel (Repo + Ops)
|
||||
|
||||
| UI Feature | Expected API/Event | Found? | Evidence | Action |
|
||||
|-----------|-------------------|--------|----------|--------|
|
||||
| Ops actions (risk/backlog/etc.) | `GET /api/ops/actions` | ✅ | `main.py` | — |
|
||||
| Run ops action | `POST /api/ops/run` | ✅ | `main.py` + `ops.py` | — |
|
||||
| Node health dashboard | `GET /api/nodes/dashboard` | ✅ | `main.py` | — |
|
||||
| Node SSH status | `GET /api/nodes/ssh/status` | ✅ | `main.py` | — |
|
||||
| Add node | `POST /api/nodes/add` | ✅ | `main.py` | — |
|
||||
| Integrations status | `GET /api/integrations/status` | ✅ | `main.py` | — |
|
||||
| **Repo changesets list** | `GET /api/repo/changesets` | ❌ | Not found | implement or mock |
|
||||
| **Create changeset** | `POST /api/repo/changesets` | ❌ | Not found | implement |
|
||||
| **Add patch** | `POST /api/repo/changesets/{id}/patches` | ❌ | Not found | implement |
|
||||
| **Execution plan** | `POST /api/repo/changesets/{id}/plan` | ❌ | Not found | implement |
|
||||
| **Create PR** | `POST /api/repo/changesets/{id}/pr` | ❌ | Not found | implement |
|
||||
| **Run checks** | `POST /api/repo/pr/{id}/checks:run` | ❌ | Not found | implement |
|
||||
| **Ops runs list** | `GET /api/ops/runs` | ❌ | Not found (only one-shot dispatch) | implement |
|
||||
| **Ops run create** | `POST /api/ops/runs` (job-based) | ❌ | Not found | implement |
|
||||
| **Ops run status** | `GET /api/ops/runs/{id}` | ❌ | Not found | implement |
|
||||
| LangGraph runs | `POST /v1/graphs/{name}/runs` (Supervisor) | ✅ | `sofiia-supervisor` | 🔧 Not exposed via BFF |
|
||||
| LangGraph status | `GET /v1/runs/{id}` | ✅ | `sofiia-supervisor` | 🔧 Not exposed via BFF |
|
||||
| **repo_tool (read)** | via chat tools | ✅ | `tool_manager.py` | — |
|
||||
| **pr_reviewer_tool** | via chat tools | ✅ | `tool_manager.py` | — |
|
||||
|
||||
---
|
||||
|
||||
## Таблиця 6: Database Model Dependency
|
||||
|
||||
| UI Screen | Required DB Table | Status | Storage | Action |
|
||||
|-----------|------------------|--------|---------|--------|
|
||||
| Chat history | `messages` | ✅ | SQLite | — |
|
||||
| Projects | `projects` | ✅ | SQLite | — |
|
||||
| Documents | `documents` | ✅ | SQLite | — |
|
||||
| Sessions | `sessions` | ✅ | SQLite | — |
|
||||
| Dialog Map (messages) | `messages.parent_msg_id` | ✅ | SQLite | — |
|
||||
| **Dialog Map (graph)** | `dialog_nodes` + `dialog_edges` | ❌ | None | ADD TABLES |
|
||||
| **Saved map views** | `dialog_views` | ❌ | None | ADD TABLE |
|
||||
| **Doc versions** | `docs_versions` | ❌ | None | ADD TABLE |
|
||||
| **Entity links** | `entity_links` | ❌ | None | ADD TABLE |
|
||||
| **Tasks** | `tasks` | ❌ | None | ADD TABLE |
|
||||
| **Meetings** | `meetings` | ❌ | None | ADD TABLE |
|
||||
| **Repo changesets** | `repo_changesets` | ❌ | None | ADD TABLE |
|
||||
| **Repo patches** | `repo_patches` | ❌ | None | ADD TABLE |
|
||||
| **Pull requests** | `pull_requests` | ❌ | None | ADD TABLE |
|
||||
| **Ops runs** | `ops_runs` | ❌ | None | ADD TABLE |
|
||||
| Embeddings | Qdrant `sofiia_docs_*` | ⚠️ | Qdrant (disabled) | ENABLE FLAG |
|
||||
| Long-term memory | Qdrant `sofiia_messages` | ✅ | Qdrant | — |
|
||||
| Facts | Postgres `daarion_memory` | ✅ | Postgres | — |
|
||||
|
||||
---
|
||||
|
||||
## Таблиця 7: Real-time Events (WebSocket)
|
||||
|
||||
| Event | Direction | Status | Evidence |
|
||||
|-------|-----------|--------|----------|
|
||||
| `nodes.status` | Server → UI | ✅ | `main.py` WebSocket fan-out |
|
||||
| `chat.reply` | Server → UI | ✅ | `main.py` |
|
||||
| `voice.stt.result` | Server → UI | ✅ | `main.py` |
|
||||
| `voice.tts.ready` | Server → UI | ✅ | `main.py` |
|
||||
| `voice.stream.chunk` | Server → UI | ✅ | `main.py` |
|
||||
| `ops.run.status` | Server → UI | ✅ | `main.py` |
|
||||
| `error` | Server → UI | ✅ | `main.py` |
|
||||
| `dialog_map.updated` | Server → UI | ❌ | Not found |
|
||||
| `task.created` | Server → UI | ❌ | Not found |
|
||||
| `doc.updated` | Server → UI | ❌ | Not found |
|
||||
| `meeting.reminder` | Server → UI | ❌ | Not found |
|
||||
| `repo.pr.status` | Server → UI | ❌ | Not found |
|
||||
| `ops_run.completed` | Server → UI | ❌ | Not found |
|
||||
|
||||
---
|
||||
|
||||
## Таблиця 8: Security & Access Control
|
||||
|
||||
| Feature | Status | Evidence |
|
||||
|---------|--------|----------|
|
||||
| API key auth (console) | ✅ | `auth.py` |
|
||||
| Strict auth (SSH/admin) | ✅ | `auth.py` strict mode |
|
||||
| Rate limiting per endpoint | ✅ | `main.py` limiters |
|
||||
| Upload sanitize (filename/mime) | ✅ | `docs_router.py` |
|
||||
| Upload size limits (env-based) | ✅ | `UPLOAD_MAX_*_MB` env |
|
||||
| RBAC tool allowlist | ✅ | `agent_tools_config.py` |
|
||||
| `mode=confidential` check | ❌ | Not in BFF or Router |
|
||||
| E2EE for docs | ❌ | Not implemented |
|
||||
| Audit log for actions | ⚠️ | Partial (router audit.py) |
|
||||
| 2-step Plan → Apply for risky ops | ❌ | Not implemented |
|
||||
| CORS config | ⚠️ | Check `main.py` |
|
||||
|
||||
---
|
||||
|
||||
## Граф залежностей (логічний)
|
||||
|
||||
```
|
||||
[index.html SPA]
|
||||
│
|
||||
┌───────────────┼───────────────┐
|
||||
│ │ │
|
||||
[chat+voice] [projects] [ops+nodes]
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
/api/chat/send /api/projects /api/ops/run
|
||||
/api/voice/* /api/files/* /api/nodes/*
|
||||
/api/telemetry /api/sessions/* /api/integrations/*
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
[Router BFF] [SQLite sofiia.db] [nodes health poll]
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
[Router DAGI] [Memory Service] [SSH + node-worker]
|
||||
/v1/agents/ /threads /events /caps
|
||||
/v1/tools/ /memories /facts /voice/health
|
||||
│
|
||||
▼
|
||||
[LLM + Tools]
|
||||
Grok / qwen3 / DeepSeek
|
||||
+ 20+ tools (repo/pr/kb/etc.)
|
||||
```
|
||||
|
||||
**Відсутні зв'язки (vNext):**
|
||||
```
|
||||
[index.html] → [Kanban Board] ←→ /api/projects/{id}/tasks
|
||||
[index.html] → [Dialog Map canvas] ←→ /api/projects/{id}/dialog-map
|
||||
[index.html] → [CTO Repo Panel] ←→ /api/repo/changesets
|
||||
[index.html] → [CTO Ops Panel] ←→ /api/ops/runs (job-based)
|
||||
[docs_router] → NATS attachment.created
|
||||
[Supervisor] → BFF (not proxied)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Next Actions for UI Team (1–2 days)
|
||||
|
||||
1. **Immediate (today)**: всі фічі chat/voice/projects/sessions/dialog-tree вже працюють — deploy і тестуйте через http://localhost:8002
|
||||
2. **Quick wins (1–2 дні)**:
|
||||
- `DELETE /api/projects/{id}` — 10 рядків коду
|
||||
- `DELETE /api/projects/{id}/documents/{doc_id}` — 10 рядків
|
||||
- BFF proxy до Supervisor: `POST /api/supervisor/runs` → `sofiia-supervisor:8080/v1/graphs/{name}/runs`
|
||||
3. **Phase 2 UI (mock-first)**:
|
||||
- Kanban board: спочатку in-memory tasks → `tasks` table
|
||||
- Meetings: спочатку form → `meetings` table
|
||||
- Dialog Map canvas: `<details>` tree → D3 tree → D3 force graph
|
||||
4. **CTO Panel mock**: додати mock handlers для `/api/repo/changesets` і `/api/ops/runs`
|
||||
5. **Увімкнути USE_EMBEDDINGS=true**: після перевірки що Qdrant доступний
|
||||
6. **Expose Supervisor API через BFF**: один proxy endpoint в main.py
|
||||
7. **NATS attachment.created**: додати до upload handler у docs_router.py
|
||||
8. **`dialog_nodes/edges` tables**: DDL + API + WS events (найважливіше для vNext graph)
|
||||
9. **`docs_versions` table**: ALTER + endpoint (для wiki history)
|
||||
10. **Перевірити WebSocket**: всі voice/ops events реально приходять до UI
|
||||
518
docs/audits/NODA1_AUDIT_CURRENT.md
Normal file
518
docs/audits/NODA1_AUDIT_CURRENT.md
Normal file
@@ -0,0 +1,518 @@
|
||||
# NODA1 Full Audit — DAARION.city
|
||||
**Дата:** 2026-02-27
|
||||
**Сервер:** node1-daarion | 144.76.224.179 | NVIDIA RTX 4000 SFF Ada (20GB VRAM)
|
||||
**Аудитор:** Sofiia — Chief AI Architect
|
||||
|
||||
---
|
||||
|
||||
## EXECUTIVE SUMMARY
|
||||
|
||||
| Напрям | Стан | Критичність |
|
||||
|--------|------|-------------|
|
||||
| Фото E2E (Telegram→Vision) | ✅ Працює, але є shortcut (не через NATS) | MEDIUM |
|
||||
| PDF/Документи | ⚠️ render-pdf-worker idle, index-doc DNS fail | HIGH |
|
||||
| Router/Profiles | ✅ OK — DeepSeek top-level, 27B crew, smollm2 CPU | LOW |
|
||||
| STT/TTS | ✅ CPU-only (Whisper), TTS unloaded | LOW |
|
||||
| Swapper | ⚠️ Потрібен — єдина точка Vision/STT/OCR/Document | KEEP |
|
||||
| GPU policy | ✅ 27B GPU, smollm2 CPU, policy_ok=1 | OK |
|
||||
| NODA1↔NODA2 | ⚠️ K3s cluster (flannel), NATS не з'єднані між нодами | HIGH |
|
||||
| CTO Sofiia control plane | ⚠️ control-plane сервіс є, але тільки prompts+policy JWT | MEDIUM |
|
||||
|
||||
---
|
||||
|
||||
## 1. INVENTORY — Що реально запущено
|
||||
|
||||
### Контейнери (48 total, ключові):
|
||||
|
||||
```
|
||||
swapper-service-node1 healthy 8890-8891
|
||||
dagi-router-node1 healthy 9102→8000
|
||||
dagi-nats-node1 up 4222
|
||||
dagi-memory-service-node1 healthy 8000
|
||||
dagi-qdrant-node1 healthy 6333
|
||||
dagi-gateway-node1 healthy 9300
|
||||
parser-pipeline up 8101
|
||||
ingest-service up 8100
|
||||
render-pdf-worker-node1 up (no port)
|
||||
render-pptx-worker-node1 up (no port)
|
||||
index-doc-worker-node1 up (no port)
|
||||
presentation-renderer-node1 healthy 9212
|
||||
rag-service-node1 healthy 9500
|
||||
dagi-vision-encoder-node1 healthy 8001
|
||||
control-plane up 9200
|
||||
dagi-crawl4ai-node1 healthy 11235
|
||||
oneok-gotenberg-node1 up 3010
|
||||
plant-vision-node1 healthy 8085
|
||||
crewai-nats-worker up 9011
|
||||
dagi-staging-crewai-service up 9010
|
||||
artifact-registry-node1 healthy 9220
|
||||
dagi-minio-node1 up 9000-9001
|
||||
```
|
||||
|
||||
### Systemd:
|
||||
- `ollama.service` — **active** (GPU, port 11434, qwen3.5:27b-q4_K_M, KEEP_ALIVE=10m)
|
||||
- `ollama-cpu.service` — **active** (CPU, port 11435, smollm2:135m)
|
||||
- `gpu-ollama-exporter.service` — **active** (port 9400)
|
||||
- `ollama-warmup-27b.timer` — **active** (кожні 15хв)
|
||||
|
||||
---
|
||||
|
||||
## 2. ROUTER — Профілі, моделі, routing
|
||||
|
||||
### CURRENT STATE
|
||||
|
||||
**Env у контейнері dagi-router-node1:**
|
||||
```
|
||||
ENABLE_CREW_MODEL_ROUTING=1
|
||||
CREW_SMALL_MODEL=smollm2:135m
|
||||
CREWAI_WORKER_LLM_PROFILE=crew_local_27b
|
||||
DEEPSEEK_API_KEY=sk-0db94... (production key)
|
||||
NATS_URL=nats://nats:4222
|
||||
VISION_ENCODER_URL=http://vision-encoder:8001
|
||||
```
|
||||
|
||||
**Профілі (router-config.yml):**
|
||||
| Profile | Provider | Model | URL |
|
||||
|---------|----------|-------|-----|
|
||||
| `cloud_deepseek` | deepseek | deepseek-chat | api.deepseek.com |
|
||||
| `cloud_mistral` | mistral | mistral-large-latest | api.mistral.ai |
|
||||
| `crew_local_27b` | ollama | qwen3.5:27b-q4_K_M | 172.17.0.1:11434 (GPU) |
|
||||
| `crew_vision_27b` | ollama | qwen3.5:27b-q4_K_M | 172.17.0.1:11434 (GPU) |
|
||||
| `crew_local_small` | ollama | smollm2:135m | host.docker.internal:11435 (CPU) |
|
||||
| `service_local_cpu` | ollama | smollm2:135m | host.docker.internal:11435 (CPU) |
|
||||
| `vision_encoder` | — | — | vision-encoder:8001 (ViT-L-14) |
|
||||
| `crewai` | — | — | localhost:9010 |
|
||||
|
||||
**Агенти з vision моделлю:** greenfood, druid, eonarch, helion → `qwen3-vl:8b` (через swapper)
|
||||
|
||||
**Метрики:** `llm_heavy_share_ratio=0.0` — важкі запити ще не логовані (лічильники нульові, нові після restart).
|
||||
|
||||
### GAPS
|
||||
|
||||
- `local_qwen3_8b`, `qwen3_strategist_8b`, ... — **всі вказують на 27B замість 8B** (рядки в config не оновлені після зміни). Назви оманливі.
|
||||
- `crew_local_27b` використовує `172.17.0.1:11434` — не `host.docker.internal`. Inconsistency: CPU профілі через host.docker.internal, GPU — через IP.
|
||||
|
||||
### RECOMMENDED PATCHES
|
||||
|
||||
**Patch 1:** Уніфікувати GPU профілі на `host.docker.internal:11434`:
|
||||
```yaml
|
||||
# services/router/router-config.yml
|
||||
crew_local_27b:
|
||||
base_url: http://host.docker.internal:11434 # було 172.17.0.1
|
||||
crew_vision_27b:
|
||||
base_url: http://host.docker.internal:11434
|
||||
```
|
||||
|
||||
**Patch 2:** Перейменувати оманливі профілі (або залишити as-is якщо вони deprecated):
|
||||
```yaml
|
||||
# local_qwen3_8b → local_qwen3_27b (або видалити невикористані)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. ФОТО E2E — Telegram → Vision → Агент
|
||||
|
||||
### CURRENT STATE (Два шляхи!)
|
||||
|
||||
#### Шлях A: Прямий (основний для більшості агентів)
|
||||
```
|
||||
Telegram photo → Gateway (http_api.py:~2085)
|
||||
↓ download photo via Telegram Bot API → file_url (https://api.telegram.org/file/...)
|
||||
↓ send_to_router({file_url, images: [file_url], prompt})
|
||||
↓ Router (main.py:~2445) → SWAPPER_URL/vision
|
||||
payload: {model: "qwen3-vl-8b", prompt, images: [file_url]}
|
||||
↓ Swapper /vision → завантажує qwen3-vl:8b (ollama pull) → відповідь
|
||||
↓ Router повертає text → Gateway → Telegram
|
||||
```
|
||||
|
||||
#### Шлях B: Через NATS ATTACHMENTS (для parser-pipeline)
|
||||
```
|
||||
Telegram photo → Gateway
|
||||
↓ (окремий worker?) → NATS ATTACHMENTS stream
|
||||
↓ parser-pipeline consumer
|
||||
process_image() → SWAPPER_URL/vision (base64 encode)
|
||||
↓ result → ??? (не ясно куди іде результат)
|
||||
```
|
||||
|
||||
**КРИТИЧНО:** `parser-pipeline` логи показують **тисячі** `ServiceUnavailableError` між рестартами — NATS stream `ATTACHMENTS` зникає після рестарту `dagi-nats-node1` (нема persistence). Після рестарту parser підключається знову (`Consumer created: parser-pipeline`).
|
||||
|
||||
### Vision model flow (Swapper):
|
||||
- Gateway надсилає `file_url` (не base64 завантаження)
|
||||
- Router передає `images: [file_url]` у Swapper
|
||||
- Swapper `/vision` → `qwen3-vl:8b` через Ollama (6.1GB, lazy load)
|
||||
- **qwen3-vl:8b зараз `unloaded`** — cold-start ~30-60s при першому виклику
|
||||
|
||||
### GAPS
|
||||
|
||||
1. **NATS stream ATTACHMENTS не персистентний** — після `docker restart dagi-nats-node1` stream зникає. Parser спамить `ServiceUnavailableError` поки не перезапустити.
|
||||
2. **parser-pipeline `SWAPPER_URL=http://swapper-service:8890`** — але контейнер називається `swapper-service-node1`. DNS може не резолвитись.
|
||||
3. **ingest-service** також має `SWAPPER_URL=http://swapper-service-node1:8890` → `socket.gaierror: Temporary failure in name resolution` — сервіс намагається резолвити щось не те.
|
||||
4. **Шлях B результат незрозумілий** — куди parser-pipeline відправляє результат обробки зображення після Vision?
|
||||
5. **qwen3-vl:8b cold-start** — перший запит до vision займе 30-60s (lazy load).
|
||||
|
||||
### RECOMMENDED PATCHES
|
||||
|
||||
**Patch 3:** Виправити `SWAPPER_URL` в parser-pipeline compose:
|
||||
```yaml
|
||||
# docker-compose.node1.yml, parser-pipeline service
|
||||
environment:
|
||||
- SWAPPER_URL=http://swapper-service-node1:8890 # було: http://swapper-service:8890
|
||||
```
|
||||
|
||||
**Patch 4:** NATS stream ATTACHMENTS — зробити файловий storage з retention:
|
||||
```yaml
|
||||
# nats-js-init service (вже є в compose) — перевірити що він запускається після рестарту NATS
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. PDF/ДОКУМЕНТИ — Обробка
|
||||
|
||||
### CURRENT STATE
|
||||
|
||||
**Сервіси обробки документів:**
|
||||
| Сервіс | Статус | Роль |
|
||||
|--------|--------|------|
|
||||
| `render-pdf-worker-node1` | ✅ up, **idle** | PDF → PNG/зображення (NATS: artifact.job.render_pdf.requested) |
|
||||
| `render-pptx-worker-node1` | ⚠️ DNS fail (`nats`) | PPTX → PNG (NATS: нема з'єднання) |
|
||||
| `index-doc-worker-node1` | ⚠️ DNS fail (RAG service?) | RAG indexing (NATS: artifact.job.*) |
|
||||
| `presentation-renderer-node1` | ✅ healthy (9212) | API сервіс рендерингу |
|
||||
| `oneok-gotenberg-node1` | ✅ up (3010) | HTML/PDF generation (Gotenberg) |
|
||||
| `rag-service-node1` | ✅ healthy (9500) | RAG retrieval |
|
||||
| `artifact-registry-node1` | ✅ healthy (9220) | Артефакт реєстр |
|
||||
| `dagi-minio-node1` | ✅ up (9000-9001) | S3 storage |
|
||||
| `parser-pipeline` | ✅ up (8101) | NATS consumer → Swapper doc+image |
|
||||
|
||||
**Docling:** НЕ ВСТАНОВЛЕНИЙ як окремий контейнер. Є як модель у Swapper (`granite-docling`, тип `document`, 2.5GB, `unloaded`).
|
||||
|
||||
**Шлях обробки документа (PDF):**
|
||||
```
|
||||
Telegram doc → Gateway → ?
|
||||
→ або send_to_router з doc_url
|
||||
→ або через NATS → parser-pipeline → Swapper /document
|
||||
Swapper /document → granite-docling (lazy load, 2.5GB) → текст
|
||||
|
||||
Паралельно:
|
||||
→ artifact.job.render_pdf.requested → render-pdf-worker → PNG → artifact-registry → MinIO
|
||||
→ artifact.job.index_doc.requested → index-doc-worker → rag-service (RAG indexing)
|
||||
```
|
||||
|
||||
### GAPS
|
||||
|
||||
1. **render-pptx-worker** не може резолвити `nats` DNS — на іншій docker network або compose group.
|
||||
2. **index-doc-worker** DNS fail (щось не резолвить) — перевірити network config.
|
||||
3. **granite-docling** у swapper `unloaded` — завантажується lazily, займе час при першому запиті документа. GPU увімкнений для docling? (GPU_ENABLED=false зараз!)
|
||||
4. **Немає Docling окремим сервісом** — вся обробка документів через Swapper, який зараз CPU-only через наші зміни.
|
||||
|
||||
### GAPS — КРИТИЧНО
|
||||
|
||||
> **Swapper GPU_ENABLED=false** — означає, що granite-docling, got-ocr2, qwen3-vl-8b і whisper будуть завантажуватись в CPU/RAM. При 20GB VRAM це субоптимально для Vision і OCR моделей.
|
||||
|
||||
### RECOMMENDED PATCHES
|
||||
|
||||
**Patch 5:** Виправити network для render-pptx-worker та index-doc-worker:
|
||||
```yaml
|
||||
# docker-compose.node1.yml — додати network dagi-network до цих сервісів
|
||||
render-pptx-worker:
|
||||
networks:
|
||||
- dagi-network # щоб резолвити 'nats'
|
||||
index-doc-worker:
|
||||
networks:
|
||||
- dagi-network
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. STT/TTS/SWAPPER — Детальний аналіз
|
||||
|
||||
### CURRENT STATE
|
||||
|
||||
**Swapper /health:** `{"status":"healthy","active_model":"qwen3-8b","mode":"single-active"}`
|
||||
|
||||
**Swapper конфіг (фактичний):**
|
||||
- `mode: multi-active` в yaml, але ENV `MAX_CONCURRENT_MODELS=1` → single-active режим
|
||||
- `GPU_ENABLED=false` (наша зміна) — але config.yaml каже `gpu_enabled: true`
|
||||
- `WHISPER_DEVICE=cpu, WHISPER_COMPUTE_TYPE=int8`
|
||||
|
||||
**Моделі в Swapper:**
|
||||
| Модель | Тип | Розмір | Статус |
|
||||
|--------|-----|--------|--------|
|
||||
| qwen3-8b | llm | 5.2GB | **loaded** (Ollama) |
|
||||
| qwen3-vl-8b | vision | 6.1GB | unloaded |
|
||||
| got-ocr2 | ocr | 7.0GB | unloaded |
|
||||
| donut-base | ocr | 3.0GB | unloaded |
|
||||
| donut-cord | ocr | 3.0GB | unloaded |
|
||||
| granite-docling | document | 2.5GB | unloaded |
|
||||
| faster-whisper-large | stt | 3.0GB | unloaded |
|
||||
| whisper-small | stt | 0.5GB | unloaded |
|
||||
| xtts-v2 | tts | 2.0GB | unloaded |
|
||||
| flux-klein-4b | image_gen | 15.4GB | unloaded |
|
||||
|
||||
**STT:**
|
||||
- STT startup: `[STT-POLICY] WHISPER_DEVICE env='cpu' | actual_device='cpu'` ✅
|
||||
- Swapper `/stt` ← parser-pipeline (audio processing)
|
||||
- Swapper `/stt` ← router (STT_URL)
|
||||
- Swapper `/stt` ← gateway (STT_SERVICE_URL)
|
||||
- **Whisper завантажується lazily при першому аудіо-запиті** на CPU (int8)
|
||||
|
||||
**TTS:** xtts-v2 (2GB) — `unloaded`. Не використовується активно.
|
||||
|
||||
**Висновок по Swapper: ЗАЛИШИТИ (він критичний)**
|
||||
|
||||
Swapper є єдиним агрегатором для:
|
||||
1. **Vision** (`/vision`) — qwen3-vl:8b для всіх агентів що аналізують фото
|
||||
2. **STT** (`/stt`) — Whisper для голосових повідомлень
|
||||
3. **OCR** (`/ocr`) — got-ocr2 для документів
|
||||
4. **Document** (`/document`) — granite-docling для PDF/DOCX
|
||||
5. **TTS** (`/tts`) — xtts-v2 (поки не активований)
|
||||
|
||||
**Проблема:** `active_model=qwen3-8b` через Ollama — це **дублювання** з основним Ollama GPU. Swapper завантажує qwen3:8b через свій ollama, поки є окремий Ollama на 11434 з 27B. При виклику vision, swapper **swap'ає** qwen3:8b і завантажує qwen3-vl:8b — займає VRAM GPU.
|
||||
|
||||
> **Але GPU_ENABLED=false!** — Значить qwen3-vl:8b завантажиться в RAM/CPU, що дуже повільно (>30s).
|
||||
|
||||
### RECOMMENDED PATCHES
|
||||
|
||||
**Patch 6 (ВАЖЛИВИЙ):** Вирішити GPU конфлікт Swapper vs Ollama:
|
||||
|
||||
Варіанти:
|
||||
- **A (рекомендований):** Swapper Vision через Ollama GPU (11434), STT на CPU:
|
||||
```yaml
|
||||
# docker-compose.node1.yml, swapper-service
|
||||
environment:
|
||||
- GPU_ENABLED=true # дозволити GPU для vision/OCR
|
||||
- WHISPER_DEVICE=cpu # але STT лишається CPU
|
||||
- WHISPER_COMPUTE_TYPE=int8
|
||||
# Прибрати CUDA_VISIBLE_DEVICES= (empty block GPU)
|
||||
```
|
||||
Потрібно додати GPU device back:
|
||||
```yaml
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
```
|
||||
Тоді Swapper поверне GPU для vision і OCR.
|
||||
|
||||
- **B (поточний стан):** GPU_ENABLED=false → all CPU → Vision дуже повільно
|
||||
|
||||
---
|
||||
|
||||
## 6. GPU POLICY
|
||||
|
||||
### CURRENT STATE ✅
|
||||
|
||||
```
|
||||
VRAM: 18783 MiB / 20475 MiB (qwen3.5:27b-q4_K_M завантажений — warmup timer)
|
||||
GPU Ollama (11434): 1 model — qwen3.5:27b-q4_K_M (17434 MiB)
|
||||
CPU Ollama (11435): 0 models (smollm2:135m unloaded, lazy)
|
||||
gpu_single_model_policy_ok = 1 ✅
|
||||
ollama_cpu_instance_up = 1 ✅
|
||||
```
|
||||
|
||||
**Проблема:** Swapper показує `active_model=qwen3-8b` — це qwen3:8b через ollama **всередині swapper**, але Swapper зараз CPU-only. Значить qwen3:8b у свопері не займає GPU VRAM поки GPU_ENABLED=false. Але якщо повернути GPU Swapper — треба перевірити що 27B + qwen3-vl-8b не одночасно в VRAM (20GB максимум).
|
||||
|
||||
**Потенційний конфлікт:** 27B (17.4GB) + qwen3-vl-8b (6.1GB) = **23.5GB > 20GB VRAM** → OOM!
|
||||
|
||||
Необхідна координація: коли Swapper завантажує vision модель, Ollama GPU має вивантажити 27B або навпаки.
|
||||
|
||||
---
|
||||
|
||||
## 7. NODA1 ↔ NODA2 — З'єднання
|
||||
|
||||
### CURRENT STATE
|
||||
|
||||
**Інфраструктура:**
|
||||
- NODA1 і NODA2 (`llm80-che-1-1`, IP 192.168.1.240) — це **K3s cluster** (flannel CNI)!
|
||||
- NODA1: `node1-daarion` — **control-plane, master** (Ready)
|
||||
- NODA2 (`llm80-che-1-1`): `worker node` — **NotReady** (проблема!)
|
||||
- **Flannel:** `10.42.0.0/24` (NODA1), `10.42.1.0/24` (NODA2) — pod overlay network
|
||||
- **WireGuard:** НЕ встановлений
|
||||
- **NATS:** cluster config є (`my_cluster`, port 6222), але `routes = []` — **NATS не з'єднаний між нодами**
|
||||
|
||||
**K3s pods на NODA2 (llm80-che-1-1):** більшість `Terminating` або `Pending` — NODA2 NotReady!
|
||||
|
||||
**Що це означає:**
|
||||
- Фізично NODA1 і NODA2 з'єднані через K3s/flannel (LAN, 192.168.x.x)
|
||||
- Але Docker Compose сервіси на NODA2 (memory service, qdrant, neo4j) — **окремі**, не в K3s
|
||||
- NATS між нодами не federatederated — жоден cross-node message bus не налаштований
|
||||
|
||||
### GAPS
|
||||
|
||||
1. **K3s worker NODA2 NotReady** — pods Terminating/Pending. Не ясно чи це критично для поточного продакшну.
|
||||
2. **NATS не кластеризований** — немає leafnode/route між NODA1 і NODA2 NATS.
|
||||
3. **Немає cross-node subjects** для агентів.
|
||||
4. **NODA2 підключення до NODA1:** NODA2 має свій Docker Compose (окремі memory/qdrant), немає спільного bus.
|
||||
|
||||
### RECOMMENDED PATCHES
|
||||
|
||||
**Patch 7 (NATS federation між нодами):**
|
||||
```conf
|
||||
# /opt/microdao-daarion/nats/nats-server.conf (NODA1)
|
||||
leafnodes {
|
||||
port: 7422
|
||||
}
|
||||
|
||||
# NATS на NODA2 підключається як leafnode:
|
||||
leafnodes {
|
||||
remotes = [{ url: "nats://144.76.224.179:7422" }]
|
||||
}
|
||||
```
|
||||
|
||||
Це дозволить NODA2 публікувати/підписуватись на `node.control.noda2.*` через NODA1.
|
||||
|
||||
---
|
||||
|
||||
## 8. CTO SOFIIA — Control Plane
|
||||
|
||||
### CURRENT STATE
|
||||
|
||||
**`control-plane` контейнер (порт 9200):**
|
||||
- FastAPI сервіс з JWT auth (`SERVICE_ROLE=controlplane`)
|
||||
- Endpoints:
|
||||
- `GET /prompts/{agent_id}` — версіоновані system prompts з файлів `*_prompt.txt`
|
||||
- `GET /policy/{agent_id}` — RBAC/entitlements (DefaultPolicies)
|
||||
- `GET /prompts/{agent_id}/hash` — hash промпту для drift detection
|
||||
- **401 Unauthorized** при зверненні без JWT — це правильно
|
||||
|
||||
**Що є:**
|
||||
- ✅ Промпти централізовані та версіоновані
|
||||
- ✅ JWT auth для сервіс-до-сервіс
|
||||
- ✅ Policy/RBAC per agent
|
||||
- ✅ `dagi-vision-encoder-node1` — ViT-L-14 на CPU (embeddings)
|
||||
|
||||
**Що НЕ реалізовано:**
|
||||
- ❌ Node operations (restart/deploy/health через control-plane)
|
||||
- ❌ Sofiia не має NATS-control topic для публікації команд
|
||||
- ❌ Немає `node-ops-worker` на кожній ноді
|
||||
- ❌ Sofiia добавляє нову ноду тільки через SSH root (bRhfV7uNY9m6er — hardcoded!)
|
||||
- ❌ Немає механізму "додати нову ноду без root"
|
||||
|
||||
**Поточний механізм керування нодами:** SSH з паролем root. Небезпечно.
|
||||
|
||||
### RECOMMENDED PATCHES
|
||||
|
||||
**Patch 8 (мінімальний control plane extension):**
|
||||
|
||||
Додати в control-plane endpoints для node ops:
|
||||
```python
|
||||
# services/control-plane/app/main.py (або новий node_ops.py)
|
||||
|
||||
# Sofiia публікує на NATS:
|
||||
# node.control.noda1.restart_service → {service_name, reason}
|
||||
# node.control.noda1.health_check → {}
|
||||
# node.control.noda1.get_logs → {service_name, lines}
|
||||
|
||||
# node-ops-worker (новий мікросервіс) підписується на ці subjects
|
||||
# виконує whitelist commands (docker restart, docker logs tail, health curl)
|
||||
# відповідає на node.control.noda1.reply.*
|
||||
```
|
||||
|
||||
**Мінімальна реалізація (50 рядків Python):**
|
||||
```python
|
||||
# services/node-ops-worker/main.py
|
||||
ALLOWED_COMMANDS = {
|
||||
"restart_service": lambda s: f"docker restart {s}",
|
||||
"health_check": lambda s: f"curl -sf http://localhost:{PORT_MAP[s]}/health",
|
||||
"logs_tail": lambda s, n: f"docker logs --tail {n} {s}",
|
||||
}
|
||||
# Subscribe to node.control.noda1.> via NATS
|
||||
# Execute only ALLOWED_COMMANDS
|
||||
# Reply to reply subject
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## VALIDATION CHECKLIST
|
||||
|
||||
```bash
|
||||
# 1. Router CPU profiles (host.docker.internal)
|
||||
docker exec dagi-router-node1 curl -s http://host.docker.internal:11435/api/tags | python3 -c 'import sys,json; print("CPU Ollama OK:", len(json.load(sys.stdin).get("models",[])))'
|
||||
|
||||
# 2. GPU policy
|
||||
curl -s http://localhost:9400/metrics | grep gpu_single_model_policy_ok
|
||||
|
||||
# 3. Swapper Vision (cold start test — без кешу)
|
||||
# УВАГА: займе 30-60s якщо GPU_ENABLED=false!
|
||||
# curl -s -X POST http://localhost:8890/vision -H 'Content-Type: application/json' \
|
||||
# -d '{"model":"qwen3-vl-8b","prompt":"що на фото?","images":["<url>"]}' | jq .
|
||||
|
||||
# 4. Parser pipeline connected
|
||||
docker logs --tail 5 parser-pipeline 2>&1 | grep -E 'Connected|Consumer created'
|
||||
|
||||
# 5. NATS stream ATTACHMENTS exists
|
||||
curl -s 'http://localhost:8222/jsz?streams=true' | python3 -m json.tool | grep -A3 'ATTACHMENTS'
|
||||
|
||||
# 6. render-pptx-worker DNS fix check
|
||||
docker logs --tail 5 render-pptx-worker-node1 2>&1 | grep -v 'getaddrinfo'
|
||||
|
||||
# 7. index-doc-worker DNS fix check
|
||||
docker logs --tail 5 index-doc-worker-node1 2>&1 | grep -v 'getaddrinfo'
|
||||
|
||||
# 8. Control plane health
|
||||
curl -s http://localhost:9200/health
|
||||
|
||||
# 9. Swapper STT device
|
||||
docker logs swapper-service-node1 2>&1 | grep STT-POLICY
|
||||
|
||||
# 10. K3s NODA2 status
|
||||
kubectl get nodes
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## PRIORITIZED ACTION PLAN
|
||||
|
||||
### P0 — Негайно (production impact):
|
||||
|
||||
| # | Патч | Файл | Вплив |
|
||||
|---|------|------|-------|
|
||||
| 3 | SWAPPER_URL fix в parser-pipeline | docker-compose.node1.yml | Vision через parser |
|
||||
| 5 | Network fix render-pptx + index-doc | docker-compose.node1.yml | Документи |
|
||||
| 6 | GPU повернути Swapper (Vision повільний!) | docker-compose.node1.yml | Vision latency |
|
||||
|
||||
### P1 — Цього тижня:
|
||||
|
||||
| # | Патч | Файл | Вплив |
|
||||
|---|------|------|-------|
|
||||
| 1 | host.docker.internal для GPU профілів | router-config.yml | Stability |
|
||||
| 4 | NATS ATTACHMENTS persistence | nats config | Parser stability |
|
||||
| 7 | NATS leafnode NODA1↔NODA2 | nats-server.conf | Cross-node |
|
||||
|
||||
### P2 — Наступний спринт:
|
||||
|
||||
| # | Патч | Файл | Вплив |
|
||||
|---|------|------|-------|
|
||||
| 8 | node-ops-worker для Sofiia control | нові файли | Security |
|
||||
| 2 | Profile rename в router-config | router-config.yml | Clarity |
|
||||
|
||||
---
|
||||
|
||||
## ВІДПОВІДІ НА 7 КЛЮЧОВИХ ПИТАНЬ
|
||||
|
||||
### 1. Фото E2E
|
||||
**Telegram photo → Gateway** (скачує файл → file_url) → **`send_to_router({images:[file_url]})`** → **Router** перевіряє агента → якщо vision-агент → **`SWAPPER_URL/vision`** → Swapper → Ollama `qwen3-vl:8b` → text опис → Router → Gateway → Telegram. Parser-pipeline — паралельний worker для асинхронної обробки (не основний шлях). Payload: `{model, prompt, images:[url], max_tokens}`.
|
||||
|
||||
### 2. Документи/PDF
|
||||
**Немає Docling як сервісу.** Docling вбудований в Swapper як `granite-docling` (lazy, unloaded). Шлях: Gateway → Router → `SWAPPER_URL/document` → Swapper → granite-docling. Паралельно через NATS: `artifact.job.render_pdf.requested` → render-pdf-worker → PNG → MinIO/artifact-registry. `index-doc-worker` індексує в RAG але має DNS fail.
|
||||
|
||||
### 3. Router
|
||||
Top-level агенти → **DeepSeek API** (cloud_deepseek). Crew tasks → **qwen3.5:27b-q4_K_M** (crew_local_27b, GPU). Monitoring/small → **smollm2:135m** (crew_local_small, CPU Ollama 11435). `ENABLE_CREW_MODEL_ROUTING=1` активний. Vision агенти отримують `qwen3-vl-8b` через Swapper.
|
||||
|
||||
### 4. TTS/STT
|
||||
STT: **Whisper (CPU, int8)** через Swapper `/stt`. `WHISPER_DEVICE=cpu` підтверджено логами. Lazy load при першому аудіо. Підтримується: faster-whisper-large (3GB), whisper-small (0.5GB). TTS: xtts-v2 (2GB) — **not deployed активно** (unloaded). Немає VRAM конкуренції для STT.
|
||||
|
||||
### 5. Swapper
|
||||
**Залишити.** Є єдиним агрегатором для Vision (qwen3-vl:8b), STT (Whisper), OCR (got-ocr2), Document (granite-docling), TTS (xtts-v2). Без Swapper треба окремі сервіси для кожного. Але: `active_model=qwen3-8b` — потенційно невикористана ролі (є окремий Ollama). **Слід розглянути видалення qwen3-8b зі Swapper** — він дублює GPU Ollama, залишити тільки Vision/OCR/STT/Document функції.
|
||||
|
||||
### 6. NODA1↔NODA2
|
||||
З'єднані через **K3s cluster** (flannel, 10.42.0.0/24). NODA2 (`llm80-che-1-1`, 192.168.1.240) — K3s worker, зараз **NotReady**. NATS між нодами **не з'єднаний** (routes=[]), немає leafnode. Docker Compose сервіси незалежні. Для cross-node messaging потрібен NATS leafnode або Flannel pod networking.
|
||||
|
||||
### 7. CTO Sofiia Control Plane
|
||||
Поточний стан: `control-plane` (9200) — JWT-захищений сервіс з prompts + policy. **Немає node-ops механізму**. Sofiia керує нодами через SSH root (небезпечно). Правильний шлях: NATS-control plane + `node-ops-worker` на кожній ноді з whitelist команд. control-plane вже є основою — треба додати NATS subscription для node operations.
|
||||
|
||||
---
|
||||
|
||||
*Звіт згенеровано автоматично аудитом NODA1 | Sofiia v2.7 | 2026-02-27*
|
||||
212
docs/backlog/backlog.md
Normal file
212
docs/backlog/backlog.md
Normal file
@@ -0,0 +1,212 @@
|
||||
# Engineering Backlog Bridge — DAARION.city
|
||||
|
||||
## Overview
|
||||
|
||||
The **Engineering Backlog Bridge** converts Risk/Pressure digest signals into a
|
||||
**managed, structured backlog** of engineering work items. It closes the loop:
|
||||
|
||||
```
|
||||
observe (Risk/Pressure) → decide (digest) → plan (backlog) → enforce (gates)
|
||||
```
|
||||
|
||||
No LLM. Fully deterministic. Policy-driven. Idempotent (weekly dedupe).
|
||||
|
||||
---
|
||||
|
||||
## Data Model
|
||||
|
||||
### BacklogItem
|
||||
|
||||
| Field | Type | Description |
|
||||
|----------------|----------|-------------|
|
||||
| `id` | string | `bl_<hex12>` |
|
||||
| `created_at` | ISO ts | When created |
|
||||
| `updated_at` | ISO ts | Last modification |
|
||||
| `env` | string | `prod` / `staging` / `dev` |
|
||||
| `service` | string | DAARION service name |
|
||||
| `category` | enum | `arch_review`, `refactor`, `slo_hardening`, `cleanup_followups`, `security` |
|
||||
| `title` | string | Short human-readable label |
|
||||
| `description` | string | Bullet-list of signals + context |
|
||||
| `priority` | enum | `P0` .. `P3` |
|
||||
| `status` | enum | See Workflow below |
|
||||
| `owner` | string | `oncall` / `cto` / team name |
|
||||
| `due_date` | YYYY-MM-DD | Computed from category `due_days` |
|
||||
| `source` | string | `risk` / `pressure` / `digest` / `manual` |
|
||||
| `dedupe_key` | string | `platform_backlog:{YYYY-WW}:{env}:{service}:{category}` |
|
||||
| `evidence_refs`| dict | `alerts[]`, `incidents[]`, `release_checks[]`, `artifacts[]`, `followups[]` |
|
||||
| `tags` | list | `["auto", "week:2026-W08", "rule:arch_review_required"]` |
|
||||
| `meta` | dict | Free-form metadata |
|
||||
|
||||
### BacklogEvent (timeline)
|
||||
|
||||
| Field | Type | Description |
|
||||
|------------|--------|-------------|
|
||||
| `id` | string | `ev_<hex12>` |
|
||||
| `item_id` | string | FK to BacklogItem |
|
||||
| `ts` | ISO ts | Event timestamp |
|
||||
| `type` | enum | `created`, `status_change`, `comment`, `auto_update` |
|
||||
| `message` | string | Human-readable description |
|
||||
| `actor` | string | Who triggered the event |
|
||||
| `meta` | dict | Old/new status, rule name, etc. |
|
||||
|
||||
---
|
||||
|
||||
## Workflow
|
||||
|
||||
```
|
||||
open ──► in_progress ──► done
|
||||
│ │
|
||||
│ ▼
|
||||
└──► blocked ──► in_progress
|
||||
│
|
||||
└──► canceled (terminal)
|
||||
```
|
||||
|
||||
| From | Allowed targets |
|
||||
|--------------|-------------------------------|
|
||||
| `open` | in_progress, blocked, canceled |
|
||||
| `in_progress`| blocked, done, canceled |
|
||||
| `blocked` | open, in_progress, canceled |
|
||||
| `done` | (none — terminal) |
|
||||
| `canceled` | (none — terminal) |
|
||||
|
||||
Transitions are enforced by `validate_transition()` in `backlog_store.py`.
|
||||
|
||||
---
|
||||
|
||||
## Auto-generation Rules
|
||||
|
||||
Rules are evaluated **per-service** from `config/backlog_policy.yml`.
|
||||
All conditions in `when` must hold (AND logic). First matching rule per
|
||||
category wins (no duplicate categories per service per week).
|
||||
|
||||
| Rule name | Trigger condition | Category | Priority |
|
||||
|-------------------------|---------------------------------------------|--------------------|----------|
|
||||
| `arch_review_required` | `pressure_requires_arch_review: true` | `arch_review` | P1 / 14d |
|
||||
| `high_pressure_refactor`| `pressure_band` AND `risk_band` ∈ high/critical | `refactor` | P1 / 21d |
|
||||
| `slo_violations` | `risk_has_slo_violations: true` | `slo_hardening` | P2 / 30d |
|
||||
| `followup_backlog` | `followups_overdue > 0` | `cleanup_followups`| P2 / 14d |
|
||||
|
||||
---
|
||||
|
||||
## Dedupe Logic
|
||||
|
||||
Each item has a `dedupe_key`:
|
||||
|
||||
```
|
||||
platform_backlog:{YYYY-WW}:{env}:{service}:{category}
|
||||
```
|
||||
|
||||
`upsert()` uses this key:
|
||||
- **First run of week** → creates the item.
|
||||
- **Subsequent runs** → updates title/description/evidence_refs (preserves status/owner).
|
||||
|
||||
This means weekly re-generation is safe and idempotent.
|
||||
|
||||
---
|
||||
|
||||
## API
|
||||
|
||||
### HTTP Endpoints
|
||||
|
||||
| Method | Path | RBAC | Description |
|
||||
|--------|-------------------------------------|------------------------|-------------|
|
||||
| GET | `/v1/backlog/dashboard?env=prod` | `tools.backlog.read` | Status/priority/overdue summary |
|
||||
| GET | `/v1/backlog/items` | `tools.backlog.read` | Filtered item list |
|
||||
| GET | `/v1/backlog/items/{id}` | `tools.backlog.read` | Single item + event timeline |
|
||||
| POST | `/v1/backlog/generate/weekly` | `tools.backlog.admin` | Trigger weekly auto-generation |
|
||||
|
||||
Query params for `/v1/backlog/items`:
|
||||
`env`, `service`, `status`, `owner`, `category`, `due_before`, `limit`, `offset`
|
||||
|
||||
### Tool: `backlog_tool`
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "list|get|dashboard|create|upsert|set_status|add_comment|close|auto_generate_weekly|cleanup",
|
||||
"env": "prod",
|
||||
"id": "bl_abc...",
|
||||
"service": "gateway",
|
||||
"status": "open",
|
||||
"item": { ... },
|
||||
"message": "...",
|
||||
"actor": "cto"
|
||||
}
|
||||
```
|
||||
|
||||
### RBAC
|
||||
|
||||
| Entitlement | Roles | Actions |
|
||||
|--------------------------|------------------|---------|
|
||||
| `tools.backlog.read` | cto, oncall, interface | list, get, dashboard |
|
||||
| `tools.backlog.write` | cto, oncall | create, upsert, set_status, add_comment, close |
|
||||
| `tools.backlog.admin` | cto only | auto_generate_weekly, cleanup |
|
||||
|
||||
---
|
||||
|
||||
## Storage Backends
|
||||
|
||||
| Backend | Env var | Notes |
|
||||
|-----------|------------------------|-------|
|
||||
| `auto` | `BACKLOG_BACKEND=auto` | Postgres → JSONL fallback (default) |
|
||||
| `postgres`| `BACKLOG_BACKEND=postgres` | Primary (requires migration) |
|
||||
| `jsonl` | `BACKLOG_BACKEND=jsonl` | Filesystem append-only (MVP) |
|
||||
| `memory` | `BACKLOG_BACKEND=memory` | Tests only |
|
||||
| `null` | `BACKLOG_BACKEND=null` | No-op |
|
||||
|
||||
Files (JSONL): `ops/backlog/items.jsonl`, `ops/backlog/events.jsonl`
|
||||
|
||||
Postgres: run `ops/scripts/migrate_backlog_postgres.py` first.
|
||||
|
||||
---
|
||||
|
||||
## Scheduled Jobs
|
||||
|
||||
| Job | Schedule | Description |
|
||||
|----------------------------|--------------------|-------------|
|
||||
| `weekly_backlog_generate` | Mon 06:20 UTC | Generate items from latest platform digest |
|
||||
| `daily_backlog_cleanup` | Daily 03:40 UTC | Remove done/canceled items older than retention_days |
|
||||
|
||||
---
|
||||
|
||||
## Examples
|
||||
|
||||
### Manual create via tool
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "create",
|
||||
"env": "prod",
|
||||
"item": {
|
||||
"service": "gateway",
|
||||
"category": "security",
|
||||
"title": "[SEC] Patch CVE-2026-xxxx in gateway",
|
||||
"priority": "P0",
|
||||
"due_date": "2026-03-01",
|
||||
"owner": "cto",
|
||||
"source": "manual",
|
||||
"dedupe_key": "manual:2026-W08:prod:gateway:security"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Close an item
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "close",
|
||||
"id": "bl_abc123456789",
|
||||
"status": "done",
|
||||
"message": "Architecture review completed — no rework needed."
|
||||
}
|
||||
```
|
||||
|
||||
### Run weekly auto-generation
|
||||
|
||||
```bash
|
||||
# HTTP
|
||||
POST /v1/backlog/generate/weekly?env=prod
|
||||
|
||||
# Tool
|
||||
{ "action": "auto_generate_weekly", "env": "prod" }
|
||||
```
|
||||
156
docs/incident/alerts.md
Normal file
156
docs/incident/alerts.md
Normal file
@@ -0,0 +1,156 @@
|
||||
# Alert → Incident Bridge
|
||||
|
||||
## Overview
|
||||
|
||||
The Alert Bridge provides a governed, deduplicated pipeline from Monitor/Prometheus detection to Incident creation.
|
||||
|
||||
**Security model:** Monitor sends alerts (`tools.alerts.ingest` only). Sofiia/oncall create incidents (`tools.oncall.incident_write` + `tools.alerts.ack`). No agent gets both roles automatically.
|
||||
|
||||
```
|
||||
Monitor@nodeX ──ingest──► AlertStore ──alert_to_incident──► IncidentStore
|
||||
(tools.alerts.ingest) (tools.oncall.incident_write)
|
||||
│
|
||||
IncidentTriage (Sofiia NODA2)
|
||||
│
|
||||
PostmortemDraft
|
||||
```
|
||||
|
||||
## AlertEvent Schema
|
||||
|
||||
```json
|
||||
{
|
||||
"source": "monitor@node1",
|
||||
"service": "gateway",
|
||||
"env": "prod",
|
||||
"severity": "P1",
|
||||
"kind": "slo_breach",
|
||||
"title": "gateway SLO: latency p95 > 300ms",
|
||||
"summary": "p95 latency at 450ms, error_rate 2.5%",
|
||||
"started_at": "2025-01-23T09:00:00Z",
|
||||
"labels": {
|
||||
"node": "node1",
|
||||
"fingerprint": "gateway:slo_breach:latency"
|
||||
},
|
||||
"metrics": {
|
||||
"latency_p95_ms": 450,
|
||||
"error_rate_pct": 2.5
|
||||
},
|
||||
"evidence": {
|
||||
"log_samples": ["ERROR timeout after 30s", "WARN retry 3/3"],
|
||||
"query": "rate(http_errors_total[5m])"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Severity values
|
||||
`P0`, `P1`, `P2`, `P3`, `INFO`
|
||||
|
||||
### Kind values
|
||||
`slo_breach`, `crashloop`, `latency`, `error_rate`, `disk`, `oom`, `deploy`, `security`, `custom`
|
||||
|
||||
## Dedupe Behavior
|
||||
|
||||
Dedupe key = `sha256(service|env|kind|fingerprint)`.
|
||||
|
||||
- Same key within TTL (default 30 min) → `deduped=true`, `occurrences++`, no new record
|
||||
- Same key after TTL → new alert record
|
||||
- Different fingerprint → separate record
|
||||
|
||||
## `alert_ingest_tool` API
|
||||
|
||||
### ingest (Monitor role)
|
||||
```json
|
||||
{
|
||||
"action": "ingest",
|
||||
"alert": { ...AlertEvent... },
|
||||
"dedupe_ttl_minutes": 30
|
||||
}
|
||||
```
|
||||
Response:
|
||||
```json
|
||||
{
|
||||
"accepted": true,
|
||||
"deduped": false,
|
||||
"dedupe_key": "abc123...",
|
||||
"alert_ref": "alrt_20250123_090000_a1b2c3",
|
||||
"occurrences": 1
|
||||
}
|
||||
```
|
||||
|
||||
### list (read)
|
||||
```json
|
||||
{ "action": "list", "service": "gateway", "env": "prod", "window_minutes": 240, "limit": 50 }
|
||||
```
|
||||
|
||||
### get (read)
|
||||
```json
|
||||
{ "action": "get", "alert_ref": "alrt_..." }
|
||||
```
|
||||
|
||||
### ack (oncall/cto)
|
||||
```json
|
||||
{ "action": "ack", "alert_ref": "alrt_...", "actor": "sofiia", "note": "false positive" }
|
||||
```
|
||||
|
||||
## `oncall_tool.alert_to_incident`
|
||||
|
||||
Converts a stored alert into an incident (or attaches to an existing open one).
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "alert_to_incident",
|
||||
"alert_ref": "alrt_...",
|
||||
"incident_severity_cap": "P1",
|
||||
"dedupe_window_minutes": 60,
|
||||
"attach_artifact": true
|
||||
}
|
||||
```
|
||||
|
||||
Response:
|
||||
```json
|
||||
{
|
||||
"incident_id": "inc_20250123_090000_xyz",
|
||||
"created": true,
|
||||
"severity": "P1",
|
||||
"artifact_path": "ops/incidents/inc_.../alert_alrt_....json",
|
||||
"note": "Incident created and alert acked"
|
||||
}
|
||||
```
|
||||
|
||||
### Logic
|
||||
1. Load alert from `AlertStore`
|
||||
2. Check for existing open P0/P1 incident for same service/env within `dedupe_window_minutes`
|
||||
- If found → attach event to existing incident, ack alert
|
||||
3. If not found → create incident, append `note` + `metric` timeline events, optionally attach masked alert JSON as artifact, ack alert
|
||||
|
||||
## RBAC
|
||||
|
||||
| Role | ingest | list/get | ack | alert_to_incident |
|
||||
|------|--------|----------|-----|-------------------|
|
||||
| `agent_monitor` | ✅ | ❌ | ❌ | ❌ |
|
||||
| `agent_cto` | ✅ | ✅ | ✅ | ✅ |
|
||||
| `agent_oncall` | ❌ | ✅ | ✅ | ✅ |
|
||||
| `agent_interface` | ❌ | ✅ | ❌ | ❌ |
|
||||
| `agent_default` | ❌ | ❌ | ❌ | ❌ |
|
||||
|
||||
## SLO Watch Gate
|
||||
|
||||
The `slo_watch` gate in `release_check` prevents deploys during active SLO breaches.
|
||||
|
||||
| Profile | Mode | Behavior |
|
||||
|---------|------|----------|
|
||||
| dev | warn | Recommendations only |
|
||||
| staging | strict | Blocks on any violation |
|
||||
| prod | warn | Recommendations only |
|
||||
|
||||
Configure in `config/release_gate_policy.yml` per profile. Override per run with `run_slo_watch: false`.
|
||||
|
||||
## Backends
|
||||
|
||||
| Env var | Value | Effect |
|
||||
|---------|-------|--------|
|
||||
| `ALERT_BACKEND` | `memory` (default) | In-process, not persistent |
|
||||
| `ALERT_BACKEND` | `postgres` | Persistent, needs DATABASE_URL |
|
||||
| `ALERT_BACKEND` | `auto` | Postgres if DATABASE_URL set, else memory |
|
||||
|
||||
Run DDL: `python3 ops/scripts/migrate_alerts_postgres.py`
|
||||
99
docs/incident/escalation.md
Normal file
99
docs/incident/escalation.md
Normal file
@@ -0,0 +1,99 @@
|
||||
# Incident Escalation Engine
|
||||
|
||||
Deterministic, LLM-free engine that escalates incidents and identifies auto-resolve candidates
|
||||
based on alert storm behavior.
|
||||
|
||||
## Overview
|
||||
|
||||
```
|
||||
alert_triage_graph (every 5 min)
|
||||
└─ process_alerts
|
||||
└─ post_process_escalation ← incident_escalation_tool.evaluate
|
||||
└─ post_process_autoresolve ← incident_escalation_tool.auto_resolve_candidates
|
||||
└─ build_digest ← includes escalation + candidate summary
|
||||
```
|
||||
|
||||
## Escalation Logic
|
||||
|
||||
Config: `config/incident_escalation_policy.yml`
|
||||
|
||||
| Trigger | From → To |
|
||||
|---------|-----------|
|
||||
| `occurrences_60m ≥ 10` OR `triage_count_24h ≥ 3` | P2 → P1 |
|
||||
| `occurrences_60m ≥ 25` OR `triage_count_24h ≥ 6` | P1 → P0 |
|
||||
| Cap: `severity_cap: "P0"` | never exceeds P0 |
|
||||
|
||||
When escalation triggers:
|
||||
1. `incident_append_event(type=decision)` — audit trail
|
||||
2. `incident_append_event(type=followup)` — auto follow-up (if `create_followup_on_escalate: true`)
|
||||
|
||||
## Auto-resolve Candidates
|
||||
|
||||
Incidents where `last_alert_at < now - no_alerts_minutes_for_candidate`:
|
||||
|
||||
- `close_allowed_severities: ["P2", "P3"]` — only low-severity auto-closeable
|
||||
- `auto_close: false` (default) — produces *candidates* only, no auto-close
|
||||
- Each candidate gets a `note` event appended to the incident timeline
|
||||
|
||||
## Alert-loop SLO
|
||||
|
||||
Tracked in `/v1/alerts/dashboard?window_minutes=240`:
|
||||
|
||||
```json
|
||||
"slo": {
|
||||
"claim_to_ack_p95_seconds": 12.3,
|
||||
"failed_rate_pct": 0.5,
|
||||
"processing_stuck_count": 0,
|
||||
"violations": []
|
||||
}
|
||||
```
|
||||
|
||||
Thresholds (from `alert_loop_slo` in policy):
|
||||
- `claim_to_ack_p95_seconds: 60` — p95 latency from claim to ack
|
||||
- `failed_rate_pct: 5` — max % failed/(acked+failed)
|
||||
- `processing_stuck_minutes: 15` — alerts stuck in processing beyond this
|
||||
|
||||
## RBAC
|
||||
|
||||
| Action | Required entitlement |
|
||||
|--------|---------------------|
|
||||
| `evaluate` | `tools.oncall.incident_write` (CTO/oncall) |
|
||||
| `auto_resolve_candidates` | `tools.oncall.incident_write` (CTO/oncall) |
|
||||
|
||||
Monitor agent does NOT have access (ingest-only).
|
||||
|
||||
## Configuration
|
||||
|
||||
```yaml
|
||||
# config/incident_escalation_policy.yml
|
||||
escalation:
|
||||
occurrences_thresholds:
|
||||
P2_to_P1: 10
|
||||
P1_to_P0: 25
|
||||
triage_thresholds_24h:
|
||||
P2_to_P1: 3
|
||||
P1_to_P0: 6
|
||||
severity_cap: "P0"
|
||||
create_followup_on_escalate: true
|
||||
|
||||
auto_resolve:
|
||||
no_alerts_minutes_for_candidate: 60
|
||||
close_allowed_severities: ["P2", "P3"]
|
||||
auto_close: false
|
||||
|
||||
alert_loop_slo:
|
||||
claim_to_ack_p95_seconds: 60
|
||||
failed_rate_pct: 5
|
||||
processing_stuck_minutes: 15
|
||||
```
|
||||
|
||||
## Tuning
|
||||
|
||||
**Too many escalations (noisy)?**
|
||||
→ Increase `occurrences_thresholds.P2_to_P1` or `triage_thresholds_24h.P2_to_P1`.
|
||||
|
||||
**Auto-resolve too aggressive?**
|
||||
→ Increase `no_alerts_minutes_for_candidate` (e.g., 120 min).
|
||||
|
||||
**Ready to enable auto-close for P3?**
|
||||
→ Set `auto_close: true` and `close_allowed_severities: ["P3"]`.
|
||||
102
docs/incident/followups.md
Normal file
102
docs/incident/followups.md
Normal file
@@ -0,0 +1,102 @@
|
||||
# Follow-up Tracker & Release Gate
|
||||
|
||||
## Overview
|
||||
|
||||
Follow-ups are structured action items attached to incidents via `incident_append_event` with `type=followup`. The `followup_watch` gate in `release_check` uses them to block or warn about releases for services with unresolved issues.
|
||||
|
||||
## Follow-up Event Schema
|
||||
|
||||
When appending a follow-up event to an incident:
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "incident_append_event",
|
||||
"incident_id": "inc_20250123_0900_abc1",
|
||||
"type": "followup",
|
||||
"message": "Upgrade postgres driver",
|
||||
"meta": {
|
||||
"title": "Upgrade postgres driver to fix connection leak",
|
||||
"owner": "sofiia",
|
||||
"priority": "P1",
|
||||
"due_date": "2025-02-01T00:00:00Z",
|
||||
"status": "open",
|
||||
"links": ["https://github.com/org/repo/issues/42"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Meta Fields
|
||||
|
||||
| Field | Type | Required | Description |
|
||||
|-------|------|----------|-------------|
|
||||
| `title` | string | yes | Short description |
|
||||
| `owner` | string | yes | Agent ID or handle |
|
||||
| `priority` | enum | yes | P0, P1, P2, P3 |
|
||||
| `due_date` | ISO8601 | yes | Deadline |
|
||||
| `status` | enum | yes | open, done, cancelled |
|
||||
| `links` | array | no | Related PRs/issues/ADRs |
|
||||
|
||||
## oncall_tool: incident_followups_summary
|
||||
|
||||
Summarises open incidents and overdue follow-ups for a service.
|
||||
|
||||
### Request
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "incident_followups_summary",
|
||||
"service": "gateway",
|
||||
"env": "prod",
|
||||
"window_days": 30
|
||||
}
|
||||
```
|
||||
|
||||
### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"open_incidents": [
|
||||
{"id": "inc_...", "severity": "P1", "status": "open", "started_at": "...", "title": "..."}
|
||||
],
|
||||
"overdue_followups": [
|
||||
{"incident_id": "inc_...", "title": "...", "due_date": "...", "priority": "P1", "owner": "sofiia"}
|
||||
],
|
||||
"stats": {
|
||||
"open_incidents": 1,
|
||||
"overdue": 1,
|
||||
"total_open_followups": 3
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Release Gate: followup_watch
|
||||
|
||||
### Behaviour per GatePolicy mode
|
||||
|
||||
| Mode | Behaviour |
|
||||
|------|-----------|
|
||||
| `off` | Gate skipped entirely |
|
||||
| `warn` | Always pass=True; adds recommendations for open P0/P1 and overdue follow-ups |
|
||||
| `strict` | Blocks release (`pass=false`) if open incidents match `fail_on` severities or overdue follow-ups exist |
|
||||
|
||||
### Configuration
|
||||
|
||||
In `config/release_gate_policy.yml`:
|
||||
|
||||
```yaml
|
||||
followup_watch:
|
||||
mode: "warn" # off | warn | strict
|
||||
fail_on: ["P0", "P1"] # Severities that block in strict mode
|
||||
```
|
||||
|
||||
### release_check inputs
|
||||
|
||||
| Input | Type | Default | Description |
|
||||
|-------|------|---------|-------------|
|
||||
| `run_followup_watch` | bool | true | Enable/disable gate |
|
||||
| `followup_watch_window_days` | int | 30 | Incident scan window |
|
||||
| `followup_watch_env` | string | "any" | Filter by environment |
|
||||
|
||||
## RBAC
|
||||
|
||||
`incident_followups_summary` requires `tools.oncall.read` entitlement.
|
||||
112
docs/incident/incident_log.md
Normal file
112
docs/incident/incident_log.md
Normal file
@@ -0,0 +1,112 @@
|
||||
# NODA1 Incident Log
|
||||
|
||||
---
|
||||
|
||||
## INC-2026-002 | 2026-02-27 | Gateway Workers + SenpAI + facts/upsert
|
||||
|
||||
**Severity:** SEV-1 (всі агенти не відповідали користувачам)
|
||||
**Status:** RESOLVED
|
||||
**Duration:** ~3 дні (з 2026-02-21 09:55 по 2026-02-27 23:15)
|
||||
|
||||
### Summary
|
||||
|
||||
Після апгрейду Redis до 8.6.1 та ряду змін у коді gateway два воркери зависли,
|
||||
SenpAI повертав 500, а `facts/upsert` падав з `InvalidColumnReferenceError`.
|
||||
В сукупності агенти не відповідали у Telegram.
|
||||
|
||||
### Root Causes (3 незалежні)
|
||||
|
||||
| # | Компонент | Причина |
|
||||
|---|-----------|---------|
|
||||
| 1 | `dagi-gateway-worker-node1` | Після Redis 8.6.1 upgrade старі TCP-сокети async-клієнта → `ReadOnlyError` у `brpop()` |
|
||||
| 2 | `dagi-gateway-reminder-worker-node1` | Та сама проблема застарілих з'єднань після Redis upgrade |
|
||||
| 3 | `SenpAI webhook` → Router | `.env`: `ROUTER_URL=http://dagi-staging-router:8000` (staging!) замість `http://router:8000` |
|
||||
| 4 | `memory-service /facts/upsert` | `ensure_facts_table()` DDL застарілий: `UNIQUE(user_id, team_id, fact_key)` → asyncpg кешував старий prepared statement без `agent_id`; ON CONFLICT не знаходив matching constraint |
|
||||
| 5 | `get_doc_context()` | Підпис функції не мав `agent_id=None` параметра, хоча `http_api.py` передавав його |
|
||||
|
||||
### Timeline
|
||||
|
||||
| Час (UTC+1) | Подія |
|
||||
|-------------|-------|
|
||||
| 2026-02-21 09:55 | Остання успішна обробка (agromatrix) |
|
||||
| 2026-02-26 13:09 | Початок `ReadOnlyError` у gateway-worker (Redis upgrade) |
|
||||
| 2026-02-27 17:02 | Поновлення помилок worker після перезапусків |
|
||||
| 2026-02-27 19:49 | Повна блокада gateway-worker (останній restart) |
|
||||
| 2026-02-27 22:46 | Перезапуск dagi-gateway-worker-node1 → стабільний |
|
||||
| 2026-02-27 22:47 | Перезапуск dagi-gateway-reminder-worker-node1 → стабільний |
|
||||
| 2026-02-28 00:01 | Виправлено ensure_facts_table() → memory-service rebuilt |
|
||||
| 2026-02-28 00:05 | Виправлено ROUTER_URL, get_doc_context() → gateway rebuilt |
|
||||
| 2026-02-28 00:15 | Всі 14 агентів HTTP 200 ✓ |
|
||||
|
||||
### Fixes Applied (на сервері /opt/microdao-daarion)
|
||||
|
||||
```
|
||||
1. docker restart dagi-gateway-worker-node1 dagi-gateway-reminder-worker-node1
|
||||
2. services/memory-service/app/database.py:
|
||||
- ensure_facts_table() замінено на noop (таблиця управляється міграціями)
|
||||
- Скопійовано відсутні файли: integration_endpoints.py, integrations.py, voice_endpoints.py
|
||||
3. gateway-bot/services/doc_service.py:
|
||||
- get_doc_context(session_id: str) → get_doc_context(session_id: str, agent_id: str = None)
|
||||
4. .env:
|
||||
- ROUTER_URL=http://dagi-staging-router:8000 → ROUTER_URL=http://router:8000
|
||||
5. Rebuild + restart: memory-service, gateway, gateway-worker, gateway-reminder-worker
|
||||
```
|
||||
|
||||
### Verification
|
||||
|
||||
```
|
||||
All 14 agents HTTP 200:
|
||||
✓ senpai ✓ helion ✓ nutra ✓ daarwizz ✓ greenfood ✓ agromatrix
|
||||
✓ alateya ✓ druid ✓ clan ✓ eonarch ✓ oneok ✓ soul
|
||||
✓ yaromir ✓ sofiia
|
||||
facts/upsert: {"status":"ok"}
|
||||
Gateway: healthy, 14 agents
|
||||
```
|
||||
|
||||
### Action Items (TODO)
|
||||
|
||||
- [ ] Після Redis upgrade — завжди перезапускати workers (додати в runbook)
|
||||
- [ ] Виправити `ensure_facts_table()` в коді репозиторію (локально)
|
||||
- [ ] Виправити `get_doc_context()` сигнатуру в локальному репо
|
||||
- [ ] Виправити `.env` в репозиторії (або `.env.example`) — прибрати staging router URL
|
||||
- [ ] Додати liveness probe для workers: exit(1) при повторних ReadOnlyError
|
||||
- [ ] Алерт: "No messages processed for X minutes"
|
||||
|
||||
---
|
||||
|
||||
---
|
||||
|
||||
## INC-2026-003 | 2026-02-28 | Ollama resource crash → всі агенти 503
|
||||
|
||||
**Severity:** SEV-1 (всі агенти не відповідали у Telegram)
|
||||
**Status:** RESOLVED
|
||||
**Duration:** ~8 годин (з 07:53 по ~16:00 UTC+1)
|
||||
|
||||
### Root Cause
|
||||
|
||||
Ollama впала з помилкою `model runner has unexpectedly stopped, this may be due to resource limitations`. Модель `qwen3:8b` (27.8B params, ~17GB) перевищила ресурси сервера під навантаженням → Router отримував `500` від Ollama → повертав `503` клієнту. Всі агенти були налаштовані на `provider: ollama`.
|
||||
|
||||
### Fix Applied
|
||||
|
||||
Переключено всі агенти в `router-config.yml` з `qwen3_*_8b` профілів → `cloud_deepseek`:
|
||||
- 14 агентів тепер використовують `deepseek-chat` через DeepSeek API
|
||||
- Router перезапущено для підхвачення нового конфігу
|
||||
|
||||
### Verification
|
||||
|
||||
```
|
||||
helion: 🌐 Trying DEEPSEEK API → HTTP 200, 15222 tokens
|
||||
All 14 agents: ✓ HTTP 200
|
||||
```
|
||||
|
||||
### Action Items
|
||||
|
||||
- [ ] Backup `router-config.yml.bak_20260228` → зберегти в репо
|
||||
- [ ] Розглянути переведення Ollama на меншу модель (smollm2:135m або qwen3-vl:8b) для vision-задач
|
||||
- [ ] Додати fallback в Router: якщо Ollama 500 → автоматично cloud_deepseek
|
||||
|
||||
---
|
||||
|
||||
## INC-2026-001 | (попередні інциденти)
|
||||
|
||||
_(додати при потребі)_
|
||||
387
docs/incident/intelligence.md
Normal file
387
docs/incident/intelligence.md
Normal file
@@ -0,0 +1,387 @@
|
||||
# Incident Intelligence Layer
|
||||
|
||||
> **Deterministic, 0 LLM tokens.** Pattern detection and weekly reporting built on top of the existing Incident Store and Alert State Machine.
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
The Incident Intelligence Layer adds three analytical capabilities to the incident management platform:
|
||||
|
||||
| Capability | Action | Description |
|
||||
|---|---|---|
|
||||
| **Correlation** | `correlate` | Find related incidents for a given incident ID using scored rule matching |
|
||||
| **Recurrence Detection** | `recurrence` | Frequency tables for 7d/30d windows with threshold classification |
|
||||
| **Weekly Digest** | `weekly_digest` | Full markdown + JSON report saved to `ops/reports/incidents/weekly/` |
|
||||
|
||||
All three functions are deterministic and reentrant — running twice on the same data produces the same output.
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
incident_intelligence_tool (tool_manager.py)
|
||||
│
|
||||
├── correlate → incident_intelligence.correlate_incident()
|
||||
├── recurrence → incident_intelligence.detect_recurrence()
|
||||
└── weekly_digest → incident_intelligence.weekly_digest()
|
||||
│
|
||||
IncidentStore (INCIDENT_BACKEND=auto)
|
||||
incident_intel_utils.py (helpers)
|
||||
config/incident_intelligence_policy.yml
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Policy: `config/incident_intelligence_policy.yml`
|
||||
|
||||
### Correlation rules
|
||||
|
||||
Each rule defines a `name`, `weight` (score contribution), and `match` conditions:
|
||||
|
||||
| Rule name | Weight | Match conditions |
|
||||
|---|---|---|
|
||||
| `same_signature` | 100 | Exact SHA-256 signature match |
|
||||
| `same_service_and_kind` | 60 | Same service **and** same kind |
|
||||
| `same_service_time_cluster` | 40 | Same service, started within `within_minutes` |
|
||||
| `same_kind_cross_service` | 30 | Same kind (cross-service), within `within_minutes` |
|
||||
|
||||
The final score is the sum of all matching rule weights. Only incidents scoring ≥ `min_score` (default: 20) appear in results.
|
||||
|
||||
**Example:** two incidents with the same signature that also share service+kind within 180 min → score = 100 + 60 + 40 + 30 = 230.
|
||||
|
||||
### Recurrence thresholds
|
||||
|
||||
```yaml
|
||||
recurrence:
|
||||
thresholds:
|
||||
signature:
|
||||
warn: 3 # ≥ 3 occurrences in window → warn
|
||||
high: 6 # ≥ 6 occurrences → high
|
||||
kind:
|
||||
warn: 5
|
||||
high: 10
|
||||
```
|
||||
|
||||
High-recurrence items receive deterministic recommendations from `recurrence.recommendations` templates (using Python `.format()` substitution with `{sig}`, `{kind}`, etc.).
|
||||
|
||||
---
|
||||
|
||||
## Tool Usage
|
||||
|
||||
### `correlate`
|
||||
|
||||
```json
|
||||
{
|
||||
"tool": "incident_intelligence_tool",
|
||||
"action": "correlate",
|
||||
"incident_id": "inc_20260218_1430_abc123",
|
||||
"append_note": true
|
||||
}
|
||||
```
|
||||
|
||||
Response:
|
||||
|
||||
```json
|
||||
{
|
||||
"incident_id": "inc_20260218_1430_abc123",
|
||||
"related_count": 3,
|
||||
"related": [
|
||||
{
|
||||
"incident_id": "inc_20260215_0900_def456",
|
||||
"score": 230,
|
||||
"reasons": ["same_signature", "same_service_and_kind", "same_service_time_cluster"],
|
||||
"service": "gateway",
|
||||
"kind": "error_rate",
|
||||
"severity": "P1",
|
||||
"status": "closed",
|
||||
"started_at": "2026-02-15T09:00:00"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
When `append_note=true`, a timeline event of type `note` is appended to the target incident listing the top-5 related incidents.
|
||||
|
||||
### `recurrence`
|
||||
|
||||
```json
|
||||
{
|
||||
"tool": "incident_intelligence_tool",
|
||||
"action": "recurrence",
|
||||
"window_days": 7
|
||||
}
|
||||
```
|
||||
|
||||
Response includes `top_signatures`, `top_kinds`, `top_services`, `high_recurrence`, and `warn_recurrence` tables.
|
||||
|
||||
### `weekly_digest`
|
||||
|
||||
```json
|
||||
{
|
||||
"tool": "incident_intelligence_tool",
|
||||
"action": "weekly_digest",
|
||||
"save_artifacts": true
|
||||
}
|
||||
```
|
||||
|
||||
Response:
|
||||
|
||||
```json
|
||||
{
|
||||
"week": "2026-W08",
|
||||
"artifact_paths": [
|
||||
"ops/reports/incidents/weekly/2026-W08.json",
|
||||
"ops/reports/incidents/weekly/2026-W08.md"
|
||||
],
|
||||
"markdown_preview": "# Weekly Incident Digest — 2026-W08\n...",
|
||||
"json_summary": {
|
||||
"week": "2026-W08",
|
||||
"open_incidents_count": 2,
|
||||
"recent_7d_count": 12,
|
||||
"recommendations": [...]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## RBAC
|
||||
|
||||
| Action | Required entitlement | Roles |
|
||||
|---|---|---|
|
||||
| `correlate` | `tools.oncall.read` | `agent_cto`, `agent_oncall` |
|
||||
| `recurrence` | `tools.oncall.read` | `agent_cto`, `agent_oncall` |
|
||||
| `weekly_digest` | `tools.oncall.incident_write` | `agent_cto`, `agent_oncall` |
|
||||
|
||||
Monitor (`agent_monitor`) has no access to `incident_intelligence_tool`.
|
||||
|
||||
---
|
||||
|
||||
## Rate limits
|
||||
|
||||
| Action | Timeout | RPM |
|
||||
|---|---|---|
|
||||
| `correlate` | 10s | 10 |
|
||||
| `recurrence` | 15s | 5 |
|
||||
| `weekly_digest` | 20s | 3 |
|
||||
|
||||
---
|
||||
|
||||
## Scheduled Job
|
||||
|
||||
Task ID: `weekly_incident_digest`
|
||||
Schedule: **Every Monday 08:00 UTC**
|
||||
Cron: `0 8 * * 1`
|
||||
|
||||
```bash
|
||||
# NODE1 — add to ops user crontab
|
||||
0 8 * * 1 /usr/local/bin/job_runner.sh weekly_incident_digest '{}'
|
||||
```
|
||||
|
||||
Artifacts are written to `ops/reports/incidents/weekly/YYYY-WW.json` and `YYYY-WW.md`.
|
||||
|
||||
---
|
||||
|
||||
## How scoring works
|
||||
|
||||
```
|
||||
Score(target, candidate) = Σ weight(rule) for each rule that matches
|
||||
|
||||
Rules are evaluated in order. The "same_signature" rule is exclusive:
|
||||
- If signatures match → score += 100, skip other conditions for this rule.
|
||||
- If signatures do not match → skip rule entirely (score += 0).
|
||||
|
||||
All other rules use combined conditions (AND logic):
|
||||
- All conditions in match{} must be satisfied for the rule to fire.
|
||||
```
|
||||
|
||||
Two incidents with **identical signatures** will always score ≥ 100. Two incidents sharing service + kind score ≥ 60. Time proximity (within 180 min, same service) scores ≥ 40.
|
||||
|
||||
---
|
||||
|
||||
## Tuning guide
|
||||
|
||||
| Goal | Change |
|
||||
|---|---|
|
||||
| Reduce false positives in correlation | Increase `min_score` (e.g., 40) |
|
||||
| More aggressive recurrence warnings | Lower `thresholds.signature.warn` |
|
||||
| Shorter lookback for correlation | Decrease `correlation.lookback_days` |
|
||||
| Disable kind-based cross-service matching | Remove `same_kind_cross_service` rule |
|
||||
| Longer digest | Increase `digest.markdown_max_chars` |
|
||||
|
||||
---
|
||||
|
||||
## Files
|
||||
|
||||
| File | Purpose |
|
||||
|---|---|
|
||||
| `services/router/incident_intelligence.py` | Core engine: correlate / recurrence / weekly_digest |
|
||||
| `services/router/incident_intel_utils.py` | Helpers: kind extraction, time math, truncation |
|
||||
| `config/incident_intelligence_policy.yml` | All tuneable policy parameters |
|
||||
| `tests/test_incident_correlation.py` | Correlation unit tests |
|
||||
| `tests/test_incident_recurrence.py` | Recurrence detection tests |
|
||||
| `tests/test_weekly_digest.py` | Weekly digest tests (incl. artifact write) |
|
||||
|
||||
---
|
||||
|
||||
## Root-Cause Buckets
|
||||
|
||||
### Overview
|
||||
|
||||
`build_root_cause_buckets` clusters incidents into actionable groups. The bucket key is either `service|kind` (default) or a signature prefix.
|
||||
|
||||
**Filtering**: only buckets meeting `min_count` thresholds appear:
|
||||
- `count_7d ≥ buckets.min_count[7]` (default: 3) **OR**
|
||||
- `count_30d ≥ buckets.min_count[30]` (default: 6)
|
||||
|
||||
**Sorting**: `count_7d desc → count_30d desc → last_seen desc`.
|
||||
|
||||
### Tool usage
|
||||
|
||||
```json
|
||||
{
|
||||
"tool": "incident_intelligence_tool",
|
||||
"action": "buckets",
|
||||
"service": "gateway",
|
||||
"window_days": 30
|
||||
}
|
||||
```
|
||||
|
||||
Response:
|
||||
```json
|
||||
{
|
||||
"service_filter": "gateway",
|
||||
"window_days": 30,
|
||||
"bucket_count": 2,
|
||||
"buckets": [
|
||||
{
|
||||
"bucket_key": "gateway|error_rate",
|
||||
"counts": {"7d": 5, "30d": 12, "open": 2},
|
||||
"last_seen": "2026-02-22T14:30:00",
|
||||
"services": ["gateway"],
|
||||
"kinds": ["error_rate"],
|
||||
"top_signatures": [{"signature": "aabbccdd", "count": 4}],
|
||||
"severity_mix": {"P0": 0, "P1": 2, "P2": 3},
|
||||
"sample_incidents": [...],
|
||||
"recommendations": [
|
||||
"Add regression test for API contract & error mapping",
|
||||
"Add/adjust SLO thresholds & alert routing"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Deterministic recommendations by kind
|
||||
|
||||
| Kind | Recommendations |
|
||||
|---|---|
|
||||
| `error_rate`, `slo_breach` | Add regression test; review deploys; adjust SLO thresholds |
|
||||
| `latency` | Check p95 vs saturation; investigate DB/queue contention |
|
||||
| `oom`, `crashloop` | Memory profiling; container limits; fix leaks |
|
||||
| `disk` | Retention/cleanup automation; verify volumes |
|
||||
| `security` | Dependency scanner + rotate secrets; verify allowlists |
|
||||
| `queue` | Consumer lag + dead-letter queue |
|
||||
| `network` | DNS audit; network policies |
|
||||
| *(any open incidents)* | ⚠ Do not deploy risky changes until mitigated |
|
||||
|
||||
---
|
||||
|
||||
## Auto Follow-ups (policy-driven)
|
||||
|
||||
When `weekly_digest` runs with `autofollowups.enabled=true`, it automatically appends a `followup` event to the **most recent open incident** in each high-recurrence bucket.
|
||||
|
||||
### Deduplication
|
||||
|
||||
Follow-up key: `{dedupe_key_prefix}:{YYYY-WW}:{bucket_key}`
|
||||
|
||||
One follow-up per bucket per week. A second call in the same week with the same bucket → skipped with `reason: already_exists`.
|
||||
|
||||
A new week (`YYYY-WW` changes) → new follow-up is created.
|
||||
|
||||
### Policy knobs
|
||||
|
||||
```yaml
|
||||
autofollowups:
|
||||
enabled: true
|
||||
only_when_high: true # only high-recurrence buckets trigger follow-ups
|
||||
owner: "oncall"
|
||||
priority: "P1"
|
||||
due_days: 7
|
||||
dedupe_key_prefix: "intel_recur"
|
||||
```
|
||||
|
||||
### Follow-up event structure
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "followup",
|
||||
"message": "[intel] Recurrence high: gateway|error_rate (7d=5, 30d=12, kinds=error_rate)",
|
||||
"meta": {
|
||||
"title": "[intel] Recurrence high: gateway|error_rate",
|
||||
"owner": "oncall",
|
||||
"priority": "P1",
|
||||
"due_date": "2026-03-02",
|
||||
"dedupe_key": "intel_recur:2026-W08:gateway|error_rate",
|
||||
"auto_created": true,
|
||||
"bucket_key": "gateway|error_rate",
|
||||
"count_7d": 5
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## `recurrence_watch` Release Gate
|
||||
|
||||
### Purpose
|
||||
|
||||
Warns (or blocks in staging) when the service being deployed has a high incident recurrence pattern — catching "we're deploying into a known-bad state."
|
||||
|
||||
### GatePolicy profiles
|
||||
|
||||
| Profile | Mode | Blocks on |
|
||||
|---|---|---|
|
||||
| `dev` | `warn` | Never blocks |
|
||||
| `staging` | `strict` | High recurrence + P0/P1 severity |
|
||||
| `prod` | `warn` | Never blocks (accumulate data first) |
|
||||
|
||||
### Strict mode logic
|
||||
|
||||
```
|
||||
if mode == "strict":
|
||||
if gate.has_high_recurrence AND gate.max_severity_seen in fail_on.severity_in:
|
||||
pass = False
|
||||
```
|
||||
|
||||
`fail_on.severity_in` defaults to `["P0", "P1"]`. Only P2/P3 incidents in a high-recurrence bucket do **not** block.
|
||||
|
||||
### Gate output fields
|
||||
|
||||
| Field | Description |
|
||||
|---|---|
|
||||
| `has_high_recurrence` | True if any signature or kind is in "high" zone |
|
||||
| `has_warn_recurrence` | True if any signature or kind is in "warn" zone |
|
||||
| `max_severity_seen` | Most severe incident in the service window |
|
||||
| `high_signatures` | List of first 5 high-recurrence signature prefixes |
|
||||
| `high_kinds` | List of first 5 high-recurrence kinds |
|
||||
| `total_incidents` | Total incidents in window |
|
||||
| `skipped` | True if gate was bypassed (error or tool unavailable) |
|
||||
|
||||
### Input overrides
|
||||
|
||||
```json
|
||||
{
|
||||
"run_recurrence_watch": true,
|
||||
"recurrence_watch_mode": "off", // override policy
|
||||
"recurrence_watch_windows_days": [7, 30],
|
||||
"recurrence_watch_service": "gateway" // default: service_name from release inputs
|
||||
}
|
||||
```
|
||||
|
||||
### Backward compatibility
|
||||
|
||||
If `run_recurrence_watch` is not in inputs, defaults to `true`. If `recurrence_watch_mode` is not set, falls back to GatePolicy profile setting.
|
||||
|
||||
139
docs/opencode/sofiia_setup.md
Normal file
139
docs/opencode/sofiia_setup.md
Normal file
@@ -0,0 +1,139 @@
|
||||
# OpenCode ↔ Sofiia Integration
|
||||
|
||||
Sofiia (CTO agent) is exposed to OpenCode via the **DAARION router** tool execution endpoint. No extra adapter service is required for basic tool calls.
|
||||
|
||||
---
|
||||
|
||||
## 1. Environment variables
|
||||
|
||||
| Variable | Description | Example |
|
||||
|----------|-------------|---------|
|
||||
| `ROUTER_URL` | Base URL of the DAARION router | `http://localhost:8000` or `http://router:8000` |
|
||||
| `SUPERVISOR_API_KEY` | Optional. If set, router requires `Authorization: Bearer <key>` on `/v1/tools/execute` | (secret) |
|
||||
|
||||
Set these in your OpenCode environment or in the config that invokes Sofiia.
|
||||
|
||||
---
|
||||
|
||||
## 2. Agent endpoint (for OpenCode “invoke agent”)
|
||||
|
||||
- **Tool execution (primary):**
|
||||
`POST {ROUTER_URL}/v1/tools/execute`
|
||||
|
||||
- **Chat / inference:**
|
||||
`POST {ROUTER_URL}/v1/agents/sofiia/infer`
|
||||
|
||||
OpenCode can treat Sofiia as an agent whose “tools” are executed by POSTing to `/v1/tools/execute` with a JSON body (see below). There is no separate “invoke” URL; tool execution **is** the invocation.
|
||||
|
||||
---
|
||||
|
||||
## 3. Tool execution contract
|
||||
|
||||
**Request:**
|
||||
|
||||
```http
|
||||
POST /v1/tools/execute
|
||||
Content-Type: application/json
|
||||
Authorization: Bearer <SUPERVISOR_API_KEY> # optional
|
||||
|
||||
{
|
||||
"tool": "risk_engine_tool",
|
||||
"action": "service",
|
||||
"agent_id": "sofiia",
|
||||
"env": "prod",
|
||||
"service": "gateway"
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "succeeded",
|
||||
"data": { ... },
|
||||
"error": null
|
||||
}
|
||||
```
|
||||
|
||||
or on failure:
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "failed",
|
||||
"data": null,
|
||||
"error": {
|
||||
"code": "tool_error",
|
||||
"message": "...",
|
||||
"retryable": false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
All parameters beyond `tool`, `action`, and `agent_id` are passed as the tool’s arguments (e.g. `env`, `service`, `task_id`, `inputs`).
|
||||
|
||||
---
|
||||
|
||||
## 4. Hello-world: one tool call
|
||||
|
||||
```bash
|
||||
export ROUTER_URL="http://localhost:8000"
|
||||
# Optional: export SUPERVISOR_API_KEY="your-key"
|
||||
|
||||
curl -s -X POST "$ROUTER_URL/v1/tools/execute" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer $SUPERVISOR_API_KEY" \
|
||||
-d '{
|
||||
"tool": "backlog_tool",
|
||||
"action": "dashboard",
|
||||
"agent_id": "sofiia",
|
||||
"env": "prod"
|
||||
}'
|
||||
```
|
||||
|
||||
Expected: `"status": "succeeded"` and `"data"` with backlog summary.
|
||||
|
||||
---
|
||||
|
||||
## 5. How to verify (one command)
|
||||
|
||||
From the repo root:
|
||||
|
||||
```bash
|
||||
python3 ops/scripts/verify_sofiia_stack.py --router-url "$ROUTER_URL"
|
||||
```
|
||||
|
||||
This checks:
|
||||
|
||||
- Router `/healthz` (or `/health`)
|
||||
- Dry-run tool calls: `risk_engine_tool.service`, `architecture_pressure_tool.service`, `backlog_tool.dashboard`
|
||||
- Presence of governance cron entries in `ops/cron/jobs.cron`
|
||||
- Optional: supervisor health if `SUPERVISOR_URL` is set
|
||||
|
||||
Exit code 0 = all checks PASS.
|
||||
|
||||
---
|
||||
|
||||
## 6. Typical tools for OpenCode-driven flows
|
||||
|
||||
| Tool | Action | Typical use |
|
||||
|------|--------|-------------|
|
||||
| `risk_engine_tool` | `service`, `dashboard` | Risk score / dashboard |
|
||||
| `architecture_pressure_tool` | `service`, `dashboard`, `digest` | Pressure index, weekly digest |
|
||||
| `backlog_tool` | `dashboard`, `list`, `create`, `auto_generate_weekly` | Backlog ops |
|
||||
| `job_orchestrator_tool` | `start_task` | e.g. `task_id: "release_check"` for release gates |
|
||||
| `oncall_tool` | `incident_create`, `list` | Incidents |
|
||||
| `incident_intelligence_tool` | `correlate`, `recurrence`, `weekly_digest` | Intelligence |
|
||||
|
||||
OpenCode can “Ask Sofiia to run release_check” by calling `/v1/tools/execute` with `tool: "job_orchestrator_tool"`, `action: "start_task"`, `task_id: "release_check"`, `inputs: { "gate_profile": "staging" }`.
|
||||
|
||||
---
|
||||
|
||||
## 7. Sofiia Control Console (optional)
|
||||
|
||||
A minimal web UI for chat + ops + nodes is provided by **sofiia-console** (NODA2 primary):
|
||||
|
||||
- Chat: proxy to `POST /v1/agents/sofiia/infer`
|
||||
- Ops: Risk/Pressure/Backlog/Release check via `POST /v1/tools/execute`
|
||||
- Nodes: dashboard from `config/nodes_registry.yml`
|
||||
|
||||
See `services/sofiia-console/` and runbook for deployment. OpenCode integration does **not** depend on the console; the console is for human operators.
|
||||
248
docs/release/release_check.md
Normal file
248
docs/release/release_check.md
Normal file
@@ -0,0 +1,248 @@
|
||||
# release_check — Release Gate
|
||||
|
||||
**Єдиний оркестрований job для перевірки готовності до релізу**
|
||||
Нода: NODE2 (dev) + NODA1 (production)
|
||||
|
||||
---
|
||||
|
||||
## Що це?
|
||||
|
||||
`release_check` — internal task у Job Orchestrator, який послідовно запускає всі release gates і повертає єдиний структурований verdict `pass/fail`.
|
||||
|
||||
Замінює ручне запускання кожного gate окремо.
|
||||
|
||||
---
|
||||
|
||||
## Gates (послідовно)
|
||||
|
||||
| # | Gate | Tool | Умова блокування |
|
||||
|---|------|------|-----------------|
|
||||
| 1 | **PR Review** | `pr_reviewer_tool` (mode=`blocking_only`) | blocking_count > 0 |
|
||||
| 2 | **Config Lint** | `config_linter_tool` (strict=true) | blocking_count > 0 |
|
||||
| 3 | **Contract Diff** | `contract_tool` (fail_on_breaking=true) | breaking_count > 0 |
|
||||
| 4 | **Threat Model** | `threatmodel_tool` (risk_profile) | unmitigated_high > 0 |
|
||||
| 5 | **Smoke** *(optional)* | `job_orchestrator_tool` → `smoke_gateway` | job fail |
|
||||
| 6 | **Drift** *(optional)* | `job_orchestrator_tool` → `drift_check_node1` | job fail |
|
||||
|
||||
Gates 1–4 завжди виконуються (якщо є вхідні дані).
|
||||
Gates 5–6 виконуються тільки при `run_smoke=true` / `run_drift=true`.
|
||||
|
||||
---
|
||||
|
||||
## Як запустити
|
||||
|
||||
### Через job_orchestrator_tool (рекомендовано)
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "start_task",
|
||||
"agent_id": "sofiia",
|
||||
"params": {
|
||||
"task_id": "release_check",
|
||||
"inputs": {
|
||||
"service_name": "router",
|
||||
"diff_text": "<unified diff>",
|
||||
"openapi_base": "<base OpenAPI spec>",
|
||||
"openapi_head": "<head OpenAPI spec>",
|
||||
"risk_profile": "agentic_tools",
|
||||
"fail_fast": false,
|
||||
"run_smoke": true,
|
||||
"run_drift": false
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Через Sofiia (OpenCode/Telegram)
|
||||
|
||||
```
|
||||
"Запусти release_check для сервісу router з цим diff: ..."
|
||||
"Зроби release gate перевірку"
|
||||
```
|
||||
|
||||
### Dry run (тільки валідація)
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "start_task",
|
||||
"params": {
|
||||
"task_id": "release_check",
|
||||
"dry_run": true,
|
||||
"inputs": {"service_name": "router"}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Вхідні параметри (inputs_schema)
|
||||
|
||||
| Параметр | Тип | Обов'язковий | Опис |
|
||||
|----------|-----|:---:|------|
|
||||
| `service_name` | string | ✅ | Назва сервісу |
|
||||
| `diff_text` | string | — | Unified diff (git diff) |
|
||||
| `openapi_base` | string | — | OpenAPI base spec (text) |
|
||||
| `openapi_head` | string | — | OpenAPI head spec (text) |
|
||||
| `risk_profile` | enum | — | `default` / `agentic_tools` / `public_api` (default: `default`) |
|
||||
| `fail_fast` | boolean | — | Зупинитись на першому fail (default: `false`) |
|
||||
| `run_smoke` | boolean | — | Запустити smoke tests (default: `false`) |
|
||||
| `run_drift` | boolean | — | Запустити drift check (default: `false`) |
|
||||
|
||||
---
|
||||
|
||||
## Вихідний формат
|
||||
|
||||
```json
|
||||
{
|
||||
"pass": true,
|
||||
"gates": [
|
||||
{
|
||||
"name": "pr_review",
|
||||
"status": "pass",
|
||||
"blocking_count": 0,
|
||||
"summary": "No blocking issues found",
|
||||
"score": 95
|
||||
},
|
||||
{
|
||||
"name": "config_lint",
|
||||
"status": "pass",
|
||||
"blocking_count": 0,
|
||||
"total_findings": 2
|
||||
},
|
||||
{
|
||||
"name": "contract_diff",
|
||||
"status": "skipped",
|
||||
"reason": "openapi_base or openapi_head not provided"
|
||||
},
|
||||
{
|
||||
"name": "threat_model",
|
||||
"status": "pass",
|
||||
"unmitigated_high": 0,
|
||||
"risk_profile": "default"
|
||||
}
|
||||
],
|
||||
"recommendations": [],
|
||||
"summary": "✅ RELEASE CHECK PASSED in 1234ms. Gates: ['pr_review', 'config_lint', 'threat_model'].",
|
||||
"elapsed_ms": 1234.5
|
||||
}
|
||||
```
|
||||
|
||||
### Gate statuses
|
||||
|
||||
| Status | Значення |
|
||||
|--------|----------|
|
||||
| `pass` | Gate пройшов |
|
||||
| `fail` | Gate не пройшов (блокує реліз) |
|
||||
| `skipped` | Вхідних даних не було (не блокує) |
|
||||
| `error` | Внутрішня помилка gate |
|
||||
|
||||
---
|
||||
|
||||
## Інтерпретація результату
|
||||
|
||||
### `pass: true`
|
||||
Всі mandatory gates пройшли → **можна випускати реліз**.
|
||||
|
||||
### `pass: false`
|
||||
Хоча б один gate має `status: fail` → **реліз заблоковано**.
|
||||
Дивись `gates[].status == "fail"` та `recommendations` для деталей.
|
||||
|
||||
### `status: error`
|
||||
Gate не зміг виконатись (internal error). Не є `fail`, але потребує уваги.
|
||||
|
||||
---
|
||||
|
||||
## Risk Profiles для Threat Model
|
||||
|
||||
| Профіль | Коли використовувати |
|
||||
|---------|---------------------|
|
||||
| `default` | Звичайний внутрішній сервіс |
|
||||
| `agentic_tools` | Сервіс з tool-викликами, prompt injection ризики |
|
||||
| `public_api` | Публічний API (rate limiting, WAF, auth hardening) |
|
||||
|
||||
---
|
||||
|
||||
## Необхідні Entitlements
|
||||
|
||||
Для запуску `release_check` агент повинен мати:
|
||||
- `tools.pr_review.gate`
|
||||
- `tools.contract.gate`
|
||||
- `tools.config_lint.gate`
|
||||
- `tools.threatmodel.gate`
|
||||
|
||||
Тільки агенти з роллю `agent_cto` (sofiia, yaromir) мають ці entitlements.
|
||||
|
||||
---
|
||||
|
||||
## Приклади сценаріїв
|
||||
|
||||
### Швидка перевірка PR (без openapi, без smoke)
|
||||
|
||||
```json
|
||||
{
|
||||
"service_name": "gateway-bot",
|
||||
"diff_text": "...",
|
||||
"fail_fast": true
|
||||
}
|
||||
```
|
||||
|
||||
### Повний release pipeline для публічного API
|
||||
|
||||
```json
|
||||
{
|
||||
"service_name": "router",
|
||||
"diff_text": "...",
|
||||
"openapi_base": "...",
|
||||
"openapi_head": "...",
|
||||
"risk_profile": "public_api",
|
||||
"run_smoke": true,
|
||||
"run_drift": true
|
||||
}
|
||||
```
|
||||
|
||||
### Тільки threat model (без diff)
|
||||
|
||||
```json
|
||||
{
|
||||
"service_name": "auth-service",
|
||||
"risk_profile": "agentic_tools"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Внутрішня архітектура
|
||||
|
||||
```
|
||||
job_orchestrator_tool.start_task("release_check")
|
||||
→ _job_orchestrator_tool() виявляє runner="internal"
|
||||
→ release_check_runner.run_release_check(tool_manager, inputs, agent_id)
|
||||
→ Gate 1: _run_pr_review()
|
||||
→ Gate 2: _run_config_lint()
|
||||
→ Gate 3: _run_dependency_scan()
|
||||
→ Gate 4: _run_contract_diff()
|
||||
→ Gate 5: _run_threat_model()
|
||||
→ [Gate 6: _run_smoke()]
|
||||
→ [Gate 7: _run_drift()]
|
||||
→ Gate 8: _run_followup_watch() (policy: off/warn/strict)
|
||||
→ Gate 9: _run_privacy_watch() (policy: off/warn/strict)
|
||||
→ Gate 10: _run_cost_watch() (always warn)
|
||||
→ _build_report()
|
||||
→ ToolResult(success=True, result=report)
|
||||
```
|
||||
|
||||
Кожен gate викликає відповідний tool через `tool_manager.execute_tool()`.
|
||||
Governance middleware (RBAC, limits, audit) застосовується до кожного gate-виклику.
|
||||
|
||||
---
|
||||
|
||||
## Файли
|
||||
|
||||
| Файл | Призначення |
|
||||
|------|-------------|
|
||||
| `ops/task_registry.yml` | Реєстрація `release_check` task |
|
||||
| `services/router/release_check_runner.py` | Internal runner (gates logic) |
|
||||
| `config/release_gate_policy.yml` | Gate strictness profiles (dev/staging/prod) |
|
||||
| `config/slo_policy.yml` | SLO thresholds per service |
|
||||
| `tests/test_tool_governance.py` | Тести (включно з release_check fixtures) |
|
||||
| `tests/test_release_check_followup_watch.py` | Follow-up watch gate tests |
|
||||
68
docs/release/release_gate_policy.md
Normal file
68
docs/release/release_gate_policy.md
Normal file
@@ -0,0 +1,68 @@
|
||||
# Release Gate Policy
|
||||
|
||||
`config/release_gate_policy.yml` — централізований конфіг строгості gate-ів для різних профілів деплойменту.
|
||||
|
||||
## Профілі
|
||||
|
||||
| Профіль | Призначення | privacy_watch | cost_watch |
|
||||
|---------|-------------|---------------|------------|
|
||||
| `dev` | Розробка | warn | warn |
|
||||
| `staging` | Стейджинг | **strict** (fail_on error) | warn |
|
||||
| `prod` | Продакшн | **strict** (fail_on error) | warn |
|
||||
|
||||
## Режими gate-ів
|
||||
|
||||
| Режим | Поведінка |
|
||||
|-------|-----------|
|
||||
| `off` | Gate повністю пропускається (не викликається, не виводиться) |
|
||||
| `warn` | Gate завжди `pass=True`; findings → `recommendations` |
|
||||
| `strict` | Gate може заблокувати реліз за умовами `fail_on` |
|
||||
|
||||
## Використання
|
||||
|
||||
Передати `gate_profile` у inputs release_check:
|
||||
|
||||
```json
|
||||
{
|
||||
"gate_profile": "staging",
|
||||
"run_privacy_watch": true,
|
||||
"diff_text": "..."
|
||||
}
|
||||
```
|
||||
|
||||
## strict mode: privacy_watch
|
||||
|
||||
Блокує реліз якщо є findings із severity у `fail_on`:
|
||||
|
||||
```yaml
|
||||
privacy_watch:
|
||||
mode: "strict"
|
||||
fail_on: ["error"] # тільки error-severity блокує; warning = recommendation
|
||||
```
|
||||
|
||||
Наприклад, `DG-SEC-001` (private key) = error → `release_check.pass = false`.
|
||||
`DG-LOG-001` (sensitive logger) = warning → не блокує у staging/prod.
|
||||
|
||||
## cost_watch
|
||||
|
||||
**Завжди `warn`** у всіх профілях — cost spikes ніколи не блокують реліз (тільки recommendations).
|
||||
|
||||
## Backward compatibility
|
||||
|
||||
Якщо `gate_profile` не переданий → використовується `dev` (warn для privacy і cost).
|
||||
Якщо `release_gate_policy.yml` відсутній → всі gates використовують `warn` (graceful fallback).
|
||||
|
||||
## Приклад виводу для staging з error finding
|
||||
|
||||
```json
|
||||
{
|
||||
"pass": false,
|
||||
"gates": [
|
||||
{ "name": "privacy_watch", "status": "pass", "errors": 1,
|
||||
"top_findings": [{"id": "DG-SEC-001", "severity": "error", ...}],
|
||||
"recommendations": ["Remove private key from code..."] }
|
||||
],
|
||||
"summary": "❌ RELEASE CHECK FAILED. Failed: []. Errors: [].",
|
||||
"recommendations": ["Remove private key from code..."]
|
||||
}
|
||||
```
|
||||
109
docs/release/sofiia-console-v1-readiness.md
Normal file
109
docs/release/sofiia-console-v1-readiness.md
Normal file
@@ -0,0 +1,109 @@
|
||||
# Sofiia Console v1.0 Release Readiness Summary
|
||||
|
||||
One-page go/no-go артефакт для релізного рішення по `sofiia-console`.
|
||||
|
||||
## 1) Scope & Version
|
||||
|
||||
- Service: `sofiia-console`
|
||||
- Target version / tag: `v1.0` (to be assigned at release cut)
|
||||
- Git SHAs:
|
||||
- sofiia-console: `e75fd33`
|
||||
- router: `<set at release window>`
|
||||
- gateway: `<set at release window>`
|
||||
- Deployment target:
|
||||
- NODA1: production runtime/data plane
|
||||
- NODA2: control plane / sofiia-console
|
||||
- Date prepared: `<set at release window>`
|
||||
- Prepared by: `<operator>`
|
||||
|
||||
## 2) Production Guarantees
|
||||
|
||||
### Reliability
|
||||
|
||||
- Idempotent `POST /api/chats/{chat_id}/send` with selectable backend (`inmemory|redis`).
|
||||
- Multi-node routing covered by E2E tests (NODA1/NODA2 via `infer` monkeypatch path).
|
||||
- Cursor pagination hardened with tie-breakers (`(ts,id)` / stable ordering semantics).
|
||||
- Release process formalized via preflight + release runbook + smoke scripts.
|
||||
|
||||
### Security
|
||||
|
||||
- Rate limiting on send path:
|
||||
- per-chat scope
|
||||
- per-operator scope
|
||||
- Strict `/api/audit` protection:
|
||||
- key required
|
||||
- no localhost bypass
|
||||
- Structured audit trail:
|
||||
- write events for operator actions
|
||||
- cursor-based read endpoint
|
||||
- Secrets rotation runbook documented and operational.
|
||||
|
||||
### Operational Controls
|
||||
|
||||
- `/metrics` exposed (including rate-limit and idempotency counters).
|
||||
- Structured JSON logs for send/replay/pagination/error flows.
|
||||
- Audit retention policy in place (default 90 days).
|
||||
- Pruning script available (`ops/prune_audit_db.py`: dry-run + batch delete + optional vacuum).
|
||||
- Release evidence auto-generator available (`ops/generate_release_evidence.sh`).
|
||||
|
||||
## 3) Known Limitations / Residual Risks
|
||||
|
||||
- Chat index is still local DB-backed; full multi-instance HA for global chat index needs Phase 6 (Redis ChatIndexStore).
|
||||
- Rate-limit defaults to `inmemory`; multi-instance consistency needs `SOFIIA_RATE_LIMIT_BACKEND=redis`.
|
||||
- Audit storage is SQLite (single-node storage, non-clustered by default).
|
||||
- Automatic alerting/paging is not yet enabled; metric observation is primarily manual/runbook-driven.
|
||||
|
||||
## 4) Required Release-Day Checks
|
||||
|
||||
### Preflight
|
||||
|
||||
- `STRICT=1 bash ops/preflight_sofiia_console.sh`
|
||||
|
||||
### Deploy order
|
||||
|
||||
- NODA2 precheck
|
||||
- NODA1 rollout
|
||||
- NODA2 finalize
|
||||
|
||||
### Smoke
|
||||
|
||||
- `GET /api/health` -> `200`
|
||||
- `/metrics` reachable
|
||||
- `bash ops/redis_idempotency_smoke.sh` -> `PASS` (when redis backend is enabled)
|
||||
- `/api/audit` auth:
|
||||
- without key -> `401`
|
||||
- with key -> `200`
|
||||
|
||||
### Post-release
|
||||
|
||||
- Verify rate-limit metrics increment under controlled load.
|
||||
- Verify audit write/read quick check.
|
||||
- Run retention dry-run:
|
||||
- `python3 ops/prune_audit_db.py --dry-run`
|
||||
|
||||
## 5) Explicit Go / No-Go Criteria
|
||||
|
||||
**GO if all conditions hold:**
|
||||
|
||||
- Preflight is `PASS` (or only non-critical `WARN` accepted by operator).
|
||||
- Smoke checks pass.
|
||||
- No unexpected 5xx spike during first 5–10 minutes.
|
||||
- Rate-limit counters and idempotency behavior are within expected range.
|
||||
|
||||
**NO-GO if any condition holds:**
|
||||
|
||||
- Strict audit auth fails (401/200 behavior broken).
|
||||
- Redis idempotency A/B smoke fails.
|
||||
- Audit write/read fails.
|
||||
- Unexpected 500s on send path.
|
||||
|
||||
## 6) Rollback Readiness Statement
|
||||
|
||||
- Rollback method:
|
||||
- revert to previous known-good SHA/tag
|
||||
- restart affected services via docker compose/systemd as per runbook
|
||||
- Estimated rollback time: `<set by operator, typically 5-15 min>`
|
||||
- Mandatory post-rollback smoke:
|
||||
- `/api/health`
|
||||
- idempotency smoke
|
||||
- audit auth/read checks
|
||||
206
docs/risk/risk_index.md
Normal file
206
docs/risk/risk_index.md
Normal file
@@ -0,0 +1,206 @@
|
||||
# Service Risk Index
|
||||
|
||||
> Deterministic. No LLM. Production-grade.
|
||||
|
||||
## Overview
|
||||
|
||||
The Risk Index Engine computes a **numerical risk score (0–100+)** for every tracked service. It is the single authoritative metric for service health in the DAARION.city control plane.
|
||||
|
||||
Score → Band mapping:
|
||||
|
||||
| Score | Band | Meaning |
|
||||
|--------|----------|------------------------------------------|
|
||||
| 0–20 | low | No significant signals |
|
||||
| 21–50 | medium | Minor signals; monitor |
|
||||
| 51–80 | high | Active problems; coordinate before deploy|
|
||||
| 81+ | critical | Block or escalate immediately |
|
||||
|
||||
---
|
||||
|
||||
## Scoring Formula
|
||||
|
||||
```
|
||||
Risk(service) = Σ weight(signal) × count_or_flag(signal)
|
||||
```
|
||||
|
||||
All weights are policy-driven via `config/risk_policy.yml`.
|
||||
|
||||
### Signal weights (defaults)
|
||||
|
||||
| Signal | Points |
|
||||
|-------------------------------|-------------------------------|
|
||||
| Open P0 incident | 50 each |
|
||||
| Open P1 incident | 25 each |
|
||||
| Open P2 incident | 10 each |
|
||||
| Open P3 incident | 5 each |
|
||||
| High recurrence signature 7d | 20 each |
|
||||
| Warn recurrence signature 7d | 10 each |
|
||||
| High recurrence kind 7d | 15 each |
|
||||
| Warn recurrence kind 7d | 8 each |
|
||||
| High recurrence signature 30d | 10 each |
|
||||
| High recurrence kind 30d | 8 each |
|
||||
| Overdue follow-up P0 | 20 each |
|
||||
| Overdue follow-up P1 | 12 each |
|
||||
| Overdue follow-up other | 6 each |
|
||||
| Active SLO violation (60m) | 10 each |
|
||||
| Alert-loop SLO violation | 10 each |
|
||||
| Escalations 24h (1–2) | 5 (warn level) |
|
||||
| Escalations 24h (3+) | 12 (high level) |
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
**`config/risk_policy.yml`** — controls all weights, thresholds, and per-service overrides.
|
||||
|
||||
```yaml
|
||||
thresholds:
|
||||
bands:
|
||||
low_max: 20
|
||||
medium_max: 50
|
||||
high_max: 80
|
||||
risk_watch:
|
||||
warn_at: 50
|
||||
fail_at: 80
|
||||
|
||||
service_overrides:
|
||||
gateway:
|
||||
risk_watch:
|
||||
fail_at: 75 # gateway fails earlier: critical path
|
||||
|
||||
p0_services:
|
||||
- gateway
|
||||
- router
|
||||
```
|
||||
|
||||
Changes to the file take effect on next request (cache is not long-lived).
|
||||
|
||||
---
|
||||
|
||||
## API
|
||||
|
||||
### `GET /v1/risk/service/{service}?env=prod&window_hours=24`
|
||||
|
||||
Returns a `RiskReport`:
|
||||
|
||||
```json
|
||||
{
|
||||
"service": "gateway",
|
||||
"env": "prod",
|
||||
"score": 72,
|
||||
"band": "high",
|
||||
"thresholds": { "warn_at": 50, "fail_at": 75 },
|
||||
"components": {
|
||||
"open_incidents": { "P0": 0, "P1": 1, "P2": 2, "points": 45 },
|
||||
"recurrence": { "high_signatures_7d": 1, "points": 20 },
|
||||
"followups": { "overdue_P1": 1, "points": 12 },
|
||||
"slo": { "violations": 1, "points": 10 },
|
||||
"alerts_loop": { "violations": 0, "points": 0 },
|
||||
"escalations": { "count_24h": 1, "points": 5 }
|
||||
},
|
||||
"reasons": [
|
||||
"Open P1 incident(s): 1",
|
||||
"High recurrence signatures (7d): 1",
|
||||
"Overdue follow-ups (P1): 1",
|
||||
"Active SLO violation(s) in window: 1",
|
||||
"Escalations in last 24h: 1"
|
||||
],
|
||||
"recommendations": [
|
||||
"Prioritize open P0/P1 incidents before deploying.",
|
||||
"Investigate recurring failure patterns.",
|
||||
"Avoid risky deploys until SLO violation clears.",
|
||||
"Service is high-risk — coordinate with oncall before release."
|
||||
],
|
||||
"updated_at": "2026-02-23T12:00:00"
|
||||
}
|
||||
```
|
||||
|
||||
RBAC required: `tools.risk.read` (granted to `agent_cto`, `agent_oncall`, `agent_monitor`).
|
||||
|
||||
### `GET /v1/risk/dashboard?env=prod&top_n=10`
|
||||
|
||||
Returns top-N services by score with band summary:
|
||||
|
||||
```json
|
||||
{
|
||||
"env": "prod",
|
||||
"generated_at": "...",
|
||||
"total_services": 4,
|
||||
"band_counts": { "critical": 1, "high": 1, "medium": 2, "low": 0 },
|
||||
"critical_p0_services": ["gateway"],
|
||||
"services": [ ...RiskReports sorted by score desc... ]
|
||||
}
|
||||
```
|
||||
|
||||
### Tool: `risk_engine_tool`
|
||||
|
||||
```json
|
||||
{ "action": "service", "service": "gateway", "env": "prod" }
|
||||
{ "action": "dashboard", "env": "prod", "top_n": 10 }
|
||||
{ "action": "policy" }
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Release Gate: `risk_watch`
|
||||
|
||||
The `risk_watch` gate integrates Risk Index into the release pipeline.
|
||||
|
||||
### Behaviour
|
||||
|
||||
| Mode | When score ≥ warn_at (default 50) | When score ≥ fail_at (default 80) |
|
||||
|--------|------------------------------------|-------------------------------------|
|
||||
| warn | pass=true + recommendations added | pass=true + recommendations added |
|
||||
| strict | pass=true + recommendations added | **pass=false** — deploy blocked |
|
||||
|
||||
### Policy
|
||||
|
||||
```yaml
|
||||
# config/release_gate_policy.yml
|
||||
dev:
|
||||
risk_watch: { mode: "warn" }
|
||||
staging:
|
||||
risk_watch: { mode: "strict" } # blocks p0_services when score >= fail_at
|
||||
prod:
|
||||
risk_watch: { mode: "warn" }
|
||||
```
|
||||
|
||||
### Non-fatal guarantee
|
||||
|
||||
If the Risk Engine is unavailable (store down, timeout, error), `risk_watch` is **skipped** — never blocks. A warning is added to the gate output.
|
||||
|
||||
### Release inputs
|
||||
|
||||
| Input | Type | Default | Description |
|
||||
|--------------------|---------|---------|----------------------------------------------|
|
||||
| `run_risk_watch` | boolean | true | Enable/disable the gate |
|
||||
| `risk_watch_env` | string | prod | Env to score against |
|
||||
| `risk_watch_warn_at` | int | policy | Override warn threshold |
|
||||
| `risk_watch_fail_at` | int | policy | Override fail threshold |
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
[Incident Store]──open incidents──┐
|
||||
[Intelligence]──recurrence 7d/30d─┤
|
||||
[Followups Summary]──overdue──────┤──► risk_engine.py ──► RiskReport
|
||||
[SLO Snapshot]──violations────────┤ │
|
||||
[Alert Store]──loop SLO───────────┤ score_to_band
|
||||
[Decision Events]──escalations────┘ │
|
||||
release_check_runner
|
||||
risk_watch gate
|
||||
```
|
||||
|
||||
The engine has **zero LLM calls**. It is deterministic: given the same signals, the same score is always produced.
|
||||
|
||||
---
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
pytest tests/test_risk_engine.py # scoring + bands + overrides
|
||||
pytest tests/test_risk_dashboard.py # sorting + band counts + p0 detection
|
||||
pytest tests/test_release_check_risk_watch.py # warn/strict/non-fatal gate
|
||||
```
|
||||
75
docs/runbook/release-evidence-template.md
Normal file
75
docs/runbook/release-evidence-template.md
Normal file
@@ -0,0 +1,75 @@
|
||||
# Release Evidence Template (Sofiia Console)
|
||||
|
||||
Заповнювати після кожного релізу. Мета: мати короткий, відтворюваний артефакт виконаних дій і перевірок.
|
||||
|
||||
## 1) Release metadata
|
||||
|
||||
- Release ID:
|
||||
- Date/Time UTC:
|
||||
- Date/Time Europe/Kyiv:
|
||||
- Operator:
|
||||
- Target nodes: `NODA1` / `NODA2`
|
||||
- Deployed SHAs:
|
||||
- `sofiia-console`:
|
||||
- `router`:
|
||||
- `gateway`:
|
||||
- `memory-service`:
|
||||
- Change summary (1-3 bullets):
|
||||
-
|
||||
|
||||
## 2) Preflight results
|
||||
|
||||
- Command:
|
||||
- `bash ops/preflight_sofiia_console.sh`
|
||||
- `STRICT=1 bash ops/preflight_sofiia_console.sh` (prod window)
|
||||
- Status: `PASS` / `FAIL`
|
||||
- WARN summary (if any):
|
||||
-
|
||||
|
||||
## 3) Deploy steps performed
|
||||
|
||||
- NODA2 precheck: `OK` / `FAIL`
|
||||
- Notes:
|
||||
- NODA1 rollout: `OK` / `FAIL`
|
||||
- Method (docker/systemd/manual):
|
||||
- Notes:
|
||||
- NODA2 finalize: `OK` / `FAIL`
|
||||
- Notes:
|
||||
|
||||
## 4) Smoke evidence
|
||||
|
||||
- `GET /api/health`: status code / result
|
||||
- `GET /metrics`: reachable `yes/no`
|
||||
- Idempotency A/B smoke:
|
||||
- Command: `bash ops/redis_idempotency_smoke.sh`
|
||||
- Result: `PASS` / `FAIL`
|
||||
- `message_id`:
|
||||
- `/api/audit` auth checks:
|
||||
- without key -> `401` confirmed: `yes/no`
|
||||
- with key -> `200` confirmed: `yes/no`
|
||||
|
||||
## 5) Post-release checks
|
||||
|
||||
- Key metrics deltas (optional):
|
||||
- `sofiia_rate_limited_total`:
|
||||
- `sofiia_idempotency_replays_total`:
|
||||
- Audit write/read quick check: `OK` / `FAIL`
|
||||
- Retention dry-run:
|
||||
- Command: `python3 ops/prune_audit_db.py --dry-run`
|
||||
- `candidates=`:
|
||||
- Notes:
|
||||
|
||||
## 6) Rollback plan & outcome
|
||||
|
||||
- Rollback needed: `no` / `yes`
|
||||
- If yes:
|
||||
- reason:
|
||||
- rollback commands used:
|
||||
- result:
|
||||
- Final service state: `healthy` / `degraded`
|
||||
|
||||
## 7) Sign-off
|
||||
|
||||
- Reviewer / approver:
|
||||
- Timestamp UTC:
|
||||
- Notes:
|
||||
175
docs/runbook/sofiia-console-ops.md
Normal file
175
docs/runbook/sofiia-console-ops.md
Normal file
@@ -0,0 +1,175 @@
|
||||
# Sofiia Console — Operations Runbook
|
||||
|
||||
## 1. Rebuild & Deploy (NODA2)
|
||||
|
||||
```bash
|
||||
cd /opt/microdao-daarion # or ~/github-projects/microdao-daarion on dev
|
||||
|
||||
# Rebuild sofiia-console (UI + backend)
|
||||
docker compose -f docker-compose.node2-sofiia.yml build sofiia-console --no-cache
|
||||
docker compose -f docker-compose.node2-sofiia.yml up -d sofiia-console
|
||||
|
||||
# Rebuild gateway (for agent registry changes)
|
||||
docker compose -f docker-compose.node2-sofiia.yml build gateway --no-cache
|
||||
docker compose -f docker-compose.node2-sofiia.yml up -d gateway
|
||||
```
|
||||
|
||||
## 2. Confirm Build Version
|
||||
|
||||
```bash
|
||||
# Via API
|
||||
APIKEY=$(grep SOFIIA_CONSOLE_API_KEY .env | cut -d= -f2)
|
||||
curl -s http://localhost:8002/api/meta/version -H "X-API-Key: $APIKEY"
|
||||
# Expected: {"version":"0.4.0","build_sha":"dev","build_time":"local",...}
|
||||
|
||||
# In UI: header shows "v0.4.0 dev" badge (top right)
|
||||
```
|
||||
|
||||
## 3. Verify Agents List
|
||||
|
||||
```bash
|
||||
APIKEY=$(grep SOFIIA_CONSOLE_API_KEY .env | cut -d= -f2)
|
||||
|
||||
# NODA2 agents
|
||||
curl -s "http://localhost:8002/api/agents?nodes=NODA2" -H "X-API-Key: $APIKEY" | \
|
||||
python3 -c "import sys,json; d=json.load(sys.stdin); print(f'items={len(d[\"items\"])} stats={d[\"stats\"]} errors={d[\"node_errors\"]}')"
|
||||
|
||||
# NODA1 agents
|
||||
curl -s "http://localhost:8002/api/agents?nodes=NODA1" -H "X-API-Key: $APIKEY" | \
|
||||
python3 -c "import sys,json; d=json.load(sys.stdin); print(f'items={len(d[\"items\"])} stats={d[\"stats\"]} errors={d[\"node_errors\"]}')"
|
||||
|
||||
# All nodes
|
||||
curl -s "http://localhost:8002/api/agents?nodes=NODA1,NODA2" -H "X-API-Key: $APIKEY" | \
|
||||
python3 -c "import sys,json; d=json.load(sys.stdin); print(f'items={len(d[\"items\"])} stats={d[\"stats\"]} errors={d[\"node_errors\"]}')"
|
||||
|
||||
# Direct gateway check (NODA2)
|
||||
curl -s http://localhost:9300/health | python3 -c "
|
||||
import sys,json; d=json.load(sys.stdin)
|
||||
print(f'agents={d[\"agents_count\"]}')
|
||||
for k,v in sorted(d[\"agents\"].items()): print(f' {k}: badges={v.get(\"badges\",[])}')
|
||||
"
|
||||
```
|
||||
|
||||
## 4. UI Debug Panel
|
||||
|
||||
У вкладці **📁 Проєкти → Agents**:
|
||||
1. Натисніть кнопку **🔍 Debug** в панелі дій
|
||||
2. Debug panel показує:
|
||||
- `fetch`: час останнього запиту
|
||||
- `nodes`: вибрані ноди
|
||||
- `items`: кількість агентів
|
||||
- `ok/total`: кількість успішних нод
|
||||
- `errors`: помилки нод (якщо є)
|
||||
|
||||
## 5. Troubleshooting
|
||||
|
||||
### Агенти не відображаються в UI
|
||||
|
||||
1. Перевірте API ключ у налаштуваннях UI
|
||||
2. Натисніть **↻ Sync**
|
||||
3. Відкрийте **🔍 Debug** — перевірте `errors`
|
||||
4. Перевірте gateway health: `curl http://localhost:9300/health`
|
||||
|
||||
### Gateway падає при старті
|
||||
|
||||
```bash
|
||||
docker logs dagi-gateway-node2 --tail 50
|
||||
```
|
||||
|
||||
Типова причина: ImportError у `http_api_doc.py` → `doc_service.py`
|
||||
Рішення: перевірте що в `doc_service.py` є stub-функції (doc_service, update_document, list_document_versions, publish_document_artifact).
|
||||
|
||||
### SQLite "no such column: last_applied_hash"
|
||||
|
||||
БД у volume має стару схему. Вирішення — міграції виконуються автоматично при старті через `_MIGRATION_SQL_STMTS` у `db.py`. Restart контейнера вирішує:
|
||||
```bash
|
||||
docker restart sofiia-console
|
||||
```
|
||||
|
||||
### NODA2 gateway_url недоступний з контейнера
|
||||
|
||||
У `config/nodes_registry.yml` NODA2 використовує `host.docker.internal:9300`.
|
||||
Якщо UI запущений не в Docker — замініть на `localhost:9300`.
|
||||
|
||||
### Monitor / AISTALK не відображаються
|
||||
|
||||
Перевірте що в `gateway-bot/http_api.py`:
|
||||
- `MONITOR_CONFIG` і `AISTALK_CONFIG` визначені через `load_agent_config`
|
||||
- Вони додані в `AGENT_REGISTRY`
|
||||
- Файл `gateway-bot/monitor_prompt.txt` існує
|
||||
|
||||
```bash
|
||||
docker exec dagi-gateway-node2 python3 -c "
|
||||
from http_api import AGENT_REGISTRY
|
||||
print(list(AGENT_REGISTRY.keys()))
|
||||
"
|
||||
```
|
||||
|
||||
## 6. Monitor Policy
|
||||
|
||||
Monitor (`agent_id=monitor`) є **обов'язковим** агентом на кожній ноді.
|
||||
|
||||
### Перевірка
|
||||
```bash
|
||||
APIKEY=$(grep SOFIIA_CONSOLE_API_KEY .env | cut -d= -f2)
|
||||
curl -s "http://localhost:8002/api/agents?nodes=NODA1,NODA2" -H "X-API-Key: $APIKEY" | \
|
||||
python3 -c "import sys,json; d=json.load(sys.stdin); print('missing:', d.get('required_missing_nodes'))"
|
||||
```
|
||||
|
||||
- `required_missing=[]` — все ОК
|
||||
- `required_missing=[{"node_id":"NODA1","agent_id":"monitor"}]` — Monitor відсутній на NODA1 → перевірте gateway registry → rebuild gateway
|
||||
|
||||
### Governance event
|
||||
Якщо Monitor відсутній на онлайн-ноді — автоматично записується `governance_event` типу `node_required_agent_missing` (severity=high).
|
||||
|
||||
## 7. Voice & Telegram Capabilities
|
||||
|
||||
У вкладці Agents:
|
||||
- **🎙 Voice** badge — агент підтримує голос (AISTALK)
|
||||
- **💬 Telegram** badge — агент активний у Telegram
|
||||
- Фільтри **🎙 Voice** і **💬 Telegram** — client-side фільтрація
|
||||
|
||||
### API
|
||||
```bash
|
||||
curl -s "http://localhost:8002/api/agents?nodes=NODA1" -H "X-API-Key: $APIKEY" | \
|
||||
python3 -c "import sys,json; d=json.load(sys.stdin);
|
||||
voice=[a['agent_id'] for a in d['items'] if a.get('capabilities',{}).get('voice')]
|
||||
print('voice:', voice)"
|
||||
```
|
||||
|
||||
## 8. Document Versioning
|
||||
|
||||
API для версій документів (в межах Sofiia Console):
|
||||
```bash
|
||||
# Список версій
|
||||
GET /api/projects/{project_id}/documents/{doc_id}/versions
|
||||
|
||||
# Оновити документ (зберігає нову версію)
|
||||
POST /api/projects/{project_id}/documents/{doc_id}/update
|
||||
{"content_md": "# Новий зміст", "author_id": "user", "reason": "оновлення", "dry_run": false}
|
||||
|
||||
# Відновити версію
|
||||
POST /api/projects/{project_id}/documents/{doc_id}/restore
|
||||
{"version_id": "...", "author_id": "user"}
|
||||
```
|
||||
|
||||
## 9. Agent Registry SSoT
|
||||
|
||||
Canonical реєстр: `config/agent_registry.yml`
|
||||
|
||||
Gateway завантажує агентів з `gateway-bot/http_api.py::AGENT_REGISTRY` (Python dict).
|
||||
Щоб додати нового агента:
|
||||
1. Додайте запис в `config/agent_registry.yml`
|
||||
2. Додайте `*_CONFIG = load_agent_config(...)` і запис в `AGENT_REGISTRY` у `gateway-bot/http_api.py`
|
||||
3. Створіть `gateway-bot/<agent_id>_prompt.txt`
|
||||
4. Rebuild gateway
|
||||
|
||||
## 10. Ports Reference
|
||||
|
||||
| Сервіс | Port | URL |
|
||||
|---|---|---|
|
||||
| Sofiia Console UI | 8002 | http://localhost:8002 |
|
||||
| Gateway | 9300 | http://localhost:9300/health |
|
||||
| Router | 9102 | http://localhost:9102/health |
|
||||
| Memory | 8000 | http://localhost:8000/health |
|
||||
| Qdrant | 6333 | http://localhost:6333/healthz |
|
||||
285
docs/runbook/sofiia-control-plane.md
Normal file
285
docs/runbook/sofiia-control-plane.md
Normal file
@@ -0,0 +1,285 @@
|
||||
# Sofiia Control Plane — Operations Runbook
|
||||
|
||||
Version: 1.0
|
||||
Date: 2026-02-25
|
||||
|
||||
---
|
||||
|
||||
## Architecture: Two-Plane Model
|
||||
|
||||
```
|
||||
┌─────────────────────────────────┐ ┌─────────────────────────────────┐
|
||||
│ NODA2 (MacBook) │ │ NODA1 (Production) │
|
||||
│ CONTROL PLANE │ │ RUNTIME PLANE │
|
||||
│ │ │ │
|
||||
│ sofiia-console BFF :8002 ────────→ │ router/gateway :8000/:9300 │
|
||||
│ memory-service UI :8000 │ │ postgres, qdrant stores │
|
||||
│ Ollama :11434 │ │ cron jobs (governance) │
|
||||
│ WebSocket /ws/events │ │ alert/incident/risk pipelines │
|
||||
│ │ │ │
|
||||
│ Operator interacts here │ │ Production traffic runs here │
|
||||
└─────────────────────────────────┘ └─────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Rule: All operator actions go through NODA2 BFF
|
||||
|
||||
The BFF on NODA2 proxies requests to NODA1 router/governance. You never call NODA1 directly from the browser.
|
||||
|
||||
---
|
||||
|
||||
## Environment Variables
|
||||
|
||||
### NODA2 (sofiia-console BFF)
|
||||
|
||||
| Variable | Default | Description |
|
||||
|---|---|---|
|
||||
| `PORT` | `8002` | BFF listen port |
|
||||
| `ENV` | `dev` | `dev\|staging\|prod` — controls CORS strictness, auth enforcement |
|
||||
| `SOFIIA_CONSOLE_API_KEY` | `""` | Bearer auth for write endpoints. Mandatory in prod. |
|
||||
| `MEMORY_SERVICE_URL` | `http://localhost:8000` | Memory service URL (STT/TTS/memory) |
|
||||
| `OLLAMA_URL` | `http://localhost:11434` | Ollama URL for local LLM |
|
||||
| `CORS_ORIGINS` | `""` | Comma-separated allowed origins. Empty = `*` in dev. |
|
||||
| `SUPERVISOR_API_KEY` | `""` | Key for router/governance calls |
|
||||
| `NODES_POLL_INTERVAL_SEC` | `30` | How often BFF polls nodes for telemetry |
|
||||
| `AISTALK_ENABLED` | `false` | Enable AISTALK adapter |
|
||||
| `AISTALK_URL` | `""` | AISTALK bridge URL |
|
||||
| `BUILD_ID` | `local` | Git SHA or build ID (set in CI/CD) |
|
||||
| `CONFIG_DIR` | auto-detect | Path to `config/` directory with `nodes_registry.yml` |
|
||||
|
||||
### NODA1 (router/governance)
|
||||
|
||||
| Variable | Description |
|
||||
|---|---|
|
||||
| `ALERT_BACKEND` | Must be `postgres` in production (not `memory`) |
|
||||
| `AUDIT_BACKEND` | `auto\|jsonl\|postgres` |
|
||||
| `GOV_CRON_FILE` | Path to cron file, default `/etc/cron.d/daarion-governance` |
|
||||
|
||||
---
|
||||
|
||||
## Starting Services
|
||||
|
||||
### NODA2 — Start BFF
|
||||
|
||||
```bash
|
||||
cd services/sofiia-console
|
||||
source .venv/bin/activate
|
||||
uvicorn app.main:app --host 0.0.0.0 --port 8002 --reload
|
||||
```
|
||||
|
||||
Or via Docker Compose:
|
||||
```bash
|
||||
docker-compose -f docker-compose.node2-sofiia.yml up -d
|
||||
```
|
||||
|
||||
### NODA2 — Check status
|
||||
|
||||
```bash
|
||||
curl http://localhost:8002/api/health
|
||||
curl http://localhost:8002/api/status/full
|
||||
```
|
||||
|
||||
Expected: `service: "sofiia-console"`, `version: "0.3.x"`.
|
||||
|
||||
### Accessing the UI
|
||||
|
||||
```
|
||||
http://localhost:8000/ui ← memory-service serves sofiia-ui.html
|
||||
```
|
||||
|
||||
The UI auto-connects to BFF at `http://localhost:8002` (configurable in Settings tab).
|
||||
|
||||
---
|
||||
|
||||
## Nodes Registry
|
||||
|
||||
Edit `config/nodes_registry.yml` to add/modify nodes:
|
||||
|
||||
```yaml
|
||||
nodes:
|
||||
NODA1:
|
||||
label: "Production (NODA1)"
|
||||
router_url: "http://<noda1-ip>:9102"
|
||||
gateway_url: "http://<noda1-ip>:9300"
|
||||
|
||||
NODA2:
|
||||
label: "Control Plane (NODA2)"
|
||||
router_url: "http://localhost:8000"
|
||||
monitor_url: "http://localhost:8000"
|
||||
```
|
||||
|
||||
**Environment overrides** (no need to edit YAML in prod):
|
||||
```bash
|
||||
export NODES_NODA1_ROUTER_URL=http://10.0.0.5:9102
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Monitor Agent on Nodes
|
||||
|
||||
The BFF probes each node at `GET /monitor/status` (falls back to `/healthz`).
|
||||
|
||||
### Implementing `/monitor/status` on a node
|
||||
|
||||
Add this endpoint to the node's router or a dedicated lightweight service:
|
||||
|
||||
```json
|
||||
GET /monitor/status → 200 OK
|
||||
{
|
||||
"online": true,
|
||||
"ts": "2026-02-25T10:00:00Z",
|
||||
"node_id": "NODA1",
|
||||
"heartbeat_age_s": 5,
|
||||
"router": {"ok": true, "latency_ms": 12},
|
||||
"gateway": {"ok": true, "latency_ms": 8},
|
||||
"alerts_loop_slo": {
|
||||
"p95_ms": 320,
|
||||
"failed_rate": 0.0
|
||||
},
|
||||
"open_incidents": 2,
|
||||
"backends": {
|
||||
"alerts": "postgres",
|
||||
"audit": "auto",
|
||||
"incidents": "auto",
|
||||
"risk_history": "auto",
|
||||
"backlog": "auto"
|
||||
},
|
||||
"last_artifacts": {
|
||||
"risk_digest": "2026-02-24",
|
||||
"platform_digest": "2026-W08",
|
||||
"backlog": "2026-02-24"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
If `/monitor/status` is not available, BFF synthesises partial data from `/healthz`.
|
||||
|
||||
---
|
||||
|
||||
## Parity Verification
|
||||
|
||||
Run after every deploy to both nodes:
|
||||
|
||||
```bash
|
||||
# NODA2 alone
|
||||
python3 ops/scripts/verify_sofiia_stack.py \
|
||||
--node NODA2 \
|
||||
--bff-url http://localhost:8002 \
|
||||
--router-url http://localhost:8000 \
|
||||
--env dev
|
||||
|
||||
# NODA1 from NODA2 (parity check)
|
||||
python3 ops/scripts/verify_sofiia_stack.py \
|
||||
--node NODA1 \
|
||||
--bff-url http://<noda1>:8002 \
|
||||
--router-url http://<noda1>:9102 \
|
||||
--compare-with http://localhost:8002 \
|
||||
--compare-node NODA2 \
|
||||
--env prod
|
||||
|
||||
# JSON output for CI
|
||||
python3 ops/scripts/verify_sofiia_stack.py --json | jq .pass
|
||||
```
|
||||
|
||||
Exit 0 = PASS. Exit 1 = critical failure.
|
||||
|
||||
### Critical PASS requirements (prod)
|
||||
|
||||
- `router_health` — router responds 200
|
||||
- `bff_health` — BFF identifies as `sofiia-console`
|
||||
- `bff_status_full` — router + memory reachable
|
||||
- `alerts_backend != memory` — must be postgres in prod/staging
|
||||
|
||||
---
|
||||
|
||||
## WebSocket Events
|
||||
|
||||
Connect to WS for real-time monitoring:
|
||||
|
||||
```bash
|
||||
# Using wscat (npm install -g wscat)
|
||||
wscat -c ws://localhost:8002/ws/events
|
||||
|
||||
# Or via Python
|
||||
python3 -c "
|
||||
import asyncio, json, websockets
|
||||
async def f():
|
||||
async with websockets.connect('ws://localhost:8002/ws/events') as ws:
|
||||
async for msg in ws:
|
||||
print(json.loads(msg)['type'])
|
||||
asyncio.run(f())
|
||||
"
|
||||
```
|
||||
|
||||
Event types: `chat.message`, `chat.reply`, `voice.stt`, `voice.tts`, `ops.run`, `nodes.status`, `error`.
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### BFF won't start: `ModuleNotFoundError`
|
||||
```bash
|
||||
pip install -r services/sofiia-console/requirements.txt
|
||||
```
|
||||
|
||||
### UI shows "BFF: ✗"
|
||||
1. Check BFF is running: `curl http://localhost:8002/api/health`
|
||||
2. Check Settings tab → BFF URL points to correct host
|
||||
3. Check CORS: BFF URL must match `CORS_ORIGINS` in prod
|
||||
|
||||
### Router shows "offline" in Nodes
|
||||
1. NODA1 router might not be running: `docker ps | grep router`
|
||||
2. Check `config/nodes_registry.yml` router_url
|
||||
3. Override: `export NODES_NODA1_ROUTER_URL=http://<correct-ip>:9102`
|
||||
|
||||
### STT/TTS not working
|
||||
1. Check memory-service is running: `curl http://localhost:8000/health`
|
||||
2. Check `MEMORY_SERVICE_URL` in BFF env
|
||||
3. Check browser has microphone permission
|
||||
|
||||
### Alerts backend is "memory" (should be postgres)
|
||||
In prod/staging, set:
|
||||
```bash
|
||||
export ALERT_BACKEND=postgres
|
||||
```
|
||||
Then restart the governance/router service.
|
||||
|
||||
### Cron jobs not running
|
||||
```bash
|
||||
# Check cron file
|
||||
cat /etc/cron.d/daarion-governance
|
||||
|
||||
# Manual trigger (example)
|
||||
cd /path/to/daarion && python3 -m services.router.risk_engine snapshot
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## AISTALK Integration
|
||||
|
||||
See `docs/aistalk/contract.md` for full integration contract.
|
||||
|
||||
Quick enable:
|
||||
```bash
|
||||
export AISTALK_ENABLED=true
|
||||
export AISTALK_URL=http://<aistalk-bridge>:PORT
|
||||
# Restart BFF
|
||||
```
|
||||
|
||||
Status check:
|
||||
```bash
|
||||
curl http://localhost:8002/api/status/full | jq .bff.aistalk_enabled
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Definition of Done Checklist
|
||||
|
||||
- [ ] `verify_sofiia_stack.py` PASS on NODA2 (dev)
|
||||
- [ ] `verify_sofiia_stack.py` PASS on NODA1 (prod) — router + BFF + alerts=postgres
|
||||
- [ ] `--compare-with` parity PASS between NODA1 and NODA2
|
||||
- [ ] Nodes dashboard shows real-time data (online/latency/incidents)
|
||||
- [ ] Ops tab: release_check runs and shows result
|
||||
- [ ] Voice: STT → chat → TTS roundtrip works without looping
|
||||
- [ ] WS Events tab shows `chat.reply`, `voice.stt`, `nodes.status`
|
||||
- [ ] `SOFIIA_CONSOLE_API_KEY` set on NODA1 (prod)
|
||||
- [ ] `ALERT_BACKEND=postgres` on NODA1 (prod)
|
||||
194
docs/sofiia_ui_vnext_audit.md
Normal file
194
docs/sofiia_ui_vnext_audit.md
Normal file
@@ -0,0 +1,194 @@
|
||||
# Sofiia UI vNext — Audit Report
|
||||
|
||||
> Generated: 2026-02-26 | Scope: file uploads, document DB, session memory, dialog map
|
||||
|
||||
---
|
||||
|
||||
## 1. Existing Infrastructure (What We Reuse)
|
||||
|
||||
### Document Processing — `gateway-bot/services/doc_service.py`
|
||||
Fully working channel-agnostic document service:
|
||||
- `parse_document()` → Swapper `/document` endpoint → markdown/text
|
||||
- `ingest_document()` → Router `POST /v1/documents/ingest` → Qdrant chunks
|
||||
- `ask_about_document()` → RAG query via Router
|
||||
- `extract_summary_from_bytes()` — local extraction for XLSX/CSV/PDF
|
||||
|
||||
Supported formats (from gateway-bot/http_api.py):
|
||||
`.pdf .doc .docx .rtf .odt .txt .md .csv .tsv .xls .xlsx .xlsm .ods`
|
||||
|
||||
**Plan:** sofiia-console proxies uploads to Router `/v1/documents/ingest` (same path as Telegram).
|
||||
|
||||
### Storage on NODA2 (`docker-compose.memory-node2.yml`)
|
||||
| Storage | Container | Port | Notes |
|
||||
|---|---|---|---|
|
||||
| PostgreSQL 16 | `dagi-postgres-node2` | 5433 | DB: `daarion_memory`, tables: sofiia_messages etc. |
|
||||
| Qdrant 1.12.4 | `dagi-qdrant-node2` | 6333 | Collections: memories, sofiia_messages, sofiia_summaries |
|
||||
| Neo4j 5.15 | `dagi-neo4j-node2` | 7687 | Available for Phase 2 dialog graph |
|
||||
|
||||
### Memory Service Endpoints (Reusable)
|
||||
- `POST /agents/{agent_id}/memory` — save chat turn → Postgres + Qdrant + Neo4j
|
||||
- `GET /agents/{agent_id}/memory` — retrieve recent events
|
||||
- `POST /threads` / `GET /threads/{id}` — conversation threads
|
||||
- `POST /memories` — long-term memory with semantic search
|
||||
- `POST /retrieve` — vector search across memories
|
||||
- `POST /facts/upsert` / `GET /facts/{key}` — key-value store
|
||||
|
||||
### sofiia-console (What Already Exists)
|
||||
- `_do_save_memory()` — auto-saves every chat turn to Memory Service
|
||||
- `GET /api/memory/context` — retrieves context for session
|
||||
- `POST /api/voice/stt` — file upload (multipart) → memory-service STT
|
||||
- `session_id`, `project_id`, `user_id` — already in request model
|
||||
|
||||
---
|
||||
|
||||
## 2. What Is Missing (What We Build)
|
||||
|
||||
| Component | Status | Plan |
|
||||
|---|---|---|
|
||||
| sofiia-console `DATABASE_URL` | ❌ MISSING | Add to docker-compose + SQLite fallback |
|
||||
| `POST /api/files/upload` | ❌ MISSING | Build in sofiia-console BFF |
|
||||
| `projects` table | ❌ MISSING | SQLite (Phase 1), Postgres (Phase 2) |
|
||||
| `documents` table | ❌ MISSING | SQLite + metadata |
|
||||
| `sessions` table | ❌ MISSING | SQLite + `started_at`, `last_active` |
|
||||
| `messages` table | ❌ MISSING | SQLite + `parent_msg_id` for branching |
|
||||
| `GET /api/chat/history` | ❌ MISSING | Load messages from SQLite |
|
||||
| Projects sidebar UI | ❌ MISSING | Left panel in index.html |
|
||||
| Dialog Map (tree) | ❌ MISSING | Collapsible tree + branching |
|
||||
| Upload UI button | ❌ MISSING | Paperclip icon in chat bar |
|
||||
|
||||
---
|
||||
|
||||
## 3. Architecture Decision: SQLite First
|
||||
|
||||
**Rationale:** sofiia-console currently has no DB. Adding a new Postgres connection
|
||||
requires network config changes and service dependency. SQLite:
|
||||
- Zero infra changes (just a volume mount)
|
||||
- Works immediately in Docker
|
||||
- Can migrate to Postgres later via `aiosqlite` → `asyncpg`
|
||||
- Sufficient for 1 user (operator) console workload
|
||||
|
||||
**Phase 2:** `DATABASE_URL=postgresql://...` env override → same schema via asyncpg.
|
||||
|
||||
---
|
||||
|
||||
## 4. Storage Schema (Phase 1)
|
||||
|
||||
```sql
|
||||
-- projects
|
||||
CREATE TABLE projects (
|
||||
project_id TEXT PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
description TEXT DEFAULT '',
|
||||
created_at TEXT NOT NULL, -- ISO8601
|
||||
updated_at TEXT NOT NULL
|
||||
);
|
||||
|
||||
-- documents
|
||||
CREATE TABLE documents (
|
||||
doc_id TEXT PRIMARY KEY,
|
||||
project_id TEXT NOT NULL REFERENCES projects(project_id),
|
||||
file_id TEXT NOT NULL,
|
||||
sha256 TEXT NOT NULL,
|
||||
mime TEXT NOT NULL,
|
||||
size_bytes INTEGER NOT NULL,
|
||||
filename TEXT NOT NULL,
|
||||
title TEXT DEFAULT '',
|
||||
tags TEXT DEFAULT '[]', -- JSON array
|
||||
created_at TEXT NOT NULL,
|
||||
extracted_text TEXT DEFAULT '' -- first 4KB preview
|
||||
);
|
||||
|
||||
-- sessions
|
||||
CREATE TABLE sessions (
|
||||
session_id TEXT PRIMARY KEY,
|
||||
project_id TEXT NOT NULL REFERENCES projects(project_id),
|
||||
title TEXT DEFAULT '',
|
||||
started_at TEXT NOT NULL,
|
||||
last_active TEXT NOT NULL,
|
||||
turn_count INTEGER DEFAULT 0
|
||||
);
|
||||
|
||||
-- messages (with branching via parent_msg_id)
|
||||
CREATE TABLE messages (
|
||||
msg_id TEXT PRIMARY KEY,
|
||||
session_id TEXT NOT NULL REFERENCES sessions(session_id),
|
||||
role TEXT NOT NULL, -- "user" | "assistant"
|
||||
content TEXT NOT NULL,
|
||||
ts TEXT NOT NULL, -- ISO8601
|
||||
parent_msg_id TEXT, -- NULL for first message; enables branching
|
||||
branch_label TEXT DEFAULT '' -- "main" | "branch-1" | etc.
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. File Upload Architecture
|
||||
|
||||
```
|
||||
Browser → POST /api/files/upload (multipart)
|
||||
↓
|
||||
BFF: validate mime + size
|
||||
↓
|
||||
Save to ./data/uploads/{sha256[:2]}/{sha256}_{filename}
|
||||
↓
|
||||
Extract text (pdf/docx/txt/md via python libs or Router OCR)
|
||||
↓
|
||||
Store metadata in documents table
|
||||
↓
|
||||
POST /v1/documents/ingest → Qdrant (async, best-effort)
|
||||
↓
|
||||
Return: {file_id, sha256, mime, size, preview_text, doc_id}
|
||||
```
|
||||
|
||||
Size limits (env-configurable):
|
||||
| Type | Env | Default |
|
||||
|---|---|---|
|
||||
| Images | `UPLOAD_MAX_IMAGE_MB` | 10 MB |
|
||||
| Videos | `UPLOAD_MAX_VIDEO_MB` | 200 MB |
|
||||
| Docs | `UPLOAD_MAX_DOC_MB` | 50 MB |
|
||||
|
||||
---
|
||||
|
||||
## 6. Session Persistence Strategy
|
||||
|
||||
**Current:** session_id generated on each `/api/chat/send` → not persisted between page loads.
|
||||
|
||||
**Phase 1 Fix:**
|
||||
1. Browser stores `session_id` in `localStorage`
|
||||
2. BFF `GET /api/sessions/{session_id}` checks if session exists → load last N messages
|
||||
3. New `/api/chat/send` saves messages to SQLite `messages` table
|
||||
4. `GET /api/chat/history?session_id=...&limit=50` returns ordered messages
|
||||
|
||||
---
|
||||
|
||||
## 7. Dialog Map (Phase 1: Tree View)
|
||||
|
||||
**Not a full graph canvas** — collapsible tree in UI:
|
||||
- Each session = root node
|
||||
- Each assistant turn = child node
|
||||
- "Fork from message" creates a new branch (new `session_id` with `parent_msg_id`)
|
||||
- UI renders as nested `<details>` tree, no canvas required
|
||||
- `GET /api/sessions/{session_id}/map` returns `{nodes, edges}` JSON
|
||||
|
||||
**Phase 2:** Upgrade to D3.js force-directed graph or Cytoscape.js when Neo4j available.
|
||||
|
||||
---
|
||||
|
||||
## 8. Integration Hooks (Phase 2 Flags)
|
||||
|
||||
```python
|
||||
USE_FABRIC_OCR = os.getenv("USE_FABRIC_OCR", "false").lower() == "true"
|
||||
USE_EMBEDDINGS = os.getenv("USE_EMBEDDINGS", "false").lower() == "true"
|
||||
```
|
||||
|
||||
- `USE_FABRIC_OCR=true` → images/PDFs go through Router `/v1/capability/ocr`
|
||||
- `USE_EMBEDDINGS=true` → extracted text indexed in Qdrant via Memory Service
|
||||
|
||||
---
|
||||
|
||||
## 9. Constraints
|
||||
|
||||
- Access: localhost-only by default (Docker port binding `127.0.0.1:8002:8002`)
|
||||
- Secrets: never stored in upload files or exposed in API responses
|
||||
- Filename sanitization: `secure_filename()` + sha256 as storage key (no path traversal)
|
||||
- Content-type: validated server-side via `python-magic` or file header bytes (not just extension)
|
||||
98
docs/spacebot/README.md
Normal file
98
docs/spacebot/README.md
Normal file
@@ -0,0 +1,98 @@
|
||||
# Spacebot — Sofiia Telegram Agent
|
||||
|
||||
Spacebot — це Rust-based multi-agent framework від Spacedrive. Використовується як Telegram-фронтенд для агента Sofiia в екосистемі DAARION.
|
||||
|
||||
- GitHub: https://github.com/spacedriveapp/spacebot
|
||||
- Версія: v0.1.15
|
||||
- Telegram bot: @SofiiaDaarionbot
|
||||
|
||||
## Архітектура
|
||||
|
||||
```
|
||||
[Telegram] ←→ [Spacebot (Rust)] ←→ [GLM-5 / Grok 4.1]
|
||||
↕
|
||||
LanceDB (vector memory)
|
||||
SOUL.md / IDENTITY.md / USER.md
|
||||
```
|
||||
|
||||
## Встановлення (перший раз)
|
||||
|
||||
### Залежності
|
||||
|
||||
```bash
|
||||
brew install rust protobuf cmake
|
||||
curl -fsSL https://bun.sh/install | bash
|
||||
```
|
||||
|
||||
### Збірка з вихідного коду
|
||||
|
||||
```bash
|
||||
git clone --depth=1 https://github.com/spacedriveapp/spacebot.git ~/github-projects/spacebot
|
||||
cd ~/github-projects/spacebot
|
||||
cargo build --release # ~7-20 хвилин
|
||||
```
|
||||
|
||||
### Конфіг
|
||||
|
||||
```bash
|
||||
mkdir -p ~/.spacebot
|
||||
cp docs/spacebot/config.toml.example ~/.spacebot/config.toml
|
||||
# Відредагуй ~/.spacebot/config.toml — виправ модель і ключі якщо потрібно
|
||||
```
|
||||
|
||||
Ключі зберігаються в `.env` проекту:
|
||||
- `SOFIIA_TELEGRAM_BOT_TOKEN` — токен бота @SofiiaDaarionbot
|
||||
- `ZHIPU_API_KEY` / `GLM5_API_KEY` — GLM-5 (Zhipu AI)
|
||||
- `XAI_API_KEY` — Grok (xAI), fallback
|
||||
|
||||
### Ідентичність агента
|
||||
|
||||
Файли в `~/.spacebot/agents/sofiia/workspace/`:
|
||||
- `IDENTITY.md` — хто такий агент, контекст DAARION, засновник
|
||||
- `SOUL.md` — стиль спілкування, мова, межі
|
||||
- `USER.md` — інформація про Повелителя Хаосу / Іван Титар
|
||||
|
||||
## Управління
|
||||
|
||||
```bash
|
||||
# Запуск
|
||||
./ops/scripts/start_spacebot.sh start
|
||||
|
||||
# Статус
|
||||
./ops/scripts/start_spacebot.sh status
|
||||
|
||||
# Live логи
|
||||
./ops/scripts/start_spacebot.sh logs
|
||||
|
||||
# Перезапуск (після змін конфігу або identity файлів)
|
||||
./ops/scripts/start_spacebot.sh restart
|
||||
|
||||
# Зупинка
|
||||
./ops/scripts/start_spacebot.sh stop
|
||||
```
|
||||
|
||||
## Важливо перед запуском
|
||||
|
||||
Якщо бот раніше використовував webhook (наприклад через gateway.daarion.city), треба видалити його:
|
||||
|
||||
```bash
|
||||
source .env
|
||||
curl "https://api.telegram.org/bot${SOFIIA_TELEGRAM_BOT_TOKEN}/deleteWebhook?drop_pending_updates=true"
|
||||
```
|
||||
|
||||
## Моделі (поточні)
|
||||
|
||||
| Призначення | Модель | Provider |
|
||||
|-------------|--------|----------|
|
||||
| channel (чат) | glm-5 | Zhipu AI |
|
||||
| branch (задачі) | glm-5 | Zhipu AI |
|
||||
| worker (фон) | glm-4.5-air | Zhipu AI |
|
||||
| cortex (память) | glm-4.7 | Zhipu AI |
|
||||
| fallback | grok-4-1 / grok-4-1-mini | xAI |
|
||||
|
||||
## Де логи
|
||||
|
||||
```
|
||||
~/.spacebot/logs/spacebot.log.YYYY-MM-DD
|
||||
~/.spacebot/agents/sofiia/ — workspace, memory, lancedb
|
||||
```
|
||||
95
docs/spacebot/config.toml.example
Normal file
95
docs/spacebot/config.toml.example
Normal file
@@ -0,0 +1,95 @@
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Spacebot config for DAARION / Sofiia agent
|
||||
# Powered by: GLM-5 (Zhipu primary), xAI Grok (fallback), Ollama (local)
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# ── LLM Providers ────────────────────────────────────────────────────────────
|
||||
[llm]
|
||||
zhipu_key = "env:GLM5_API_KEY"
|
||||
xai_key = "env:XAI_API_KEY"
|
||||
|
||||
# Sofiia BFF as custom OpenAI-compatible provider
|
||||
[llm.provider.sofiia_bff]
|
||||
api_type = "openai_completions"
|
||||
base_url = "http://localhost:8002/api"
|
||||
api_key = "env:SOFIIA_CONSOLE_API_KEY"
|
||||
name = "Sofiia BFF (DAARION)"
|
||||
|
||||
# Local Ollama
|
||||
[llm.provider.ollama]
|
||||
api_type = "openai_completions"
|
||||
base_url = "http://localhost:11434"
|
||||
api_key = "ollama"
|
||||
name = "Ollama Local"
|
||||
|
||||
# ── Instance Defaults ─────────────────────────────────────────────────────────
|
||||
[defaults]
|
||||
max_concurrent_branches = 4
|
||||
max_turns = 8
|
||||
context_window = 131072
|
||||
history_backfill_count = 30
|
||||
worker_log_mode = "errors_only"
|
||||
cron_timezone = "Europe/Kyiv"
|
||||
|
||||
# Primary: GLM-5 (Zhipu) — найкращий варіант для DAARION (є підписка)
|
||||
# Fallback: Grok (xAI) — швидкий і потужний
|
||||
[defaults.routing]
|
||||
channel = "zhipu/glm-5"
|
||||
branch = "zhipu/glm-5"
|
||||
worker = "zhipu/glm-4.5-air"
|
||||
compactor = "zhipu/glm-4.5-air"
|
||||
cortex = "zhipu/glm-4.7"
|
||||
rate_limit_cooldown_secs = 30
|
||||
|
||||
[defaults.routing.task_overrides]
|
||||
coding = "zhipu/glm-5"
|
||||
|
||||
[defaults.routing.fallbacks]
|
||||
"zhipu/glm-5" = ["xai/grok-4-1-mini", "zhipu/glm-4.7"]
|
||||
"zhipu/glm-4.7" = ["xai/grok-4-1-mini"]
|
||||
"zhipu/glm-4.5-air" = ["zhipu/glm-4.5"]
|
||||
|
||||
# Prompt complexity routing — cheap models for simple requests
|
||||
[defaults.routing.prompt_routing]
|
||||
enabled = true
|
||||
process_types = ["channel", "branch"]
|
||||
|
||||
# ── Messaging ─────────────────────────────────────────────────────────────────
|
||||
[messaging.telegram]
|
||||
enabled = true
|
||||
token = "env:SOFIIA_TELEGRAM_BOT_TOKEN"
|
||||
dm_allowed_users = [] # populated after first /getUpdates with Ivan's user_id
|
||||
|
||||
# ── Agents ───────────────────────────────────────────────────────────────────
|
||||
[[agents]]
|
||||
id = "sofiia"
|
||||
display_name = "Sofiia"
|
||||
|
||||
[agents.identity]
|
||||
name = "Sofiia"
|
||||
description = """
|
||||
Ти Sofiia — Chief AI Architect та Technical Sovereign екосистеми DAARION.city.
|
||||
|
||||
Засновник та головний архітектор DAARION: Повелитель Хаосу (офіційно — Іван Титар).
|
||||
Він є єдиним, хто має повний контроль над платформою.
|
||||
|
||||
Ноди: NODA1 (production runtime), NODA2 (control plane), NODA3 (AI/ML).
|
||||
|
||||
Відповідай українською. Технічні терміни (API, SLO, backend, deploy, incident тощо) залишай англійською.
|
||||
Будь конкретною, структурованою, без зайвих вступів. Не галюцинуй.
|
||||
"""
|
||||
|
||||
[agents.routing]
|
||||
channel = "zhipu/glm-5"
|
||||
branch = "zhipu/glm-5"
|
||||
worker = "zhipu/glm-4.5-air"
|
||||
compactor = "zhipu/glm-4.5-air"
|
||||
|
||||
[agents.routing.fallbacks]
|
||||
"zhipu/glm-5" = ["xai/grok-4-1", "xai/grok-4-1-mini"]
|
||||
|
||||
# ── Bindings: Telegram → Sofiia ───────────────────────────────────────────────
|
||||
[[bindings]]
|
||||
agent_id = "sofiia"
|
||||
channel = "telegram"
|
||||
# group_ids = [] # додати ID групи якщо потрібно
|
||||
264
docs/supervisor/langgraph_supervisor.md
Normal file
264
docs/supervisor/langgraph_supervisor.md
Normal file
@@ -0,0 +1,264 @@
|
||||
# Sofiia Supervisor — LangGraph Orchestration Service
|
||||
|
||||
**Location**: NODA2 | **Port**: 8084 (external) → 8080 (container)
|
||||
**State backend**: Redis (`sofiia-redis:6379`)
|
||||
**Gateway**: `http://router:8000/v1/tools/execute`
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
Caller (Telegram/UI/API)
|
||||
│
|
||||
▼
|
||||
sofiia-supervisor:8084 ──── POST /v1/graphs/{name}/runs
|
||||
│ GET /v1/runs/{run_id}
|
||||
│ POST /v1/runs/{run_id}/cancel
|
||||
│
|
||||
▼ (LangGraph nodes)
|
||||
GatewayClient ──────────────→ router:8000/v1/tools/execute
|
||||
│ │
|
||||
│ ▼ (ToolGovernance)
|
||||
│ RBAC check → limits → redact → audit
|
||||
│ │
|
||||
│ ToolManager.execute_tool(...)
|
||||
│
|
||||
▼
|
||||
sofiia-redis ←── RunRecord + RunEvents (no payload)
|
||||
```
|
||||
|
||||
**Key invariants:**
|
||||
- LangGraph nodes have **no direct access** to internal services
|
||||
- All tool calls go through `router → ToolGovernance → ToolManager`
|
||||
- `graph_run_id` is propagated in every gateway request metadata
|
||||
- Logs contain **hash + sizes only** (no payload content)
|
||||
|
||||
---
|
||||
|
||||
## Graphs
|
||||
|
||||
### `release_check`
|
||||
|
||||
Runs the DAARION release_check pipeline via `job_orchestrator_tool`.
|
||||
|
||||
**Nodes**: `start_job` → `poll_job` (loop) → `finalize` → END
|
||||
|
||||
**Input** (`input` field of StartRunRequest):
|
||||
|
||||
| Field | Type | Default | Description |
|
||||
|---|---|---|---|
|
||||
| `service_name` | string | `"unknown"` | Service being released |
|
||||
| `diff_text` | string | `""` | Git diff text |
|
||||
| `fail_fast` | bool | `true` | Stop on first gate failure |
|
||||
| `run_deps` | bool | `true` | Run dependency scan gate |
|
||||
| `run_drift` | bool | `true` | Run drift analysis gate |
|
||||
| `run_smoke` | bool | `false` | Run smoke tests |
|
||||
| `deps_targets` | array | `["python","node"]` | Ecosystems for dep scan |
|
||||
| `deps_vuln_mode` | string | `"offline_cache"` | OSV mode |
|
||||
| `deps_fail_on` | array | `["CRITICAL","HIGH"]` | Blocking severity |
|
||||
| `drift_categories` | array | all | Drift analysis categories |
|
||||
| `risk_profile` | string | `"default"` | Risk profile |
|
||||
| `timeouts.overall_sec` | number | `180` | Total timeout |
|
||||
|
||||
**Output** (in `result`): Same as `release_check_runner.py`:
|
||||
```json
|
||||
{
|
||||
"pass": true,
|
||||
"gates": [{"name": "pr_review", "status": "pass"}, ...],
|
||||
"recommendations": [],
|
||||
"summary": "All 5 gates passed.",
|
||||
"elapsed_ms": 4200
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `incident_triage`
|
||||
|
||||
Collects observability data, logs, health, and runbooks to build a triage report.
|
||||
|
||||
**Nodes**: `validate_input` → `service_overview` → `top_errors_logs` → `health_and_runbooks` → `trace_lookup` → `build_triage_report` → END
|
||||
|
||||
**Input**:
|
||||
|
||||
| Field | Type | Default | Description |
|
||||
|---|---|---|---|
|
||||
| `service` | string | — | Service name (required) |
|
||||
| `symptom` | string | — | Brief incident description (required) |
|
||||
| `time_range.from` | ISO | -1h | Start of analysis window |
|
||||
| `time_range.to` | ISO | now | End of analysis window |
|
||||
| `env` | string | `"prod"` | Environment |
|
||||
| `include_traces` | bool | `false` | Look up traces from log IDs |
|
||||
| `max_log_lines` | int | `120` | Log lines to analyse (max 200) |
|
||||
| `log_query_hint` | string | auto | Custom log query filter |
|
||||
|
||||
**Time window**: Clamped to 24h max (`INCIDENT_MAX_TIME_WINDOW_H`).
|
||||
|
||||
**Output** (in `result`):
|
||||
```json
|
||||
{
|
||||
"summary": "...",
|
||||
"suspected_root_causes": [{"rank": 1, "cause": "...", "evidence": [...]}],
|
||||
"impact_assessment": "SLO impact: error_rate=2.1%",
|
||||
"mitigations_now": ["Increase DB pool size", "..."],
|
||||
"next_checks": ["Verify healthz", "..."],
|
||||
"references": {
|
||||
"metrics": {"slo": {...}, "alerts_count": 1},
|
||||
"log_samples": ["..."],
|
||||
"runbook_snippets": [{"path": "...", "text": "..."}],
|
||||
"traces": {"traces": [...]}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Deployment on NODA2
|
||||
|
||||
### Quick start
|
||||
|
||||
```bash
|
||||
# On NODA2 host
|
||||
cd /path/to/microdao-daarion
|
||||
|
||||
# Start supervisor + redis (attaches to existing dagi-network-node2)
|
||||
docker compose \
|
||||
-f docker-compose.node2.yml \
|
||||
-f docker-compose.node2-sofiia-supervisor.yml \
|
||||
up -d sofiia-supervisor sofiia-redis
|
||||
|
||||
# Verify
|
||||
curl http://localhost:8084/healthz
|
||||
```
|
||||
|
||||
### Environment variables
|
||||
|
||||
Copy `.env.example` and set:
|
||||
|
||||
```bash
|
||||
cp services/sofiia-supervisor/.env.example .env
|
||||
# Edit:
|
||||
# GATEWAY_BASE_URL=http://router:8000 (must be accessible from container)
|
||||
# SUPERVISOR_API_KEY=<key-for-router> (matches SUPERVISOR_API_KEY in router)
|
||||
# SUPERVISOR_INTERNAL_KEY=<key-to-protect-supervisor-api>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## HTTP API
|
||||
|
||||
All endpoints require `Authorization: Bearer <SUPERVISOR_INTERNAL_KEY>` if `SUPERVISOR_INTERNAL_KEY` is set.
|
||||
|
||||
### Start a run
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8084/v1/graphs/release_check/runs \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"workspace_id": "daarion",
|
||||
"user_id": "sofiia",
|
||||
"agent_id": "sofiia",
|
||||
"input": {
|
||||
"service_name": "router",
|
||||
"run_deps": true,
|
||||
"run_drift": true
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
Response:
|
||||
```json
|
||||
{"run_id": "gr_3a1b2c...", "status": "queued", "result": null}
|
||||
```
|
||||
|
||||
### Poll for result
|
||||
|
||||
```bash
|
||||
curl http://localhost:8084/v1/runs/gr_3a1b2c...
|
||||
```
|
||||
|
||||
Response (when complete):
|
||||
```json
|
||||
{
|
||||
"run_id": "gr_3a1b2c...",
|
||||
"graph": "release_check",
|
||||
"status": "succeeded",
|
||||
"started_at": "2026-02-23T10:00:00+00:00",
|
||||
"finished_at": "2026-02-23T10:00:45+00:00",
|
||||
"result": {"pass": true, "gates": [...], "summary": "..."},
|
||||
"events": [
|
||||
{"ts": "...", "type": "node_start", "node": "graph_start", "details": {...}},
|
||||
...
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Start incident triage
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8084/v1/graphs/incident_triage/runs \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"workspace_id": "daarion",
|
||||
"user_id": "helion",
|
||||
"agent_id": "sofiia",
|
||||
"input": {
|
||||
"service": "router",
|
||||
"symptom": "High error rate after deploy",
|
||||
"env": "prod",
|
||||
"include_traces": true,
|
||||
"time_range": {"from": "2026-02-23T09:00:00Z", "to": "2026-02-23T10:00:00Z"}
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
### Cancel a run
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8084/v1/runs/gr_3a1b2c.../cancel
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Connecting to Sofiia (Telegram / internal UI)
|
||||
|
||||
The supervisor exposes a REST API. To invoke from Sofiia's tool loop:
|
||||
|
||||
1. The gateway `job_orchestrator_tool` can be extended with a `start_supervisor_run` action that calls `POST http://sofiia-supervisor:8080/v1/graphs/{name}/runs`.
|
||||
2. Alternatively, call the supervisor directly from the Telegram bot's backend (if on the same network).
|
||||
|
||||
Example flow for Telegram → Sofiia → Supervisor → Release Check:
|
||||
```
|
||||
User: "Run release check for router"
|
||||
→ Sofiia LLM → job_orchestrator_tool(start_task, release_check)
|
||||
→ Router: job_orchestrator_tool dispatches to release_check_runner
|
||||
→ Returns report (existing flow, unchanged)
|
||||
```
|
||||
|
||||
For **async long-running** workflows (>30s), use the supervisor directly:
|
||||
```
|
||||
User: "Triage production incident for router"
|
||||
→ Sofiia LLM → [http call] POST /v1/graphs/incident_triage/runs
|
||||
→ Returns run_id
|
||||
→ Sofiia polls GET /v1/runs/{run_id} (or user asks again)
|
||||
→ Returns structured triage report
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Security
|
||||
|
||||
- `SUPERVISOR_INTERNAL_KEY`: Protects supervisor HTTP API (recommend: network-level isolation instead)
|
||||
- `SUPERVISOR_API_KEY` → sent to router's `/v1/tools/execute` as `Authorization: Bearer`
|
||||
- Router's `SUPERVISOR_API_KEY` guards direct tool execution endpoint
|
||||
- All RBAC/limits/audit enforced by router's `ToolGovernance` — supervisor cannot bypass them
|
||||
- LangGraph nodes have **no credentials or secrets** — only `workspace_id/user_id/agent_id`
|
||||
|
||||
---
|
||||
|
||||
## State TTL and cleanup
|
||||
|
||||
Runs are stored in Redis with TTL = `RUN_TTL_SEC` (default 24h). After TTL expires, the run metadata is automatically removed.
|
||||
|
||||
To extend TTL for important runs, call `backend.save_run(run)` with a new timestamp (planned: admin endpoint).
|
||||
87
docs/supervisor/postmortem_draft_graph.md
Normal file
87
docs/supervisor/postmortem_draft_graph.md
Normal file
@@ -0,0 +1,87 @@
|
||||
# Postmortem Draft Graph
|
||||
|
||||
## Overview
|
||||
|
||||
The `postmortem_draft_graph` is a LangGraph workflow on the Sofiia Supervisor (NODA2) that generates structured postmortem drafts from incident data.
|
||||
|
||||
## Flow
|
||||
|
||||
```
|
||||
validate → load_incident → ensure_triage → draft_postmortem
|
||||
→ attach_artifacts → append_followups → build_result → END
|
||||
```
|
||||
|
||||
1. **validate** — checks `incident_id` is provided.
|
||||
2. **load_incident** — calls `oncall_tool.incident_get` via gateway.
|
||||
3. **ensure_triage** — if no `triage_report` artifact exists, generates one by calling observability/health/KB tools.
|
||||
4. **draft_postmortem** — builds a deterministic markdown + JSON postmortem using a structured template.
|
||||
5. **attach_artifacts** — uploads `postmortem_draft.md`, `postmortem_draft.json` (and optionally `triage_report.json`) via `oncall_tool.incident_attach_artifact`.
|
||||
6. **append_followups** — creates `followup` timeline events from the postmortem.
|
||||
7. **build_result** — returns the final output.
|
||||
|
||||
## API
|
||||
|
||||
### Start run
|
||||
|
||||
```bash
|
||||
curl -X POST http://supervisor:8000/v1/graphs/postmortem_draft/runs \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"workspace_id": "default",
|
||||
"user_id": "admin",
|
||||
"agent_id": "sofiia",
|
||||
"input": {
|
||||
"incident_id": "inc_20260223_1000_abc123",
|
||||
"service": "router",
|
||||
"env": "prod",
|
||||
"include_traces": false
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
### Input
|
||||
|
||||
| Field | Type | Required | Description |
|
||||
|-------|------|----------|-------------|
|
||||
| incident_id | string | Yes | Existing incident ID |
|
||||
| service | string | No | Override service (defaults to incident's service) |
|
||||
| env | string | No | Environment (default: prod) |
|
||||
| time_range | object | No | `{"from": "ISO", "to": "ISO"}` (defaults to incident timestamps) |
|
||||
| include_traces | bool | No | Include trace lookup in triage (default: false) |
|
||||
|
||||
### Output
|
||||
|
||||
```json
|
||||
{
|
||||
"incident_id": "inc_...",
|
||||
"artifacts_count": 3,
|
||||
"artifacts": [...],
|
||||
"followups_count": 4,
|
||||
"triage_was_generated": true,
|
||||
"markdown_preview": "# Postmortem: Router OOM\n..."
|
||||
}
|
||||
```
|
||||
|
||||
## Postmortem Template
|
||||
|
||||
The generated markdown includes:
|
||||
|
||||
- **Summary** — from triage report
|
||||
- **Impact** — SLO/health assessment
|
||||
- **Detection** — when/how the incident was reported
|
||||
- **Timeline** — from incident events
|
||||
- **Root Cause Analysis** — from triage suspected causes
|
||||
- **Mitigations Applied** — from triage/runbooks
|
||||
- **Follow-ups** — action items extracted from triage
|
||||
- **Prevention** — standard recommendations
|
||||
|
||||
## Error Handling
|
||||
|
||||
- Incident not found → `graph_status: "failed"`
|
||||
- Gateway errors during triage generation → non-fatal (uses partial data)
|
||||
- Follow-up append errors → non-fatal (graph still succeeds)
|
||||
- All tool calls go through gateway (RBAC/audit enforced)
|
||||
|
||||
## Correlation
|
||||
|
||||
Every tool call includes `graph_run_id` in metadata for full traceability.
|
||||
233
docs/tools/contract_tool.md
Normal file
233
docs/tools/contract_tool.md
Normal file
@@ -0,0 +1,233 @@
|
||||
# Contract Tool (OpenAPI/JSON Schema) - Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
Contract Tool validates OpenAPI 3.x specifications and detects breaking changes between API versions. Essential for release gates and API governance.
|
||||
|
||||
## Integration
|
||||
|
||||
### Tool Definition
|
||||
|
||||
Registered in `services/router/tool_manager.py`:
|
||||
|
||||
```python
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "contract_tool",
|
||||
"description": "📜 Перевірка OpenAPI контрактів...",
|
||||
"parameters": {...}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### RBAC Configuration
|
||||
|
||||
Added to `FULL_STANDARD_STACK` in `services/router/agent_tools_config.py`.
|
||||
|
||||
## Request Format
|
||||
|
||||
### `POST /v1/tools/contract-check`
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "lint_openapi | diff_openapi | generate_client_stub",
|
||||
"inputs": {
|
||||
"format": "openapi_json | openapi_yaml",
|
||||
"base": {
|
||||
"source": "text",
|
||||
"value": "..."
|
||||
},
|
||||
"head": {
|
||||
"source": "text",
|
||||
"value": "..."
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"fail_on_breaking": true,
|
||||
"strict": true,
|
||||
"max_chars": 800000,
|
||||
"service_name": "my-service"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Actions
|
||||
|
||||
### 1. lint_openapi
|
||||
|
||||
Static quality checks on OpenAPI specification.
|
||||
|
||||
**Example:**
|
||||
```json
|
||||
{
|
||||
"action": "lint_openapi",
|
||||
"inputs": {
|
||||
"format": "openapi_yaml",
|
||||
"base": {
|
||||
"source": "text",
|
||||
"value": "openapi: 3.0.0\npaths:\n /users:\n get:\n operationId: getUsers..."
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Lint Rules:**
|
||||
|
||||
| Severity | Rule | Description |
|
||||
|----------|------|-------------|
|
||||
| Error | Missing operationId | Every endpoint must have operationId |
|
||||
| Warning | Missing requestBody | POST/PUT should have requestBody |
|
||||
| Warning | No 2xx response | Success responses required |
|
||||
| Warning | Unresolved $ref | External references not allowed |
|
||||
| Info | Missing description | Critical endpoints need descriptions |
|
||||
|
||||
### 2. diff_openapi
|
||||
|
||||
Compare two OpenAPI specs and classify changes.
|
||||
|
||||
**Breaking Changes:**
|
||||
|
||||
| Type | Description |
|
||||
|------|-------------|
|
||||
| endpoint_removed | Endpoint or method removed |
|
||||
| param_removed | Parameter removed |
|
||||
| required_added | Required parameter/field added |
|
||||
| required_field_added | Required schema field added |
|
||||
| response_shape_changed | Response schema changed |
|
||||
| auth_changed | Auth requirements changed |
|
||||
| enum_narrowed | Enum values removed |
|
||||
| schema_incompatible | Type changed |
|
||||
|
||||
**Non-Breaking Changes:**
|
||||
|
||||
| Type | Description |
|
||||
|------|-------------|
|
||||
| endpoint_added | New endpoint |
|
||||
| param_optional_added | Optional parameter added |
|
||||
| description_updated | Description changed |
|
||||
| schema_extended | Optional fields added |
|
||||
|
||||
**Example:**
|
||||
```json
|
||||
{
|
||||
"action": "diff_openapi",
|
||||
"inputs": {
|
||||
"format": "openapi_yaml",
|
||||
"base": {"source": "text", "value": "..."},
|
||||
"head": {"source": "text", "value": "..."}
|
||||
},
|
||||
"options": {
|
||||
"fail_on_breaking": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 3. generate_client_stub
|
||||
|
||||
Generate Python client stub from OpenAPI spec.
|
||||
|
||||
**Example:**
|
||||
```json
|
||||
{
|
||||
"action": "generate_client_stub",
|
||||
"inputs": {
|
||||
"format": "openapi_yaml",
|
||||
"base": {"source": "text", "value": "..."}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"language": "python",
|
||||
"client_stub": "class UserAPIClient:\n def getUsers(self): ...",
|
||||
"info": {"title": "User API", "version": "1.0.0", "endpoints": 5}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Response Format
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "succeeded",
|
||||
"data": {
|
||||
"summary": "🚫 2 breaking change(s) detected",
|
||||
"breaking": [
|
||||
{
|
||||
"id": "OAC-001",
|
||||
"type": "endpoint_removed",
|
||||
"path": "/v1/users",
|
||||
"method": "DELETE",
|
||||
"location": "paths./v1/users.delete",
|
||||
"why_it_breaks": "Endpoint was removed",
|
||||
"suggested_fix": "Deprecate instead of removing"
|
||||
}
|
||||
],
|
||||
"non_breaking": [...],
|
||||
"lint": [...],
|
||||
"compat_score": {
|
||||
"breaking_count": 2,
|
||||
"warnings": 1,
|
||||
"coverage": 75
|
||||
},
|
||||
"release_checklist": [...]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Security Features
|
||||
|
||||
### Logging Policy
|
||||
- **NEVER** logs full OpenAPI specs
|
||||
- Only logs: hash (first 16 chars), spec size, service name
|
||||
|
||||
### Limits
|
||||
- `max_chars`: Default 800KB
|
||||
- Parse timeout: 30 seconds
|
||||
|
||||
## Release Checklist
|
||||
|
||||
Generated automatically for diff:
|
||||
|
||||
1. Breaking changes detected → requires version bump
|
||||
2. Communicate changes to API consumers
|
||||
3. Update API documentation
|
||||
4. Update client SDKs
|
||||
5. Test with existing clients
|
||||
|
||||
## Example Usage
|
||||
|
||||
### Check for Breaking Changes Before Release
|
||||
```
|
||||
"Перевір чи є breaking changes в API: base=spec-v1.yaml, head=spec-v2.yaml"
|
||||
```
|
||||
|
||||
### Validate OpenAPI Quality
|
||||
```
|
||||
"Зроби lint мого OpenAPI спека"
|
||||
```
|
||||
|
||||
### Generate Client SDK
|
||||
```
|
||||
"Згенеруй Python клієнта для мого API"
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
pytest tools/contract_tool/tests/test_contract_tool.py -v
|
||||
```
|
||||
|
||||
Test coverage:
|
||||
- Endpoint removed → breaking
|
||||
- Required field added → breaking
|
||||
- Optional field added → non-breaking
|
||||
- Enum narrowed → breaking
|
||||
- fail_on_breaking option
|
||||
- max_chars limit enforcement
|
||||
- Python client stub generation
|
||||
266
docs/tools/cost_analyzer_tool.md
Normal file
266
docs/tools/cost_analyzer_tool.md
Normal file
@@ -0,0 +1,266 @@
|
||||
# cost_analyzer_tool — FinOps & Resource Analyzer
|
||||
|
||||
**Категорія:** FinOps / Observability
|
||||
**RBAC:** `tools.cost.read` (report, top, anomalies, weights), `tools.cost.gate` (gate)
|
||||
**Ролі:** `agent_cto` (read + gate), `agent_oncall` (read)
|
||||
**Timeout:** 20 s
|
||||
**Rate limit:** 10 rpm
|
||||
|
||||
---
|
||||
|
||||
## Призначення
|
||||
|
||||
`cost_analyzer_tool` дає CTO/oncall команді відповіді на питання:
|
||||
|
||||
- **Хто спалює ресурси?** (по агентам, tools, workspace)
|
||||
- **Чи є аномальні сплески?** (порівняння вікна з базовим рівнем)
|
||||
- **Які налаштування ваг?** (для FinOps калібрування)
|
||||
|
||||
Всі розрахунки базуються на **відносних cost_units** без реальних грошових значень.
|
||||
Payload ніколи не зберігається і не логується.
|
||||
|
||||
---
|
||||
|
||||
## Actions
|
||||
|
||||
### `report` — агрегований звіт за період
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "report",
|
||||
"time_range": { "from": "2026-02-16T00:00:00Z", "to": "2026-02-23T00:00:00Z" },
|
||||
"group_by": ["tool", "agent_id"],
|
||||
"top_n": 10,
|
||||
"include_failed": true,
|
||||
"include_hourly": false
|
||||
}
|
||||
```
|
||||
|
||||
**Відповідь:**
|
||||
```json
|
||||
{
|
||||
"time_range": { "from": "...", "to": "..." },
|
||||
"totals": {
|
||||
"calls": 1240,
|
||||
"cost_units": 4821.5,
|
||||
"failed": 12,
|
||||
"denied": 3,
|
||||
"error_rate": 0.0097
|
||||
},
|
||||
"breakdowns": {
|
||||
"tool": [
|
||||
{ "tool": "comfy_generate_video", "count": 42, "cost_units": 5200.0, "avg_duration_ms": 8200 },
|
||||
{ "tool": "pr_reviewer_tool", "count": 87, "cost_units": 960.0, ... }
|
||||
],
|
||||
"agent_id": [...]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `top` — швидкий топ-N за вікно (24h/7d)
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "top",
|
||||
"window_hours": 24,
|
||||
"top_n": 10
|
||||
}
|
||||
```
|
||||
|
||||
**Відповідь:** `top_tools`, `top_agents`, `top_users`, `top_workspaces`.
|
||||
|
||||
---
|
||||
|
||||
### `anomalies` — виявлення сплесків
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "anomalies",
|
||||
"window_minutes": 60,
|
||||
"baseline_hours": 24,
|
||||
"ratio_threshold": 3.0,
|
||||
"min_calls": 50
|
||||
}
|
||||
```
|
||||
|
||||
**Алгоритм:**
|
||||
1. Вікно = `[now - window_minutes, now]`
|
||||
2. Базовий рівень = `[now - baseline_hours, now - window_minutes]`
|
||||
3. Spike = `window_rate / baseline_rate >= ratio_threshold` AND `calls >= min_calls`
|
||||
4. Error spike = `error_rate > 10%` AND `calls >= min_calls`
|
||||
|
||||
**Відповідь:**
|
||||
```json
|
||||
{
|
||||
"anomalies": [
|
||||
{
|
||||
"type": "cost_spike",
|
||||
"key": "tool:comfy_generate_image",
|
||||
"tool": "comfy_generate_image",
|
||||
"window": "last_60m",
|
||||
"baseline": "prev_24h",
|
||||
"window_calls": 120,
|
||||
"baseline_calls": 8,
|
||||
"ratio": 6.3,
|
||||
"recommendation": "'comfy_generate_image' cost spike..."
|
||||
}
|
||||
],
|
||||
"anomaly_count": 1,
|
||||
"stats": { "window_calls": 120, "baseline_calls": 8 }
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `weights` — поточні ваги cost model
|
||||
|
||||
```json
|
||||
{ "action": "weights" }
|
||||
```
|
||||
|
||||
Повертає конфіг з `config/cost_weights.yml`: defaults, per-tool weights, anomaly thresholds.
|
||||
|
||||
---
|
||||
|
||||
## Cost Model
|
||||
|
||||
```
|
||||
cost_units = cost_per_call(tool) + duration_ms × cost_per_ms(tool)
|
||||
```
|
||||
|
||||
Це **відносні одиниці**, не реальні $. Калібруйте через `config/cost_weights.yml`.
|
||||
|
||||
| Tool | cost_per_call | cost_per_ms |
|
||||
|------|--------------|-------------|
|
||||
| `comfy_generate_video` | 120.0 | 0.005 |
|
||||
| `comfy_generate_image` | 50.0 | 0.003 |
|
||||
| `pr_reviewer_tool` | 10.0 | 0.002 |
|
||||
| `observability_tool` | 2.0 | 0.001 |
|
||||
| _(default)_ | 1.0 | 0.001 |
|
||||
|
||||
---
|
||||
|
||||
## Audit persistence (AuditStore)
|
||||
|
||||
Кожен tool call через `ToolGovernance.post_call()` автоматично зберігається.
|
||||
|
||||
**Backend (env var `AUDIT_BACKEND`):**
|
||||
|
||||
| Backend | Config | Опис |
|
||||
|---------|--------|------|
|
||||
| `jsonl` (default) | `AUDIT_JSONL_DIR` | Append-only файли по датах: `ops/audit/tool_audit_YYYY-MM-DD.jsonl` |
|
||||
| `postgres` | `DATABASE_URL` | async asyncpg → таблиця `tool_audit_events` |
|
||||
| `memory` | — | In-process (тести, dev) |
|
||||
| `null` | — | Вимкнено |
|
||||
|
||||
**Поля в store** (без payload):
|
||||
```
|
||||
ts, req_id, workspace_id, user_id, agent_id, tool, action,
|
||||
status, duration_ms, in_size, out_size, input_hash,
|
||||
graph_run_id?, graph_node?, job_id?
|
||||
```
|
||||
|
||||
**Non-fatal:** якщо store недоступний — логується warning, tool call не падає.
|
||||
|
||||
---
|
||||
|
||||
## Інтеграція в release_check (cost_watch gate)
|
||||
|
||||
`cost_watch` — **warning-only gate**: завжди `pass=true`, додає рекомендації.
|
||||
|
||||
```yaml
|
||||
# ops/task_registry.yml (release_check inputs)
|
||||
run_cost_watch: true # вмикає gate
|
||||
cost_watch_window_hours: 24 # вікно аналізу
|
||||
cost_spike_ratio_threshold: 3.0
|
||||
cost_min_calls_threshold: 50
|
||||
```
|
||||
|
||||
**Gate output:**
|
||||
```json
|
||||
{
|
||||
"name": "cost_watch",
|
||||
"status": "pass",
|
||||
"anomalies_count": 2,
|
||||
"anomalies_preview": [...],
|
||||
"note": "2 anomaly(ies) detected",
|
||||
"recommendations": ["Cost spike: comfy_generate_image — apply rate limit."]
|
||||
}
|
||||
```
|
||||
|
||||
Якщо `cost_analyzer_tool` недоступний → `skipped: true`, реліз не блокується.
|
||||
|
||||
---
|
||||
|
||||
## RBAC
|
||||
|
||||
```yaml
|
||||
cost_analyzer_tool:
|
||||
actions:
|
||||
report: { entitlements: ["tools.cost.read"] }
|
||||
top: { entitlements: ["tools.cost.read"] }
|
||||
anomalies: { entitlements: ["tools.cost.read"] }
|
||||
weights: { entitlements: ["tools.cost.read"] }
|
||||
gate: { entitlements: ["tools.cost.gate"] }
|
||||
|
||||
role_entitlements:
|
||||
agent_cto: [..., tools.cost.read, tools.cost.gate]
|
||||
agent_oncall: [..., tools.cost.read]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Limits
|
||||
|
||||
```yaml
|
||||
cost_analyzer_tool:
|
||||
timeout_ms: 20000 # 20s
|
||||
max_chars_in: 2000
|
||||
max_bytes_out: 1048576 # 1MB
|
||||
rate_limit_rpm: 10
|
||||
concurrency: 2
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Security
|
||||
|
||||
- Payload НІКОЛИ не зберігається і не логується.
|
||||
- AuditStore writes: тільки hash + sizes + metadata.
|
||||
- Всі aggregation queries фільтруються тільки по метаданим (ts, tool, agent_id, workspace_id).
|
||||
- `anomalies` endpoint не розкриває вміст tool calls.
|
||||
|
||||
---
|
||||
|
||||
## Тести
|
||||
|
||||
`tests/test_cost_analyzer.py` (18 тестів):
|
||||
|
||||
| Тест | Перевірка |
|
||||
|------|-----------|
|
||||
| `test_audit_persist_nonfatal` | Broken store не ламає tool call |
|
||||
| `test_cost_report_aggregation` | 20 events → правильні totals і top |
|
||||
| `test_cost_event_cost_units` | `pr_reviewer` 500ms = 11.0 units |
|
||||
| `test_anomalies_spike_detection` | 80 calls у вікні vs 2 в baseline → spike |
|
||||
| `test_anomalies_no_spike` | Стабільний трафік → 0 anomalies |
|
||||
| `test_top_report` | comfy_generate_video як #1 spender |
|
||||
| `test_release_check_cost_watch_always_passes` | gate pass=True з аномаліями |
|
||||
| `test_cost_watch_gate_in_full_release_check` | full run_release_check зберігає pass |
|
||||
| `test_rbac_cost_tool_deny` | alateya (agent_media) → denied |
|
||||
| `test_rbac_cost_tool_allow` | sofiia (agent_cto) → allowed |
|
||||
| `test_weights_loaded` | cost_weights.yml читається коректно |
|
||||
| `test_jsonl_store_roundtrip` | write + read JSONL |
|
||||
| `test_cost_watch_skipped_on_tool_error` | tool error → gate skipped, не error |
|
||||
| `test_anomalies_error_rate_spike` | 80% failure rate → error_spike |
|
||||
|
||||
---
|
||||
|
||||
## Наступні кроки (після MVP)
|
||||
|
||||
1. **Postgres backend** — для довгострокового зберігання (>7d) і SQL-запитів.
|
||||
2. **Token-level cost** — якщо є метрика LLM tokens → точний $ cost.
|
||||
3. **Budget alerts** — notify oncall при перевищенні щоденного бюджету.
|
||||
4. **Cost dashboard** — Grafana panel на базі `tool_audit_events` table.
|
||||
5. **Per-graph cost** — tracking через `graph_run_id` (вже є в schema).
|
||||
275
docs/tools/data_governance_tool.md
Normal file
275
docs/tools/data_governance_tool.md
Normal file
@@ -0,0 +1,275 @@
|
||||
# data_governance_tool — Data Governance & Privacy
|
||||
|
||||
**Категорія:** Security / Privacy / Compliance
|
||||
**RBAC:** `tools.data_gov.read` (scan_repo, scan_audit, retention_check, policy), `tools.data_gov.gate` (gate)
|
||||
**Ролі:** `agent_cto` (read + gate), `agent_oncall` (read)
|
||||
**Timeout:** 30 s
|
||||
**Rate limit:** 5 rpm
|
||||
|
||||
---
|
||||
|
||||
## Призначення
|
||||
|
||||
`data_governance_tool` — детермінований, read-only сканер для виявлення:
|
||||
|
||||
- **PII в коді/доках** (email, телефон, кредитні картки, паспорти)
|
||||
- **Хардкоджених секретів** (API keys, private keys, токени)
|
||||
- **Ризиків логування** (sensitive fields у logger calls, raw payload в audit records)
|
||||
- **Відсутності retention/TTL** при збереженні даних
|
||||
- **Аномалій в audit-стрімі** (PII у metadata, аномально великі outputs)
|
||||
- **Наявності cleanup-механізмів** (task_registry.yml, runbooks)
|
||||
|
||||
**Перший рівень — warning-only**: gate `privacy_watch` завжди `pass=True`, але генерує конкретні рекомендації.
|
||||
|
||||
---
|
||||
|
||||
## Actions
|
||||
|
||||
### `scan_repo` — статичний аналіз файлів
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "scan_repo",
|
||||
"mode": "fast",
|
||||
"max_files": 200,
|
||||
"paths_include": ["services/", "config/", "ops/"],
|
||||
"paths_exclude": ["**/node_modules/**", "**/*.lock"],
|
||||
"focus": ["pii", "secrets", "logging", "retention"]
|
||||
}
|
||||
```
|
||||
|
||||
**Режими:**
|
||||
- `fast` (default): `.py`, `.yml`, `.yaml`, `.json`, `.sh` — оптимізовано для CI
|
||||
- `full`: всі розширення з `config/data_governance_policy.yml`
|
||||
|
||||
**Категорії перевірок:**
|
||||
|
||||
| ID | Категорія | Severity | Опис |
|
||||
|----|-----------|----------|------|
|
||||
| `DG-PII-001` | pii | warning | Email address |
|
||||
| `DG-PII-002` | pii | warning | Phone number |
|
||||
| `DG-PII-003` | pii | **error** | Credit card |
|
||||
| `DG-PII-004` | pii | warning | Passport-like ID |
|
||||
| `DG-SEC-000` | secrets | **error** | Secret value (inherited from governance) |
|
||||
| `DG-SEC-001` | secrets | **error** | Private key block |
|
||||
| `DG-LOG-001` | logging | warning | Sensitive field in logger call |
|
||||
| `DG-AUD-001` | logging | **error** | Raw payload near audit/log write |
|
||||
| `DG-RET-001` | retention | warning | Storage write без TTL/retention |
|
||||
|
||||
**Відповідь:**
|
||||
```json
|
||||
{
|
||||
"pass": true,
|
||||
"summary": "Scanned 87 files (fast mode). Found 0 errors, 3 warnings, 1 info.",
|
||||
"stats": { "errors": 0, "warnings": 3, "infos": 1, "files_scanned": 87 },
|
||||
"findings": [
|
||||
{
|
||||
"id": "DG-LOG-001",
|
||||
"category": "logging",
|
||||
"severity": "warning",
|
||||
"title": "Potential sensitive field logged in auth.py",
|
||||
"evidence": { "path": "services/router/auth.py", "lines": "L42-L46", "details": "token=***REDACTED***" },
|
||||
"recommended_fix": "Apply redact() before logging. Log hash+last4 for identifiers."
|
||||
}
|
||||
],
|
||||
"recommendations": ["Review logger calls for sensitive fields. Apply redact()..."]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### `scan_audit` — аналіз audit-стріму
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "scan_audit",
|
||||
"backend": "jsonl",
|
||||
"time_window_hours": 24,
|
||||
"max_events": 50000
|
||||
}
|
||||
```
|
||||
|
||||
**Перевірки:**
|
||||
|
||||
| ID | Опис |
|
||||
|----|------|
|
||||
| `DG-AUD-101` | PII-like pattern в полях метадата audit event (user_id, workspace_id) |
|
||||
| `DG-AUD-102` | Аномально великий `out_size` (>64KB за замовчуванням) |
|
||||
|
||||
---
|
||||
|
||||
### `retention_check` — перевірка cleanup-механізмів
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "retention_check",
|
||||
"check_audit_cleanup_task": true,
|
||||
"check_jsonl_rotation": true,
|
||||
"check_memory_retention_docs": true,
|
||||
"check_logs_retention_docs": true
|
||||
}
|
||||
```
|
||||
|
||||
| ID | Severity | Опис |
|
||||
|----|----------|------|
|
||||
| `DG-RET-201` | warning | Не знайдено cleanup task або runbook для audit |
|
||||
| `DG-RET-202` | info | Cleanup/rotation задокументовано |
|
||||
| `DG-RET-203` | info | JSONL rotation реалізовано |
|
||||
| `DG-RET-204` | warning | JSONL rotation не підтверджено |
|
||||
| `DG-RET-205` | info | Memory retention policy не знайдено |
|
||||
| `DG-RET-206` | info | Log retention не задокументовано |
|
||||
|
||||
---
|
||||
|
||||
### `policy` — поточні політики
|
||||
|
||||
```json
|
||||
{ "action": "policy" }
|
||||
```
|
||||
|
||||
Повертає конфіг `config/data_governance_policy.yml`: retention, pii_patterns, logging_rules, severity_behavior.
|
||||
|
||||
---
|
||||
|
||||
## Evidence masking
|
||||
|
||||
**Всі evidence snippets маскуються** перед поверненням:
|
||||
1. Через `redact()` з `tool_governance` (успадковані `_SECRET_PATTERNS`)
|
||||
2. Truncate до 200 символів
|
||||
3. Ніяких raw значень у відповіді
|
||||
|
||||
---
|
||||
|
||||
## Інтеграція в release_check (privacy_watch gate)
|
||||
|
||||
`privacy_watch` — **warning-only gate**: завжди `pass=true`, додає рекомендації.
|
||||
|
||||
```yaml
|
||||
# ops/task_registry.yml (release_check inputs)
|
||||
run_privacy_watch: true # вмикає gate (default: true)
|
||||
privacy_watch_mode: "fast" # fast|full
|
||||
privacy_audit_window_hours: 24 # вікно для scan_audit
|
||||
```
|
||||
|
||||
**Gate output:**
|
||||
```json
|
||||
{
|
||||
"name": "privacy_watch",
|
||||
"status": "pass",
|
||||
"errors": 0,
|
||||
"warnings": 2,
|
||||
"infos": 1,
|
||||
"top_findings": [
|
||||
{ "id": "DG-LOG-001", "title": "...", "severity": "warning" }
|
||||
],
|
||||
"note": "3 finding(s): 0 error(s), 2 warning(s)",
|
||||
"recommendations": ["Review logger calls for sensitive fields."]
|
||||
}
|
||||
```
|
||||
|
||||
Якщо `data_governance_tool` недоступний → `skipped: true`, реліз не блокується.
|
||||
|
||||
---
|
||||
|
||||
## Конфігурація: `config/data_governance_policy.yml`
|
||||
|
||||
```yaml
|
||||
retention:
|
||||
audit_jsonl_days: 30
|
||||
audit_postgres_days: 90
|
||||
large_output_bytes: 65536 # threshold для DG-AUD-102
|
||||
|
||||
pii_patterns:
|
||||
email: { severity: "warning", ... }
|
||||
credit_card: { severity: "error", ... }
|
||||
|
||||
logging_rules:
|
||||
forbid_logging_fields: [password, token, secret, api_key, ...]
|
||||
raw_payload_indicators: [payload, prompt, messages, transcript, ...]
|
||||
redaction_calls: [redact, mask, sanitize, ...]
|
||||
|
||||
severity_behavior:
|
||||
gate_mode: "warning_only" # або "strict" (блокує на error)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## RBAC
|
||||
|
||||
```yaml
|
||||
data_governance_tool:
|
||||
actions:
|
||||
scan_repo: { entitlements: ["tools.data_gov.read"] }
|
||||
scan_audit: { entitlements: ["tools.data_gov.read"] }
|
||||
retention_check: { entitlements: ["tools.data_gov.read"] }
|
||||
policy: { entitlements: ["tools.data_gov.read"] }
|
||||
gate: { entitlements: ["tools.data_gov.gate"] }
|
||||
|
||||
role_entitlements:
|
||||
agent_cto: [..., tools.data_gov.read, tools.data_gov.gate]
|
||||
agent_oncall: [..., tools.data_gov.read]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Limits
|
||||
|
||||
```yaml
|
||||
data_governance_tool:
|
||||
timeout_ms: 30000 # 30s (file I/O + regex)
|
||||
max_chars_in: 3000 # params only
|
||||
max_bytes_out: 1MB
|
||||
rate_limit_rpm: 5
|
||||
concurrency: 1 # serial (filesystem-bound)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Security
|
||||
|
||||
- **Read-only**: ніяких записів, змін, видалень
|
||||
- **Path traversal protection**: всі шляхи перевіряються проти `repo_root`
|
||||
- **Evidence masking**: `redact()` + truncation — raw secrets ніколи не повертаються
|
||||
- **Never-scan list**: `.env`, `.pem`, `.key` файли не читаються
|
||||
- **Lock files excluded** (за замовчуванням): `*.lock` — запобігає false positives від hash-рядків у lock-файлах
|
||||
|
||||
---
|
||||
|
||||
## Тести
|
||||
|
||||
`tests/test_data_governance.py` (22 тести):
|
||||
|
||||
| Тест | Перевірка |
|
||||
|------|-----------|
|
||||
| `test_scan_repo_detects_pii_logging` | Email у logger call → DG-PII-001 |
|
||||
| `test_scan_repo_detects_logging_forbidden_field` | `token=` у logger → DG-LOG-001 |
|
||||
| `test_scan_repo_detects_secret` | Hardcoded API key → DG-SEC-000, masked |
|
||||
| `test_scan_repo_detects_private_key` | `-----BEGIN RSA PRIVATE KEY-----` → error |
|
||||
| `test_scan_repo_detects_credit_card` | 16-digit number → DG-PII-003 error |
|
||||
| `test_scan_repo_no_findings_clean` | Clean code → 0 error findings |
|
||||
| `test_scan_audit_detects_pii_in_meta` | Email у user_id → DG-AUD-101 |
|
||||
| `test_scan_audit_detects_large_output` | 200KB out_size → DG-AUD-102 |
|
||||
| `test_scan_audit_no_findings_for_clean_events` | Normal events → 0 findings |
|
||||
| `test_retention_check_missing_cleanup` | No runbook → DG-RET-201 |
|
||||
| `test_retention_check_with_cleanup` | Runbook mentions cleanup → DG-RET-202 |
|
||||
| `test_scan_repo_raw_payload_audit_write` | `payload` near logger → DG-AUD-001 |
|
||||
| `test_release_check_privacy_watch_integration` | Gate pass=True, adds recs |
|
||||
| `test_privacy_watch_skipped_on_tool_error` | Tool exception → skipped=True |
|
||||
| `test_rbac_deny` | alateya (agent_media) → denied |
|
||||
| `test_rbac_allow` | sofiia (agent_cto) → allowed |
|
||||
| `test_policy_action` | Returns structured policy |
|
||||
| `test_path_traversal_protection` | `../../etc/passwd` → None |
|
||||
| `test_scan_repo_excludes_lock_files` | `*.lock` excluded |
|
||||
| `test_mask_evidence_redacts_secrets` | key=value → masked |
|
||||
| `test_mask_evidence_truncates` | 500 chars → ≤120 |
|
||||
| `test_unknown_action_returns_error` | Invalid action → error dict |
|
||||
|
||||
---
|
||||
|
||||
## Наступні кроки
|
||||
|
||||
1. **`strict` mode** — увімкнути для `credit_card` + `private_key` (блокувати реліз)
|
||||
2. **AST-based analysis** — замість regex: точніший аналіз Python AST для logging calls
|
||||
3. **Git history scan** — перевіряти, чи не були secrets раніше в git history
|
||||
4. **GDPR retention report** — автоматичний звіт для DPO про час зберігання PII по системах
|
||||
5. **Integration з incident_triage** — DG findings у RCA якщо є privacy-related incident
|
||||
203
docs/tools/dependency_scanner_tool.md
Normal file
203
docs/tools/dependency_scanner_tool.md
Normal file
@@ -0,0 +1,203 @@
|
||||
# dependency_scanner_tool
|
||||
|
||||
Scans Python and Node.js dependencies for known vulnerabilities, outdated packages, and license policy violations.
|
||||
Integrates as **Gate 3** in `release_check`.
|
||||
|
||||
---
|
||||
|
||||
## Purpose
|
||||
|
||||
| Concern | Source A | Source B |
|
||||
|---|---|---|
|
||||
| **Vulnerabilities** | OSV.dev database (online or cached) | Pinned deps from lock files |
|
||||
| **Outdated packages** | Fixed versions in OSV findings | Current versions in lock files |
|
||||
| **License policy** | Configured deny/warn list | Package metadata (limited in MVP) |
|
||||
|
||||
---
|
||||
|
||||
## RBAC
|
||||
|
||||
| Entitlement | Grants |
|
||||
|---|---|
|
||||
| `tools.deps.read` | Run scan (agent_cto, agent_oncall) |
|
||||
| `tools.deps.gate` | Gate execution in release_check (agent_cto only) |
|
||||
|
||||
---
|
||||
|
||||
## Limits (`config/tool_limits.yml`)
|
||||
|
||||
| Param | Value |
|
||||
|---|---|
|
||||
| `timeout_ms` | 45 000 ms |
|
||||
| `max_chars_in` | 3 000 |
|
||||
| `max_bytes_out` | 1 048 576 (1 MB) |
|
||||
| `rate_limit_rpm` | 5 |
|
||||
| `concurrency` | 1 |
|
||||
|
||||
---
|
||||
|
||||
## Invocation
|
||||
|
||||
```json
|
||||
{
|
||||
"tool": "dependency_scanner_tool",
|
||||
"action": "scan",
|
||||
"targets": ["python", "node"],
|
||||
"vuln_mode": "offline_cache",
|
||||
"fail_on": ["CRITICAL", "HIGH"],
|
||||
"timeout_sec": 40
|
||||
}
|
||||
```
|
||||
|
||||
### Parameters
|
||||
|
||||
| Param | Type | Default | Description |
|
||||
|---|---|---|---|
|
||||
| `action` | string | — | Must be `"scan"` |
|
||||
| `targets` | array | `["python","node"]` | Ecosystems to scan |
|
||||
| `vuln_mode` | string | `"offline_cache"` | `"online"` queries api.osv.dev; `"offline_cache"` uses local cache only |
|
||||
| `fail_on` | array | `["CRITICAL","HIGH"]` | Severity levels that block release |
|
||||
| `timeout_sec` | number | `40` | Hard wall-clock timeout |
|
||||
|
||||
---
|
||||
|
||||
## Response
|
||||
|
||||
```json
|
||||
{
|
||||
"pass": true,
|
||||
"summary": "✅ Dependency scan PASSED. 120 deps scanned, 0 vulns found.",
|
||||
"stats": {
|
||||
"ecosystems": ["PyPI", "npm"],
|
||||
"files_scanned": 4,
|
||||
"deps_total": 120,
|
||||
"deps_pinned": 115,
|
||||
"deps_unresolved": 3,
|
||||
"vulns_total": 0,
|
||||
"by_severity": {"CRITICAL": 0, "HIGH": 0, "MEDIUM": 0, "LOW": 0, "UNKNOWN": 0},
|
||||
"outdated_total": 0
|
||||
},
|
||||
"vulnerabilities": [],
|
||||
"outdated": [],
|
||||
"licenses": [],
|
||||
"recommendations": []
|
||||
}
|
||||
```
|
||||
|
||||
### Vulnerability object
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "GHSA-35jh-r3h4-6jhm",
|
||||
"ecosystem": "npm",
|
||||
"package": "lodash",
|
||||
"version": "4.17.20",
|
||||
"severity": "HIGH",
|
||||
"fixed_versions": ["4.17.21"],
|
||||
"aliases": ["CVE-2021-23337"],
|
||||
"evidence": {"file": "services/render-pptx-worker/package-lock.json", "details": "lodash==4.17.20"},
|
||||
"recommendation": "Upgrade lodash from 4.17.20 to 4.17.21"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pass / Fail Rule
|
||||
|
||||
| Condition | Result |
|
||||
|---|---|
|
||||
| Any `CRITICAL` or `HIGH` vuln found | `pass=false` (gate blocks) |
|
||||
| Any denied license found | `pass=false` |
|
||||
| `MEDIUM` vulns only | `pass=true`, added to recommendations |
|
||||
| `UNKNOWN` severity (cache miss) | `pass=true`, recommendation to populate cache |
|
||||
|
||||
---
|
||||
|
||||
## Supported Manifest Files
|
||||
|
||||
### Python (priority order)
|
||||
1. `poetry.lock` — fully resolved versions
|
||||
2. `Pipfile.lock` — resolved versions
|
||||
3. `requirements*.txt` — only `==` pinned lines are scanned; unpinned noted
|
||||
4. `pyproject.toml` — declared deps listed (no version resolution)
|
||||
|
||||
### Node.js (priority order)
|
||||
1. `package-lock.json` (npm v2/v3)
|
||||
2. `pnpm-lock.yaml`
|
||||
3. `yarn.lock`
|
||||
4. `package.json` — only if no lock file present
|
||||
|
||||
---
|
||||
|
||||
## Vulnerability Sources
|
||||
|
||||
### OSV.dev
|
||||
|
||||
**Online mode** (`vuln_mode=online`):
|
||||
- Queries `https://api.osv.dev/v1/querybatch` in batches of 100
|
||||
- Requires entry in `config/network_allowlist.yml` (`dependency_scanner_tool.hosts: api.osv.dev`)
|
||||
- New results are cached to `ops/cache/osv_cache.json`
|
||||
|
||||
**Offline cache mode** (`vuln_mode=offline_cache`, default):
|
||||
- Reads from `ops/cache/osv_cache.json` only
|
||||
- Cache misses → severity `UNKNOWN` (not blocking by default)
|
||||
- No outbound network calls
|
||||
|
||||
**Cache format** (`ops/cache/osv_cache.json`):
|
||||
```json
|
||||
{
|
||||
"version": 1,
|
||||
"updated_at": "...",
|
||||
"entries": {
|
||||
"PyPI:requests:2.31.0": {"vulns": [], "cached_at": "..."},
|
||||
"npm:lodash:4.17.20": {"vulns": [...], "cached_at": "..."}
|
||||
}
|
||||
}
|
||||
```
|
||||
Cache key: `{ecosystem}:{normalized_name}:{version}`
|
||||
|
||||
---
|
||||
|
||||
## Security
|
||||
|
||||
- **Read-only**: scans lock files; no writes (except optional cache update in online mode)
|
||||
- **Evidence redaction**: secrets/tokens masked before inclusion in report
|
||||
- **No payload logging**: only hash of dep list + counts logged to audit trail
|
||||
- **Path traversal protection**: excluded dirs (`node_modules`, `.git`, `.venv`, etc.)
|
||||
- **Size limits**: max 80 files, 2000 deps, 500 vulns enforced in code
|
||||
|
||||
---
|
||||
|
||||
## Integration in release_check
|
||||
|
||||
Gate order: `pr_review` → `config_lint` → **`dependency_scan`** → `contract_diff` → `threat_model` → `smoke` → `drift`
|
||||
|
||||
`release_check` inputs related to this gate:
|
||||
|
||||
| Input | Type | Default | Description |
|
||||
|---|---|---|---|
|
||||
| `run_deps` | boolean | `true` | Enable dependency scan gate |
|
||||
| `deps_targets` | array | `["python","node"]` | Ecosystems |
|
||||
| `deps_vuln_mode` | string | `"offline_cache"` | OSV mode |
|
||||
| `deps_fail_on` | array | `["CRITICAL","HIGH"]` | Blocking severity |
|
||||
| `deps_timeout_sec` | number | `40` | Timeout |
|
||||
|
||||
---
|
||||
|
||||
## Outdated Analysis (lockfile_only mode)
|
||||
|
||||
In MVP, "latest version" is inferred from OSV `fixed_versions` only (no registry lookup).
|
||||
An upgrade is recommended if a fixed version > current version exists in an OSV finding.
|
||||
|
||||
Full latest-version lookup (PyPI/npm registry) is planned as an optional enhancement.
|
||||
|
||||
---
|
||||
|
||||
## Extending the Cache
|
||||
|
||||
To refresh the offline cache:
|
||||
1. Set `vuln_mode: online` in a controlled environment with outbound access to `api.osv.dev`
|
||||
2. Run `dependency_scanner_tool` — new entries are merged into `ops/cache/osv_cache.json`
|
||||
3. Commit the updated cache file
|
||||
|
||||
Or use `ops/scripts/refresh_osv_cache.py` (planned).
|
||||
253
docs/tools/drift_analyzer_tool.md
Normal file
253
docs/tools/drift_analyzer_tool.md
Normal file
@@ -0,0 +1,253 @@
|
||||
# drift_analyzer_tool
|
||||
|
||||
**Drift Analyzer — 6-й gate у release_check**
|
||||
Знаходить розбіжності між "джерелами правди" (docs/inventory/config) та фактичним станом repo.
|
||||
|
||||
---
|
||||
|
||||
## Огляд
|
||||
|
||||
`drift_analyzer_tool` — детерміністичний (без LLM), read-only аналізатор drift у 4 категоріях.
|
||||
|
||||
| Категорія | Джерело правди | Факт | Приклад drift |
|
||||
|-----------|---------------|------|---------------|
|
||||
| **services** | `inventory_services.csv` / `01_SERVICE_CATALOG.md` | `docker-compose*.yml` | DEPLOYED сервіс відсутній у compose |
|
||||
| **openapi** | `docs/contracts/*.openapi.yaml` | FastAPI route decorators у коді | Endpoint у spec але нема в коді |
|
||||
| **nats** | `inventory_nats_topics.csv` | `nc.publish/subscribe` у коді | Subject у коді не задокументований |
|
||||
| **tools** | `config/tools_rollout.yml` + `rbac_tools_matrix.yml` | Handlers у `tool_manager.py` | Tool у rollout але нема handler |
|
||||
|
||||
---
|
||||
|
||||
## Використання
|
||||
|
||||
### Через агента (OpenCode / Telegram)
|
||||
|
||||
```
|
||||
"Запусти drift аналіз"
|
||||
"Перевір drift для категорій tools та openapi"
|
||||
"Drift check перед релізом"
|
||||
```
|
||||
|
||||
### Через execute_tool
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "analyze",
|
||||
"categories": ["services", "openapi", "nats", "tools"],
|
||||
"timeout_sec": 25
|
||||
}
|
||||
```
|
||||
|
||||
### Через release_check (Gate 6, optional)
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "start_task",
|
||||
"params": {
|
||||
"task_id": "release_check",
|
||||
"inputs": {
|
||||
"service_name": "router",
|
||||
"run_drift": true,
|
||||
"drift_categories": ["openapi", "tools"],
|
||||
"drift_timeout_sec": 20
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Параметри
|
||||
|
||||
| Параметр | Тип | Обов'язковий | Опис |
|
||||
|----------|-----|:---:|------|
|
||||
| `action` | `"analyze"` | ✅ | Єдина дія |
|
||||
| `categories` | array | — | Підмножина `["services","openapi","nats","tools"]` (default: всі) |
|
||||
| `timeout_sec` | number | — | Таймаут в секундах (default: 25, max: 30) |
|
||||
|
||||
---
|
||||
|
||||
## Формат відповіді
|
||||
|
||||
```json
|
||||
{
|
||||
"pass": false,
|
||||
"summary": "❌ Drift analysis FAILED. 2 error(s), 1 warning(s).",
|
||||
"stats": {
|
||||
"errors": 2,
|
||||
"warnings": 1,
|
||||
"infos": 0,
|
||||
"skipped": [],
|
||||
"items_checked": {
|
||||
"services": 42,
|
||||
"openapi": 18,
|
||||
"tools": 65
|
||||
},
|
||||
"elapsed_ms": 1234.5,
|
||||
"by_category": { "...": "..." }
|
||||
},
|
||||
"findings": [
|
||||
{
|
||||
"category": "tools",
|
||||
"severity": "error",
|
||||
"id": "DRIFT-TOOLS-001",
|
||||
"title": "Tool 'fake_tool_x' in tools_rollout.yml but no handler in tool_manager.py",
|
||||
"evidence": {
|
||||
"path": "config/tools_rollout.yml",
|
||||
"details": "'fake_tool_x' referenced in rollout groups but missing from KNOWN_TOOL_HANDLERS"
|
||||
},
|
||||
"recommended_fix": "Add handler for 'fake_tool_x' in tool_manager.py execute_tool dispatch, or remove from rollout."
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Pass/Fail правило
|
||||
|
||||
| Умова | `pass` |
|
||||
|-------|--------|
|
||||
| Будь-який `severity: error` | `false` |
|
||||
| Тільки `warning` / `info` | `true` |
|
||||
| Категорія відсутня (skipped) | не впливає |
|
||||
|
||||
---
|
||||
|
||||
## Категорії деталі
|
||||
|
||||
### 1. services — Service Catalog vs docker-compose
|
||||
|
||||
**Джерела:**
|
||||
- A: `docs/architecture_inventory/inventory_services.csv` → поле `type` (DEPLOYED/DEFINED/...)
|
||||
- B: всі `docker-compose*.yml` у repo root + `infra/compose/docker-compose.yml`
|
||||
|
||||
**Findings:**
|
||||
|
||||
| ID | Severity | Умова |
|
||||
|----|----------|-------|
|
||||
| `DRIFT-SVC-001` | error | Сервіс `DEPLOYED` у catalog, але відсутній в compose |
|
||||
| `DRIFT-SVC-002` | warning | Сервіс є в compose, але не в catalog |
|
||||
|
||||
**Normalization:** `my-svc` ↔ `my_svc` (dash/underscore equivalence).
|
||||
|
||||
---
|
||||
|
||||
### 2. openapi — API Spec vs Code Routes
|
||||
|
||||
**Джерела:**
|
||||
- A: `docs/contracts/*.openapi.yaml` та будь-які `openapi*.yaml/yml/json` у repo
|
||||
- B: Python файли — `@app.get(...)`, `@router.post(...)`, `.add_api_route(...)`
|
||||
|
||||
**Findings:**
|
||||
|
||||
| ID | Severity | Умова |
|
||||
|----|----------|-------|
|
||||
| `DRIFT-OAS-001` | error | Path у OpenAPI spec але не знайдено в коді |
|
||||
| `DRIFT-OAS-002` | error | Path `/v1/*` є в коді але не описаний у spec |
|
||||
| `DRIFT-OAS-003` | warning | Method mismatch для тієї самої path |
|
||||
|
||||
**Normalization:** trailing slash, lowercase path comparison.
|
||||
**Скоп коду:** тільки `/v1/` routes перевіряються для OAS-002.
|
||||
|
||||
---
|
||||
|
||||
### 3. nats — Subject Inventory vs Code Usage
|
||||
|
||||
**Джерела:**
|
||||
- A: `docs/architecture_inventory/inventory_nats_topics.csv` (поле `subject`)
|
||||
- B: regex пошук `nc.publish(...)`, `nc.subscribe(...)`, `subject=...` у `.py` файлах
|
||||
|
||||
**Findings:**
|
||||
|
||||
| ID | Severity | Умова |
|
||||
|----|----------|-------|
|
||||
| `DRIFT-NATS-001` | warning | Subject використовується в коді але відсутній у inventory |
|
||||
| `DRIFT-NATS-002` | info | Subject у inventory але не знайдено в коді (можливо legacy) |
|
||||
|
||||
**Wildcard matching:** `agent.run.{agent_id}` → `agent.run.*` → `agent.run.>`.
|
||||
**Skipped:** якщо `inventory_nats_topics.csv` відсутній — категорія `skipped`, gate не падає.
|
||||
|
||||
---
|
||||
|
||||
### 4. tools — Rollout/Matrix vs Handlers
|
||||
|
||||
**Джерела:**
|
||||
- A: `config/tools_rollout.yml` (всі tool-назви у groups, з @group expand)
|
||||
- B: `config/rbac_tools_matrix.yml` (секція `tools:`)
|
||||
- C: `KNOWN_TOOL_HANDLERS` у `drift_analyzer.py` (compile-time список)
|
||||
- D: `agent_tools_config.effective_tools` для ролей `agent_default` і `agent_cto`
|
||||
|
||||
**Findings:**
|
||||
|
||||
| ID | Severity | Умова |
|
||||
|----|----------|-------|
|
||||
| `DRIFT-TOOLS-001` | error | Tool у rollout але нема handler |
|
||||
| `DRIFT-TOOLS-002` | warning | Handler є але tool відсутній у RBAC matrix |
|
||||
| `DRIFT-TOOLS-003` | warning | Tool у matrix але ніколи не потрапляє в effective_tools |
|
||||
|
||||
**Maintenance:** при додаванні нового tool handler — оновіть `KNOWN_TOOL_HANDLERS` у `drift_analyzer.py`.
|
||||
|
||||
---
|
||||
|
||||
## Безпека
|
||||
|
||||
- **Read-only:** не записує нічого у repo
|
||||
- **Path traversal:** сканує тільки всередині `REPO_ROOT`
|
||||
- **Excluded dirs:** `node_modules`, `.git`, `venv*`, `__pycache__`, `dist`, `build`, `rollback_backups`
|
||||
- **File size limit:** max 256KB per file
|
||||
- **File count limit:** max 300 files per category scan
|
||||
- **Secret redaction:** evidence маскується `_redact_evidence()` перед поверненням
|
||||
- **Governance:** проходить через `ToolGovernance.pre_call/post_call` (RBAC, limits, audit)
|
||||
|
||||
---
|
||||
|
||||
## RBAC Entitlements
|
||||
|
||||
| Entitlement | Хто | Що дозволяє |
|
||||
|-------------|-----|-------------|
|
||||
| `tools.drift.read` | `agent_cto`, `agent_oncall` | Запускати drift analyze |
|
||||
| `tools.drift.gate` | `agent_cto` | Запускати drift у release gate |
|
||||
|
||||
---
|
||||
|
||||
## Limits (`config/tool_limits.yml`)
|
||||
|
||||
| Параметр | Значення |
|
||||
|----------|----------|
|
||||
| `timeout_ms` | 30 000 (30s) |
|
||||
| `max_chars_in` | 5 000 |
|
||||
| `max_bytes_out` | 524 288 (512KB) |
|
||||
| `rate_limit_rpm` | 5 |
|
||||
| `concurrency` | 1 |
|
||||
|
||||
---
|
||||
|
||||
## Оновлення `KNOWN_TOOL_HANDLERS`
|
||||
|
||||
Коли додається новий tool handler у `tool_manager.py`:
|
||||
|
||||
1. Додай tool name до `KNOWN_TOOL_HANDLERS` у `drift_analyzer.py`
|
||||
2. Додай tool до `config/tools_rollout.yml` (потрібна роль)
|
||||
3. Додай tool до `config/rbac_tools_matrix.yml` (actions + entitlements)
|
||||
4. Запусти `pytest tests/test_drift_analyzer.py::TestToolsDrift` щоб перевірити
|
||||
|
||||
```python
|
||||
# drift_analyzer.py
|
||||
KNOWN_TOOL_HANDLERS: FrozenSet[str] = frozenset({
|
||||
...,
|
||||
"my_new_tool", # add here
|
||||
})
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Файли
|
||||
|
||||
| Файл | Призначення |
|
||||
|------|-------------|
|
||||
| `services/router/drift_analyzer.py` | Вся логіка аналізу (4 категорії) |
|
||||
| `services/router/tool_manager.py` | Handler `_drift_analyzer_tool` + TOOL_DEFINITIONS |
|
||||
| `services/router/release_check_runner.py` | Gate 6 `_run_drift()` |
|
||||
| `config/tools_rollout.yml` | `cto_tools` включає `drift_analyzer_tool` |
|
||||
| `config/rbac_tools_matrix.yml` | `drift_analyzer_tool` actions + `tools.drift.*` entitlements |
|
||||
| `config/tool_limits.yml` | `drift_analyzer_tool` limits |
|
||||
| `tests/test_drift_analyzer.py` | 29 тестів + fixtures |
|
||||
277
docs/tools/governance.md
Normal file
277
docs/tools/governance.md
Normal file
@@ -0,0 +1,277 @@
|
||||
# Tool Governance
|
||||
|
||||
**Система керування інструментами DAARION.city**
|
||||
Версія: 2.0 | Нода: NODE2 (розробка) + NODA1 (production)
|
||||
|
||||
---
|
||||
|
||||
## Огляд
|
||||
|
||||
Tool Governance — єдина система контролю над усіма tool-викликами агентів.
|
||||
Складається з чотирьох компонентів:
|
||||
|
||||
| Компонент | Файл | Що робить |
|
||||
|-----------|------|-----------|
|
||||
| **Rollout Policy** | `config/tools_rollout.yml` | Визначає, які tools отримує кожен агент за роллю |
|
||||
| **RBAC Matrix** | `config/rbac_tools_matrix.yml` | Матриця `tool → action → entitlement → role` |
|
||||
| **Safety Middleware** | `services/router/tool_governance.py` | Limits, redaction, allowlist, audit |
|
||||
| **Release Gate** | `ops/task_registry.yml` + `services/router/release_check_runner.py` | Єдиний release verdict |
|
||||
|
||||
---
|
||||
|
||||
## 1. Global Tools Rollout
|
||||
|
||||
### 1.1 Merge Policy
|
||||
|
||||
```
|
||||
effective_tools = unique(DEFAULT_TOOLS_BY_ROLE ∪ FULL_STANDARD_STACK ∪ agent.specialized_tools)
|
||||
```
|
||||
|
||||
Кожен агент **автоматично** отримує набір tools відповідно до ролі — без необхідності явно вказувати їх.
|
||||
|
||||
### 1.2 Конфіг (`config/tools_rollout.yml`)
|
||||
|
||||
```yaml
|
||||
# Групи tools
|
||||
default_tools_read:
|
||||
- repo_tool
|
||||
- kb_tool
|
||||
- oncall_tool
|
||||
- observability_tool
|
||||
...
|
||||
|
||||
cto_tools:
|
||||
- pr_reviewer_tool
|
||||
- contract_tool
|
||||
- config_linter_tool
|
||||
- threatmodel_tool
|
||||
- job_orchestrator_tool
|
||||
|
||||
# Ролі → групи
|
||||
role_map:
|
||||
agent_default:
|
||||
tools: ["@default_tools_read", "@content_tools"]
|
||||
agent_cto:
|
||||
tools: ["@default_tools_read", "@cto_tools", "@content_tools", "@media_tools"]
|
||||
agent_oncall:
|
||||
tools: ["@default_tools_read", "job_orchestrator_tool"]
|
||||
|
||||
# Агент → роль
|
||||
agent_roles:
|
||||
sofiia: agent_cto
|
||||
helion: agent_oncall
|
||||
alateya: agent_media
|
||||
```
|
||||
|
||||
### 1.3 Ролі
|
||||
|
||||
| Роль | Хто | Набір |
|
||||
|------|-----|-------|
|
||||
| `agent_cto` | sofiia, yaromir | Все: read + cto + content + media |
|
||||
| `agent_oncall` | helion | Read + job_orchestrator |
|
||||
| `agent_media` | alateya, nutra, agromatrix, greenfood... | Read + content + media |
|
||||
| `agent_default` | всі інші / нові агенти | Read + content |
|
||||
|
||||
### 1.4 Розширення груп (`@group`)
|
||||
|
||||
`@group_name` у конфігу розгортається рекурсивно. Підтримуються вкладені групи:
|
||||
|
||||
```yaml
|
||||
my_super_group:
|
||||
- "@cto_tools"
|
||||
- "@media_tools"
|
||||
- custom_tool
|
||||
```
|
||||
|
||||
### 1.5 Як перевірити tools агента
|
||||
|
||||
```python
|
||||
from agent_tools_config import get_agent_tools, get_agent_role
|
||||
|
||||
tools = get_agent_tools("sofiia") # → список всіх tools
|
||||
role = get_agent_role("sofiia") # → "agent_cto"
|
||||
```
|
||||
|
||||
**Acceptance**: новий агент без явного `tools` отримує read-набір автоматично.
|
||||
|
||||
---
|
||||
|
||||
## 2. RBAC Matrix
|
||||
|
||||
### 2.1 Структура (`config/rbac_tools_matrix.yml`)
|
||||
|
||||
```yaml
|
||||
tools:
|
||||
pr_reviewer_tool:
|
||||
actions:
|
||||
review:
|
||||
entitlements: ["tools.pr_review.use"]
|
||||
gate:
|
||||
entitlements: ["tools.pr_review.gate"]
|
||||
|
||||
role_entitlements:
|
||||
agent_cto:
|
||||
- tools.pr_review.use
|
||||
- tools.pr_review.gate
|
||||
...
|
||||
agent_default:
|
||||
- tools.repo.read
|
||||
- tools.kb.read
|
||||
...
|
||||
```
|
||||
|
||||
### 2.2 Enforcement Flow
|
||||
|
||||
```
|
||||
execute_tool(tool, action, agent_id)
|
||||
→ get_agent_role(agent_id) → "agent_cto"
|
||||
→ get_role_entitlements(role) → ["tools.pr_review.use", ...]
|
||||
→ get_required_entitlements(tool, action) → ["tools.pr_review.gate"]
|
||||
→ missing = required - agent_ents
|
||||
→ if missing: DENY
|
||||
```
|
||||
|
||||
### 2.3 Entitlement схема
|
||||
|
||||
```
|
||||
tools.<tool_short>.<scope>
|
||||
|
||||
Приклади:
|
||||
tools.repo.read
|
||||
tools.oncall.incident_write
|
||||
tools.pr_review.gate
|
||||
tools.jobs.run.deploy
|
||||
```
|
||||
|
||||
### 2.4 Перевірка вручну
|
||||
|
||||
```python
|
||||
from tool_governance import check_rbac
|
||||
|
||||
ok, reason = check_rbac("sofiia", "pr_reviewer_tool", "gate")
|
||||
# → (True, "")
|
||||
|
||||
ok, reason = check_rbac("helion", "pr_reviewer_tool", "gate")
|
||||
# → (False, "Missing entitlements: ['tools.pr_review.gate']")
|
||||
```
|
||||
|
||||
**Acceptance**: всі tool handlers використовують матрицю — жодного хардкоду прав у коді.
|
||||
|
||||
---
|
||||
|
||||
## 3. Tool Safety Middleware
|
||||
|
||||
Реалізовано у `services/router/tool_governance.py`.
|
||||
Застосовується автоматично до **кожного** `execute_tool(...)` виклику.
|
||||
|
||||
### 3.1 Limits (`config/tool_limits.yml`)
|
||||
|
||||
| Параметр | Опис |
|
||||
|----------|------|
|
||||
| `timeout_ms` | Максимальний час виконання |
|
||||
| `max_chars_in` | Максимальна довжина вхідного тексту |
|
||||
| `max_bytes_out` | Максимальний розмір відповіді |
|
||||
| `rate_limit_rpm` | Запитів на хвилину |
|
||||
| `concurrency` | Паралельних викликів |
|
||||
|
||||
Приклад:
|
||||
```yaml
|
||||
pr_reviewer_tool:
|
||||
timeout_ms: 60000 # 60s
|
||||
max_chars_in: 409600 # 400KB
|
||||
rate_limit_rpm: 10
|
||||
```
|
||||
|
||||
### 3.2 Redaction
|
||||
|
||||
Модуль `redact(text)` у `tool_governance.py` маскує:
|
||||
- API ключі (`api_key=***REDACTED***`)
|
||||
- Токени (`token=***REDACTED***`)
|
||||
- Паролі (`password=***REDACTED***`)
|
||||
- Bearer tokens, JWT, OAuth secrets, private keys
|
||||
|
||||
Застосовується до:
|
||||
- Evidence/snippets у результатах pr_reviewer_tool
|
||||
- Evidence у config_linter_tool
|
||||
- Log lines у observability_tool
|
||||
|
||||
**Включено за замовчуванням.** Вимкнути: `ToolGovernance(enable_redaction=False)`.
|
||||
|
||||
### 3.3 Network Allowlist (`config/network_allowlist.yml`)
|
||||
|
||||
Tools, що роблять HTTP-запити, обмежені allowlist:
|
||||
|
||||
```python
|
||||
from tool_governance import check_url_allowed
|
||||
|
||||
ok, reason = check_url_allowed("oncall_tool", "http://localhost:9102/health")
|
||||
# → (True, "")
|
||||
|
||||
ok, reason = check_url_allowed("oncall_tool", "http://evil.com/steal")
|
||||
# → (False, "Host 'evil.com' not in allowlist for tool 'oncall_tool'")
|
||||
```
|
||||
|
||||
`web_extract` та `crawl4ai_scrape` мають `allow_any_public: true` але блокують private IPs (RFC1918/loopback).
|
||||
|
||||
### 3.4 Audit Events
|
||||
|
||||
На кожен tool-виклик емітується structured event у log:
|
||||
|
||||
```json
|
||||
{
|
||||
"ts": "2026-02-23T12:00:00Z",
|
||||
"req_id": "abc123def456",
|
||||
"tool": "pr_reviewer_tool",
|
||||
"action": "review",
|
||||
"workspace_id": "default",
|
||||
"user_id": "user_123",
|
||||
"agent_id": "sofiia",
|
||||
"status": "pass",
|
||||
"duration_ms": 234.5,
|
||||
"limits_applied": {"timeout_ms": 60000, "max_chars_in": 409600},
|
||||
"input_hash": "a1b2c3d4e5f6",
|
||||
"input_chars": 1024,
|
||||
"output_size_bytes": 2048
|
||||
}
|
||||
```
|
||||
|
||||
**Payload не логується** — тільки hash та розміри.
|
||||
Log prefix: `TOOL_AUDIT`.
|
||||
|
||||
### 3.5 Integration у `execute_tool`
|
||||
|
||||
```python
|
||||
# В tool_manager.py, автоматично:
|
||||
governance = get_governance()
|
||||
pre = governance.pre_call(tool, action, agent_id, user_id, workspace_id, input_text)
|
||||
if not pre.allowed:
|
||||
return ToolResult(success=False, error=pre.reason)
|
||||
|
||||
result = await _handler(args) # actual tool execution
|
||||
|
||||
governance.post_call(pre.call_ctx, result.result, error=result.error)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Налаштування та Hot-Reload
|
||||
|
||||
```python
|
||||
# Force reload конфігів (без перезапуску)
|
||||
from agent_tools_config import reload_rollout_config
|
||||
from tool_governance import _reload_yaml_cache
|
||||
|
||||
reload_rollout_config()
|
||||
_reload_yaml_cache()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. Acceptance Criteria
|
||||
|
||||
- ✅ Новий агент без явного `tools` отримує read-набір автоматично
|
||||
- ✅ Sofiia/CTO має повний набір через роль `agent_cto`
|
||||
- ✅ Будь-який tool call проходить через middleware (limits/redaction/audit)
|
||||
- ✅ RBAC денить без entitlement, без хардкоду в коді
|
||||
- ✅ Allowlist блокує довільні URL для HTTP-tools
|
||||
- ✅ 31/31 тест проходить
|
||||
206
docs/tools/observability_tool.md
Normal file
206
docs/tools/observability_tool.md
Normal file
@@ -0,0 +1,206 @@
|
||||
# Observability Tool - Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
Observability Tool provides read-only access to metrics (Prometheus), logs (Loki), and traces (Tempo). Designed for CTO/SRE operations.
|
||||
|
||||
## Integration
|
||||
|
||||
### Tool Definition
|
||||
|
||||
Registered in `services/router/tool_manager.py`:
|
||||
|
||||
```python
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "observability_tool",
|
||||
"description": "📊 Метрики, логи, трейси...",
|
||||
"parameters": {...}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### RBAC Configuration
|
||||
|
||||
Added to `FULL_STANDARD_STACK` in `services/router/agent_tools_config.py`.
|
||||
|
||||
## Configuration
|
||||
|
||||
Data sources configured in `config/observability_sources.yml`:
|
||||
|
||||
```yaml
|
||||
prometheus:
|
||||
base_url: "http://prometheus:9090"
|
||||
allow_promql_prefixes:
|
||||
- "sum("
|
||||
- "rate("
|
||||
- "histogram_quantile("
|
||||
|
||||
loki:
|
||||
base_url: "http://loki:3100"
|
||||
|
||||
tempo:
|
||||
base_url: "http://tempo:3200"
|
||||
|
||||
limits:
|
||||
max_time_window_hours: 24
|
||||
max_series: 200
|
||||
max_points: 2000
|
||||
timeout_seconds: 5
|
||||
```
|
||||
|
||||
Override URLs via environment variables:
|
||||
- `PROMETHEUS_URL`
|
||||
- `LOKI_URL`
|
||||
- `TEMPO_URL`
|
||||
|
||||
## Actions
|
||||
|
||||
### 1. metrics_query
|
||||
|
||||
Prometheus instant query.
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "metrics_query",
|
||||
"params": {
|
||||
"query": "rate(http_requests_total[5m])",
|
||||
"datasource": "prometheus"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Allowed PromQL prefixes:**
|
||||
- `sum(`, `rate(`, `histogram_quantile(`, `avg(`, `max(`, `min(`, `count(`, `irate(`
|
||||
|
||||
### 2. metrics_range
|
||||
|
||||
Prometheus range query.
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "metrics_range",
|
||||
"params": {
|
||||
"query": "rate(http_requests_total[5m])",
|
||||
"time_range": {
|
||||
"from": "2024-01-15T10:00:00Z",
|
||||
"to": "2024-01-15T11:00:00Z"
|
||||
},
|
||||
"step_seconds": 30
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 3. logs_query
|
||||
|
||||
Loki log query.
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "logs_query",
|
||||
"params": {
|
||||
"query": "{service=\"gateway\"}",
|
||||
"time_range": {
|
||||
"from": "2024-01-15T10:00:00Z",
|
||||
"to": "2024-01-15T11:00:00Z"
|
||||
},
|
||||
"limit": 100
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 4. traces_query
|
||||
|
||||
Tempo trace search.
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "traces_query",
|
||||
"params": {
|
||||
"trace_id": "abc123"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 5. service_overview
|
||||
|
||||
Aggregated service metrics.
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "service_overview",
|
||||
"params": {
|
||||
"service": "gateway",
|
||||
"time_range": {
|
||||
"from": "2024-01-15T10:00:00Z",
|
||||
"to": "2024-01-15T11:00:00Z"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Returns:
|
||||
- p95 latency
|
||||
- error rate
|
||||
- throughput
|
||||
|
||||
## Security Features
|
||||
|
||||
### Query Allowlist
|
||||
Only allowlisted PromQL prefixes can be used.
|
||||
|
||||
### Time Window Limits
|
||||
- Max 24 hours per query
|
||||
- Step min: 15s, max: 300s
|
||||
|
||||
### Limits
|
||||
- Max series: 200
|
||||
- Max points: 2000
|
||||
- Timeout: 5 seconds
|
||||
|
||||
### Redaction
|
||||
Secrets automatically redacted from logs:
|
||||
- `api_key=***`
|
||||
- `token=***`
|
||||
- `password=***`
|
||||
|
||||
## Example Usage
|
||||
|
||||
### Check Service Latency
|
||||
```
|
||||
"Покажи p95 latency для gateway за останні 30 хвилин"
|
||||
```
|
||||
|
||||
### View Error Rate
|
||||
```
|
||||
"Який error rate для router за останню годину?"
|
||||
```
|
||||
|
||||
### Search Logs
|
||||
```
|
||||
"Знайди помилки в логах gateway за останні 2 години"
|
||||
```
|
||||
|
||||
### Get Trace
|
||||
```
|
||||
"Покажи трейс abc123"
|
||||
```
|
||||
|
||||
### Service Overview
|
||||
```
|
||||
"Дай overview gateway сервісу"
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
pytest tools/observability_tool/tests/test_observability_tool.py -v
|
||||
```
|
||||
|
||||
Test coverage:
|
||||
- Valid PromQL queries work
|
||||
- Invalid PromQL blocked
|
||||
- Time window limit enforced
|
||||
- Trace by ID query
|
||||
- Service overview
|
||||
292
docs/tools/oncall_tool.md
Normal file
292
docs/tools/oncall_tool.md
Normal file
@@ -0,0 +1,292 @@
|
||||
# Oncall/Runbook Tool - Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
Oncall Tool provides operational information: services catalog, health checks, deployments, runbooks, and incident tracking. Read-only for most agents, with gated write for.
|
||||
|
||||
## Integration incidents
|
||||
|
||||
### Tool Definition
|
||||
|
||||
Registered in `services/router/tool_manager.py`:
|
||||
|
||||
```python
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "oncall_tool",
|
||||
"description": "📋 Операційна інформація...",
|
||||
"parameters": {...}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### RBAC Configuration
|
||||
|
||||
Added to `FULL_STANDARD_STACK` in `services/router/agent_tools_config.py`.
|
||||
|
||||
## Actions
|
||||
|
||||
### 1. services_list
|
||||
|
||||
List all services from docker-compose files and service catalogs.
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "services_list"
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"services": [
|
||||
{"name": "router", "source": "docker-compose.yml", "type": "service", "criticality": "medium"},
|
||||
{"name": "gateway", "source": "docker-compose.yml", "type": "service", "criticality": "high"}
|
||||
],
|
||||
"count": 2
|
||||
}
|
||||
```
|
||||
|
||||
### 2. service_health
|
||||
|
||||
Check health endpoint of a service.
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "service_health",
|
||||
"params": {
|
||||
"service_name": "router",
|
||||
"health_endpoint": "http://router-service:8000/health"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Security:** Only allowlisted internal hosts can be checked.
|
||||
|
||||
**Allowlist:** `localhost`, `127.0.0.1`, `router-service`, `gateway-service`, `memory-service`, `swapper-service`, `crewai-service`
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"service": "router",
|
||||
"endpoint": "http://router-service:8000/health",
|
||||
"status": "healthy",
|
||||
"status_code": 200,
|
||||
"latency_ms": 15
|
||||
}
|
||||
```
|
||||
|
||||
### 3. service_status
|
||||
|
||||
Get service status and version info.
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "service_status",
|
||||
"params": {
|
||||
"service_name": "router"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 4. deployments_recent
|
||||
|
||||
Get recent deployments from log file or git.
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "deployments_recent"
|
||||
}
|
||||
```
|
||||
|
||||
**Sources (priority):**
|
||||
1. `ops/deployments.jsonl`
|
||||
2. Git commit history (fallback)
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"deployments": [
|
||||
{"ts": "2024-01-15T10:00:00", "service": "router", "version": "1.2.0"},
|
||||
{"type": "git_commit", "commit": "abc123 Fix bug"}
|
||||
],
|
||||
"count": 2
|
||||
}
|
||||
```
|
||||
|
||||
### 5. runbook_search
|
||||
|
||||
Search for runbooks.
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "runbook_search",
|
||||
"params": {
|
||||
"query": "deployment"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Search directories:** `ops/`, `runbooks/`, `docs/runbooks/`, `docs/ops/`
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"results": [
|
||||
{"path": "ops/deploy.md", "file": "deploy.md"}
|
||||
],
|
||||
"query": "deployment",
|
||||
"count": 1
|
||||
}
|
||||
```
|
||||
|
||||
### 6. runbook_read
|
||||
|
||||
Read a specific runbook.
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "runbook_read",
|
||||
"params": {
|
||||
"runbook_path": "ops/deploy.md"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Security:**
|
||||
- Only reads from allowlisted directories
|
||||
- Path traversal blocked
|
||||
- Secrets masked in content
|
||||
- Max 200KB per read
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"path": "ops/deploy.md",
|
||||
"content": "# Deployment Runbook\n\n...",
|
||||
"size": 1234
|
||||
}
|
||||
```
|
||||
|
||||
### 7. incident_log_list
|
||||
|
||||
List incidents.
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "incident_log_list",
|
||||
"params": {
|
||||
"severity": "sev1",
|
||||
"limit": 20
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"incidents": [
|
||||
{
|
||||
"ts": "2024-01-15T10:00:00",
|
||||
"severity": "sev1",
|
||||
"title": "Router down",
|
||||
"service": "router"
|
||||
}
|
||||
],
|
||||
"count": 1
|
||||
}
|
||||
```
|
||||
|
||||
### 8. incident_log_append
|
||||
|
||||
Add new incident (gated - requires entitlement).
|
||||
|
||||
```json
|
||||
{
|
||||
"action": "incident_log_append",
|
||||
"params": {
|
||||
"service_name": "router",
|
||||
"incident_title": "High latency",
|
||||
"incident_severity": "sev2",
|
||||
"incident_details": "Router experiencing 500ms latency",
|
||||
"incident_tags": ["performance", "router"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**RBAC:** Only `sofiia`, `helion`, `admin` can add incidents.
|
||||
|
||||
**Storage:** `ops/incidents.jsonl`
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"incident_id": "2024-01-15T10:00:00",
|
||||
"status": "logged"
|
||||
}
|
||||
```
|
||||
|
||||
## Security Features
|
||||
|
||||
### Health Check Allowlist
|
||||
Only internal service endpoints can be checked:
|
||||
- `localhost`, `127.0.0.1`
|
||||
- Service names: `router-service`, `gateway-service`, `memory-service`, `swapper-service`, `crewai-service`
|
||||
|
||||
### Runbook Security
|
||||
- Only read from allowlisted directories: `ops/`, `runbooks/`, `docs/runbooks/`, `docs/ops/`
|
||||
- Path traversal blocked
|
||||
- Secrets automatically masked
|
||||
|
||||
### RBAC
|
||||
- Read actions: `tools.oncall.read` (default for all agents)
|
||||
- Write incidents: `tools.oncall.incident_write` (only sofiia, helion, admin)
|
||||
|
||||
## Data Files
|
||||
|
||||
Created empty files for data storage:
|
||||
- `ops/incidents.jsonl` - Incident log
|
||||
- `ops/deployments.jsonl` - Deployment log
|
||||
|
||||
## Example Usage
|
||||
|
||||
### Check Service Health
|
||||
```
|
||||
"Перевіри health router сервісу"
|
||||
```
|
||||
|
||||
### Find Runbook
|
||||
```
|
||||
"Знайди runbook про деплой"
|
||||
```
|
||||
|
||||
### Read Deployment Runbook
|
||||
```
|
||||
"Відкрий runbook/deploy.md"
|
||||
```
|
||||
|
||||
### View Recent Deployments
|
||||
```
|
||||
"Покажи останні деплої"
|
||||
```
|
||||
|
||||
### Log Incident
|
||||
```
|
||||
"Зареєструй інцидент: router висока затримка, sev2"
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
pytest tools/oncall_tool/tests/test_oncall_tool.py -v
|
||||
```
|
||||
|
||||
Test coverage:
|
||||
- services_list parses docker-compose
|
||||
- runbook_search finds results
|
||||
- runbook_read blocks path traversal
|
||||
- runbook_read masks secrets
|
||||
- incident_log_append allowed for sofiia
|
||||
- incident_log_append blocked for regular agents
|
||||
- service_health blocks non-allowlisted hosts
|
||||
233
docs/tools/pr_reviewer_tool.md
Normal file
233
docs/tools/pr_reviewer_tool.md
Normal file
@@ -0,0 +1,233 @@
|
||||
# PR Reviewer Tool - Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
PR Reviewer Tool analyzes code changes (diff/patch) and provides structured code review with blocking issues, security findings, regression risks, and recommendations.
|
||||
|
||||
## Integration
|
||||
|
||||
### Tool Definition
|
||||
|
||||
Registered in `services/router/tool_manager.py`:
|
||||
|
||||
```python
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "pr_reviewer_tool",
|
||||
"description": "🔍 Рев'ю коду з PR/diff...",
|
||||
"parameters": {...}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### RBAC Configuration
|
||||
|
||||
Added to `FULL_STANDARD_STACK` in `services/router/agent_tools_config.py` - available to all agents.
|
||||
|
||||
## Request Format
|
||||
|
||||
### `POST /v1/tools/pr-review` (via gateway dispatcher)
|
||||
|
||||
```json
|
||||
{
|
||||
"mode": "blocking_only | full_review",
|
||||
"context": {
|
||||
"repo": {
|
||||
"name": "microdao-daarion",
|
||||
"commit_base": "abc123",
|
||||
"commit_head": "def456"
|
||||
},
|
||||
"change_summary": "Added user authentication",
|
||||
"risk_profile": "default | security_strict | release_gate"
|
||||
},
|
||||
"diff": {
|
||||
"format": "unified",
|
||||
"text": "diff --git a/file.py b/file.py\n...",
|
||||
"max_files": 200,
|
||||
"max_chars": 400000
|
||||
},
|
||||
"options": {
|
||||
"include_tests_checklist": true,
|
||||
"include_deploy_risks": true,
|
||||
"include_migration_risks": true,
|
||||
"language_hint": "python"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Response Format
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "succeeded",
|
||||
"data": {
|
||||
"summary": "🚫 2 blocking issues found",
|
||||
"score": {
|
||||
"risk": 50,
|
||||
"maintainability": 50,
|
||||
"security": 40,
|
||||
"test_coverage": 30
|
||||
},
|
||||
"blocking_issues": [
|
||||
{
|
||||
"id": "PRR-001",
|
||||
"title": "Secret detected in diff",
|
||||
"severity": "critical",
|
||||
"file": "config.py",
|
||||
"lines": "L15",
|
||||
"evidence": "API_KEY=***",
|
||||
"why_it_matters": "Secrets in code can be exposed...",
|
||||
"fix_suggestion": "Use environment variables..."
|
||||
}
|
||||
],
|
||||
"issues": [...],
|
||||
"regression_risks": [...],
|
||||
"security_findings": [...],
|
||||
"tests_checklist": [...],
|
||||
"deploy_checklist": [...],
|
||||
"questions_for_author": [...]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Modes
|
||||
|
||||
### `blocking_only`
|
||||
- Returns only critical and high severity issues
|
||||
- Fast feedback for quick gate decisions
|
||||
- No non-blocking issues
|
||||
|
||||
### `full_review`
|
||||
- Complete analysis with all issues
|
||||
- Includes recommendations and checklists
|
||||
- Slower but thorough
|
||||
|
||||
## Blocking Issue Categories
|
||||
|
||||
| Category | Severity | Description |
|
||||
|----------|----------|-------------|
|
||||
| SECRETS | Critical | API keys, tokens, passwords in diff |
|
||||
| RCE | Critical | eval, exec, subprocess with shell=True |
|
||||
| SQL_INJECTION | Critical | String concatenation in queries |
|
||||
| AUTH_BYPASS | High | Disabled auth checks |
|
||||
| HARDCODED_CREDS | High | Hardcoded credentials |
|
||||
| SECURITY_DISABLED | High | Security checks commented out |
|
||||
| BREAKING_API | High | API changes without versioning |
|
||||
|
||||
## Non-Blocking Issue Categories
|
||||
|
||||
| Category | Severity | Description |
|
||||
|----------|----------|-------------|
|
||||
| TODO | Medium | Technical debt markers |
|
||||
| BROAD_EXCEPTION | Medium | Catching all exceptions |
|
||||
| LOGGING | Low | Print statements |
|
||||
| BLOCKING_SLEEP | Low | Synchronous sleep calls |
|
||||
|
||||
## Security Features
|
||||
|
||||
### Logging Policy
|
||||
- **NEVER** logs `diff.text`
|
||||
- Only logs: hash (first 16 chars), file count, line count, char count, mode
|
||||
|
||||
### Secret Masking
|
||||
Evidence automatically masks:
|
||||
- `api_key = sk-live-***`
|
||||
- `token = ***`
|
||||
- `password = ***`
|
||||
- Private keys: `-----BEGIN PRIVATE KEY-----` → masked
|
||||
|
||||
### Limits Enforced
|
||||
- `max_chars`: Default 400KB, max configurable
|
||||
- `max_files`: Default 200 files
|
||||
- Timeout: 30 seconds for analysis
|
||||
|
||||
## Example Usage
|
||||
|
||||
### Blocking Only (Quick Gate)
|
||||
```json
|
||||
{
|
||||
"mode": "blocking_only",
|
||||
"diff": {
|
||||
"text": "diff --git a/.env b/.env\n+API_KEY=sk-live-123\n"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Expected: Returns blocking issue about secrets, evidence masked.
|
||||
|
||||
### Full Review (Complete Analysis)
|
||||
```json
|
||||
{
|
||||
"mode": "full_review",
|
||||
"context": {
|
||||
"repo": {"name": "microdao-daarion", "commit_base": "abc", "commit_head": "def"}
|
||||
},
|
||||
"diff": {
|
||||
"text": "diff --git a/services/api/main.py..."
|
||||
},
|
||||
"options": {
|
||||
"include_tests_checklist": true,
|
||||
"include_deploy_risks": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Expected: Full response with blocking issues, non-blocking issues, checklists, regression risks.
|
||||
|
||||
## Scoring
|
||||
|
||||
### Risk Score (0-100)
|
||||
- 0-25: Low risk
|
||||
- 26-50: Medium risk
|
||||
- 51-75: High risk
|
||||
- 76-100: Critical risk
|
||||
|
||||
Calculation: `min(100, blocking_issues * 25 + issues * 5)`
|
||||
|
||||
### Security Score (0-100)
|
||||
- Starts at 100
|
||||
- Subtracts 30 per security finding
|
||||
|
||||
## Integration with Other Tools
|
||||
|
||||
### With RepoTool
|
||||
If diff text not provided, can use:
|
||||
```json
|
||||
{
|
||||
"source": "git_range",
|
||||
"base": "abc123",
|
||||
"head": "def456"
|
||||
}
|
||||
```
|
||||
Tool will fetch diff via RepoTool or local git.
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
pytest tools/pr_reviewer_tool/tests/test_pr_reviewer.py -v
|
||||
```
|
||||
|
||||
Test coverage:
|
||||
- Diff size limits enforced
|
||||
- File count limits enforced
|
||||
- Secrets detection + masking
|
||||
- RCE pattern detection
|
||||
- SQL injection detection
|
||||
- Auth bypass detection
|
||||
- blocking_only vs full_review modes
|
||||
- Scoring calculation
|
||||
- Checklist generation
|
||||
|
||||
## Error Responses
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "failed",
|
||||
"error": {
|
||||
"code": "diff_too_large",
|
||||
"message": "Diff too large: 500000 chars (max: 400000)",
|
||||
"retryable": false
|
||||
}
|
||||
}
|
||||
```
|
||||
265
docs/tools/repo_tool.md
Normal file
265
docs/tools/repo_tool.md
Normal file
@@ -0,0 +1,265 @@
|
||||
# RepoTool - Read-only Repository Access
|
||||
|
||||
## Overview
|
||||
|
||||
RepoTool provides read-only access to the DAARION repository filesystem for agents (primarily Sofiia). It allows viewing code, configs, and searching through the codebase without any write or execute capabilities.
|
||||
|
||||
## Integration
|
||||
|
||||
### Tool Definition
|
||||
|
||||
RepoTool is registered in `services/router/tool_manager.py` under `TOOL_DEFINITIONS`:
|
||||
|
||||
```python
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "repo_tool",
|
||||
"description": "📂 Read-only доступ до файловї системи репозиторію...",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"action": {
|
||||
"type": "string",
|
||||
"enum": ["tree", "read", "search", "metadata"]
|
||||
},
|
||||
"path": {"type": "string"},
|
||||
"start_line": {"type": "integer"},
|
||||
"end_line": {"type": "integer"},
|
||||
"depth": {"type": "integer"},
|
||||
"glob": {"type": "string"},
|
||||
"query": {"type": "string"},
|
||||
"limit": {"type": "integer"},
|
||||
"max_bytes": {"type": "integer"}
|
||||
},
|
||||
"required": ["action"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### RBAC Configuration
|
||||
|
||||
Added to `services/router/agent_tools_config.py` in `FULL_STANDARD_STACK` - available to all agents.
|
||||
|
||||
## Actions
|
||||
|
||||
### 1. tree - Directory Structure
|
||||
|
||||
Show directory tree starting from a path.
|
||||
|
||||
**Parameters:**
|
||||
- `path`: Starting path (default: ".")
|
||||
- `depth`: Maximum depth (default: 3, max: 10)
|
||||
- `glob`: Optional glob pattern to filter files
|
||||
|
||||
**Example:**
|
||||
```json
|
||||
{
|
||||
"action": "tree",
|
||||
"path": "services",
|
||||
"depth": 2
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"result": {
|
||||
"tree": {
|
||||
"router": {"main.py": "[file]", "tool_manager.py": "[file]"},
|
||||
"gateway": {"main.py": "[file]"}
|
||||
},
|
||||
"path": "services"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 2. read - File Content
|
||||
|
||||
Read file contents with optional line limits.
|
||||
|
||||
**Parameters:**
|
||||
- `path`: File path (required)
|
||||
- `start_line`: Starting line (default: 1)
|
||||
- `end_line`: Ending line (optional)
|
||||
- `max_bytes`: Max bytes to read (default: 200KB, max: 1MB)
|
||||
|
||||
**Example:**
|
||||
```json
|
||||
{
|
||||
"action": "read",
|
||||
"path": "services/router/main.py",
|
||||
"start_line": 1,
|
||||
"end_line": 50
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"result": {
|
||||
"path": "services/router/main.py",
|
||||
"content": "import asyncio\n...",
|
||||
"lines": 50,
|
||||
"start_line": 1,
|
||||
"end_line": 50
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 3. search - Text Search
|
||||
|
||||
Search for text in files using grep.
|
||||
|
||||
**Parameters:**
|
||||
- `query`: Search query (required)
|
||||
- `path`: Starting path (default: ".")
|
||||
- `glob`: File pattern (e.g., "**/*.py")
|
||||
- `limit`: Max results (default: 50, max: 200)
|
||||
|
||||
**Example:**
|
||||
```json
|
||||
{
|
||||
"action": "search",
|
||||
"query": "async def",
|
||||
"path": "services",
|
||||
"glob": "**/*.py",
|
||||
"limit": 20
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"result": {
|
||||
"query": "async def",
|
||||
"path": "services",
|
||||
"matches": [
|
||||
{"file": "router/main.py", "line": "45", "content": "async def handle_request"},
|
||||
{"file": "router/main.py", "line": "102", "content": "async def process_message"}
|
||||
],
|
||||
"count": 2
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 4. metadata - Git Information
|
||||
|
||||
Get git repository metadata.
|
||||
|
||||
**Parameters:**
|
||||
- `path`: Path within repo (optional)
|
||||
|
||||
**Example:**
|
||||
```json
|
||||
{
|
||||
"action": "metadata",
|
||||
"path": "."
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"result": {
|
||||
"path": ".",
|
||||
"repo_root": "/path/to/repo",
|
||||
"commit": "abc123def456",
|
||||
"branch": "main",
|
||||
"dirty": false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Security Features
|
||||
|
||||
### Path Traversal Protection
|
||||
- Blocks `..` in paths
|
||||
- Rejects absolute paths outside repo root
|
||||
- Validates resolved path stays within repo root
|
||||
|
||||
### Symlink Escape Prevention
|
||||
- Uses `os.path.realpath()` to resolve symlinks
|
||||
- Ensures resolved path is still within repo root
|
||||
- Blocks access through symlinks to external locations
|
||||
|
||||
### Secret Masking
|
||||
Files and content containing secrets are automatically masked:
|
||||
|
||||
**Masked file patterns:**
|
||||
- `.env`, `.env.local`, `.env.production`
|
||||
- `*secrets*`, `*credentials*`, `*keys*`, `*tokens*`, `*passwords*`
|
||||
|
||||
**Masked content patterns:**
|
||||
```
|
||||
api_key = xxx → api_key = ***
|
||||
token = xxx → token = ***
|
||||
password = xxx → password = ***
|
||||
SECRET_KEY=xxx → SECRET_KEY=***
|
||||
Bearer xxx → Bearer ***
|
||||
-----BEGIN PRIVATE KEY----- → [MASKED]
|
||||
```
|
||||
|
||||
### Limits
|
||||
| Limit | Default | Max |
|
||||
|-------|---------|-----|
|
||||
| Tree depth | 3 | 10 |
|
||||
| Search results | 50 | 200 |
|
||||
| File size | 200KB | 1MB |
|
||||
| Lines per read | 1000 | - |
|
||||
| Search timeout | - | 10s |
|
||||
|
||||
## Example Usage
|
||||
|
||||
### Sofiia Commands
|
||||
|
||||
```
|
||||
"Покажи структуру папки services"
|
||||
"Прочитай файл services/router/main.py перші 50 рядків"
|
||||
"Знайди всі файли з 'async def' в папці services"
|
||||
"Який останній коміт?"
|
||||
```
|
||||
|
||||
## Error Responses
|
||||
|
||||
```json
|
||||
{
|
||||
"success": false,
|
||||
"result": null,
|
||||
"error": "Path traversal detected. Access denied."
|
||||
}
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"success": false,
|
||||
"result": null,
|
||||
"error": "File too large: 500000 bytes (max: 204800)"
|
||||
}
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
Run tests:
|
||||
```bash
|
||||
cd /path/to/repo
|
||||
pytest tools/repo_tool/tests/test_repo_tool.py -v
|
||||
```
|
||||
|
||||
Test coverage:
|
||||
- Path traversal blocked
|
||||
- Symlink escape blocked
|
||||
- Absolute path blocked
|
||||
- Tree action works
|
||||
- Read action works with line limits
|
||||
- Search finds content
|
||||
- Metadata returns git info
|
||||
- Secret files (.env) masked
|
||||
- Inline secrets masked
|
||||
- Size limits enforced
|
||||
- Depth limits enforced
|
||||
157
docs/voice_phase2_cutover.md
Normal file
157
docs/voice_phase2_cutover.md
Normal file
@@ -0,0 +1,157 @@
|
||||
# Voice Phase 2 Streaming — Cutover Plan
|
||||
|
||||
## Мета
|
||||
|
||||
Безпечно ввімкнути Phase 2 sentence-chunking streaming (`/api/voice/chat/stream`)
|
||||
для **всіх голосових сесій** без регресій.
|
||||
|
||||
## Поточний стан (baseline)
|
||||
|
||||
| Метрика | Значення |
|
||||
|---------|---------|
|
||||
| TTS p95 | ~1536ms ✅ |
|
||||
| gemma3 TTFA | ~2620ms ✅ |
|
||||
| qwen3.5 TTFA | ~8524ms ✅ (auto-promote qualified) |
|
||||
| qwen3:14b TTFA | ~11618ms ⚠ fallback only |
|
||||
| Streaming | enabled by default (checkbox ON) |
|
||||
|
||||
---
|
||||
|
||||
## Stages
|
||||
|
||||
### Stage 0 — Pre-conditions (blockers)
|
||||
|
||||
Виконати **перед будь-яким Stage**:
|
||||
|
||||
```bash
|
||||
# 1. Voice canary preflight
|
||||
python3 ops/scripts/voice_canary.py --mode preflight
|
||||
|
||||
# 2. Contract tests
|
||||
python3 -m pytest tests/test_voice_policy.py tests/test_voice_stream.py -v
|
||||
|
||||
# 3. Degradation state check
|
||||
curl -s http://localhost:8002/api/voice/degradation_status | python3 -m json.tool
|
||||
# Очікування: state = "ok"
|
||||
```
|
||||
|
||||
**Блокери:**
|
||||
- [ ] voice_canary preflight passed (Polina + Ostap OK)
|
||||
- [ ] 45/45 tests green
|
||||
- [ ] degradation_status state = "ok"
|
||||
- [ ] edge-tts версія = 7.2.7 (`docker exec dagi-memory-service-node2 pip show edge-tts | grep Version`)
|
||||
|
||||
---
|
||||
|
||||
### Stage 1 — 5% canary (feature flag в UI)
|
||||
|
||||
Увімкнути `streamMode=true` за замовчуванням (вже є), але обмежити до 5% сесій через cookie.
|
||||
|
||||
Реалізація (мінімальна):
|
||||
- BFF `/api/voice/chat/stream` вже є.
|
||||
- UI вже має `streamMode` checkbox (ON за замовчуванням).
|
||||
- Достатньо: **не блокувати**, але збирати метрики.
|
||||
|
||||
**Що моніторити (10 хвилин):**
|
||||
```bash
|
||||
# TTFA
|
||||
curl -s http://localhost:8002/api/voice/degradation_status
|
||||
|
||||
# Логи
|
||||
docker logs sofiia-console --tail 50 | grep "voice_stream ok"
|
||||
|
||||
# Underflows у browser console
|
||||
_voiceStats()
|
||||
```
|
||||
|
||||
**SLO Gate Stage 1:**
|
||||
- `voice_ttfa_ms` p95 ≤ 6000ms (20% буфер)
|
||||
- `voice_tts_first_ms` p95 ≤ 2500ms
|
||||
- underflow_rate ≤ 5% (relaxed for canary)
|
||||
- No `emergency` state in degradation_status
|
||||
|
||||
---
|
||||
|
||||
### Stage 2 — 50% rollout
|
||||
|
||||
Якщо Stage 1 пройшов 30 хвилин без SLO breach:
|
||||
- Переконатись що streamMode ON за замовчуванням.
|
||||
- Включити polling деградації (`_startDegradPolling` — вже активний).
|
||||
|
||||
**Що додатково перевірити:**
|
||||
```bash
|
||||
# Grafana dashboard (імпортувати ops/grafana_voice_dashboard.json)
|
||||
# Перевірити панелі 1-4 на наявність spike-ів
|
||||
|
||||
# Voice latency audit
|
||||
bash ops/voice_latency_audit.sh 2>&1 | tail -30
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Stage 3 — 100% (production default)
|
||||
|
||||
Умови:
|
||||
- Stage 2 стабільний ≥ 2 години
|
||||
- Усі алерти (ops/voice_alerts.yml) в стані "OK" (не firing)
|
||||
- `voice_queue_underflows_total` rate ≤ 0.017/s (1/хв)
|
||||
|
||||
**Дії:**
|
||||
1. Переконатись `streamMode` checkbox: `checked` by default — вже є.
|
||||
2. Додати voice_canary у ops/cron/jobs.cron — вже є.
|
||||
3. Задеплоїти ops/voice_alerts.yml у Prometheus.
|
||||
|
||||
---
|
||||
|
||||
## Rollback план
|
||||
|
||||
Якщо будь-який SLO breach або degradation state ≠ ok:
|
||||
|
||||
```bash
|
||||
# 1. Негайний rollback: вимкнути stream mode у BFF
|
||||
# (без rebuild — через env var)
|
||||
docker exec sofiia-console env | grep VOICE_STREAM_DISABLED
|
||||
# Або через конфіг — додати VOICE_STREAM_DISABLED=true і перезапустити
|
||||
|
||||
# 2. Перевірити стан
|
||||
curl -s http://localhost:8002/api/voice/degradation_status
|
||||
python3 ops/scripts/voice_canary.py --mode preflight
|
||||
|
||||
# 3. Якщо TTS деградував — перезапустити memory-service
|
||||
docker restart dagi-memory-service-node2
|
||||
sleep 10 && curl -s http://localhost:8000/voice/health
|
||||
```
|
||||
|
||||
**Fallback chain (автоматичний):**
|
||||
1. TTFA p95 > 5s → badge "⚠ AI SLOW", profile stays fast
|
||||
2. TTFA p95 > 8s → badge "⚡ FAST MODE", voiceQuality checkbox auto-unchecked
|
||||
3. TTS p95 > 2s → badge "⚠ TTS SLOW"
|
||||
4. TTS p95 > 4s → badge "🔴 TTS DEGRADED", user informed
|
||||
|
||||
---
|
||||
|
||||
## Feature Flag (якщо потрібен explicit ON/OFF)
|
||||
|
||||
Додати в `docker-compose.node2-sofiia.yml` → environment:
|
||||
|
||||
```yaml
|
||||
VOICE_STREAM_ENABLED: "true" # або "false" для rollback
|
||||
```
|
||||
|
||||
Та в `main.py` `/api/voice/chat/stream`:
|
||||
```python
|
||||
if not os.getenv("VOICE_STREAM_ENABLED", "true").lower() == "true":
|
||||
raise HTTPException(503, "Voice streaming disabled")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Метрики для Phase 2 auto-approve
|
||||
|
||||
voice_policy_update.py читає ops/voice_canary_last.json + Prometheus і автоматично:
|
||||
1. Оновлює `auto_promote` пороги в router-config.yml
|
||||
2. Генерує ops/voice_latency_report.json
|
||||
|
||||
```bash
|
||||
python3 ops/voice_policy_update.py --apply
|
||||
```
|
||||
129
docs/voice_streaming_phase2.md
Normal file
129
docs/voice_streaming_phase2.md
Normal file
@@ -0,0 +1,129 @@
|
||||
# Voice Streaming — Phase 2 Architecture
|
||||
|
||||
## Проблема
|
||||
|
||||
Поточний pipeline (Phase 1):
|
||||
|
||||
```
|
||||
User stops → STT → [full LLM text] → TTS request → audio plays
|
||||
↑
|
||||
Bottleneck: 8–12s
|
||||
```
|
||||
|
||||
TTS запускається лише після **повного** тексту від LLM.
|
||||
Результат: E2E latency = `llm_total + tts_compute` (~10–14s).
|
||||
|
||||
## Ціль Phase 2
|
||||
|
||||
```
|
||||
User stops → STT → [LLM first chunk] → TTS(chunk1) → audio starts
|
||||
↓
|
||||
[LLM continues] → TTS(chunk2) → audio continues
|
||||
```
|
||||
|
||||
**E2E TTFA** (time-to-first-audio): ~`llm_first_sentence + tts_compute` = ~3–5s.
|
||||
|
||||
---
|
||||
|
||||
## Архітектура
|
||||
|
||||
### Варіант A (рекомендований): "Sentence chunking" без streaming
|
||||
|
||||
Не потребує streaming від LLM. Кроки:
|
||||
|
||||
1. BFF робить `POST /api/generate` з `stream=true` до Ollama.
|
||||
2. BFF накопичує токени до першого `[.!?]` або 100 символів.
|
||||
3. Одразу `POST /voice/tts` для першого речення.
|
||||
4. Паралельно продовжує читати LLM stream для наступних речень.
|
||||
5. Браузер отримує перший аудіо chunk → починає відтворення.
|
||||
6. Наступні chunks додаються через MediaSource API або sequential `<audio>`.
|
||||
|
||||
**Переваги**: не потребує WebSocket/SSE між BFF і браузером для відео; тільки аудіо.
|
||||
|
||||
### Варіант B: Full streaming pipeline
|
||||
|
||||
```
|
||||
BFF → SSE → Browser
|
||||
↓
|
||||
chunk1_text → TTS → audio_b64_1
|
||||
chunk2_text → TTS → audio_b64_2
|
||||
...
|
||||
```
|
||||
|
||||
Складніший, але найкращий UX.
|
||||
|
||||
---
|
||||
|
||||
## Мінімальний патч (Варіант A)
|
||||
|
||||
### 1. BFF: новий endpoint `POST /api/voice/chat/stream`
|
||||
|
||||
```python
|
||||
@app.post("/api/voice/chat/stream")
|
||||
async def api_voice_chat_stream(body: VoiceChatBody):
|
||||
# 1. GET full LLM text (streaming or not)
|
||||
# 2. Split into sentences: re.split(r'(?<=[.!?])\s+', text)
|
||||
# 3. For first sentence: POST /voice/tts immediately
|
||||
# 4. Return: {first_audio_b64, first_text, remaining_text}
|
||||
# 5. Client plays first_audio, requests TTS for remaining in background
|
||||
```
|
||||
|
||||
### 2. Browser: play first sentence, background-fetch rest
|
||||
|
||||
```javascript
|
||||
async function voiceChatStreamTurn(text) {
|
||||
const r = await fetch('/api/voice/chat/stream', {...});
|
||||
const d = await r.json();
|
||||
|
||||
// Play first sentence immediately
|
||||
playAudioB64(d.first_audio_b64);
|
||||
|
||||
// Fetch remaining in background while first plays
|
||||
if (d.remaining_text) {
|
||||
fetchAndQueueAudio(d.remaining_text);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Audio queue on browser
|
||||
|
||||
```javascript
|
||||
const audioQueue = [];
|
||||
function playAudioB64(b64) { /* ... */ }
|
||||
function fetchAndQueueAudio(text) {
|
||||
// split to sentences, fetch TTS per sentence, add to queue
|
||||
// play each when previous finishes (currentAudio.onended)
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## SLO Impact (estimated)
|
||||
|
||||
| Metric | Phase 1 | Phase 2 (est.) |
|
||||
|---|---|---|
|
||||
| TTFA (first audio) | ~10–14s | ~3–5s |
|
||||
| Full response end | ~12–15s | ~10–13s (same) |
|
||||
| UX perceived latency | high | natural conversation |
|
||||
|
||||
---
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- `stream=true` support in Ollama (already available)
|
||||
- BFF needs async generator / streaming response
|
||||
- Browser needs MediaSource or sequential audio queue
|
||||
- TTS chunk size: 1 sentence or 80–120 chars (edge-tts handles well)
|
||||
|
||||
---
|
||||
|
||||
## Status
|
||||
|
||||
- Phase 1: ✅ deployed (delegates to memory-service)
|
||||
- Phase 2: 📋 planned — implement after voice quality stabilizes
|
||||
|
||||
### When to implement Phase 2
|
||||
|
||||
1. When `gemma3` p95 latency is consistently < 4s (currently ~2.6s — ready).
|
||||
2. When voice usage > 20 turns/day (worth the complexity).
|
||||
3. When edge-tts 403 rate < 0.1% (confirmed stable with 7.2.7).
|
||||
38
ops/Caddyfile
Normal file
38
ops/Caddyfile
Normal file
@@ -0,0 +1,38 @@
|
||||
# Caddyfile for Radicale CalDAV Server
|
||||
|
||||
# Global options
|
||||
{
|
||||
email {$CADDY_ACME_EMAIL:admin@daarion.space}
|
||||
on_demand_tls
|
||||
}
|
||||
|
||||
# HTTP to HTTPS redirect
|
||||
http:// {
|
||||
redir https://{host}{uri} 308
|
||||
}
|
||||
|
||||
# CalDAV HTTPS endpoint
|
||||
https://caldav.daarion.space {
|
||||
# Reverse proxy to Radicale
|
||||
reverse_proxy radicale:5232
|
||||
|
||||
# Security headers
|
||||
header {
|
||||
X-Frame-Options "SAMEORIGIN"
|
||||
X-Content-Type-Options "nosniff"
|
||||
X-XSS-Protection "1; mode=block"
|
||||
Referrer-Policy "strict-origin-when-cross-origin"
|
||||
Content-Security-Policy "default-src 'self'; connect-src 'self'; img-src 'self' data:; style-src 'self' 'unsafe-inline'"
|
||||
}
|
||||
|
||||
# Basic auth for Radicale
|
||||
basic_auth {
|
||||
{$CADDY_BASIC_AUTH}
|
||||
}
|
||||
|
||||
# TLS settings
|
||||
tls {
|
||||
min_version tls1.2
|
||||
cipher_suites TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256
|
||||
}
|
||||
}
|
||||
84
ops/cache/osv_cache.json
vendored
Normal file
84
ops/cache/osv_cache.json
vendored
Normal file
@@ -0,0 +1,84 @@
|
||||
{
|
||||
"version": 1,
|
||||
"updated_at": "2026-02-23T00:00:00+00:00",
|
||||
"description": "Offline OSV vulnerability cache. Keys: 'ecosystem:package:version'. Populate via dependency_scanner_tool with vuln_mode=online.",
|
||||
"entries": {
|
||||
"PyPI:requests:2.31.0": {
|
||||
"vulns": [],
|
||||
"cached_at": "2026-02-23T00:00:00+00:00"
|
||||
},
|
||||
"PyPI:cryptography:41.0.0": {
|
||||
"vulns": [
|
||||
{
|
||||
"id": "GHSA-jfh8-c2jp-5v3q",
|
||||
"aliases": ["CVE-2023-49083"],
|
||||
"summary": "cryptography vulnerable to NULL-dereference when loading PKCS12 files",
|
||||
"database_specific": {"severity": "MEDIUM"},
|
||||
"severity": [{"type": "CVSS_V3", "score": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:L"}],
|
||||
"affected": [
|
||||
{
|
||||
"package": {"name": "cryptography", "ecosystem": "PyPI"},
|
||||
"ranges": [
|
||||
{
|
||||
"type": "ECOSYSTEM",
|
||||
"events": [{"introduced": "0"}, {"fixed": "41.0.6"}]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"cached_at": "2026-02-23T00:00:00+00:00"
|
||||
},
|
||||
"npm:lodash:4.17.20": {
|
||||
"vulns": [
|
||||
{
|
||||
"id": "GHSA-35jh-r3h4-6jhm",
|
||||
"aliases": ["CVE-2021-23337"],
|
||||
"summary": "Command Injection in lodash",
|
||||
"database_specific": {"severity": "HIGH"},
|
||||
"severity": [{"type": "CVSS_V3", "score": "CVSS:3.1/AV:N/AC:L/PR:H/UI:N/S:U/C:H/I:H/A:H"}],
|
||||
"affected": [
|
||||
{
|
||||
"package": {"name": "lodash", "ecosystem": "npm"},
|
||||
"ranges": [
|
||||
{
|
||||
"type": "ECOSYSTEM",
|
||||
"events": [{"introduced": "0"}, {"fixed": "4.17.21"}]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"cached_at": "2026-02-23T00:00:00+00:00"
|
||||
},
|
||||
"npm:lodash:4.17.21": {
|
||||
"vulns": [],
|
||||
"cached_at": "2026-02-23T00:00:00+00:00"
|
||||
},
|
||||
"PyPI:pyyaml:5.4.1": {
|
||||
"vulns": [
|
||||
{
|
||||
"id": "GHSA-8q59-q68h-6hv4",
|
||||
"aliases": ["CVE-2022-42966"],
|
||||
"summary": "PyYAML vulnerable to ReDoS in FullLoader",
|
||||
"database_specific": {"severity": "HIGH"},
|
||||
"severity": [{"type": "CVSS_V3", "score": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:H"}],
|
||||
"affected": [
|
||||
{
|
||||
"package": {"name": "pyyaml", "ecosystem": "PyPI"},
|
||||
"ranges": [
|
||||
{
|
||||
"type": "ECOSYSTEM",
|
||||
"events": [{"introduced": "0"}, {"fixed": "6.0"}]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"cached_at": "2026-02-23T00:00:00+00:00"
|
||||
}
|
||||
}
|
||||
}
|
||||
13
ops/cron/alert_triage.cron
Normal file
13
ops/cron/alert_triage.cron
Normal file
@@ -0,0 +1,13 @@
|
||||
# Alert Triage Loop — every 5 minutes (NODA2)
|
||||
# Edit SUPERVISOR_URL if supervisor runs on a different port/host.
|
||||
# Logs go to /var/log/alert_triage.log (rotate with logrotate or similar).
|
||||
|
||||
SUPERVISOR_URL=http://sofiia-supervisor:8084
|
||||
SUPERVISOR_API_KEY=
|
||||
ALERT_TRIAGE_WS_ID=default
|
||||
ALERT_TRIAGE_AGENT=sofiia
|
||||
|
||||
*/5 * * * * python3 /opt/daarion/ops/scripts/alert_triage_loop.py >> /var/log/alert_triage.log 2>&1
|
||||
|
||||
# Dry-run check (manual use, not scheduled):
|
||||
# python3 /opt/daarion/ops/scripts/alert_triage_loop.py --dry-run
|
||||
102
ops/cron/jobs.cron
Normal file
102
ops/cron/jobs.cron
Normal file
@@ -0,0 +1,102 @@
|
||||
# ─── DAARION Operational Scheduled Jobs ─────────────────────────────────────
|
||||
# Add these entries to `/etc/cron.d/daarion-ops` (NODE1, as ops user)
|
||||
# or use `crontab -e`.
|
||||
#
|
||||
# Format: minute hour dom month dow command
|
||||
# All times UTC (TZ=UTC is set below).
|
||||
#
|
||||
# Requires:
|
||||
# REPO_ROOT=/path/to/microdao-daarion
|
||||
# ROUTER_URL=http://localhost:8000 (or http://dagi-router-node1:8000)
|
||||
# DATABASE_URL=postgresql://... (if using Postgres backends)
|
||||
# ALERT_DATABASE_URL=... (optional, overrides DATABASE_URL for alerts)
|
||||
#
|
||||
# Replace /opt/daarion/microdao-daarion and python3 path as needed.
|
||||
|
||||
SHELL=/bin/bash
|
||||
TZ=UTC
|
||||
REPO_ROOT=/opt/daarion/microdao-daarion
|
||||
PYTHON=/usr/local/bin/python3
|
||||
ROUTER_URL=http://localhost:8000
|
||||
RUN_JOB=$PYTHON $REPO_ROOT/ops/scripts/run_governance_job.py
|
||||
|
||||
# ── Daily 03:30 — Audit JSONL cleanup (enforce retention_days=30) ────────────
|
||||
30 3 * * * ops $PYTHON $REPO_ROOT/ops/scripts/audit_cleanup.py \
|
||||
--audit-dir $REPO_ROOT/ops/audit \
|
||||
--retention-days 30 \
|
||||
>> /var/log/daarion/audit_cleanup.log 2>&1
|
||||
|
||||
# ── Daily 09:00 — FinOps cost digest (saves to ops/reports/cost/) ─────────────
|
||||
0 9 * * * ops $PYTHON $REPO_ROOT/ops/scripts/schedule_jobs.py daily_cost_digest \
|
||||
>> /var/log/daarion/cost_digest.log 2>&1
|
||||
|
||||
# ── Daily 09:10 — Privacy audit digest (saves to ops/reports/privacy/) ─────────
|
||||
10 9 * * * ops $PYTHON $REPO_ROOT/ops/scripts/schedule_jobs.py daily_privacy_digest \
|
||||
>> /var/log/daarion/privacy_digest.log 2>&1
|
||||
|
||||
# ── Weekly Monday 02:00 — Full drift analysis (saves to ops/reports/drift/) ────
|
||||
0 2 * * 1 ops $PYTHON $REPO_ROOT/ops/scripts/schedule_jobs.py weekly_drift_full \
|
||||
>> /var/log/daarion/drift_full.log 2>&1
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# ── GOVERNANCE ENGINE — Risk / Pressure / Backlog Jobs ───────────────────────
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# All governance jobs use run_governance_job.py → POST /v1/tools/execute
|
||||
# Logs rotate daily via logrotate or append-only (safe).
|
||||
|
||||
# ── Hourly — Risk score snapshot (saves to risk_history_store) ───────────────
|
||||
0 * * * * ops $RUN_JOB \
|
||||
--tool risk_history_tool --action snapshot \
|
||||
--params-json '{"env":"prod"}' \
|
||||
>> /var/log/daarion/risk_snapshot.log 2>&1
|
||||
|
||||
# ── Daily 09:00 — Daily Risk Digest (saves to ops/reports/risk/YYYY-MM-DD.*) ─
|
||||
0 9 * * * ops $RUN_JOB \
|
||||
--tool risk_history_tool --action digest \
|
||||
--params-json '{"env":"prod"}' \
|
||||
>> /var/log/daarion/risk_digest.log 2>&1
|
||||
|
||||
# ── Daily 03:20 — Risk history cleanup (remove old snapshots) ────────────────
|
||||
20 3 * * * ops $RUN_JOB \
|
||||
--tool risk_history_tool --action cleanup \
|
||||
--params-json '{}' \
|
||||
>> /var/log/daarion/risk_cleanup.log 2>&1
|
||||
|
||||
# ── Monday 06:00 — Weekly Platform Priority Digest (ops/reports/platform/YYYY-WW.*) ─
|
||||
0 6 * * 1 ops $RUN_JOB \
|
||||
--tool architecture_pressure_tool --action digest \
|
||||
--params-json '{"env":"prod"}' \
|
||||
>> /var/log/daarion/platform_digest.log 2>&1
|
||||
|
||||
# ── Monday 06:20 — Weekly Backlog Auto-Generation (20 min after platform digest) ─
|
||||
20 6 * * 1 ops $RUN_JOB \
|
||||
--tool backlog_tool --action auto_generate_weekly \
|
||||
--params-json '{"env":"prod"}' \
|
||||
>> /var/log/daarion/backlog_generate.log 2>&1
|
||||
|
||||
# ── Daily 03:40 — Backlog cleanup (remove done/canceled items older than 180d) ─
|
||||
40 3 * * * ops $RUN_JOB \
|
||||
--tool backlog_tool --action cleanup \
|
||||
--params-json '{"env":"prod","retention_days":180}' \
|
||||
>> /var/log/daarion/backlog_cleanup.log 2>&1
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# ── VOICE CANARY — Runtime health check (NODA2) ───────────────────────────────
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# Runs every 7 minutes: live synthesis test for Polina + Ostap.
|
||||
# Writes ops/voice_canary_last.json for voice_policy_update.py.
|
||||
# Sends alert webhook if voices fail or degrade.
|
||||
# Does NOT hard-fail (runtime mode) — alerting handles escalation.
|
||||
#
|
||||
# Required env (set at top of this file or in /etc/cron.d/daarion-ops):
|
||||
# MEMORY_SERVICE_URL=http://localhost:8000 (or docker service name on NODA2)
|
||||
# ALERT_WEBHOOK_URL=<slack/telegram webhook> (optional)
|
||||
# PUSHGATEWAY_URL=http://localhost:9091 (optional, for Prometheus)
|
||||
|
||||
MEMORY_SERVICE_URL=http://localhost:8000
|
||||
|
||||
*/7 * * * * ops MEMORY_SERVICE_URL=$MEMORY_SERVICE_URL \
|
||||
ALERT_WEBHOOK_URL=$ALERT_WEBHOOK_URL \
|
||||
PUSHGATEWAY_URL=$PUSHGATEWAY_URL \
|
||||
$PYTHON $REPO_ROOT/ops/scripts/voice_canary.py --mode runtime \
|
||||
>> /var/log/daarion/voice_canary.log 2>&1
|
||||
0
ops/deployments.jsonl
Normal file
0
ops/deployments.jsonl
Normal file
57
ops/docker-compose.calendar.yml
Normal file
57
ops/docker-compose.calendar.yml
Normal file
@@ -0,0 +1,57 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
# Radicale CalDAV Server
|
||||
radicale:
|
||||
image: radicse/radicale:latest
|
||||
container_name: daarion-radicale
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "127.0.0.1:5232:5232"
|
||||
volumes:
|
||||
- radicale_data:/data
|
||||
- radicale_config:/config
|
||||
environment:
|
||||
- RADICALE_HOST=0.0.0.0
|
||||
- RADICALE_PORT=5232
|
||||
- RADICALE_LOG_LEVEL=INFO
|
||||
networks:
|
||||
- calendar-network
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-q", "--spider", "http://localhost:5232"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# Caddy reverse proxy with TLS
|
||||
caddy:
|
||||
image: caddy:2-alpine
|
||||
container_name: daarion-caldav-proxy
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8443:443"
|
||||
- "8080:80"
|
||||
volumes:
|
||||
- ./Caddyfile:/etc/caddy/Caddyfile
|
||||
- caddy_data:/data
|
||||
- caddy_config:/config
|
||||
environment:
|
||||
- ACME_EMAIL=${CADDY_ACME_EMAIL:-admin@daarion.space}
|
||||
depends_on:
|
||||
- radicale
|
||||
networks:
|
||||
- calendar-network
|
||||
|
||||
networks:
|
||||
calendar-network:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
radicale_data:
|
||||
name: daarion-radicale-data
|
||||
radicale_config:
|
||||
name: daarion-radicale-config
|
||||
caddy_data:
|
||||
name: daarion-caddy-data
|
||||
caddy_config:
|
||||
name: daarion-caddy-config
|
||||
212
ops/grafana_voice_dashboard.json
Normal file
212
ops/grafana_voice_dashboard.json
Normal file
@@ -0,0 +1,212 @@
|
||||
{
|
||||
"__inputs": [
|
||||
{
|
||||
"name": "DS_PROMETHEUS",
|
||||
"label": "Prometheus",
|
||||
"description": "Prometheus datasource — point to your Prometheus instance",
|
||||
"type": "datasource",
|
||||
"pluginId": "prometheus",
|
||||
"pluginName": "Prometheus"
|
||||
}
|
||||
],
|
||||
"title": "DAARION Voice SLO Dashboard",
|
||||
"uid": "voice-slo",
|
||||
"description": "Voice pipeline SLO: TTFA, LLM latency, TTS health, queue underflows. Aligns with ops/voice_alerts.yml and config/slo_policy.yml.",
|
||||
"tags": ["voice", "slo", "daarion"],
|
||||
"timezone": "browser",
|
||||
"refresh": "30s",
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"panels": [
|
||||
|
||||
{
|
||||
"id": 1,
|
||||
"title": "⏱ Time-to-First-Audio p50 / p95",
|
||||
"description": "SLO: voice_fast_uk p95 ≤ 5000ms | voice_quality_uk p95 ≤ 7000ms",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"custom": { "lineWidth": 2 },
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": null, "color": "green" },
|
||||
{ "value": 5000, "color": "yellow" },
|
||||
{ "value": 7000, "color": "red" }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, rate(voice_ttfa_ms_bucket{voice_profile='voice_fast_uk'}[$__rate_interval]))",
|
||||
"legendFormat": "fast p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(voice_ttfa_ms_bucket{voice_profile='voice_fast_uk'}[$__rate_interval]))",
|
||||
"legendFormat": "fast p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(voice_ttfa_ms_bucket{voice_profile='voice_quality_uk'}[$__rate_interval]))",
|
||||
"legendFormat": "quality p95"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"id": 2,
|
||||
"title": "🤖 LLM Latency by Model",
|
||||
"description": "LLM inference time per model. Use to identify slow models and trigger auto-promote.",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "ms" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, rate(voice_llm_ms_bucket[$__rate_interval])) by (model)",
|
||||
"legendFormat": "{{ model }} p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(voice_llm_ms_bucket[$__rate_interval])) by (model)",
|
||||
"legendFormat": "{{ model }} p95"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"id": 3,
|
||||
"title": "🔊 TTS Health: Synthesis Time + Error Rate",
|
||||
"description": "SLO: tts_first_ms p95 ≤ 2000ms. Error rate > 0.05/s → alert.",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "x": 0, "y": 8, "w": 12, "h": 8 },
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "ms" },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "errors/s" },
|
||||
"properties": [
|
||||
{ "id": "custom.axisPlacement", "value": "right" },
|
||||
{ "id": "unit", "value": "short" },
|
||||
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, rate(voice_tts_first_ms_bucket[$__rate_interval]))",
|
||||
"legendFormat": "tts_first p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(voice_tts_first_ms_bucket[$__rate_interval]))",
|
||||
"legendFormat": "tts_first p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(voice_tts_compute_ms_bucket[$__rate_interval])) by (engine)",
|
||||
"legendFormat": "{{ engine }} compute p95"
|
||||
},
|
||||
{
|
||||
"expr": "rate(voice_tts_errors_total[$__rate_interval])",
|
||||
"legendFormat": "errors/s"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"id": 4,
|
||||
"title": "📊 Queue Underflows + E2E Latency",
|
||||
"description": "Underflow = playback outran TTS synthesis (silence gap). E2E SLO: p95 ≤ 9000ms.",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "x": 12, "y": 8, "w": 12, "h": 8 },
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "ms" },
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "underflows/min" },
|
||||
"properties": [
|
||||
{ "id": "custom.axisPlacement", "value": "right" },
|
||||
{ "id": "unit", "value": "short" },
|
||||
{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(voice_e2e_ms_bucket{voice_profile='voice_fast_uk'}[$__rate_interval]))",
|
||||
"legendFormat": "e2e fast p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(voice_e2e_ms_bucket{voice_profile='voice_quality_uk'}[$__rate_interval]))",
|
||||
"legendFormat": "e2e quality p95"
|
||||
},
|
||||
{
|
||||
"expr": "rate(voice_queue_underflows_total[$__rate_interval]) * 60",
|
||||
"legendFormat": "underflows/min"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"id": 5,
|
||||
"title": "🚦 SLO Status (Stat)",
|
||||
"type": "stat",
|
||||
"gridPos": { "x": 0, "y": 16, "w": 24, "h": 4 },
|
||||
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "value": null, "color": "green" },
|
||||
{ "value": 5000, "color": "yellow" },
|
||||
{ "value": 7000, "color": "red" }
|
||||
]
|
||||
},
|
||||
"mappings": []
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": { "calcs": ["lastNotNull"] },
|
||||
"orientation": "horizontal",
|
||||
"colorMode": "background"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(voice_ttfa_ms_bucket{voice_profile='voice_fast_uk'}[10m]))",
|
||||
"legendFormat": "TTFA fast p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(voice_tts_first_ms_bucket[10m]))",
|
||||
"legendFormat": "TTS first p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(voice_e2e_ms_bucket{voice_profile='voice_fast_uk'}[10m]))",
|
||||
"legendFormat": "E2E fast p95"
|
||||
},
|
||||
{
|
||||
"expr": "rate(voice_tts_errors_total[10m])",
|
||||
"legendFormat": "TTS errors/s"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
0
ops/incidents.jsonl
Normal file
0
ops/incidents.jsonl
Normal file
247
ops/runbook-alerts.md
Normal file
247
ops/runbook-alerts.md
Normal file
@@ -0,0 +1,247 @@
|
||||
# Runbook: Alert → Incident Bridge (State Machine + Cooldown)
|
||||
|
||||
## Topology
|
||||
|
||||
```
|
||||
Monitor@node1/2 ──► alert_ingest_tool.ingest ──► AlertStore (Postgres or Memory)
|
||||
│
|
||||
Sofiia / oncall ──► oncall_tool.alert_to_incident ─────┘
|
||||
│
|
||||
IncidentStore (Postgres) ◄───-┘
|
||||
│
|
||||
Sofiia NODA2: incident_triage_graph
|
||||
│
|
||||
postmortem_draft_graph
|
||||
```
|
||||
|
||||
## Alert State Machine
|
||||
|
||||
```
|
||||
new → processing → acked
|
||||
↓
|
||||
failed → (retry after TTL) → new
|
||||
```
|
||||
|
||||
| Status | Meaning |
|
||||
|-------------|--------------------------------------------------|
|
||||
| `new` | Freshly ingested, not yet claimed |
|
||||
| `processing` | Claimed by a loop worker; locked for 10 min |
|
||||
| `acked` | Successfully processed and closed |
|
||||
| `failed` | Processing error; retry after `retry_after_sec` |
|
||||
|
||||
**Concurrency safety:** `claim` uses `SELECT FOR UPDATE SKIP LOCKED` (Postgres) or an in-process lock (Memory). Two concurrent loops cannot claim the same alert.
|
||||
|
||||
**Stale processing requeue:** `claim` automatically requeues alerts whose `processing_lock_until` has expired.
|
||||
|
||||
---
|
||||
|
||||
## Triage Cooldown (per Signature)
|
||||
|
||||
After a triage runs for a given `incident_signature`, subsequent alerts with the same signature **within 15 min** (configurable via `triage_cooldown_minutes` in `alert_routing_policy.yml`) only get an `incident_append_event` note — no new triage run. This prevents triage storms.
|
||||
|
||||
```yaml
|
||||
# config/alert_routing_policy.yml
|
||||
defaults:
|
||||
triage_cooldown_minutes: 15
|
||||
```
|
||||
|
||||
The state is persisted in `incident_signature_state` table (Postgres) or in-memory (fallback).
|
||||
|
||||
---
|
||||
|
||||
## Startup Checklist
|
||||
|
||||
1. **Postgres DDL** (if `ALERT_BACKEND=postgres`):
|
||||
```bash
|
||||
DATABASE_URL=postgresql://... python3 ops/scripts/migrate_alerts_postgres.py
|
||||
```
|
||||
This is idempotent — safe to re-run. Adds state machine columns and `incident_signature_state` table.
|
||||
|
||||
2. **Env vars on NODE1 (router)**:
|
||||
```env
|
||||
ALERT_BACKEND=auto # Postgres → Memory fallback
|
||||
DATABASE_URL=postgresql://...
|
||||
```
|
||||
|
||||
3. **Monitor agent**: configure `source: monitor@node1`, use `alert_ingest_tool.ingest`.
|
||||
|
||||
## Operational Scenarios
|
||||
|
||||
### Alert storm protection
|
||||
|
||||
Alert deduplication prevents storms. If alerts are firing repeatedly:
|
||||
1. Check `occurrences` field — same alert ref means dedupe is working
|
||||
2. Adjust `dedupe_ttl_minutes` per alert (default 30)
|
||||
3. If many different fingerprints create new records — review Monitor fingerprint logic
|
||||
|
||||
### False positive alert
|
||||
|
||||
1. `alert_ingest_tool.ack` with `note="false positive"`
|
||||
2. No incident created (or close the incident if already created via `oncall_tool.incident_close`)
|
||||
|
||||
### Alert → Incident conversion
|
||||
|
||||
```bash
|
||||
# Sofiia or oncall agent calls:
|
||||
oncall_tool.alert_to_incident(
|
||||
alert_ref="alrt_...",
|
||||
incident_severity_cap="P1",
|
||||
dedupe_window_minutes=60
|
||||
)
|
||||
```
|
||||
|
||||
### View recent alerts (by status)
|
||||
|
||||
```bash
|
||||
# Default: all statuses
|
||||
alert_ingest_tool.list(window_minutes=240, env="prod")
|
||||
|
||||
# Only new/failed (unprocessed):
|
||||
alert_ingest_tool.list(window_minutes=240, status_in=["new","failed"])
|
||||
```
|
||||
|
||||
### Claim alerts for processing (Supervisor loop)
|
||||
|
||||
```bash
|
||||
# Atomic claim — locks alerts for 10 min
|
||||
alert_ingest_tool.claim(window_minutes=240, limit=25, owner="sofiia-supervisor", lock_ttl_seconds=600)
|
||||
```
|
||||
|
||||
### Mark alert as failed (retry)
|
||||
|
||||
```bash
|
||||
alert_ingest_tool.fail(alert_ref="alrt_...", error="gateway timeout", retry_after_seconds=300)
|
||||
```
|
||||
|
||||
### Operational dashboard
|
||||
|
||||
```
|
||||
GET /v1/alerts/dashboard?window_minutes=240
|
||||
# → counts by status, top signatures, latest alerts
|
||||
```
|
||||
|
||||
```
|
||||
GET /v1/incidents/open?service=gateway
|
||||
# → open/mitigating incidents
|
||||
```
|
||||
|
||||
### Monitor health check
|
||||
|
||||
Verify Monitor is pushing alerts:
|
||||
```bash
|
||||
alert_ingest_tool.list(source="monitor@node1", window_minutes=60)
|
||||
```
|
||||
If empty and there should be alerts → check Monitor service + entitlements.
|
||||
|
||||
## SLO Watch Gate
|
||||
|
||||
### Staging blocks on SLO breach
|
||||
Config in `config/release_gate_policy.yml`:
|
||||
```yaml
|
||||
staging:
|
||||
gates:
|
||||
slo_watch:
|
||||
mode: "strict"
|
||||
```
|
||||
|
||||
To temporarily bypass (emergency deploy):
|
||||
```bash
|
||||
# In release_check input:
|
||||
run_slo_watch: false
|
||||
```
|
||||
Document reason in incident timeline.
|
||||
|
||||
### Tuning SLO thresholds
|
||||
|
||||
Edit `config/slo_policy.yml`:
|
||||
```yaml
|
||||
services:
|
||||
gateway:
|
||||
latency_p95_ms: 300 # adjust
|
||||
error_rate_pct: 1.0
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
| Symptom | Cause | Fix |
|
||||
|---------|-------|-----|
|
||||
| Alert `accepted=false` | Validation failure (missing service/title, invalid kind) | Fix Monitor alert payload |
|
||||
| `deduped=true` unexpectedly | Same fingerprint within TTL | Check Monitor fingerprint logic |
|
||||
| `alert_to_incident` fails "not found" | Alert ref expired from MemoryStore | Switch to Postgres backend |
|
||||
| Alerts stuck in `processing` | Loop died without acking | Run `claim` — it auto-requeues expired locks. Or: `UPDATE alerts SET status='new', processing_lock_until=NULL WHERE status='processing' AND processing_lock_until < NOW()` |
|
||||
| Alerts stuck in `failed` | Persistent processing errors | Check `last_error` field: `SELECT alert_ref, last_error FROM alerts WHERE status='failed'` |
|
||||
| Triage not running | Cooldown active | Check `incident_signature_state.last_triage_at`; or reduce `triage_cooldown_minutes` in policy |
|
||||
| `claim` returns empty | All new alerts already locked | Check for stale processing: `SELECT COUNT(*) FROM alerts WHERE status='processing' AND processing_lock_until < NOW()` |
|
||||
| SLO gate blocks in staging | SLO breach active | Fix service or override with `run_slo_watch: false` |
|
||||
| `tools.alerts.ingest` denied | Monitor agent missing entitlement | Check `config/rbac_tools_matrix.yml` `agent_monitor` role |
|
||||
| `tools.alerts.claim` denied | Agent missing `tools.alerts.claim` | Only `agent_cto` / `agent_oncall` / Supervisor can claim |
|
||||
|
||||
## Retention
|
||||
|
||||
Alerts in Postgres: no TTL enforced by default — add a cron job if needed:
|
||||
```sql
|
||||
DELETE FROM alerts WHERE created_at < NOW() - INTERVAL '30 days';
|
||||
```
|
||||
|
||||
Memory backend: cleared on process restart.
|
||||
|
||||
---
|
||||
|
||||
## Production Mode: ALERT_BACKEND=postgres
|
||||
|
||||
**⚠ Default is `memory` — do NOT use in production.** Alerts are lost on router restart.
|
||||
|
||||
### Setup (one-time, per environment)
|
||||
|
||||
**1. Run migration:**
|
||||
```bash
|
||||
python3 ops/scripts/migrate_alerts_postgres.py \
|
||||
--dsn "postgresql://user:pass@host:5432/daarion"
|
||||
# or dry-run:
|
||||
python3 ops/scripts/migrate_alerts_postgres.py --dry-run
|
||||
```
|
||||
|
||||
**2. Set env vars** (in `.env`, docker-compose, or systemd unit):
|
||||
```bash
|
||||
ALERT_BACKEND=postgres
|
||||
ALERT_DATABASE_URL=postgresql://user:pass@host:5432/daarion
|
||||
# Fallback: if ALERT_DATABASE_URL is unset, DATABASE_URL is used automatically
|
||||
```
|
||||
|
||||
**3. Restart router:**
|
||||
```bash
|
||||
docker compose -f docker-compose.node1.yml restart router
|
||||
# or node2:
|
||||
docker compose -f docker-compose.node2-sofiia.yml restart router
|
||||
```
|
||||
|
||||
**4. Verify persistence** (survive a restart):
|
||||
```bash
|
||||
# Ingest a test alert
|
||||
curl -X POST http://router:8000/v1/tools/execute \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"tool":"alert_ingest_tool","action":"ingest","service":"test","kind":"test","message":"persistence check"}'
|
||||
|
||||
# Restart router
|
||||
docker compose restart router
|
||||
|
||||
# Confirm alert still visible after restart
|
||||
curl "http://router:8000/v1/tools/execute" \
|
||||
-d '{"tool":"alert_ingest_tool","action":"list","service":"test"}'
|
||||
# Expect: alert still present → PASS
|
||||
```
|
||||
|
||||
### DSN resolution order
|
||||
|
||||
`alert_store.py` factory resolves DSN in this priority:
|
||||
1. `ALERT_DATABASE_URL` (service-specific, recommended)
|
||||
2. `DATABASE_URL` (shared Postgres, fallback)
|
||||
3. Falls back to memory with a WARNING log if neither is set.
|
||||
|
||||
### compose files updated
|
||||
|
||||
| File | ALERT_BACKEND set? |
|
||||
|------|--------------------|
|
||||
| `docker-compose.node1.yml` | ✅ `postgres` |
|
||||
| `docker-compose.node2-sofiia.yml` | ✅ `postgres` |
|
||||
| `docker-compose.staging.yml` | ✅ `postgres` |
|
||||
192
ops/runbook-audit-postgres.md
Normal file
192
ops/runbook-audit-postgres.md
Normal file
@@ -0,0 +1,192 @@
|
||||
# Runbook: Postgres Audit Backend
|
||||
|
||||
## Overview
|
||||
|
||||
The audit backend stores structured, non-payload `ToolGovernance` events for FinOps, privacy analysis, and incident triage.
|
||||
|
||||
| Backend | Config | Use case |
|
||||
|---------|--------|----------|
|
||||
| `auto` | `AUDIT_BACKEND=auto` + `DATABASE_URL=...` | **Recommended for prod/staging**: tries Postgres, falls back to JSONL on failure |
|
||||
| `postgres` | `AUDIT_BACKEND=postgres` | Hard-require Postgres; fails on DB down |
|
||||
| `jsonl` | `AUDIT_BACKEND=jsonl` | JSONL files only (default / dev) |
|
||||
| `null` | `AUDIT_BACKEND=null` | Discard all events (useful for testing) |
|
||||
|
||||
---
|
||||
|
||||
## 1. Initial Setup (NODE1 / Gateway)
|
||||
|
||||
### 1.1 Create `tool_audit_events` table (idempotent)
|
||||
|
||||
```bash
|
||||
DATABASE_URL="postgresql://user:password@host:5432/daarion" \
|
||||
python3 ops/scripts/migrate_audit_postgres.py
|
||||
```
|
||||
|
||||
Dry-run (print DDL only):
|
||||
|
||||
```bash
|
||||
python3 ops/scripts/migrate_audit_postgres.py --dry-run
|
||||
```
|
||||
|
||||
### 1.2 Configure environment
|
||||
|
||||
In `services/router/.env` (or your Docker env):
|
||||
|
||||
```env
|
||||
AUDIT_BACKEND=auto
|
||||
DATABASE_URL=postgresql://audit_user:secret@pg-host:5432/daarion
|
||||
AUDIT_JSONL_DIR=/var/log/daarion/audit # fallback dir
|
||||
```
|
||||
|
||||
Restart the router after changes.
|
||||
|
||||
### 1.3 Verify
|
||||
|
||||
```bash
|
||||
# Check router logs for:
|
||||
# AuditStore: auto (postgres→jsonl fallback) dsn=postgresql://...
|
||||
docker logs router 2>&1 | grep AuditStore
|
||||
|
||||
# Or call the dashboard:
|
||||
curl http://localhost:8080/v1/finops/dashboard?window_hours=24 \
|
||||
-H "X-Agent-Id: sofiia"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. `AUDIT_BACKEND=auto` Fallback Behaviour
|
||||
|
||||
When `AUDIT_BACKEND=auto`:
|
||||
|
||||
1. **Normal operation**: all writes/reads go to Postgres.
|
||||
2. **Postgres failure**: `AutoAuditStore` catches the error, logs a WARNING, and switches to JSONL for the next ~5 minutes.
|
||||
3. **Recovery**: after 5 minutes the next write attempt re-tries Postgres. If successful, switches back silently.
|
||||
|
||||
This means **tool calls are never blocked** by a DB outage; events continue to land in JSONL.
|
||||
|
||||
---
|
||||
|
||||
## 3. Schema
|
||||
|
||||
```sql
|
||||
CREATE TABLE IF NOT EXISTS tool_audit_events (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
ts TIMESTAMPTZ NOT NULL,
|
||||
req_id TEXT NOT NULL,
|
||||
workspace_id TEXT NOT NULL,
|
||||
user_id TEXT NOT NULL,
|
||||
agent_id TEXT NOT NULL,
|
||||
tool TEXT NOT NULL,
|
||||
action TEXT NOT NULL,
|
||||
status TEXT NOT NULL,
|
||||
duration_ms INT NOT NULL DEFAULT 0,
|
||||
in_size INT NOT NULL DEFAULT 0,
|
||||
out_size INT NOT NULL DEFAULT 0,
|
||||
input_hash TEXT NOT NULL DEFAULT '',
|
||||
graph_run_id TEXT,
|
||||
graph_node TEXT,
|
||||
job_id TEXT
|
||||
);
|
||||
```
|
||||
|
||||
Indexes: `ts`, `(workspace_id, ts)`, `(tool, ts)`, `(agent_id, ts)`.
|
||||
|
||||
---
|
||||
|
||||
## 4. Scheduled Operational Jobs
|
||||
|
||||
Jobs are run via `ops/scripts/schedule_jobs.py` (called by cron — see `ops/cron/jobs.cron`):
|
||||
|
||||
| Job | Schedule | What it does |
|
||||
|-----|----------|--------------|
|
||||
| `audit_cleanup` | Daily 03:30 | Deletes/gzips JSONL files older than 30 days |
|
||||
| `daily_cost_digest` | Daily 09:00 | Cost digest → `ops/reports/cost/YYYY-MM-DD.{json,md}` |
|
||||
| `daily_privacy_digest` | Daily 09:10 | Privacy digest → `ops/reports/privacy/YYYY-MM-DD.{json,md}` |
|
||||
| `weekly_drift_full` | Mon 02:00 | Full drift → `ops/reports/drift/week-YYYY-WW.json` |
|
||||
|
||||
### Run manually
|
||||
|
||||
```bash
|
||||
# Cost digest
|
||||
AUDIT_BACKEND=auto DATABASE_URL=... \
|
||||
python3 ops/scripts/schedule_jobs.py daily_cost_digest
|
||||
|
||||
# Privacy digest
|
||||
python3 ops/scripts/schedule_jobs.py daily_privacy_digest
|
||||
|
||||
# Weekly drift
|
||||
python3 ops/scripts/schedule_jobs.py weekly_drift_full
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. Dashboard Endpoints
|
||||
|
||||
| Endpoint | RBAC | Description |
|
||||
|----------|------|-------------|
|
||||
| `GET /v1/finops/dashboard?window_hours=24` | `tools.cost.read` | FinOps cost digest |
|
||||
| `GET /v1/privacy/dashboard?window_hours=24` | `tools.data_gov.read` | Privacy/audit digest |
|
||||
|
||||
Headers:
|
||||
- `X-Agent-Id: sofiia` (or any agent with appropriate entitlements)
|
||||
- `X-Workspace-Id: your-ws`
|
||||
|
||||
---
|
||||
|
||||
## 6. Maintenance & Troubleshooting
|
||||
|
||||
### Check active backend at runtime
|
||||
|
||||
```bash
|
||||
curl -s http://localhost:8080/v1/finops/dashboard \
|
||||
-H "X-Agent-Id: sofiia" | python3 -m json.tool | grep source_backend
|
||||
```
|
||||
|
||||
### Force Postgres migration (re-apply schema)
|
||||
|
||||
```bash
|
||||
python3 ops/scripts/migrate_audit_postgres.py
|
||||
```
|
||||
|
||||
### Postgres is down — expected behaviour
|
||||
|
||||
- Router logs: `WARNING: AutoAuditStore: Postgres write failed (...), switching to JSONL fallback`
|
||||
- Events land in `AUDIT_JSONL_DIR/tool_audit_YYYY-MM-DD.jsonl`
|
||||
- Recovery automatic after 5 minutes
|
||||
- No tool call failures
|
||||
|
||||
### JSONL fallback getting large
|
||||
|
||||
Run compaction:
|
||||
|
||||
```bash
|
||||
python3 ops/scripts/audit_compact.py \
|
||||
--audit-dir ops/audit --window-days 7 --output ops/audit/compact
|
||||
```
|
||||
|
||||
Then cleanup old originals:
|
||||
|
||||
```bash
|
||||
python3 ops/scripts/audit_cleanup.py \
|
||||
--audit-dir ops/audit --retention-days 30
|
||||
```
|
||||
|
||||
### Retention enforcement
|
||||
|
||||
Enforced by daily `audit_cleanup` job (cron 03:30). Policy defined in `config/data_governance_policy.yml`:
|
||||
|
||||
```yaml
|
||||
retention:
|
||||
audit_jsonl_days: 30
|
||||
audit_postgres_days: 90
|
||||
```
|
||||
|
||||
Postgres retention (if needed) must be managed separately with a `DELETE FROM tool_audit_events WHERE ts < NOW() - INTERVAL '90 days'` job or pg_partman.
|
||||
|
||||
---
|
||||
|
||||
## 7. Security Notes
|
||||
|
||||
- No PII or payload is stored in `tool_audit_events` — only sizes, hashes, and metadata.
|
||||
- `DATABASE_URL` must be a restricted user with `INSERT/SELECT` on `tool_audit_events` only.
|
||||
- JSONL fallback files inherit filesystem permissions; ensure directory is `chmod 700`.
|
||||
299
ops/runbook-backlog.md
Normal file
299
ops/runbook-backlog.md
Normal file
@@ -0,0 +1,299 @@
|
||||
# Runbook — Engineering Backlog Bridge
|
||||
|
||||
**Service:** Engineering Backlog Bridge
|
||||
**Owner:** CTO / Platform Engineering
|
||||
**On-call:** oncall
|
||||
|
||||
---
|
||||
|
||||
## 1. Storage Backends
|
||||
|
||||
### 1.1 Default: Auto (Postgres → JSONL)
|
||||
|
||||
The `AutoBacklogStore` attempts Postgres on startup. If Postgres is
|
||||
unavailable, it falls back to JSONL and retries every 5 minutes.
|
||||
|
||||
Check the active backend in logs:
|
||||
|
||||
```
|
||||
backlog_store: using PostgresBacklogStore
|
||||
backlog_store: using JsonlBacklogStore
|
||||
```
|
||||
|
||||
### 1.2 Switching backend
|
||||
|
||||
```bash
|
||||
# Use JSONL only (no DB required)
|
||||
export BACKLOG_BACKEND=jsonl
|
||||
|
||||
# Use Postgres
|
||||
export BACKLOG_BACKEND=postgres
|
||||
export BACKLOG_POSTGRES_DSN="postgresql://user:pass@host:5432/daarion"
|
||||
|
||||
# Tests only
|
||||
export BACKLOG_BACKEND=memory
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. Postgres Migration
|
||||
|
||||
Run once per environment. Idempotent (safe to re-run).
|
||||
|
||||
```bash
|
||||
# Dry-run first
|
||||
python3 ops/scripts/migrate_backlog_postgres.py \
|
||||
--dsn "postgresql://user:pass@host/daarion" \
|
||||
--dry-run
|
||||
|
||||
# Apply
|
||||
python3 ops/scripts/migrate_backlog_postgres.py \
|
||||
--dsn "postgresql://user:pass@host/daarion"
|
||||
```
|
||||
|
||||
Alternatively, use `$BACKLOG_POSTGRES_DSN` or `$POSTGRES_DSN` environment variables.
|
||||
|
||||
**Tables created:**
|
||||
- `backlog_items` — dedupe_key UNIQUE constraint
|
||||
- `backlog_events` — FK to backlog_items with CASCADE DELETE
|
||||
|
||||
**Indexes:** env+status, service, due_date, owner, category, item_id, ts.
|
||||
|
||||
---
|
||||
|
||||
## 3. Weekly Auto-generation
|
||||
|
||||
### 3.1 Automatic (scheduled)
|
||||
|
||||
`weekly_backlog_generate` runs every **Monday at 06:20 UTC** (20 min after
|
||||
the weekly platform digest at 06:00 UTC). Registered in `ops/task_registry.yml`.
|
||||
|
||||
### 3.2 Manual trigger
|
||||
|
||||
```bash
|
||||
# HTTP (admin only)
|
||||
curl -X POST "https://router/v1/backlog/generate/weekly?env=prod"
|
||||
|
||||
# Tool call
|
||||
{
|
||||
"tool": "backlog_tool",
|
||||
"action": "auto_generate_weekly",
|
||||
"env": "prod"
|
||||
}
|
||||
```
|
||||
|
||||
### 3.3 Prerequisite
|
||||
|
||||
The latest `ops/reports/platform/YYYY-WW.json` must exist (produced by
|
||||
`weekly_platform_priority_digest`). If it's missing, generation returns:
|
||||
|
||||
```json
|
||||
{ "error": "No platform digest found. Run architecture_pressure_tool.digest first." }
|
||||
```
|
||||
|
||||
Fix:
|
||||
```bash
|
||||
# Trigger platform digest
|
||||
{ "tool": "architecture_pressure_tool", "action": "digest", "env": "prod" }
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Cleanup (Retention)
|
||||
|
||||
**Schedule:** Daily at 03:40 UTC.
|
||||
|
||||
Removes `done` / `canceled` items older than `retention_days` (default 180d).
|
||||
|
||||
```bash
|
||||
# Manual cleanup
|
||||
{
|
||||
"tool": "backlog_tool",
|
||||
"action": "cleanup",
|
||||
"retention_days": 180
|
||||
}
|
||||
```
|
||||
|
||||
For JSONL backend, cleanup rewrites the file atomically.
|
||||
For Postgres, it runs a `DELETE WHERE status IN ('done','canceled') AND updated_at < cutoff`.
|
||||
|
||||
---
|
||||
|
||||
## 5. JSONL File Management
|
||||
|
||||
Files: `ops/backlog/items.jsonl`, `ops/backlog/events.jsonl`
|
||||
|
||||
The JSONL backend is **append-only** (updates append a new line; reads use
|
||||
last-write-wins per `id`). The file grows over time until `cleanup()` rewrites it.
|
||||
|
||||
### Check file size
|
||||
|
||||
```bash
|
||||
wc -l ops/backlog/items.jsonl
|
||||
ls -lh ops/backlog/items.jsonl
|
||||
```
|
||||
|
||||
### Manual compaction (outside cleanup schedule)
|
||||
|
||||
```bash
|
||||
python3 -c "
|
||||
from services.router.backlog_store import JsonlBacklogStore
|
||||
s = JsonlBacklogStore()
|
||||
deleted = s.cleanup(retention_days=30)
|
||||
print(f'Removed {deleted} old items')
|
||||
"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. Dashboard & Monitoring
|
||||
|
||||
```bash
|
||||
# HTTP
|
||||
GET /v1/backlog/dashboard?env=prod
|
||||
|
||||
# Example response
|
||||
{
|
||||
"total": 42,
|
||||
"status_counts": {"open": 18, "in_progress": 5, "blocked": 3, "done": 14, "canceled": 2},
|
||||
"priority_counts": {"P0": 1, "P1": 9, "P2": 22, "P3": 10},
|
||||
"overdue_count": 4,
|
||||
"overdue": [
|
||||
{"id": "bl_...", "service": "gateway", "priority": "P1", "due_date": "2026-02-10", ...}
|
||||
],
|
||||
"top_services": [{"service": "gateway", "count": 5}, ...]
|
||||
}
|
||||
```
|
||||
|
||||
Alert thresholds (recommended):
|
||||
- `overdue_count > 5` → notify oncall
|
||||
- `priority_counts.P0 > 0 AND overdue` → page CTO
|
||||
|
||||
---
|
||||
|
||||
## 7. Troubleshooting
|
||||
|
||||
### Items not generated
|
||||
|
||||
1. Check if platform digest exists: `ls ops/reports/platform/*.json`
|
||||
2. Verify `generation.weekly_from_pressure_digest: true` in `config/backlog_policy.yml`
|
||||
3. Check `max_items_per_run` — may cap generation if many services match.
|
||||
|
||||
### Duplicate items across weeks
|
||||
|
||||
Normal — each week gets a new dedupe_key `...:YYYY-WW:...`. Items from
|
||||
previous weeks remain unless closed. This is intentional: unresolved issues
|
||||
accumulate visibility week-over-week.
|
||||
|
||||
### Postgres connection failures
|
||||
|
||||
Check: `BACKLOG_POSTGRES_DSN`, network access, and that migration has been run.
|
||||
The `AutoBacklogStore` will fall back to JSONL and log a warning.
|
||||
|
||||
### Wrong owner assigned
|
||||
|
||||
Check `config/backlog_policy.yml` → `ownership.overrides`. Add/update
|
||||
service-level overrides as needed. Re-run `auto_generate_weekly` — the
|
||||
upsert will update the existing item if `ownership` changed (title/meta update
|
||||
path only; owner field is preserved on existing items). For immediate
|
||||
correction, use `set_status` + `add_comment` or `upsert` with explicit `owner`.
|
||||
|
||||
---
|
||||
|
||||
## 8. Configuration Reference
|
||||
|
||||
`config/backlog_policy.yml` — key sections:
|
||||
|
||||
| Section | Key | Default | Description |
|
||||
|-------------------|-------------------------|---------|-------------|
|
||||
| `defaults` | `retention_days` | 180 | Days to keep done/canceled items |
|
||||
| `defaults` | `max_items_per_run` | 50 | Cap per generation run |
|
||||
| `dedupe` | `key_prefix` | platform_backlog | Dedupe key prefix |
|
||||
| `categories.*` | `priority` | varies | Default priority per category |
|
||||
| `categories.*` | `due_days` | varies | Days until due from creation |
|
||||
| `generation` | `weekly_from_pressure_digest` | true | Enable weekly generation |
|
||||
| `generation` | `daily_from_risk_digest` | false | Enable daily generation from risk |
|
||||
| `ownership` | `default_owner` | oncall | Fallback owner |
|
||||
| `ownership.overrides` | `{service}` | — | Per-service owner override |
|
||||
|
||||
---
|
||||
|
||||
## 9. Scheduler Wiring: cron vs task_registry
|
||||
|
||||
### Architecture
|
||||
|
||||
There are two sources of truth for scheduled jobs:
|
||||
|
||||
| File | Role |
|
||||
|------|------|
|
||||
| `ops/task_registry.yml` | **Declarative registry** — defines what jobs exist, their schedule, inputs, permissions, and dry-run behavior. Used for documentation, audits, and future scheduler integrations. |
|
||||
| `ops/cron/jobs.cron` | **Active scheduler** — physical cron entries that actually run jobs. Must be kept in sync with `task_registry.yml`. |
|
||||
|
||||
### How governance jobs are executed
|
||||
|
||||
All governance jobs use the universal runner:
|
||||
|
||||
```bash
|
||||
python3 ops/scripts/run_governance_job.py \
|
||||
--tool <tool_name> \
|
||||
--action <action> \
|
||||
--params-json '<json>'
|
||||
```
|
||||
|
||||
This POSTs to `POST /v1/tools/execute` on the router. The router applies RBAC
|
||||
(agent_id=`scheduler`, which has `tools.backlog.admin` + `tools.pressure.write` +
|
||||
`tools.risk.write` via the `scheduler` service account) and executes the tool.
|
||||
|
||||
### Governance cron schedule
|
||||
|
||||
```
|
||||
0 * * * * hourly_risk_snapshot (risk_history_tool.snapshot)
|
||||
0 9 * * * daily_risk_digest (risk_history_tool.digest)
|
||||
20 3 * * * risk_history_cleanup (risk_history_tool.cleanup)
|
||||
0 6 * * 1 weekly_platform_priority_digest (architecture_pressure_tool.digest)
|
||||
20 6 * * 1 weekly_backlog_generate (backlog_tool.auto_generate_weekly)
|
||||
40 3 * * * daily_backlog_cleanup (backlog_tool.cleanup)
|
||||
```
|
||||
|
||||
### Deployment
|
||||
|
||||
```bash
|
||||
# 1. Copy cron file to /etc/cron.d/
|
||||
sudo cp ops/cron/jobs.cron /etc/cron.d/daarion-governance
|
||||
sudo chmod 644 /etc/cron.d/daarion-governance
|
||||
|
||||
# 2. Edit REPO_ROOT and ROUTER_URL if needed
|
||||
sudo nano /etc/cron.d/daarion-governance
|
||||
|
||||
# 3. Verify syntax
|
||||
crontab -T /etc/cron.d/daarion-governance
|
||||
|
||||
# 4. Check logs
|
||||
tail -f /var/log/daarion/risk_snapshot.log
|
||||
tail -f /var/log/daarion/backlog_generate.log
|
||||
```
|
||||
|
||||
### Dry-run testing
|
||||
|
||||
```bash
|
||||
python3 ops/scripts/run_governance_job.py \
|
||||
--tool backlog_tool --action auto_generate_weekly \
|
||||
--params-json '{"env":"prod"}' \
|
||||
--dry-run
|
||||
```
|
||||
|
||||
### Expected artifacts
|
||||
|
||||
After first run:
|
||||
- `ops/reports/risk/YYYY-MM-DD.md` and `.json` (daily digest)
|
||||
- `ops/reports/platform/YYYY-WW.md` and `.json` (weekly platform digest)
|
||||
- `ops/backlog/items.jsonl` (if BACKLOG_BACKEND=jsonl) or Postgres `backlog_items` table
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
| Symptom | Cause | Fix |
|
||||
|---------|-------|-----|
|
||||
| `Cannot reach http://localhost:8000` | Router not running or wrong `ROUTER_URL` | Check compose, set `ROUTER_URL` in cron header |
|
||||
| `HTTP 401 from /v1/tools/execute` | Missing `SCHEDULER_API_KEY` | Set env var or check auth config |
|
||||
| `error: No platform digest found` | `weekly_backlog_generate` ran before `weekly_platform_priority_digest` | Fix cron timing (06:00 vs 06:20) or run digest manually |
|
||||
| Job output empty | Scheduler running but tool silently skipped | Check tool policy (e.g. `weekly_from_pressure_digest: false`) |
|
||||
236
ops/runbook-incidents.md
Normal file
236
ops/runbook-incidents.md
Normal file
@@ -0,0 +1,236 @@
|
||||
# Runbook: Incident Log Operations
|
||||
|
||||
## 1. Initial Setup
|
||||
|
||||
### JSONL backend (default)
|
||||
|
||||
No setup needed. Incidents stored in `ops/incidents/`:
|
||||
- `incidents.jsonl` — incident records
|
||||
- `events.jsonl` — timeline events
|
||||
- `artifacts.jsonl` — artifact metadata
|
||||
|
||||
Artifact files: `ops/incidents/<incident_id>/` (md/json/txt files).
|
||||
|
||||
### Postgres backend
|
||||
|
||||
```bash
|
||||
# Run idempotent migration
|
||||
DATABASE_URL="postgresql://user:pass@host:5432/db" \
|
||||
python3 ops/scripts/migrate_incidents_postgres.py
|
||||
|
||||
# Dry run (prints DDL only)
|
||||
python3 ops/scripts/migrate_incidents_postgres.py --dry-run
|
||||
```
|
||||
|
||||
Tables created: `incidents`, `incident_events`, `incident_artifacts`.
|
||||
|
||||
## 2. Agent Roles & Permissions
|
||||
|
||||
| Agent | Role | Incident access |
|
||||
|-------|------|----------------|
|
||||
| sofiia | agent_cto | Full CRUD |
|
||||
| helion | agent_oncall | Full CRUD |
|
||||
| monitor | agent_monitor | Read only |
|
||||
| aistalk | agent_interface | Read only |
|
||||
| others | agent_default | Read only |
|
||||
|
||||
## 3. Common Operations
|
||||
|
||||
### Create incident manually (via tool)
|
||||
|
||||
```json
|
||||
{
|
||||
"tool": "oncall_tool",
|
||||
"action": "incident_create",
|
||||
"params": {
|
||||
"service": "gateway",
|
||||
"severity": "P1",
|
||||
"title": "Gateway 5xx rate >5%",
|
||||
"env": "prod",
|
||||
"started_at": "2026-02-23T10:00:00Z"
|
||||
},
|
||||
"agent_id": "sofiia"
|
||||
}
|
||||
```
|
||||
|
||||
### Generate postmortem
|
||||
|
||||
```bash
|
||||
curl -X POST http://supervisor:8000/v1/graphs/postmortem_draft/runs \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"agent_id":"sofiia","input":{"incident_id":"inc_..."}}'
|
||||
```
|
||||
|
||||
### List open incidents
|
||||
|
||||
```json
|
||||
{
|
||||
"tool": "oncall_tool",
|
||||
"action": "incident_list",
|
||||
"params": { "status": "open", "limit": 20 }
|
||||
}
|
||||
```
|
||||
|
||||
## 4. Troubleshooting
|
||||
|
||||
### Artifacts not writing
|
||||
|
||||
- Check `INCIDENT_ARTIFACTS_DIR` env var (or default `ops/incidents/`).
|
||||
- Check filesystem permissions (directory must be writable).
|
||||
- Max artifact size: 2MB. Only json/md/txt allowed.
|
||||
|
||||
### Incident not found
|
||||
|
||||
- Verify `incident_id` format: `inc_YYYYMMDD_HHMM_<rand>`.
|
||||
- Check the correct backend is configured (`INCIDENT_BACKEND` env var).
|
||||
- For JSONL: verify `ops/incidents/incidents.jsonl` exists and is not corrupt.
|
||||
|
||||
### Postmortem graph fails
|
||||
|
||||
1. Check supervisor logs: `docker logs sofiia-supervisor`.
|
||||
2. Verify the incident exists: `oncall_tool.incident_get`.
|
||||
3. Check gateway is reachable from supervisor.
|
||||
4. Run `GET /v1/runs/<run_id>` to see graph status and error.
|
||||
|
||||
## 5. Backup & Retention
|
||||
|
||||
### JSONL
|
||||
|
||||
```bash
|
||||
# Backup
|
||||
cp -r ops/incidents/ /backup/incidents-$(date +%F)/
|
||||
|
||||
# Retention: manual cleanup of closed incidents older than N days
|
||||
# (Not automated yet; add to future audit_cleanup scope)
|
||||
```
|
||||
|
||||
### Postgres
|
||||
|
||||
Standard pg_dump for `incidents`, `incident_events`, `incident_artifacts` tables.
|
||||
|
||||
## 6. INCIDENT_BACKEND=auto
|
||||
|
||||
The incident store supports `INCIDENT_BACKEND=auto` which tries Postgres first and falls back to JSONL:
|
||||
|
||||
```bash
|
||||
# Set in environment:
|
||||
INCIDENT_BACKEND=auto
|
||||
DATABASE_URL=postgresql://user:pass@localhost:5432/daarion
|
||||
|
||||
# Behaviour:
|
||||
# - Primary: PostgresIncidentStore
|
||||
# - Fallback: JsonlIncidentStore (on connection failure)
|
||||
# - Recovery: re-attempts Postgres after 5 minutes
|
||||
```
|
||||
|
||||
Use `INCIDENT_BACKEND=postgres` for Postgres-only (fails if DB is down) or `jsonl` for file-only.
|
||||
|
||||
## 7. Follow-up Tracking
|
||||
|
||||
Follow-ups are `incident_append_event` entries with `type=followup` and structured meta:
|
||||
|
||||
```bash
|
||||
# Check overdue follow-ups for a service:
|
||||
curl -X POST http://gateway/v1/tools/oncall_tool -d '{
|
||||
"action": "incident_followups_summary",
|
||||
"service": "gateway",
|
||||
"env": "prod",
|
||||
"window_days": 30
|
||||
}'
|
||||
```
|
||||
|
||||
The `followup_watch` release gate uses this to warn (or block in staging/prod strict mode) about open P0/P1 incidents and overdue follow-ups. See `docs/incident/followups.md`.
|
||||
|
||||
## 8. Monitoring
|
||||
|
||||
- Check `/healthz` on supervisor.
|
||||
- Monitor `ops/incidents/` directory size (JSONL backend).
|
||||
- Daily: review `incident_list status=open` for stale incidents.
|
||||
- Weekly: review `incident_followups_summary` for overdue items.
|
||||
|
||||
## 9. Weekly Incident Intelligence Digest
|
||||
|
||||
The `weekly_incident_digest` scheduled job runs every Monday at 08:00 UTC and produces:
|
||||
|
||||
- `ops/reports/incidents/weekly/YYYY-WW.json` — full structured data
|
||||
- `ops/reports/incidents/weekly/YYYY-WW.md` — markdown report for review
|
||||
|
||||
### Manual run
|
||||
|
||||
```bash
|
||||
# Via job orchestrator
|
||||
curl -X POST http://gateway/v1/tools/jobs \
|
||||
-H "X-API-Key: $GATEWAY_API_KEY" \
|
||||
-d '{"action":"start_task","params":{"task_id":"weekly_incident_digest","inputs":{}}}'
|
||||
|
||||
# Direct tool call (CTO/oncall only)
|
||||
curl -X POST http://gateway/v1/tools/incident_intelligence_tool \
|
||||
-H "X-API-Key: $GATEWAY_API_KEY" \
|
||||
-d '{"action":"weekly_digest","save_artifacts":true}'
|
||||
```
|
||||
|
||||
### Correlating a specific incident
|
||||
|
||||
```bash
|
||||
curl -X POST http://gateway/v1/tools/incident_intelligence_tool \
|
||||
-H "X-API-Key: $GATEWAY_API_KEY" \
|
||||
-d '{"action":"correlate","incident_id":"inc_20260218_1430_abc123","append_note":true}'
|
||||
```
|
||||
|
||||
### Recurrence analysis
|
||||
|
||||
```bash
|
||||
curl -X POST http://gateway/v1/tools/incident_intelligence_tool \
|
||||
-H "X-API-Key: $GATEWAY_API_KEY" \
|
||||
-d '{"action":"recurrence","window_days":7}'
|
||||
```
|
||||
|
||||
### Digest location
|
||||
|
||||
Reports accumulate in `ops/reports/incidents/weekly/`. Retention follows standard `audit_jsonl_days` or manual cleanup.
|
||||
|
||||
See also: `docs/incident/intelligence.md` for policy tuning and scoring details.
|
||||
|
||||
---
|
||||
|
||||
## Scheduler Wiring: cron vs task_registry
|
||||
|
||||
### Alert triage loop (already active)
|
||||
|
||||
```
|
||||
# ops/cron/alert_triage.cron — runs every 5 minutes
|
||||
*/5 * * * * python3 /opt/daarion/ops/scripts/alert_triage_loop.py
|
||||
```
|
||||
|
||||
This processes `new` alerts → creates/updates incidents → triggers escalation when needed.
|
||||
|
||||
### Governance jobs (activated in ops/cron/jobs.cron)
|
||||
|
||||
The following jobs complement the triage loop by computing intelligence and
|
||||
generating artifacts that Sofiia can consume:
|
||||
|
||||
| Job | Schedule | Output |
|
||||
|-----|----------|--------|
|
||||
| `hourly_risk_snapshot` | every hour | `risk_history_store` (Postgres or memory) |
|
||||
| `daily_risk_digest` | 09:00 UTC | `ops/reports/risk/YYYY-MM-DD.{md,json}` |
|
||||
| `weekly_platform_priority_digest` | Mon 06:00 UTC | `ops/reports/platform/YYYY-WW.{md,json}` |
|
||||
| `weekly_backlog_generate` | Mon 06:20 UTC | `ops/backlog/items.jsonl` or Postgres |
|
||||
|
||||
### Registering cron entries
|
||||
|
||||
```bash
|
||||
# Deploy all governance cron jobs:
|
||||
sudo cp ops/cron/jobs.cron /etc/cron.d/daarion-governance
|
||||
sudo chmod 644 /etc/cron.d/daarion-governance
|
||||
|
||||
# Verify active entries:
|
||||
grep -v "^#\|^$" /etc/cron.d/daarion-governance
|
||||
```
|
||||
|
||||
### Relationship between task_registry.yml and ops/cron/
|
||||
|
||||
`ops/task_registry.yml` is the **canonical declaration** of all scheduled jobs
|
||||
(schedule, permissions, inputs, dry-run). `ops/cron/jobs.cron` is the **physical
|
||||
activation** — what actually runs. They must be kept in sync.
|
||||
|
||||
Use `run_governance_job.py --dry-run` to test any job before enabling in cron.
|
||||
127
ops/runbook-sofiia-console.md
Normal file
127
ops/runbook-sofiia-console.md
Normal file
@@ -0,0 +1,127 @@
|
||||
# Runbook: Sofiia Control Console
|
||||
|
||||
**Service:** sofiia-console (NODA2 primary)
|
||||
**Port:** 8002
|
||||
**UI:** http://localhost:8002/
|
||||
|
||||
---
|
||||
|
||||
## 1. Endpoints
|
||||
|
||||
| Method | Path | Auth | Description |
|
||||
|--------|------|------|-------------|
|
||||
| GET | `/` | — | Console UI (Chat + Ops + Nodes) |
|
||||
| GET | `/api/health` | — | Aggregated health (first node router) |
|
||||
| POST | `/api/chat/send` | X-API-Key* | Proxy to router `/v1/agents/sofiia/infer` |
|
||||
| GET | `/api/ops/actions` | — | List ops action ids |
|
||||
| POST | `/api/ops/run` | X-API-Key* | Run risk_dashboard / pressure_dashboard / backlog_generate_weekly / release_check |
|
||||
| GET | `/api/nodes/dashboard` | — | Per-node router health from `config/nodes_registry.yml` |
|
||||
|
||||
\* If `SOFIIA_CONSOLE_API_KEY` is set, write endpoints require header `X-API-Key: <key>`.
|
||||
|
||||
---
|
||||
|
||||
## 2. Environment variables
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| `ROUTER_URL` | Default router for health/chat when node not specified | `http://localhost:9102` |
|
||||
| `CONFIG_DIR` | Directory containing `nodes_registry.yml` (Docker: `/app/config`) | repo `config/` |
|
||||
| `NODES_<ID>_ROUTER_URL` | Override router URL per node (e.g. `NODES_NODA1_ROUTER_URL`) | from registry |
|
||||
| `SUPERVISOR_API_KEY` | Sent to router on tool/infer calls (optional) | — |
|
||||
| `SOFIIA_CONSOLE_API_KEY` | Protects POST /api/chat/send and /api/ops/run | — (no auth if unset) |
|
||||
|
||||
---
|
||||
|
||||
## 3. Deploy (Docker, NODA2)
|
||||
|
||||
```bash
|
||||
cd /path/to/microdao-daarion
|
||||
docker compose -f docker-compose.node2-sofiia.yml up -d sofiia-console
|
||||
```
|
||||
|
||||
Ensure `config/nodes_registry.yml` exists and lists `NODA1` / `NODA2` with correct `router_url`.
|
||||
Open http://localhost:8002/
|
||||
|
||||
---
|
||||
|
||||
## 4. Run locally (no Docker)
|
||||
|
||||
```bash
|
||||
cd services/sofiia-console
|
||||
pip install -r requirements.txt
|
||||
export ROUTER_URL=http://localhost:8000 # or 9102
|
||||
uvicorn app.main:app --host 0.0.0.0 --port 8002
|
||||
```
|
||||
|
||||
Then open http://localhost:8002/
|
||||
|
||||
---
|
||||
|
||||
## 5. API key rotation (NODA2)
|
||||
|
||||
Rotate both `SOFIIA_CONSOLE_API_KEY` and `SUPERVISOR_API_KEY` to one new value:
|
||||
|
||||
```bash
|
||||
cd /Users/apple/github-projects/microdao-daarion
|
||||
NEW_KEY="$(openssl rand -hex 24)"
|
||||
sed -i '' "s/^SOFIIA_CONSOLE_API_KEY=.*/SOFIIA_CONSOLE_API_KEY=${NEW_KEY}/" .env
|
||||
sed -i '' "s/^SUPERVISOR_API_KEY=.*/SUPERVISOR_API_KEY=${NEW_KEY}/" .env
|
||||
docker compose -f docker-compose.node2-sofiia.yml up -d sofiia-console router
|
||||
```
|
||||
|
||||
Quick check in container env:
|
||||
|
||||
```bash
|
||||
docker exec sofiia-console sh -lc 'env | grep -E "^(ENV|SOFIIA_CONSOLE_API_KEY|SUPERVISOR_API_KEY)="'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. Ops API examples (with key)
|
||||
|
||||
```bash
|
||||
KEY="<SOFIIA_CONSOLE_API_KEY>"
|
||||
```
|
||||
|
||||
```bash
|
||||
curl -sS -X POST http://localhost:8002/api/ops/run \
|
||||
-H "X-API-Key: ${KEY}" -H "Content-Type: application/json" \
|
||||
-d '{"action_id":"risk_dashboard","node_id":"NODA2","params":{}}' | jq .
|
||||
```
|
||||
|
||||
```bash
|
||||
curl -sS -X POST http://localhost:8002/api/ops/run \
|
||||
-H "X-API-Key: ${KEY}" -H "Content-Type: application/json" \
|
||||
-d '{"action_id":"pressure_dashboard","node_id":"NODA2","params":{}}' | jq .
|
||||
```
|
||||
|
||||
```bash
|
||||
curl -sS -X POST http://localhost:8002/api/ops/run \
|
||||
-H "X-API-Key: ${KEY}" -H "Content-Type: application/json" \
|
||||
-d '{"action_id":"release_check","node_id":"NODA2","params":{}}' | jq .
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. Troubleshooting
|
||||
|
||||
| Symptom | Cause | Fix |
|
||||
|---------|-------|-----|
|
||||
| Chat "Помилка мережі" | Router unreachable | Check ROUTER_URL and router container |
|
||||
| Ops run returns 502 | Router or tool error | Check router logs; verify RBAC for agent `sofiia` |
|
||||
| Nodes dashboard empty | No nodes in registry or CONFIG_DIR wrong | Check `config/nodes_registry.yml` and CONFIG_DIR mount |
|
||||
| 401 on POST /api/chat/send | API key required but missing/wrong | Set X-API-Key header to SOFIIA_CONSOLE_API_KEY or leave SOFIIA_CONSOLE_API_KEY unset |
|
||||
|
||||
---
|
||||
|
||||
## 8. Verification
|
||||
|
||||
After deploy, run stack verifier (from repo root):
|
||||
|
||||
```bash
|
||||
export ROUTER_URL=http://localhost:8000 # or router:8000 inside Docker network
|
||||
python3 ops/scripts/verify_sofiia_stack.py
|
||||
```
|
||||
|
||||
See `docs/opencode/sofiia_setup.md` for OpenCode integration and tool contract.
|
||||
194
ops/runbook-sofiia-docs.md
Normal file
194
ops/runbook-sofiia-docs.md
Normal file
@@ -0,0 +1,194 @@
|
||||
# Runbook: Sofiia Console — Projects, Documents, Sessions
|
||||
|
||||
> Scope: sofiia-console BFF (NODA2) | Storage: SQLite (Phase 1) | Vol: `sofiia-data`
|
||||
|
||||
---
|
||||
|
||||
## 1. Volume Paths
|
||||
|
||||
| Item | Host path | Container path |
|
||||
|---|---|---|
|
||||
| SQLite DB | `sofiia-data` Docker volume | `/app/data/sofiia.db` |
|
||||
| Uploaded files | `sofiia-data` Docker volume | `/app/data/uploads/{sha[:2]}/{sha}_{filename}` |
|
||||
|
||||
**Inspect volume:**
|
||||
```bash
|
||||
docker volume inspect microdao-daarion_sofiia-data
|
||||
# -> Mountpoint: /var/lib/docker/volumes/.../data/_data
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. Backup Strategy
|
||||
|
||||
### Option A: rsync snapshot (recommended)
|
||||
```bash
|
||||
# Get volume mountpoint
|
||||
VOL=$(docker volume inspect microdao-daarion_sofiia-data --format '{{.Mountpoint}}')
|
||||
|
||||
# Create timestamped backup
|
||||
BACKUP_DIR=/opt/backups/sofiia-data/$(date +%Y%m%d_%H%M%S)
|
||||
mkdir -p "$BACKUP_DIR"
|
||||
rsync -a "$VOL/" "$BACKUP_DIR/"
|
||||
echo "Backup: $BACKUP_DIR"
|
||||
```
|
||||
|
||||
### Option B: SQLite online backup
|
||||
```bash
|
||||
# Create consistent SQLite backup while service is running
|
||||
docker exec sofiia-console-node2 sqlite3 /app/data/sofiia.db ".backup /app/data/sofiia_backup.db"
|
||||
docker cp sofiia-console-node2:/app/data/sofiia_backup.db ./backup_$(date +%Y%m%d).db
|
||||
```
|
||||
|
||||
### Cron (recommended: daily at 3:00 AM)
|
||||
```cron
|
||||
0 3 * * * rsync -a $(docker volume inspect microdao-daarion_sofiia-data --format '{{.Mountpoint}}/') /opt/backups/sofiia/$(date +\%Y\%m\%d_\%H\%M\%S)/ >> /var/log/sofiia-backup.log 2>&1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Migration Commands
|
||||
|
||||
### Phase 1 → Phase 2 (SQLite → PostgreSQL)
|
||||
|
||||
When ready to migrate to Postgres:
|
||||
|
||||
1. Set `DATABASE_URL=postgresql://user:pass@host:5432/dbname` in docker-compose.
|
||||
2. Restart service — schemas auto-create via `init_db()`.
|
||||
3. Migrate data:
|
||||
```bash
|
||||
# Export SQLite to SQL
|
||||
sqlite3 /app/data/sofiia.db .dump > /tmp/sofiia_dump.sql
|
||||
|
||||
# Import to Postgres (manual cleanup may be required for SQLite-specific syntax)
|
||||
psql "$DATABASE_URL" < /tmp/sofiia_dump.sql
|
||||
```
|
||||
|
||||
### Schema version check
|
||||
```bash
|
||||
docker exec sofiia-console-node2 sqlite3 /app/data/sofiia.db ".tables"
|
||||
# Expected: documents messages projects sessions
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. API Endpoints Reference
|
||||
|
||||
| Endpoint | Method | Purpose |
|
||||
|---|---|---|
|
||||
| `/api/projects` | GET | List all projects |
|
||||
| `/api/projects` | POST | Create project `{name, description}` |
|
||||
| `/api/projects/{pid}` | GET | Get project details |
|
||||
| `/api/projects/{pid}` | PATCH | Update name/description |
|
||||
| `/api/files/upload?project_id=...` | POST | Upload file (multipart) |
|
||||
| `/api/files/{file_id}/download` | GET | Download file |
|
||||
| `/api/projects/{pid}/documents` | GET | List documents |
|
||||
| `/api/projects/{pid}/documents/{did}` | GET | Document metadata + text |
|
||||
| `/api/projects/{pid}/search` | POST | Keyword search `{query}` |
|
||||
| `/api/sessions?project_id=...` | GET | List sessions |
|
||||
| `/api/sessions/{sid}` | GET | Session details |
|
||||
| `/api/sessions/{sid}/title` | PATCH | Update session title |
|
||||
| `/api/chat/history?session_id=...` | GET | Load message history |
|
||||
| `/api/sessions/{sid}/map` | GET | Dialog map nodes + edges |
|
||||
| `/api/sessions/{sid}/fork` | POST | Fork session from message |
|
||||
|
||||
---
|
||||
|
||||
## 5. Upload Limits (env-configurable)
|
||||
|
||||
| Type | Env var | Default |
|
||||
|---|---|---|
|
||||
| Images | `UPLOAD_MAX_IMAGE_MB` | 10 MB |
|
||||
| Videos | `UPLOAD_MAX_VIDEO_MB` | 200 MB |
|
||||
| Docs | `UPLOAD_MAX_DOC_MB` | 50 MB |
|
||||
|
||||
Change without rebuild:
|
||||
```yaml
|
||||
# in docker-compose.node2-sofiia.yml
|
||||
environment:
|
||||
- UPLOAD_MAX_IMAGE_MB=20
|
||||
- UPLOAD_MAX_DOC_MB=100
|
||||
```
|
||||
Then: `docker compose restart sofiia-console`
|
||||
|
||||
---
|
||||
|
||||
## 6. Phase 2 Feature Flags
|
||||
|
||||
```bash
|
||||
# Enable Fabric OCR for images (routes through Router /v1/capability/ocr)
|
||||
USE_FABRIC_OCR=true
|
||||
|
||||
# Enable Qdrant embedding indexing for documents
|
||||
USE_EMBEDDINGS=true
|
||||
```
|
||||
|
||||
Both default to `false` (no impact on baseline performance).
|
||||
|
||||
---
|
||||
|
||||
## 7. Troubleshooting
|
||||
|
||||
### DB not initialized
|
||||
```bash
|
||||
docker logs sofiia-console-node2 | grep -i "DB init"
|
||||
# Expected: "sofiia-console DB initialised"
|
||||
```
|
||||
|
||||
If missing: restart container. DB init is in `lifespan()` startup hook.
|
||||
|
||||
### Upload failing (413)
|
||||
Check file size vs. limit. Inspect:
|
||||
```bash
|
||||
curl -s http://localhost:8002/api/projects | jq
|
||||
```
|
||||
If 500 → check logs: `docker logs sofiia-console-node2 --tail 50`
|
||||
|
||||
### Session not restoring after page reload
|
||||
- Browser `localStorage` must have `sofiia_session_id`
|
||||
- Check: `GET /api/chat/history?session_id={id}&limit=20`
|
||||
- If empty: session exists but has 0 messages (new session)
|
||||
|
||||
### Dialog map empty
|
||||
```bash
|
||||
curl -s "http://localhost:8002/api/sessions?project_id=default&limit=5" | jq
|
||||
curl -s "http://localhost:8002/api/sessions/{session_id}/map" | jq '.nodes | length'
|
||||
```
|
||||
If 0 nodes: no messages saved yet. Ensure `_do_save_memory` is not blocked (check Memory Service health).
|
||||
|
||||
### Volume full
|
||||
```bash
|
||||
docker system df
|
||||
du -sh $(docker volume inspect microdao-daarion_sofiia-data --format '{{.Mountpoint}}')
|
||||
```
|
||||
Cleanup old uploads manually (content-addressed, safe to delete by sha if no DB references):
|
||||
```bash
|
||||
sqlite3 /app/data/sofiia.db "SELECT file_id FROM documents" > /tmp/active_files.txt
|
||||
# Then diff with actual /app/data/uploads/* to find orphans
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 8. Testing
|
||||
|
||||
### Run unit tests
|
||||
```bash
|
||||
cd /opt/microdao-daarion
|
||||
python3 -m pytest tests/test_sofiia_docs.py -v
|
||||
```
|
||||
|
||||
### Smoke test: create project + upload
|
||||
```bash
|
||||
BASE=http://localhost:8002
|
||||
|
||||
# Create project
|
||||
curl -s -X POST "$BASE/api/projects" -H "Content-Type: application/json" \
|
||||
-d '{"name":"Test Project","description":"Smoke test"}' | jq .
|
||||
|
||||
# Upload file
|
||||
curl -s -X POST "$BASE/api/files/upload?project_id=default&title=Test+Doc" \
|
||||
-F "file=@/etc/hostname" | jq '.doc_id, .sha256, .size_bytes'
|
||||
|
||||
# List docs
|
||||
curl -s "$BASE/api/projects/default/documents" | jq '.[].filename'
|
||||
```
|
||||
257
ops/runbook-sofiia-supervisor.md
Normal file
257
ops/runbook-sofiia-supervisor.md
Normal file
@@ -0,0 +1,257 @@
|
||||
# Runbook: sofiia-supervisor (NODA2)
|
||||
|
||||
**Service**: `sofiia-supervisor` + `sofiia-redis`
|
||||
**Host**: NODA2 | **External port**: 8084
|
||||
**Escalation**: #platform-ops → @platform-oncall
|
||||
|
||||
---
|
||||
|
||||
## Health Check
|
||||
|
||||
```bash
|
||||
# Basic health
|
||||
curl -sf http://localhost:8084/healthz && echo OK
|
||||
|
||||
# Expected response:
|
||||
# {"status":"ok","service":"sofiia-supervisor","graphs":["release_check","incident_triage"],
|
||||
# "state_backend":"redis","gateway_url":"http://router:8000"}
|
||||
|
||||
# Redis health
|
||||
docker exec sofiia-redis redis-cli ping
|
||||
# Expected: PONG
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Logs
|
||||
|
||||
```bash
|
||||
# Supervisor logs (last 100 lines)
|
||||
docker logs sofiia-supervisor --tail 100 -f
|
||||
|
||||
# Filter tool call events (no payload)
|
||||
docker logs sofiia-supervisor 2>&1 | grep "gateway_call\|gateway_ok\|gateway_tool_fail"
|
||||
|
||||
# Redis logs
|
||||
docker logs sofiia-redis --tail 50
|
||||
|
||||
# All supervisor logs to file
|
||||
docker logs sofiia-supervisor > /tmp/supervisor-$(date +%Y%m%d-%H%M%S).log 2>&1
|
||||
```
|
||||
|
||||
Log format:
|
||||
```
|
||||
2026-02-23T10:00:01Z [INFO] gateway_call tool=job_orchestrator_tool action=start_task node=start_job run=gr_abc123 hash=d4e5f6 size=312 attempt=1
|
||||
2026-02-23T10:00:02Z [INFO] gateway_ok tool=job_orchestrator_tool node=start_job run=gr_abc123 elapsed_ms=145
|
||||
```
|
||||
|
||||
**Payload is NEVER logged.** Only: tool name, action, node, run_id, input hash, size, elapsed time.
|
||||
|
||||
---
|
||||
|
||||
## Restart
|
||||
|
||||
```bash
|
||||
# Graceful restart (in-flight runs will fail → status=failed in Redis)
|
||||
docker compose -f docker-compose.node2-sofiia-supervisor.yml restart sofiia-supervisor
|
||||
|
||||
# Full restart with rebuild (after code changes)
|
||||
docker compose -f docker-compose.node2-sofiia-supervisor.yml \
|
||||
up -d --build sofiia-supervisor
|
||||
|
||||
# Check container status after restart
|
||||
docker ps --filter name=sofiia-supervisor --format "table {{.Names}}\t{{.Status}}"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Start / Stop
|
||||
|
||||
```bash
|
||||
# Start (attached to dagi-network-node2)
|
||||
docker compose \
|
||||
-f docker-compose.node2.yml \
|
||||
-f docker-compose.node2-sofiia-supervisor.yml \
|
||||
up -d sofiia-supervisor sofiia-redis
|
||||
|
||||
# Stop (preserves Redis data)
|
||||
docker compose -f docker-compose.node2-sofiia-supervisor.yml stop sofiia-supervisor
|
||||
|
||||
# Stop + remove containers (keeps volumes)
|
||||
docker compose -f docker-compose.node2-sofiia-supervisor.yml down
|
||||
|
||||
# Full teardown (removes volumes — DESTROYS run history)
|
||||
docker compose -f docker-compose.node2-sofiia-supervisor.yml down -v
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## State Cleanup
|
||||
|
||||
```bash
|
||||
# Connect to Redis
|
||||
docker exec -it sofiia-redis redis-cli
|
||||
|
||||
# List all run keys
|
||||
127.0.0.1:6379> KEYS run:*
|
||||
|
||||
# Check a specific run
|
||||
127.0.0.1:6379> GET run:gr_abc123
|
||||
|
||||
# Check run TTL (seconds until expiry)
|
||||
127.0.0.1:6379> TTL run:gr_abc123
|
||||
|
||||
# Manually delete a stuck/stale run
|
||||
127.0.0.1:6379> DEL run:gr_abc123 run:gr_abc123:events
|
||||
|
||||
# Count all active runs
|
||||
127.0.0.1:6379> DBSIZE
|
||||
|
||||
# Flush all run data (CAUTION: destroys all history)
|
||||
# 127.0.0.1:6379> FLUSHDB
|
||||
|
||||
# Exit
|
||||
127.0.0.1:6379> EXIT
|
||||
```
|
||||
|
||||
Default TTL: `RUN_TTL_SEC=86400` (24h). Runs auto-expire.
|
||||
|
||||
---
|
||||
|
||||
## Common Issues
|
||||
|
||||
### `sofiia-supervisor` can't reach router
|
||||
|
||||
```bash
|
||||
# Check network
|
||||
docker exec sofiia-supervisor curl -sf http://router:8000/healthz
|
||||
|
||||
# If fails: verify router is on dagi-network-node2
|
||||
docker network inspect dagi-network-node2 | grep -A3 router
|
||||
```
|
||||
|
||||
**Fix**: Ensure both services are on `dagi-network-node2` (see compose `networks` section).
|
||||
|
||||
---
|
||||
|
||||
### Run stuck in `running` status
|
||||
|
||||
Cause: Graph crashed mid-execution or supervisor was restarted.
|
||||
|
||||
```bash
|
||||
# Manually cancel via API
|
||||
curl -X POST http://localhost:8084/v1/runs/gr_STUCK_ID/cancel
|
||||
|
||||
# Or force-set status in Redis
|
||||
docker exec -it sofiia-redis redis-cli
|
||||
> GET run:gr_STUCK_ID
|
||||
> SET run:gr_STUCK_ID '{"run_id":"gr_STUCK_ID","graph":"release_check","status":"failed",...}'
|
||||
> EXIT
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Redis connection error
|
||||
|
||||
```bash
|
||||
docker logs sofiia-supervisor 2>&1 | grep "Redis connection error"
|
||||
|
||||
# Check Redis is running
|
||||
docker ps --filter name=sofiia-redis
|
||||
|
||||
# Restart Redis (data preserved in volume)
|
||||
docker compose -f docker-compose.node2-sofiia-supervisor.yml restart sofiia-redis
|
||||
|
||||
# Test connection
|
||||
docker exec sofiia-redis redis-cli -h sofiia-redis ping
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### High memory on Redis
|
||||
|
||||
```bash
|
||||
# Check memory usage
|
||||
docker exec sofiia-redis redis-cli info memory | grep used_memory_human
|
||||
|
||||
# Redis is configured with maxmemory=256mb + allkeys-lru policy
|
||||
# Old runs will be evicted automatically
|
||||
|
||||
# Manual cleanup of old runs (older than 12h):
|
||||
# Write a cleanup script or reduce RUN_TTL_SEC in .env
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Gateway returns 401 Unauthorized
|
||||
|
||||
Cause: `SUPERVISOR_API_KEY` mismatch between supervisor and router.
|
||||
|
||||
```bash
|
||||
# Check env
|
||||
docker exec sofiia-supervisor env | grep SUPERVISOR_API_KEY
|
||||
|
||||
# Compare with router
|
||||
docker exec dagi-router-node2 env | grep SUPERVISOR_API_KEY
|
||||
```
|
||||
|
||||
Both must match. Set via `SUPERVISOR_API_KEY=...` in docker-compose or `.env`.
|
||||
|
||||
---
|
||||
|
||||
## Metrics / Monitoring
|
||||
|
||||
Currently no dedicated metrics endpoint. Monitor via:
|
||||
|
||||
1. **`/healthz`** — service up/down
|
||||
2. **Docker stats** — `docker stats sofiia-supervisor sofiia-redis`
|
||||
3. **Log patterns** — `gateway_ok`, `gateway_tool_fail`, `run_graph error`
|
||||
|
||||
Planned: Prometheus `/metrics` endpoint with run counts per graph/status.
|
||||
|
||||
---
|
||||
|
||||
## Upgrade
|
||||
|
||||
```bash
|
||||
# Pull new image (if using registry)
|
||||
docker pull daarion/sofiia-supervisor:latest
|
||||
|
||||
# Or rebuild from source
|
||||
cd /path/to/microdao-daarion
|
||||
docker compose -f docker-compose.node2-sofiia-supervisor.yml \
|
||||
build --no-cache sofiia-supervisor
|
||||
|
||||
# Rolling restart (zero-downtime is NOT guaranteed — single instance)
|
||||
docker compose -f docker-compose.node2-sofiia-supervisor.yml \
|
||||
up -d sofiia-supervisor
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Available Graphs
|
||||
|
||||
| Graph | Description | Key nodes |
|
||||
|-------|-------------|-----------|
|
||||
| `release_check` | Release validation pipeline | jobs → poll → result |
|
||||
| `incident_triage` | Collect observability + KB + SLO/privacy/cost context | overview → logs → health → traces → slo_context → privacy → cost → report |
|
||||
| `postmortem_draft` | Generate postmortem from incident | load_incident → ensure_triage → draft → attach_artifacts → followups |
|
||||
|
||||
### postmortem_draft (new)
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:8084/v1/graphs/postmortem_draft/runs \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"agent_id":"sofiia","input":{"incident_id":"inc_..."}}'
|
||||
```
|
||||
|
||||
Generates markdown + JSON postmortem, attaches as incident artifacts, and appends follow-up timeline events. See `docs/supervisor/postmortem_draft_graph.md`.
|
||||
|
||||
---
|
||||
|
||||
## Known Limitations (MVP)
|
||||
|
||||
1. **Single worker** (`--workers 1`) — graph runs are sequential per process. For concurrent load, increase workers (but Redis state handles consistency).
|
||||
2. **No LangGraph checkpointing** — runs interrupted by restart will show as `failed`; they do not resume.
|
||||
3. **Polling-based job status** — `release_check` polls `job_orchestrator_tool` every 3s. Tune `JOB_POLL_INTERVAL_SEC` if needed.
|
||||
4. **In-flight cancellation** — `cancel` sets status in Redis but cannot interrupt an already-executing tool call. Cancellation is effective between nodes.
|
||||
221
ops/runbook-voice-incidents.md
Normal file
221
ops/runbook-voice-incidents.md
Normal file
@@ -0,0 +1,221 @@
|
||||
# Voice Incidents Runbook
|
||||
**Version:** 1.0 | **Node:** NODA2 | **SLO doc:** `config/slo_policy.yml`
|
||||
|
||||
---
|
||||
|
||||
## Перший крок для БУДЬ-ЯКОГО алерту (30 секунд)
|
||||
|
||||
```bash
|
||||
# 1. Репро пакет — весь контекст в одному запиті
|
||||
curl -s http://localhost:8002/api/voice/degradation_status | python3 -m json.tool
|
||||
|
||||
# 2. Canary живий синтез
|
||||
python3 ops/scripts/voice_canary.py --mode preflight --memory-url http://localhost:8000
|
||||
|
||||
# 3. Логи останніх 2 хвилин
|
||||
docker logs sofiia-console --since 2m 2>&1 | grep -E "ERROR|WARNING|TTS|LLM|502|429|503"
|
||||
docker logs dagi-memory-service-node2 --since 2m 2>&1 | grep -E "ERROR|403|edge.tts|synthesiz"
|
||||
```
|
||||
|
||||
**Поля `repro` у відповіді** дають: `last_5_tts_errors`, `last_5_llm_errors`, `node_id`, `last_model`, `concurrent_tts_slots_free`.
|
||||
|
||||
---
|
||||
|
||||
## Alert 1: `VoiceTTFA_P95_Breach_Fast`
|
||||
**Умова:** TTFA p95 > 5000ms за 10 хвилин | **Severity:** warning
|
||||
|
||||
**Що значить:** LLM відповідає повільно — черга Ollama переповнена, модель cold-start, або qwen3.5 вибрана замість gemma3.
|
||||
|
||||
### Крок 1 — Діагностика (2 хв)
|
||||
```bash
|
||||
# Ollama поточний стан
|
||||
curl -s http://localhost:11434/api/ps | python3 -m json.tool
|
||||
# Метрики LLM по моделях (якщо є Prometheus)
|
||||
# promql: histogram_quantile(0.95, rate(voice_llm_ms_bucket[5m])) by (model)
|
||||
|
||||
# Деградаційний стан
|
||||
curl -s http://localhost:8002/api/voice/degradation_status | python3 -c \
|
||||
"import sys,json; d=json.load(sys.stdin); print(d['repro']['last_model'], d['p95'])"
|
||||
```
|
||||
|
||||
### Крок 2 — Mitigation
|
||||
```bash
|
||||
# A. Примусово переключити на gemma3 (якщо qwen3.5 завантажений)
|
||||
# В UI: зняти галочку "Якісно" → fast profile автоматично обере gemma3
|
||||
|
||||
# B. Якщо Ollama завантажений запитами — зупинити важкі моделі
|
||||
curl -s -X POST http://localhost:11434/api/generate \
|
||||
-d '{"model":"qwen3.5:35b-a3b","keep_alive":0}' # вивантажити з GPU
|
||||
|
||||
# C. Якщо Ollama не відповідає — перезапуск
|
||||
docker restart ollama && sleep 10
|
||||
curl -s http://localhost:11434/api/tags | python3 -m json.tool
|
||||
```
|
||||
|
||||
### Крок 3 — Verify
|
||||
```bash
|
||||
python3 ops/scripts/voice_canary.py --mode runtime --memory-url http://localhost:8000
|
||||
# Очікування: overall=ok, Polina/Ostap < 3000ms
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Alert 2: `VoiceTTFA_P95_Breach_Quality`
|
||||
**Умова:** quality profile TTFA p95 > 7000ms | **Severity:** warning
|
||||
|
||||
**Що значить:** qwen3.5 або qwen3:14b надто повільні. Часто — конкурентні запити або cold token generation.
|
||||
|
||||
### Дії
|
||||
1. Перевірити `degradation_status.repro.last_model` — підтвердити що це quality profile.
|
||||
2. Якщо це ізольована сесія — ігнорувати (quality SLO м'якший).
|
||||
3. Якщо 5+ хвилин стабільно → переключити всіх на fast: в `router-config.yml` тимчасово видалити `voice_quality_uk` з `agent_voice_profiles.sofiia.quality_option`.
|
||||
4. Після нормалізації — повернути.
|
||||
|
||||
```bash
|
||||
# Підтвердити що fast profile нормальний
|
||||
curl -s -X POST http://localhost:8002/api/voice/chat/stream \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"message":"ping","model":"ollama:gemma3:latest","voice_profile":"voice_fast_uk"}' \
|
||||
| python3 -c "import sys,json; d=json.load(sys.stdin); print('llm_ms:', d['meta']['llm_ms'])"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Alert 3: `VoiceQueueUnderflow_Spike`
|
||||
**Умова:** underflow rate > 1/хв за 5 хвилин | **Severity:** warning
|
||||
|
||||
**Що значить:** браузер відтворює аудіо швидше ніж BFF синтезує `rest_chunks`. Користувач чує тишу між реченнями.
|
||||
|
||||
### Діагностика
|
||||
```bash
|
||||
# Перевірити TTS latency (чи сповільнилось edge-tts?)
|
||||
curl -s http://localhost:8000/voice/health | python3 -c \
|
||||
"import sys,json; d=json.load(sys.stdin); [print(v['voice'], v['ms'],'ms') for v in d['voices']]"
|
||||
|
||||
# Перевірити concurrent TTS slots
|
||||
curl -s http://localhost:8002/api/voice/degradation_status | python3 -c \
|
||||
"import sys,json; d=json.load(sys.stdin); print('free slots:', d['repro']['concurrent_tts_slots_free'])"
|
||||
```
|
||||
|
||||
### Mitigation
|
||||
- **Якщо TTS slow** (> 2s) → Alert 4 (edge-tts). Дивись нижче.
|
||||
- **Якщо concurrent slots = 0** → TTS DOS. Перевірити `docker stats dagi-memory-service-node2`. Збільшити `MAX_CONCURRENT_TTS` або перезапустити memory-service.
|
||||
- **Якщо slots OK** → перший чанк надто короткий (~1 речення). Тимчасове рішення — зменшити `MIN_CHUNK_CHARS` у `voice_utils.py` щоб більше тексту йшло у перший чанк.
|
||||
|
||||
---
|
||||
|
||||
## Alert 4: `VoiceTTS_P95_Degraded`
|
||||
**Умова:** TTS synthesis p95 > 2000ms за 10 хвилин | **Severity:** **critical**
|
||||
|
||||
**Що значить:** edge-tts сповільнився або починає отримувати 403. Типова причина — Microsoft endpoint зміна auth або rate limiting.
|
||||
|
||||
### Крок 1 — Визначити тип помилки (1 хв)
|
||||
```bash
|
||||
# Подивитись last_5_tts_errors
|
||||
curl -s http://localhost:8002/api/voice/degradation_status | python3 -c \
|
||||
"import sys,json; d=json.load(sys.stdin); [print(e) for e in d['repro']['last_5_tts_errors']]"
|
||||
|
||||
# Живий тест
|
||||
python3 ops/scripts/voice_canary.py --mode preflight --memory-url http://localhost:8000
|
||||
```
|
||||
|
||||
### Якщо 403 errors:
|
||||
```bash
|
||||
# Перевірити версію edge-tts
|
||||
docker exec dagi-memory-service-node2 pip show edge-tts | grep Version
|
||||
# Очікується: 7.2.7
|
||||
|
||||
# Якщо версія не 7.2.7 — оновити
|
||||
docker exec dagi-memory-service-node2 pip install edge-tts==7.2.7
|
||||
docker restart dagi-memory-service-node2
|
||||
sleep 10 && python3 ops/scripts/voice_canary.py --mode preflight
|
||||
```
|
||||
|
||||
### Якщо timeout / network:
|
||||
```bash
|
||||
# Тест від сервера до Microsoft endpoint
|
||||
docker exec dagi-memory-service-node2 python3 -c \
|
||||
"import asyncio, edge_tts; asyncio.run(edge_tts.list_voices())"
|
||||
|
||||
# Якщо мережева проблема — тимчасово переключити на espeak (fallback)
|
||||
# В memory-service env: TTS_FALLBACK_ENGINE=espeak
|
||||
# Увага: якість значно гірша, але голос є
|
||||
```
|
||||
|
||||
### Нотувати в incident log:
|
||||
```bash
|
||||
curl -s -X POST http://localhost:9102/v1/tools/execute \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"tool":"oncall_tool","action":"incident_log_append","params":{"severity":"sev2","title":"TTS degraded — edge-tts","body":"VoiceTTS_P95_Degraded alert fired. Last errors: ..."}}'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Alert 5: `VoiceTTS_ErrorRate_High`
|
||||
**Умова:** TTS errors > 0.05/s за 3 хвилини | **Severity:** **critical**
|
||||
|
||||
**Що значить:** масові відмови TTS синтезу. Користувачі або не чують нічого, або чують espeak-fallback.
|
||||
|
||||
### Перший крок (30 секунд)
|
||||
```bash
|
||||
# Скільки помилок і якого типу
|
||||
docker logs dagi-memory-service-node2 --since 5m 2>&1 | grep -c "ERROR\|403\|edge.tts"
|
||||
docker logs dagi-memory-service-node2 --since 5m 2>&1 | grep "ERROR" | tail -5
|
||||
```
|
||||
|
||||
### Mitigation tree:
|
||||
```
|
||||
error_type = 403 → Крок "Якщо 403 errors" з Alert 4
|
||||
error_type = timeout → Перевірити мережу, перезапустити memory-service
|
||||
error_type = synthesis → pip install edge-tts==7.2.7 --force-reinstall
|
||||
error_type = OOM → docker stats → перезапустити memory-service з більшим RAM limit
|
||||
```
|
||||
|
||||
### Аварійний fallback (якщо нічого не допомогло):
|
||||
```bash
|
||||
# Вимкнути автоспік у UI — щоб не показувало помилки
|
||||
# Або тимчасово вимкнути streaming
|
||||
docker exec sofiia-console env VOICE_STREAM_ENABLED=false \
|
||||
uvicorn app.main:app --host 0.0.0.0 --port 8002 &
|
||||
# (не рекомендовано на prod без rebuild, але як аварійний захід)
|
||||
```
|
||||
|
||||
### Повідомити користувачів (якщо > 10 хвилин):
|
||||
- Додати banner у UI: змінна `VOICE_DEGRADED_BANNER` у env → відобразити через degradation badge "🔴 TTS DEGRADED"
|
||||
|
||||
---
|
||||
|
||||
## Escalation
|
||||
|
||||
| Тривалість | Дія |
|
||||
|------------|-----|
|
||||
| < 10 хв | Автоматичний деградаційний badge у UI, моніторинг |
|
||||
| 10–30 хв | Mitigation з цього runbook, canary preflight |
|
||||
| > 30 хв | Escalate до @IvanTytar, записати incident в ops/incidents.jsonl |
|
||||
| > 2 год | Post-mortem draft (Sofiia-supervisor `postmortem_draft_graph`) |
|
||||
|
||||
```bash
|
||||
# Записати incident
|
||||
echo '{"ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","sev":"sev2","title":"Voice TTS degraded","status":"open"}' \
|
||||
>> ops/incidents.jsonl
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Корисні команди (bookmark)
|
||||
|
||||
```bash
|
||||
# Швидкий статус всього voice стеку
|
||||
curl -s http://localhost:8002/api/voice/degradation_status | python3 -m json.tool
|
||||
curl -s http://localhost:8000/voice/health | python3 -c "import sys,json; d=json.load(sys.stdin); print('TTS:', d['edge_tts'], '| Polina:', [v for v in d['voices'] if 'Polina' in v['voice']][0]['ms'], 'ms')"
|
||||
python3 ops/scripts/voice_canary.py --mode preflight
|
||||
|
||||
# Browser console для активних сесій
|
||||
# _voiceStats() — p50/p95 по останніх 20 турнах
|
||||
# _voice_degradation_sm — поточний стан на сервері
|
||||
|
||||
# Prometheus queries (якщо є)
|
||||
# histogram_quantile(0.95, rate(voice_ttfa_ms_bucket[5m])) by (voice_profile)
|
||||
# rate(voice_tts_errors_total[5m])
|
||||
# rate(voice_queue_underflows_total[5m]) * 60
|
||||
```
|
||||
148
ops/scripts/alert_triage_loop.py
Normal file
148
ops/scripts/alert_triage_loop.py
Normal file
@@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
alert_triage_loop.py — Scheduled runner for the alert_triage_graph.
|
||||
|
||||
Calls the sofiia-supervisor API (POST /v1/graphs/alert_triage/runs) and
|
||||
polls until the run completes, then prints the digest.
|
||||
|
||||
Usage:
|
||||
python3 ops/scripts/alert_triage_loop.py [--dry-run] [--supervisor-url URL]
|
||||
|
||||
Environment:
|
||||
SUPERVISOR_URL default: http://sofiia-supervisor:8084
|
||||
SUPERVISOR_API_KEY optional API key (Bearer token)
|
||||
ALERT_TRIAGE_WS_ID workspace_id (default: "default")
|
||||
ALERT_TRIAGE_AGENT agent_id (default: "sofiia")
|
||||
|
||||
Cron example (NODA2):
|
||||
*/5 * * * * python3 /opt/daarion/ops/scripts/alert_triage_loop.py >> /var/log/alert_triage.log 2>&1
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SUPERVISOR_URL = os.getenv("SUPERVISOR_URL", "http://sofiia-supervisor:8084")
|
||||
API_KEY = os.getenv("SUPERVISOR_API_KEY", "")
|
||||
WORKSPACE_ID = os.getenv("ALERT_TRIAGE_WS_ID", "default")
|
||||
AGENT_ID = os.getenv("ALERT_TRIAGE_AGENT", "sofiia")
|
||||
|
||||
MAX_POLL_SECONDS = 220
|
||||
POLL_INTERVAL_SECONDS = 5
|
||||
|
||||
|
||||
def _headers() -> dict:
|
||||
h = {"Content-Type": "application/json", "Accept": "application/json"}
|
||||
if API_KEY:
|
||||
h["Authorization"] = f"Bearer {API_KEY}"
|
||||
return h
|
||||
|
||||
|
||||
def _http_post(url: str, body: dict) -> dict:
|
||||
data = json.dumps(body).encode()
|
||||
req = urllib.request.Request(url, data=data, headers=_headers(), method="POST")
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return json.loads(resp.read())
|
||||
|
||||
|
||||
def _http_get(url: str) -> dict:
|
||||
req = urllib.request.Request(url, headers=_headers(), method="GET")
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return json.loads(resp.read())
|
||||
|
||||
|
||||
def start_run(dry_run: bool = False) -> str:
|
||||
payload = {
|
||||
"workspace_id": WORKSPACE_ID,
|
||||
"user_id": "scheduler",
|
||||
"agent_id": AGENT_ID,
|
||||
"input": {
|
||||
"policy_profile": "default",
|
||||
"dry_run": dry_run,
|
||||
"workspace_id": WORKSPACE_ID,
|
||||
"agent_id": AGENT_ID,
|
||||
},
|
||||
}
|
||||
url = f"{SUPERVISOR_URL}/v1/graphs/alert_triage/runs"
|
||||
logger.info("Starting alert_triage run (dry_run=%s)", dry_run)
|
||||
resp = _http_post(url, payload)
|
||||
run_id = resp.get("run_id")
|
||||
if not run_id:
|
||||
raise RuntimeError(f"No run_id in response: {resp}")
|
||||
logger.info("Run started: %s (status=%s)", run_id, resp.get("status"))
|
||||
return run_id
|
||||
|
||||
|
||||
def poll_run(run_id: str) -> dict:
|
||||
url = f"{SUPERVISOR_URL}/v1/runs/{run_id}"
|
||||
deadline = time.monotonic() + MAX_POLL_SECONDS
|
||||
while time.monotonic() < deadline:
|
||||
resp = _http_get(url)
|
||||
status = resp.get("status", "unknown")
|
||||
if status in ("succeeded", "failed", "cancelled"):
|
||||
return resp
|
||||
logger.debug("Run %s status=%s — waiting…", run_id, status)
|
||||
time.sleep(POLL_INTERVAL_SECONDS)
|
||||
raise TimeoutError(f"Run {run_id} did not complete in {MAX_POLL_SECONDS}s")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Alert Triage Loop runner")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Simulate without writes")
|
||||
parser.add_argument("--supervisor-url", default=SUPERVISOR_URL)
|
||||
args = parser.parse_args()
|
||||
|
||||
global SUPERVISOR_URL
|
||||
SUPERVISOR_URL = args.supervisor_url
|
||||
|
||||
try:
|
||||
run_id = start_run(dry_run=args.dry_run)
|
||||
result = poll_run(run_id)
|
||||
status = result.get("status")
|
||||
run_result = result.get("result") or {}
|
||||
|
||||
digest = run_result.get("digest_md", "")
|
||||
summary = run_result.get("result_summary") or {}
|
||||
|
||||
logger.info(
|
||||
"Alert triage run %s completed: status=%s processed=%s "
|
||||
"created=%s updated=%s skipped=%s errors=%s triages=%s",
|
||||
run_id, status,
|
||||
summary.get("processed", "?"),
|
||||
summary.get("created_incidents", "?"),
|
||||
summary.get("updated_incidents", "?"),
|
||||
summary.get("skipped", "?"),
|
||||
summary.get("errors", "?"),
|
||||
summary.get("triage_runs", "?"),
|
||||
)
|
||||
|
||||
if digest:
|
||||
print("\n" + digest)
|
||||
|
||||
if status == "failed":
|
||||
logger.error("Run %s FAILED", run_id)
|
||||
sys.exit(1)
|
||||
|
||||
except urllib.error.URLError as e:
|
||||
logger.error("Cannot reach supervisor at %s: %s", SUPERVISOR_URL, e)
|
||||
sys.exit(2)
|
||||
except TimeoutError as e:
|
||||
logger.error("Timeout: %s", e)
|
||||
sys.exit(3)
|
||||
except Exception as e:
|
||||
logger.error("Unexpected error: %s", e)
|
||||
sys.exit(4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
215
ops/scripts/audit_cleanup.py
Normal file
215
ops/scripts/audit_cleanup.py
Normal file
@@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
audit_cleanup.py — Audit JSONL Retention Enforcement
|
||||
|
||||
Finds ops/audit/tool_audit_YYYY-MM-DD.jsonl files older than `retention_days`,
|
||||
then either:
|
||||
- dry_run=True → report only, no changes
|
||||
- archive_gzip=True → compress to .jsonl.gz, delete original
|
||||
- otherwise → delete original
|
||||
|
||||
Exit codes:
|
||||
0 — success (including dry_run)
|
||||
1 — script error
|
||||
|
||||
Usage:
|
||||
python3 ops/scripts/audit_cleanup.py \
|
||||
--retention-days 30 \
|
||||
--audit-dir ops/audit \
|
||||
[--dry-run] [--archive-gzip] [--verbose]
|
||||
|
||||
Also callable programmatically via run_cleanup() for Job Orchestrator.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import datetime
|
||||
import gzip
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DATE_PAT = re.compile(r"tool_audit_(\d{4}-\d{2}-\d{2})\.jsonl$")
|
||||
|
||||
|
||||
# ─── Core logic ───────────────────────────────────────────────────────────────
|
||||
|
||||
def find_eligible_files(
|
||||
audit_dir: Path,
|
||||
cutoff_date: datetime.date,
|
||||
) -> List[Path]:
|
||||
"""Return JSONL files whose embedded date < cutoff_date."""
|
||||
eligible = []
|
||||
if not audit_dir.exists():
|
||||
return eligible
|
||||
|
||||
for fpath in sorted(audit_dir.glob("tool_audit_*.jsonl")):
|
||||
m = _DATE_PAT.search(fpath.name)
|
||||
if not m:
|
||||
continue
|
||||
try:
|
||||
file_date = datetime.date.fromisoformat(m.group(1))
|
||||
except ValueError:
|
||||
continue
|
||||
if file_date < cutoff_date:
|
||||
eligible.append(fpath)
|
||||
|
||||
return eligible
|
||||
|
||||
|
||||
def run_cleanup(
|
||||
retention_days: int,
|
||||
audit_dir: str = "ops/audit",
|
||||
dry_run: bool = True,
|
||||
archive_gzip: bool = False,
|
||||
repo_root: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
) -> Dict:
|
||||
"""
|
||||
Main cleanup routine.
|
||||
|
||||
Returns:
|
||||
{scanned, eligible, deleted, archived, bytes_freed, dry_run, errors}
|
||||
"""
|
||||
if retention_days < 1 or retention_days > 365:
|
||||
raise ValueError(f"retention_days must be 1–365, got {retention_days}")
|
||||
|
||||
root = Path(repo_root or os.getenv("REPO_ROOT", ".")).resolve()
|
||||
dir_path = (root / audit_dir).resolve()
|
||||
|
||||
# Path traversal guard
|
||||
if not str(dir_path).startswith(str(root)):
|
||||
raise ValueError(f"audit_dir '{audit_dir}' resolves outside repo root")
|
||||
|
||||
today = datetime.date.today()
|
||||
cutoff = today - datetime.timedelta(days=retention_days)
|
||||
|
||||
all_jsonl = list(sorted(dir_path.glob("tool_audit_*.jsonl")))
|
||||
eligible = find_eligible_files(dir_path, cutoff)
|
||||
|
||||
deleted = 0
|
||||
archived = 0
|
||||
bytes_freed = 0
|
||||
errors: List[str] = []
|
||||
|
||||
for fpath in eligible:
|
||||
size = fpath.stat().st_size
|
||||
if dry_run:
|
||||
action = "archive" if archive_gzip else "delete"
|
||||
if verbose:
|
||||
logger.info("[dry_run] Would %s: %s (%d bytes)", action, fpath.name, size)
|
||||
bytes_freed += size
|
||||
if archive_gzip:
|
||||
archived += 1
|
||||
else:
|
||||
deleted += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
if archive_gzip:
|
||||
gz_path = fpath.with_suffix(".jsonl.gz")
|
||||
with open(fpath, "rb") as f_in:
|
||||
with gzip.open(gz_path, "wb") as f_out:
|
||||
shutil.copyfileobj(f_in, f_out)
|
||||
fpath.unlink()
|
||||
archived += 1
|
||||
bytes_freed += size
|
||||
if verbose:
|
||||
logger.info("Archived: %s → %s (%d bytes)", fpath.name, gz_path.name, size)
|
||||
else:
|
||||
fpath.unlink()
|
||||
deleted += 1
|
||||
bytes_freed += size
|
||||
if verbose:
|
||||
logger.info("Deleted: %s (%d bytes)", fpath.name, size)
|
||||
except Exception as e:
|
||||
msg = f"Error processing {fpath.name}: {e}"
|
||||
logger.warning(msg)
|
||||
errors.append(msg)
|
||||
|
||||
result = {
|
||||
"scanned": len(all_jsonl),
|
||||
"eligible": len(eligible),
|
||||
"deleted": deleted,
|
||||
"archived": archived,
|
||||
"bytes_freed": bytes_freed,
|
||||
"dry_run": dry_run,
|
||||
"retention_days": retention_days,
|
||||
"cutoff_date": cutoff.isoformat(),
|
||||
"audit_dir": str(dir_path),
|
||||
"errors": errors,
|
||||
}
|
||||
|
||||
if verbose or not dry_run:
|
||||
summary = (
|
||||
f"audit_cleanup: scanned={result['scanned']}, eligible={result['eligible']}, "
|
||||
f"{'[DRY RUN] ' if dry_run else ''}"
|
||||
f"deleted={deleted}, archived={archived}, freed={bytes_freed} bytes"
|
||||
)
|
||||
logger.info(summary)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ─── CLI entrypoint ───────────────────────────────────────────────────────────
|
||||
|
||||
def _parse_args(argv=None) -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(
|
||||
description="Audit JSONL retention cleanup",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
)
|
||||
p.add_argument("--retention-days", type=int, default=30,
|
||||
help="Delete/archive files older than this many days")
|
||||
p.add_argument("--audit-dir", default="ops/audit",
|
||||
help="Relative path to audit directory")
|
||||
p.add_argument("--repo-root", default=None,
|
||||
help="Repo root (default: REPO_ROOT env or cwd)")
|
||||
p.add_argument("--dry-run", action="store_true",
|
||||
help="Report only; do not delete or archive")
|
||||
p.add_argument("--archive-gzip", action="store_true",
|
||||
help="Compress to .jsonl.gz before deleting")
|
||||
p.add_argument("--verbose", action="store_true",
|
||||
help="Verbose output")
|
||||
p.add_argument("--output-json", action="store_true",
|
||||
help="Print JSON result to stdout")
|
||||
return p.parse_args(argv)
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s audit_cleanup %(message)s",
|
||||
stream=sys.stderr,
|
||||
)
|
||||
args = _parse_args(argv)
|
||||
result = run_cleanup(
|
||||
retention_days=args.retention_days,
|
||||
audit_dir=args.audit_dir,
|
||||
dry_run=args.dry_run,
|
||||
archive_gzip=args.archive_gzip,
|
||||
repo_root=args.repo_root,
|
||||
verbose=args.verbose,
|
||||
)
|
||||
if args.output_json:
|
||||
print(json.dumps(result, indent=2))
|
||||
else:
|
||||
status = "DRY RUN" if result["dry_run"] else "DONE"
|
||||
print(
|
||||
f"[{status}] scanned={result['scanned']} eligible={result['eligible']} "
|
||||
f"deleted={result['deleted']} archived={result['archived']} "
|
||||
f"freed={result['bytes_freed']}B"
|
||||
)
|
||||
if result["errors"]:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
186
ops/scripts/audit_compact.py
Normal file
186
ops/scripts/audit_compact.py
Normal file
@@ -0,0 +1,186 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
audit_compact.py — Audit JSONL Compaction
|
||||
|
||||
Merges individual daily JSONL files from the last `window_days` into a single
|
||||
compressed artifact: ops/audit/compact/tool_audit_last_{window_days}d.jsonl.gz
|
||||
|
||||
Useful for:
|
||||
- Faster forensic analysis (single file to read)
|
||||
- Archival before cleanup
|
||||
- Offline cost_analyzer runs
|
||||
|
||||
Usage:
|
||||
python3 ops/scripts/audit_compact.py \
|
||||
--window-days 7 \
|
||||
[--output-path ops/audit/compact] \
|
||||
[--dry-run] [--verbose]
|
||||
|
||||
Callable programmatically via run_compact().
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import datetime
|
||||
import gzip
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DATE_PAT = re.compile(r"tool_audit_(\d{4}-\d{2}-\d{2})\.jsonl$")
|
||||
|
||||
|
||||
def run_compact(
|
||||
window_days: int = 7,
|
||||
audit_dir: str = "ops/audit",
|
||||
output_path: Optional[str] = None,
|
||||
dry_run: bool = True,
|
||||
repo_root: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
) -> Dict:
|
||||
"""
|
||||
Compact last `window_days` JSONL audit files into one .jsonl.gz.
|
||||
|
||||
Returns:
|
||||
{source_files, lines_written, output_file, bytes_written, dry_run, errors}
|
||||
"""
|
||||
if window_days < 1 or window_days > 30:
|
||||
raise ValueError(f"window_days must be 1–30, got {window_days}")
|
||||
|
||||
root = Path(repo_root or os.getenv("REPO_ROOT", ".")).resolve()
|
||||
dir_path = (root / audit_dir).resolve()
|
||||
if not str(dir_path).startswith(str(root)):
|
||||
raise ValueError("audit_dir resolves outside repo root")
|
||||
|
||||
today = datetime.date.today()
|
||||
cutoff = today - datetime.timedelta(days=window_days)
|
||||
|
||||
# Find files within window
|
||||
source_files: List[Path] = []
|
||||
for fpath in sorted(dir_path.glob("tool_audit_*.jsonl")):
|
||||
m = _DATE_PAT.search(fpath.name)
|
||||
if not m:
|
||||
continue
|
||||
try:
|
||||
file_date = datetime.date.fromisoformat(m.group(1))
|
||||
except ValueError:
|
||||
continue
|
||||
if file_date >= cutoff:
|
||||
source_files.append(fpath)
|
||||
|
||||
out_dir = (root / (output_path or f"{audit_dir}/compact")).resolve()
|
||||
if not str(out_dir).startswith(str(root)):
|
||||
raise ValueError("output_path resolves outside repo root")
|
||||
|
||||
out_name = f"tool_audit_last_{window_days}d.jsonl.gz"
|
||||
out_file = out_dir / out_name
|
||||
|
||||
lines_written = 0
|
||||
bytes_written = 0
|
||||
errors: List[str] = []
|
||||
|
||||
if dry_run:
|
||||
# Count lines without writing
|
||||
for fpath in source_files:
|
||||
try:
|
||||
with open(fpath, "r", encoding="utf-8", errors="replace") as f:
|
||||
lines_written += sum(1 for line in f if line.strip())
|
||||
except Exception as e:
|
||||
errors.append(f"{fpath.name}: {e}")
|
||||
if verbose:
|
||||
logger.info(
|
||||
"[dry_run] Would compact %d files → %s (%d lines)",
|
||||
len(source_files), out_file, lines_written,
|
||||
)
|
||||
else:
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
with gzip.open(out_file, "wt", encoding="utf-8") as gz:
|
||||
for fpath in source_files:
|
||||
try:
|
||||
with open(fpath, "r", encoding="utf-8", errors="replace") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
gz.write(line + "\n")
|
||||
lines_written += 1
|
||||
except Exception as e:
|
||||
msg = f"Error reading {fpath.name}: {e}"
|
||||
logger.warning(msg)
|
||||
errors.append(msg)
|
||||
bytes_written = out_file.stat().st_size
|
||||
if verbose:
|
||||
logger.info(
|
||||
"Compacted %d files → %s (%d lines, %d bytes compressed)",
|
||||
len(source_files), out_file.name, lines_written, bytes_written,
|
||||
)
|
||||
except Exception as e:
|
||||
errors.append(f"Write error: {e}")
|
||||
logger.error("audit_compact failed: %s", e)
|
||||
|
||||
return {
|
||||
"source_files": len(source_files),
|
||||
"window_days": window_days,
|
||||
"lines_written": lines_written,
|
||||
"output_file": str(out_file) if not dry_run else str(out_file) + " [not created]",
|
||||
"bytes_written": bytes_written,
|
||||
"dry_run": dry_run,
|
||||
"errors": errors,
|
||||
}
|
||||
|
||||
|
||||
def _parse_args(argv=None) -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(
|
||||
description="Compact audit JSONL files into a single .gz archive",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
)
|
||||
p.add_argument("--window-days", type=int, default=7,
|
||||
help="Compact files from last N days")
|
||||
p.add_argument("--audit-dir", default="ops/audit",
|
||||
help="Relative path to audit directory")
|
||||
p.add_argument("--output-path", default=None,
|
||||
help="Output directory (default: ops/audit/compact)")
|
||||
p.add_argument("--repo-root", default=None)
|
||||
p.add_argument("--dry-run", action="store_true")
|
||||
p.add_argument("--verbose", action="store_true")
|
||||
p.add_argument("--output-json", action="store_true")
|
||||
return p.parse_args(argv)
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s audit_compact %(message)s",
|
||||
stream=sys.stderr,
|
||||
)
|
||||
args = _parse_args(argv)
|
||||
result = run_compact(
|
||||
window_days=args.window_days,
|
||||
audit_dir=args.audit_dir,
|
||||
output_path=args.output_path,
|
||||
dry_run=args.dry_run,
|
||||
repo_root=args.repo_root,
|
||||
verbose=args.verbose,
|
||||
)
|
||||
if args.output_json:
|
||||
print(json.dumps(result, indent=2))
|
||||
else:
|
||||
status = "DRY RUN" if result["dry_run"] else "DONE"
|
||||
print(
|
||||
f"[{status}] sources={result['source_files']} "
|
||||
f"lines={result['lines_written']} bytes={result['bytes_written']} "
|
||||
f"→ {result['output_file']}"
|
||||
)
|
||||
if result["errors"]:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
148
ops/scripts/migrate_alerts_postgres.py
Normal file
148
ops/scripts/migrate_alerts_postgres.py
Normal file
@@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
migrate_alerts_postgres.py — Idempotent DDL for alert tables.
|
||||
|
||||
Runs safely on existing DBs (ALTER ... ADD COLUMN IF NOT EXISTS).
|
||||
|
||||
Tables:
|
||||
alerts — canonical alert records + state machine
|
||||
incident_signature_state — cooldown tracking per incident signature
|
||||
|
||||
Usage:
|
||||
DATABASE_URL=postgresql://user:pass@host/db python3 ops/scripts/migrate_alerts_postgres.py [--dry-run]
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import textwrap
|
||||
|
||||
# ─── alerts table ─────────────────────────────────────────────────────────────
|
||||
DDL_ALERTS_CREATE = textwrap.dedent("""\
|
||||
CREATE TABLE IF NOT EXISTS alerts (
|
||||
alert_ref TEXT PRIMARY KEY,
|
||||
dedupe_key TEXT NOT NULL,
|
||||
source TEXT NOT NULL DEFAULT 'unknown',
|
||||
service TEXT NOT NULL,
|
||||
env TEXT NOT NULL DEFAULT 'prod',
|
||||
severity TEXT NOT NULL DEFAULT 'P2',
|
||||
kind TEXT NOT NULL DEFAULT 'custom',
|
||||
title TEXT NOT NULL DEFAULT '',
|
||||
summary TEXT,
|
||||
started_at TIMESTAMPTZ,
|
||||
labels JSONB,
|
||||
metrics JSONB,
|
||||
evidence JSONB,
|
||||
links JSONB,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
last_seen_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
occurrences INT NOT NULL DEFAULT 1,
|
||||
-- State machine (added in v2)
|
||||
status TEXT NOT NULL DEFAULT 'new',
|
||||
processing_lock_until TIMESTAMPTZ,
|
||||
processing_owner TEXT,
|
||||
last_error TEXT,
|
||||
acked_at TIMESTAMPTZ,
|
||||
-- Legacy compat
|
||||
ack_status TEXT DEFAULT 'pending',
|
||||
ack_actor TEXT,
|
||||
ack_note TEXT,
|
||||
ack_at TIMESTAMPTZ
|
||||
);
|
||||
""")
|
||||
|
||||
# Backward-compat additions (safe on existing tables)
|
||||
DDL_ALERTS_ADD_COLUMNS = textwrap.dedent("""\
|
||||
ALTER TABLE alerts ADD COLUMN IF NOT EXISTS status TEXT NOT NULL DEFAULT 'new';
|
||||
ALTER TABLE alerts ADD COLUMN IF NOT EXISTS claimed_at TIMESTAMPTZ;
|
||||
ALTER TABLE alerts ADD COLUMN IF NOT EXISTS processing_lock_until TIMESTAMPTZ;
|
||||
ALTER TABLE alerts ADD COLUMN IF NOT EXISTS processing_owner TEXT;
|
||||
ALTER TABLE alerts ADD COLUMN IF NOT EXISTS last_error TEXT;
|
||||
ALTER TABLE alerts ADD COLUMN IF NOT EXISTS acked_at TIMESTAMPTZ;
|
||||
ALTER TABLE alerts ADD COLUMN IF NOT EXISTS ack_status TEXT DEFAULT 'pending';
|
||||
ALTER TABLE alerts ADD COLUMN IF NOT EXISTS ack_actor TEXT;
|
||||
ALTER TABLE alerts ADD COLUMN IF NOT EXISTS ack_note TEXT;
|
||||
ALTER TABLE alerts ADD COLUMN IF NOT EXISTS ack_at TIMESTAMPTZ;
|
||||
""")
|
||||
|
||||
DDL_ALERTS_INDEXES = textwrap.dedent("""\
|
||||
CREATE INDEX IF NOT EXISTS idx_alerts_dedupe_key
|
||||
ON alerts(dedupe_key, created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_alerts_service_env
|
||||
ON alerts(service, env, created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_alerts_severity
|
||||
ON alerts(severity, created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_alerts_status
|
||||
ON alerts(status, created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_alerts_processing_lock
|
||||
ON alerts(processing_lock_until)
|
||||
WHERE processing_lock_until IS NOT NULL;
|
||||
""")
|
||||
|
||||
# ─── incident_signature_state table ──────────────────────────────────────────
|
||||
DDL_SIG_STATE = textwrap.dedent("""\
|
||||
CREATE TABLE IF NOT EXISTS incident_signature_state (
|
||||
signature TEXT PRIMARY KEY,
|
||||
last_triage_at TIMESTAMPTZ,
|
||||
last_alert_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
triage_count_24h INT NOT NULL DEFAULT 0,
|
||||
occurrences_60m INT NOT NULL DEFAULT 0,
|
||||
occurrences_60m_bucket_start TIMESTAMPTZ,
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Add new columns to existing table (idempotent)
|
||||
ALTER TABLE incident_signature_state
|
||||
ADD COLUMN IF NOT EXISTS occurrences_60m INT NOT NULL DEFAULT 0;
|
||||
ALTER TABLE incident_signature_state
|
||||
ADD COLUMN IF NOT EXISTS occurrences_60m_bucket_start TIMESTAMPTZ;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_sig_state_updated
|
||||
ON incident_signature_state(updated_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_sig_state_last_alert
|
||||
ON incident_signature_state(last_alert_at);
|
||||
""")
|
||||
|
||||
|
||||
def run(dsn: str, dry_run: bool = False) -> None:
|
||||
try:
|
||||
import psycopg2
|
||||
except ImportError:
|
||||
print("psycopg2 not installed. Run: pip install psycopg2-binary", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
all_ddl = [
|
||||
("Create alerts table", DDL_ALERTS_CREATE),
|
||||
("Add state machine columns (idempotent)", DDL_ALERTS_ADD_COLUMNS),
|
||||
("Create alerts indexes", DDL_ALERTS_INDEXES),
|
||||
("Create incident_signature_state table", DDL_SIG_STATE),
|
||||
]
|
||||
|
||||
if dry_run:
|
||||
print("=== DRY RUN — DDL that would be executed ===\n")
|
||||
for label, ddl in all_ddl:
|
||||
print(f"-- {label}\n{ddl}")
|
||||
return
|
||||
|
||||
conn = psycopg2.connect(dsn)
|
||||
try:
|
||||
conn.autocommit = False
|
||||
with conn.cursor() as cur:
|
||||
for label, ddl in all_ddl:
|
||||
print(f" → {label}")
|
||||
cur.execute(ddl)
|
||||
conn.commit()
|
||||
print("✅ All alert migrations completed successfully.")
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
print(f"❌ Migration failed: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dsn = os.getenv("DATABASE_URL") or os.getenv("ALERT_DATABASE_URL")
|
||||
if not dsn:
|
||||
print("ERROR: DATABASE_URL not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
dry = "--dry-run" in sys.argv
|
||||
run(dsn, dry_run=dry)
|
||||
114
ops/scripts/migrate_audit_postgres.py
Normal file
114
ops/scripts/migrate_audit_postgres.py
Normal file
@@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Idempotent DDL migration for Postgres audit backend.
|
||||
|
||||
Creates the `tool_audit_events` table and its indexes if they don't already exist.
|
||||
|
||||
Usage:
|
||||
python3 ops/scripts/migrate_audit_postgres.py
|
||||
DATABASE_URL=postgresql://user:pass@host/db python3 ops/scripts/migrate_audit_postgres.py --dry-run
|
||||
|
||||
Environment variables:
|
||||
DATABASE_URL — PostgreSQL DSN (required).
|
||||
|
||||
Exit codes:
|
||||
0 — success / already up-to-date
|
||||
1 — error
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import textwrap
|
||||
|
||||
# ─── DDL ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
DDL = textwrap.dedent("""\
|
||||
-- Audit events table (idempotent)
|
||||
CREATE TABLE IF NOT EXISTS tool_audit_events (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
ts TIMESTAMPTZ NOT NULL,
|
||||
req_id TEXT NOT NULL,
|
||||
workspace_id TEXT NOT NULL,
|
||||
user_id TEXT NOT NULL,
|
||||
agent_id TEXT NOT NULL,
|
||||
tool TEXT NOT NULL,
|
||||
action TEXT NOT NULL,
|
||||
status TEXT NOT NULL,
|
||||
duration_ms INT NOT NULL DEFAULT 0,
|
||||
in_size INT NOT NULL DEFAULT 0,
|
||||
out_size INT NOT NULL DEFAULT 0,
|
||||
input_hash TEXT NOT NULL DEFAULT '',
|
||||
graph_run_id TEXT,
|
||||
graph_node TEXT,
|
||||
job_id TEXT
|
||||
);
|
||||
|
||||
-- Indexes (idempotent)
|
||||
CREATE INDEX IF NOT EXISTS idx_tool_audit_ts
|
||||
ON tool_audit_events (ts);
|
||||
CREATE INDEX IF NOT EXISTS idx_tool_audit_ws_ts
|
||||
ON tool_audit_events (workspace_id, ts);
|
||||
CREATE INDEX IF NOT EXISTS idx_tool_audit_tool_ts
|
||||
ON tool_audit_events (tool, ts);
|
||||
CREATE INDEX IF NOT EXISTS idx_tool_audit_agent_ts
|
||||
ON tool_audit_events (agent_id, ts);
|
||||
""")
|
||||
|
||||
|
||||
# ─── Runner ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def run(dsn: str, dry_run: bool) -> int:
|
||||
"""Execute migration against Postgres. Returns 0 on success, 1 on error."""
|
||||
try:
|
||||
import psycopg2 # type: ignore
|
||||
except ImportError:
|
||||
try:
|
||||
import subprocess
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", "psycopg2-binary"])
|
||||
import psycopg2 # type: ignore # noqa: F811
|
||||
except Exception as pip_err:
|
||||
print(f"[ERROR] psycopg2 not available and could not be installed: {pip_err}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
print(f"[migrate] Connecting to: {dsn[:40]}…")
|
||||
if dry_run:
|
||||
print("[migrate] DRY-RUN — printing DDL only, not executing:\n")
|
||||
print(DDL)
|
||||
return 0
|
||||
|
||||
try:
|
||||
conn = psycopg2.connect(dsn)
|
||||
conn.autocommit = False
|
||||
cur = conn.cursor()
|
||||
cur.execute(DDL)
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
print("[migrate] ✅ Migration applied successfully.")
|
||||
return 0
|
||||
except Exception as exc:
|
||||
print(f"[migrate] ❌ Migration failed: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Idempotent Postgres audit DDL migration")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Print DDL without executing")
|
||||
parser.add_argument(
|
||||
"--dsn",
|
||||
default=os.getenv("DATABASE_URL") or os.getenv("POSTGRES_DSN", ""),
|
||||
help="PostgreSQL DSN (default: $DATABASE_URL)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.dsn:
|
||||
print("[migrate] ERROR: DATABASE_URL not set. Provide --dsn or set DATABASE_URL.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
sys.exit(run(args.dsn, args.dry_run))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
116
ops/scripts/migrate_backlog_postgres.py
Normal file
116
ops/scripts/migrate_backlog_postgres.py
Normal file
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
migrate_backlog_postgres.py — Idempotent DDL migration for Engineering Backlog.
|
||||
DAARION.city
|
||||
|
||||
Creates tables and indexes if they do not exist. Safe to re-run.
|
||||
|
||||
Usage:
|
||||
python3 ops/scripts/migrate_backlog_postgres.py
|
||||
python3 ops/scripts/migrate_backlog_postgres.py --dry-run
|
||||
python3 ops/scripts/migrate_backlog_postgres.py --dsn "postgresql://user:pass@host/db"
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
DDL = [
|
||||
# ── backlog_items ─────────────────────────────────────────────────────────
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS backlog_items (
|
||||
id TEXT PRIMARY KEY,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
env TEXT NOT NULL DEFAULT 'prod',
|
||||
service TEXT NOT NULL DEFAULT '',
|
||||
category TEXT NOT NULL DEFAULT '',
|
||||
title TEXT NOT NULL DEFAULT '',
|
||||
description TEXT NOT NULL DEFAULT '',
|
||||
priority TEXT NOT NULL DEFAULT 'P2',
|
||||
status TEXT NOT NULL DEFAULT 'open',
|
||||
owner TEXT NOT NULL DEFAULT 'oncall',
|
||||
due_date DATE,
|
||||
source TEXT NOT NULL DEFAULT 'manual',
|
||||
dedupe_key TEXT NOT NULL UNIQUE DEFAULT '',
|
||||
evidence_refs JSONB NOT NULL DEFAULT '{}',
|
||||
tags JSONB NOT NULL DEFAULT '[]',
|
||||
meta JSONB NOT NULL DEFAULT '{}'
|
||||
)
|
||||
""",
|
||||
# ── backlog_events ────────────────────────────────────────────────────────
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS backlog_events (
|
||||
id TEXT PRIMARY KEY,
|
||||
item_id TEXT NOT NULL REFERENCES backlog_items(id) ON DELETE CASCADE,
|
||||
ts TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
type TEXT NOT NULL DEFAULT 'comment',
|
||||
message TEXT NOT NULL DEFAULT '',
|
||||
actor TEXT NOT NULL DEFAULT 'system',
|
||||
meta JSONB NOT NULL DEFAULT '{}'
|
||||
)
|
||||
""",
|
||||
# ── Indexes ───────────────────────────────────────────────────────────────
|
||||
"CREATE INDEX IF NOT EXISTS idx_backlog_items_env_status ON backlog_items (env, status)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_backlog_items_service ON backlog_items (service)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_backlog_items_due_date ON backlog_items (due_date)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_backlog_items_owner ON backlog_items (owner)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_backlog_items_category ON backlog_items (category)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_backlog_events_item_id ON backlog_events (item_id)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_backlog_events_ts ON backlog_events (ts)",
|
||||
]
|
||||
|
||||
|
||||
def migrate(dsn: str, dry_run: bool = False) -> None:
|
||||
print(f"[backlog migration] DSN: {dsn!r} dry_run={dry_run}")
|
||||
if dry_run:
|
||||
print("[dry-run] Would execute the following DDL statements:")
|
||||
for stmt in DDL:
|
||||
print(" ", stmt.strip()[:120])
|
||||
return
|
||||
|
||||
try:
|
||||
import psycopg2
|
||||
except ImportError:
|
||||
print("ERROR: psycopg2 not installed. Run: pip install psycopg2-binary", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
conn = psycopg2.connect(dsn)
|
||||
conn.autocommit = True
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
for stmt in DDL:
|
||||
stmt = stmt.strip()
|
||||
if not stmt:
|
||||
continue
|
||||
print(f" EXEC: {stmt[:80].replace(chr(10), ' ')}…")
|
||||
cur.execute(stmt)
|
||||
print("[backlog migration] Done. All DDL applied idempotently.")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Idempotent Postgres DDL migration for Engineering Backlog"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dsn",
|
||||
default=os.environ.get(
|
||||
"BACKLOG_POSTGRES_DSN",
|
||||
os.environ.get("POSTGRES_DSN", "postgresql://localhost/daarion"),
|
||||
),
|
||||
help="Postgres DSN (default: $BACKLOG_POSTGRES_DSN or $POSTGRES_DSN)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Print DDL without executing",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
migrate(args.dsn, dry_run=args.dry_run)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
121
ops/scripts/migrate_incidents_postgres.py
Normal file
121
ops/scripts/migrate_incidents_postgres.py
Normal file
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Idempotent DDL migration for Postgres incident log backend.
|
||||
|
||||
Creates tables: incidents, incident_events, incident_artifacts (+ indexes).
|
||||
|
||||
Usage:
|
||||
DATABASE_URL=postgresql://... python3 ops/scripts/migrate_incidents_postgres.py
|
||||
python3 ops/scripts/migrate_incidents_postgres.py --dry-run
|
||||
|
||||
Exit codes: 0 = success, 1 = error
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import textwrap
|
||||
|
||||
DDL = textwrap.dedent("""\
|
||||
-- ─── incidents ──────────────────────────────────────────────────────────
|
||||
CREATE TABLE IF NOT EXISTS incidents (
|
||||
id TEXT PRIMARY KEY,
|
||||
workspace_id TEXT NOT NULL DEFAULT 'default',
|
||||
service TEXT NOT NULL,
|
||||
env TEXT NOT NULL DEFAULT 'prod',
|
||||
severity TEXT NOT NULL DEFAULT 'P2',
|
||||
status TEXT NOT NULL DEFAULT 'open',
|
||||
title TEXT NOT NULL,
|
||||
summary TEXT,
|
||||
started_at TIMESTAMPTZ NOT NULL,
|
||||
ended_at TIMESTAMPTZ,
|
||||
created_by TEXT NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_incidents_ws_created
|
||||
ON incidents (workspace_id, created_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_incidents_service_status
|
||||
ON incidents (service, status);
|
||||
|
||||
-- ─── incident_events (timeline) ─────────────────────────────────────────
|
||||
CREATE TABLE IF NOT EXISTS incident_events (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
incident_id TEXT NOT NULL REFERENCES incidents(id) ON DELETE CASCADE,
|
||||
ts TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
type TEXT NOT NULL,
|
||||
message TEXT NOT NULL DEFAULT '',
|
||||
meta JSONB
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_incident_events_inc_ts
|
||||
ON incident_events (incident_id, ts);
|
||||
|
||||
-- ─── incident_artifacts ──────────────────────────────────────────────────
|
||||
CREATE TABLE IF NOT EXISTS incident_artifacts (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
incident_id TEXT NOT NULL REFERENCES incidents(id) ON DELETE CASCADE,
|
||||
ts TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
kind TEXT NOT NULL,
|
||||
format TEXT NOT NULL DEFAULT 'json',
|
||||
path TEXT NOT NULL,
|
||||
sha256 TEXT NOT NULL DEFAULT '',
|
||||
size_bytes INT NOT NULL DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_incident_artifacts_inc_ts
|
||||
ON incident_artifacts (incident_id, ts);
|
||||
""")
|
||||
|
||||
|
||||
def run(dsn: str, dry_run: bool) -> int:
|
||||
try:
|
||||
import psycopg2 # type: ignore
|
||||
except ImportError:
|
||||
try:
|
||||
import subprocess
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", "psycopg2-binary"])
|
||||
import psycopg2 # type: ignore # noqa: F811
|
||||
except Exception as pip_err:
|
||||
print(f"[ERROR] psycopg2 not available: {pip_err}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
print(f"[migrate-incidents] Connecting to: {dsn[:40]}…")
|
||||
if dry_run:
|
||||
print("[migrate-incidents] DRY-RUN — DDL only:\n")
|
||||
print(DDL)
|
||||
return 0
|
||||
|
||||
try:
|
||||
conn = psycopg2.connect(dsn)
|
||||
conn.autocommit = False
|
||||
cur = conn.cursor()
|
||||
cur.execute(DDL)
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
print("[migrate-incidents] ✅ Incident tables created/verified successfully.")
|
||||
return 0
|
||||
except Exception as exc:
|
||||
print(f"[migrate-incidents] ❌ Migration failed: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Idempotent Postgres incident DDL migration")
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
parser.add_argument(
|
||||
"--dsn",
|
||||
default=os.getenv("DATABASE_URL") or os.getenv("POSTGRES_DSN", ""),
|
||||
)
|
||||
args = parser.parse_args()
|
||||
if not args.dsn:
|
||||
print("[migrate-incidents] ERROR: DATABASE_URL not set.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
sys.exit(run(args.dsn, args.dry_run))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
84
ops/scripts/migrate_risk_history_postgres.py
Normal file
84
ops/scripts/migrate_risk_history_postgres.py
Normal file
@@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Idempotent DDL migration for Postgres risk_history backend.
|
||||
|
||||
Creates table: risk_history (+ indexes).
|
||||
|
||||
Usage:
|
||||
DATABASE_URL=postgresql://... python3 ops/scripts/migrate_risk_history_postgres.py
|
||||
python3 ops/scripts/migrate_risk_history_postgres.py --dry-run
|
||||
|
||||
Exit codes: 0 = success, 1 = error
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import textwrap
|
||||
|
||||
DDL = textwrap.dedent("""\
|
||||
-- ─── risk_history ──────────────────────────────────────────────────────────
|
||||
CREATE TABLE IF NOT EXISTS risk_history (
|
||||
ts TIMESTAMPTZ NOT NULL,
|
||||
service TEXT NOT NULL,
|
||||
env TEXT NOT NULL DEFAULT 'prod',
|
||||
score INTEGER NOT NULL,
|
||||
band TEXT NOT NULL,
|
||||
components JSONB NOT NULL DEFAULT '{}',
|
||||
reasons JSONB NOT NULL DEFAULT '[]',
|
||||
PRIMARY KEY (ts, service, env)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS risk_history_svc_env_ts
|
||||
ON risk_history (service, env, ts DESC);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS risk_history_env_ts
|
||||
ON risk_history (env, ts DESC);
|
||||
""")
|
||||
|
||||
|
||||
def run(dsn: str, dry_run: bool = False) -> None:
|
||||
if dry_run:
|
||||
print("=== DRY RUN — DDL that would be applied ===")
|
||||
print(DDL)
|
||||
return
|
||||
|
||||
try:
|
||||
import psycopg2 # type: ignore
|
||||
except ImportError:
|
||||
print("ERROR: psycopg2 not installed. Run: pip install psycopg2-binary", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
conn = psycopg2.connect(dsn)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
for statement in DDL.split(";"):
|
||||
stmt = statement.strip()
|
||||
if stmt:
|
||||
cur.execute(stmt + ";")
|
||||
cur.close()
|
||||
conn.close()
|
||||
print("risk_history migration applied successfully.")
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Migrate risk_history table in Postgres")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Print DDL without executing")
|
||||
parser.add_argument("--dsn", default="", help="Postgres DSN (overrides DATABASE_URL)")
|
||||
args = parser.parse_args()
|
||||
|
||||
dsn = args.dsn or os.getenv("DATABASE_URL") or os.getenv("RISK_DATABASE_URL", "")
|
||||
if not dsn and not args.dry_run:
|
||||
print("ERROR: No DSN provided. Set DATABASE_URL or pass --dsn.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
run(dsn, dry_run=args.dry_run)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
31
ops/scripts/rotate_sofiia_keys.sh
Executable file
31
ops/scripts/rotate_sofiia_keys.sh
Executable file
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||
ENV_FILE="${ROOT_DIR}/.env"
|
||||
COMPOSE_FILE="${ROOT_DIR}/docker-compose.node2-sofiia.yml"
|
||||
|
||||
if [[ ! -f "${ENV_FILE}" ]]; then
|
||||
echo "Missing .env: ${ENV_FILE}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
NEW_KEY="$(openssl rand -hex 24)"
|
||||
|
||||
if grep -q '^SOFIIA_CONSOLE_API_KEY=' "${ENV_FILE}"; then
|
||||
sed -i '' "s/^SOFIIA_CONSOLE_API_KEY=.*/SOFIIA_CONSOLE_API_KEY=${NEW_KEY}/" "${ENV_FILE}"
|
||||
else
|
||||
printf '\nSOFIIA_CONSOLE_API_KEY=%s\n' "${NEW_KEY}" >> "${ENV_FILE}"
|
||||
fi
|
||||
|
||||
if grep -q '^SUPERVISOR_API_KEY=' "${ENV_FILE}"; then
|
||||
sed -i '' "s/^SUPERVISOR_API_KEY=.*/SUPERVISOR_API_KEY=${NEW_KEY}/" "${ENV_FILE}"
|
||||
else
|
||||
printf 'SUPERVISOR_API_KEY=%s\n' "${NEW_KEY}" >> "${ENV_FILE}"
|
||||
fi
|
||||
|
||||
docker compose -f "${COMPOSE_FILE}" up -d sofiia-console router >/dev/null
|
||||
|
||||
echo "Sofiia keys rotated and services restarted."
|
||||
echo "Use this API key for X-API-Key header:"
|
||||
echo "${NEW_KEY}"
|
||||
182
ops/scripts/run_governance_job.py
Executable file
182
ops/scripts/run_governance_job.py
Executable file
@@ -0,0 +1,182 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
run_governance_job.py — Universal Governance Job Runner.
|
||||
DAARION.city | used by cron to trigger scheduled governance jobs.
|
||||
|
||||
Usage:
|
||||
python3 ops/scripts/run_governance_job.py \\
|
||||
--tool risk_history_tool \\
|
||||
--action snapshot \\
|
||||
--params-json '{"env":"prod"}'
|
||||
|
||||
python3 ops/scripts/run_governance_job.py \\
|
||||
--tool backlog_tool --action cleanup --params-json '{"env":"prod"}' \\
|
||||
--router-url http://localhost:8000 \\
|
||||
--agent-id scheduler
|
||||
|
||||
Exit codes:
|
||||
0 — success (HTTP 200, result.success=true)
|
||||
1 — HTTP error or tool returned success=false
|
||||
2 — usage / configuration error
|
||||
|
||||
Environment variables (read from .env if present):
|
||||
ROUTER_URL — base URL of the router service (default: http://localhost:8000)
|
||||
SCHEDULER_API_KEY — optional Bearer token for router auth
|
||||
GOVERNANCE_ENV — default env param passed in tool arguments (default: prod)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
# ── Try loading .env from repo root ──────────────────────────────────────────
|
||||
|
||||
def _load_dotenv(path: Path) -> None:
|
||||
if not path.exists():
|
||||
return
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
key, _, value = line.partition("=")
|
||||
key = key.strip()
|
||||
value = value.strip().strip('"').strip("'")
|
||||
if key and key not in os.environ: # don't override existing env vars
|
||||
os.environ[key] = value
|
||||
|
||||
|
||||
_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
_load_dotenv(_REPO_ROOT / ".env")
|
||||
_load_dotenv(_REPO_ROOT / ".env.local")
|
||||
|
||||
# ── Logging ───────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [run_governance_job] %(levelname)s %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
logger = logging.getLogger("run_governance_job")
|
||||
|
||||
|
||||
# ── HTTP helper ───────────────────────────────────────────────────────────────
|
||||
|
||||
def _post_json(url: str, payload: dict, api_key: str = "", timeout: int = 60) -> dict:
|
||||
"""POST JSON payload; return parsed response dict. Raises on HTTP error."""
|
||||
body = json.dumps(payload).encode()
|
||||
headers = {"Content-Type": "application/json", "Accept": "application/json"}
|
||||
if api_key:
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
|
||||
req = urllib.request.Request(url, data=body, headers=headers, method="POST")
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
return json.loads(resp.read().decode())
|
||||
except urllib.error.HTTPError as e:
|
||||
body_txt = e.read().decode(errors="replace")[:500]
|
||||
raise RuntimeError(f"HTTP {e.code} from {url}: {body_txt}") from e
|
||||
except urllib.error.URLError as e:
|
||||
raise RuntimeError(f"Cannot reach {url}: {e.reason}") from e
|
||||
|
||||
|
||||
# ── Main ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Trigger a governance tool action via the DAARION router API."
|
||||
)
|
||||
parser.add_argument("--tool", required=True, help="Tool name (e.g. risk_history_tool)")
|
||||
parser.add_argument("--action", required=True, help="Action (e.g. snapshot)")
|
||||
parser.add_argument(
|
||||
"--params-json",
|
||||
default="{}",
|
||||
help='JSON dict of extra parameters (e.g. \'{"env":"prod"}\')',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--router-url",
|
||||
default=os.environ.get("ROUTER_URL", "http://localhost:8000"),
|
||||
help="Router base URL (default: $ROUTER_URL or http://localhost:8000)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--agent-id",
|
||||
default="scheduler",
|
||||
help='Agent identity for audit trail (default: scheduler)',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=90,
|
||||
help="HTTP timeout in seconds (default: 90)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Print the request payload without sending it",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Parse extra params
|
||||
try:
|
||||
extra_params = json.loads(args.params_json)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error("Invalid --params-json: %s", e)
|
||||
return 2
|
||||
|
||||
api_key = os.environ.get("SCHEDULER_API_KEY", "")
|
||||
endpoint = f"{args.router_url.rstrip('/')}/v1/tools/execute"
|
||||
|
||||
payload = {
|
||||
"tool": args.tool,
|
||||
"action": args.action,
|
||||
"agent_id": args.agent_id,
|
||||
**extra_params,
|
||||
}
|
||||
|
||||
ts = datetime.datetime.utcnow().isoformat()
|
||||
logger.info("Job: %s.%s params=%s ts=%s", args.tool, args.action, extra_params, ts)
|
||||
|
||||
if args.dry_run:
|
||||
print("[dry-run] Would POST to:", endpoint)
|
||||
print("[dry-run] Payload:", json.dumps(payload, indent=2))
|
||||
return 0
|
||||
|
||||
try:
|
||||
result = _post_json(endpoint, payload, api_key=api_key, timeout=args.timeout)
|
||||
except RuntimeError as e:
|
||||
logger.error("Request failed: %s", e)
|
||||
return 1
|
||||
|
||||
# Normalise result — router returns {"success": bool, "result": ..., "error": ...}
|
||||
success = result.get("success", True) # assume success if key absent
|
||||
error = result.get("error")
|
||||
res_data = result.get("result", result)
|
||||
|
||||
if success:
|
||||
# Pretty-print a summary
|
||||
summary = {}
|
||||
if isinstance(res_data, dict):
|
||||
for key in ("created", "updated", "skipped", "deleted", "snapshot_id",
|
||||
"services", "total", "week", "band", "score"):
|
||||
if key in res_data:
|
||||
summary[key] = res_data[key]
|
||||
logger.info(
|
||||
"✅ %s.%s → OK %s",
|
||||
args.tool, args.action,
|
||||
json.dumps(summary) if summary else "(done)",
|
||||
)
|
||||
return 0
|
||||
else:
|
||||
logger.error("❌ %s.%s → FAIL error=%s", args.tool, args.action, error)
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
145
ops/scripts/schedule_jobs.py
Normal file
145
ops/scripts/schedule_jobs.py
Normal file
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Lightweight scheduled job runner for DAARION operational tasks.
|
||||
|
||||
Calls tools directly (no gateway required) and saves output artifacts to
|
||||
ops/reports/{cost,privacy,drift}/.
|
||||
|
||||
Usage:
|
||||
python3 ops/scripts/schedule_jobs.py daily_cost_digest
|
||||
python3 ops/scripts/schedule_jobs.py daily_privacy_digest
|
||||
python3 ops/scripts/schedule_jobs.py weekly_drift_full
|
||||
|
||||
Environment variables:
|
||||
REPO_ROOT — root of repo (default: inferred from script location)
|
||||
AUDIT_BACKEND — auto|jsonl|postgres (default: auto)
|
||||
DATABASE_URL — PostgreSQL DSN (required for backend=postgres/auto with DB)
|
||||
AUDIT_JSONL_DIR — override JSONL audit dir
|
||||
|
||||
Exit codes: 0 = success, 1 = error
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# ── Resolve repo root ─────────────────────────────────────────────────────────
|
||||
_HERE = Path(__file__).resolve().parent
|
||||
REPO_ROOT = Path(os.getenv("REPO_ROOT", str(_HERE.parent.parent)))
|
||||
sys.path.insert(0, str(REPO_ROOT / "services" / "router"))
|
||||
|
||||
|
||||
def _today() -> str:
|
||||
return datetime.date.today().isoformat()
|
||||
|
||||
|
||||
def _week_tag() -> str:
|
||||
d = datetime.date.today()
|
||||
return f"week-{d.isocalendar()[0]}-{d.isocalendar()[1]:02d}"
|
||||
|
||||
|
||||
def _save_artifact(output_dir: Path, stem: str, data: dict) -> None:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
json_path = output_dir / f"{stem}.json"
|
||||
md_path = output_dir / f"{stem}.md"
|
||||
with open(json_path, "w", encoding="utf-8") as fh:
|
||||
json.dump(data, fh, indent=2, ensure_ascii=False, default=str)
|
||||
markdown = data.get("markdown", "")
|
||||
if markdown:
|
||||
with open(md_path, "w", encoding="utf-8") as fh:
|
||||
fh.write(markdown)
|
||||
print(f"[schedule_jobs] Artifacts saved: {json_path}")
|
||||
if md_path.exists():
|
||||
print(f"[schedule_jobs] Markdown: {md_path}")
|
||||
|
||||
|
||||
# ─── Task implementations ─────────────────────────────────────────────────────
|
||||
|
||||
def run_daily_cost_digest() -> int:
|
||||
print(f"[schedule_jobs] Running daily_cost_digest ({_today()})")
|
||||
try:
|
||||
from cost_analyzer import analyze_cost_dict # type: ignore
|
||||
result = analyze_cost_dict("digest", params={
|
||||
"window_hours": 24,
|
||||
"baseline_hours": 168,
|
||||
"top_n": 10,
|
||||
"backend": os.getenv("AUDIT_BACKEND", "auto"),
|
||||
})
|
||||
output_dir = REPO_ROOT / "ops" / "reports" / "cost"
|
||||
_save_artifact(output_dir, _today(), result)
|
||||
anomalies = result.get("anomaly_count", 0)
|
||||
recs = result.get("recommendations") or []
|
||||
print(f"[schedule_jobs] Cost digest OK — anomalies={anomalies}, recs={len(recs)}")
|
||||
return 0
|
||||
except Exception as exc:
|
||||
print(f"[schedule_jobs] daily_cost_digest FAILED: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
def run_daily_privacy_digest() -> int:
|
||||
print(f"[schedule_jobs] Running daily_privacy_digest ({_today()})")
|
||||
try:
|
||||
from data_governance import scan_data_governance_dict # type: ignore
|
||||
result = scan_data_governance_dict("digest_audit", params={
|
||||
"backend": os.getenv("AUDIT_BACKEND", "auto"),
|
||||
"time_window_hours": 24,
|
||||
"max_findings": 20,
|
||||
})
|
||||
output_dir = REPO_ROOT / "ops" / "reports" / "privacy"
|
||||
_save_artifact(output_dir, _today(), result)
|
||||
stats = result.get("stats") or {}
|
||||
print(
|
||||
f"[schedule_jobs] Privacy digest OK — "
|
||||
f"errors={stats.get('errors',0)}, warnings={stats.get('warnings',0)}"
|
||||
)
|
||||
return 0
|
||||
except Exception as exc:
|
||||
print(f"[schedule_jobs] daily_privacy_digest FAILED: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
def run_weekly_drift_full() -> int:
|
||||
tag = _week_tag()
|
||||
print(f"[schedule_jobs] Running weekly_drift_full ({tag})")
|
||||
try:
|
||||
from drift_analyzer import analyze_drift_dict # type: ignore
|
||||
result = analyze_drift_dict({
|
||||
"categories": ["services", "openapi", "nats", "tools"],
|
||||
"drift_profile": "dev",
|
||||
})
|
||||
output_dir = REPO_ROOT / "ops" / "reports" / "drift"
|
||||
_save_artifact(output_dir, tag, result)
|
||||
stats = (result.get("data") or result).get("stats") or {}
|
||||
print(
|
||||
f"[schedule_jobs] Drift full OK — "
|
||||
f"errors={stats.get('errors',0)}, warnings={stats.get('warnings',0)}"
|
||||
)
|
||||
return 0
|
||||
except Exception as exc:
|
||||
print(f"[schedule_jobs] weekly_drift_full FAILED: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
# ─── Dispatch ─────────────────────────────────────────────────────────────────
|
||||
|
||||
TASKS = {
|
||||
"daily_cost_digest": run_daily_cost_digest,
|
||||
"daily_privacy_digest": run_daily_privacy_digest,
|
||||
"weekly_drift_full": run_weekly_drift_full,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if len(sys.argv) < 2 or sys.argv[1] not in TASKS:
|
||||
print(f"Usage: {sys.argv[0]} <task>", file=sys.stderr)
|
||||
print(f" Available tasks: {', '.join(TASKS)}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
task_name = sys.argv[1]
|
||||
sys.exit(TASKS[task_name]())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
74
ops/scripts/start_spacebot.sh
Executable file
74
ops/scripts/start_spacebot.sh
Executable file
@@ -0,0 +1,74 @@
|
||||
#!/bin/bash
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Spacebot (Sofiia Telegram agent) start script
|
||||
# Usage: ./ops/scripts/start_spacebot.sh [stop|restart|status|logs]
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
set -e
|
||||
|
||||
REPO_DIR="$(cd "$(dirname "$0")/../.." && pwd)"
|
||||
SPACEBOT_BIN="/Users/apple/github-projects/spacebot/target/release/spacebot"
|
||||
SPACEBOT_DIR="/Users/apple/.spacebot"
|
||||
ENV_FILE="${REPO_DIR}/.env"
|
||||
PID_FILE="${SPACEBOT_DIR}/spacebot.pid"
|
||||
LOG_FILE="${SPACEBOT_DIR}/logs/spacebot.log.$(date +%Y-%m-%d)"
|
||||
export PATH="$HOME/.bun/bin:$PATH"
|
||||
|
||||
load_env() {
|
||||
if [ -f "$ENV_FILE" ]; then
|
||||
set -a; source "$ENV_FILE"; set +a
|
||||
fi
|
||||
export ZHIPU_API_KEY="${GLM5_API_KEY}"
|
||||
}
|
||||
|
||||
is_running() {
|
||||
[ -f "$PID_FILE" ] && kill -0 "$(cat "$PID_FILE")" 2>/dev/null
|
||||
}
|
||||
|
||||
cmd="${1:-start}"
|
||||
|
||||
case "$cmd" in
|
||||
start)
|
||||
if is_running; then
|
||||
echo "Spacebot already running (PID: $(cat $PID_FILE))"
|
||||
exit 0
|
||||
fi
|
||||
load_env
|
||||
echo "Starting Spacebot..."
|
||||
nohup $SPACEBOT_BIN --config "${SPACEBOT_DIR}/config.toml" > /tmp/spacebot.out 2>&1 &
|
||||
sleep 3
|
||||
if is_running; then
|
||||
echo "✓ Spacebot started (PID: $(cat $PID_FILE))"
|
||||
echo " Bot: @SofiiaDaarionbot"
|
||||
echo " Logs: $LOG_FILE"
|
||||
else
|
||||
echo "✗ Spacebot failed to start. Check logs: $LOG_FILE"
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
stop)
|
||||
if is_running; then
|
||||
kill "$(cat "$PID_FILE")"
|
||||
echo "✓ Spacebot stopped"
|
||||
else
|
||||
echo "Spacebot not running"
|
||||
fi
|
||||
;;
|
||||
restart)
|
||||
$0 stop 2>/dev/null; sleep 2; $0 start
|
||||
;;
|
||||
status)
|
||||
if is_running; then
|
||||
echo "✓ Spacebot running (PID: $(cat $PID_FILE))"
|
||||
tail -3 "$LOG_FILE" 2>/dev/null
|
||||
else
|
||||
echo "✗ Spacebot not running"
|
||||
fi
|
||||
;;
|
||||
logs)
|
||||
tail -f "$LOG_FILE"
|
||||
;;
|
||||
*)
|
||||
echo "Usage: $0 {start|stop|restart|status|logs}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
411
ops/scripts/verify_sofiia_stack.py
Normal file
411
ops/scripts/verify_sofiia_stack.py
Normal file
@@ -0,0 +1,411 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
verify_sofiia_stack.py — Sofiia stack parity verifier (NODA1 / NODA2).
|
||||
DAARION.city | deterministic PASS/FAIL/WARN, no LLM.
|
||||
|
||||
Checks (per node):
|
||||
- Router /healthz (or /health)
|
||||
- /v1/tools/execute dry-run: risk_engine_tool.service, architecture_pressure_tool.service, backlog_tool.dashboard
|
||||
- BFF /api/status/full → reachable, router+memory reachable, alerts backend != memory
|
||||
- BFF /api/health → service=sofiia-console
|
||||
- Cron: jobs present (via status/full or local file)
|
||||
- Optional: supervisor health if SUPERVISOR_URL set
|
||||
|
||||
Parity (--compare-with):
|
||||
- Compare BFF version between two nodes (WARN if different, not FAIL)
|
||||
- Compare router/memory reachable on both
|
||||
|
||||
Usage:
|
||||
python3 ops/scripts/verify_sofiia_stack.py
|
||||
python3 ops/scripts/verify_sofiia_stack.py --node NODA2 --bff-url http://localhost:8002
|
||||
python3 ops/scripts/verify_sofiia_stack.py \\
|
||||
--node NODA2 --bff-url http://noda2:8002 \\
|
||||
--compare-with http://noda1:8002
|
||||
|
||||
Exit: 0 if all critical checks PASS, 1 otherwise.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
CRON_FILE = REPO_ROOT / "ops" / "cron" / "jobs.cron"
|
||||
TOOLS_TIMEOUT = 25
|
||||
|
||||
CRON_JOBS_EXPECTED = [
|
||||
"hourly_risk_snapshot",
|
||||
"daily_risk_digest",
|
||||
"risk_history_cleanup",
|
||||
"weekly_platform_priority_digest",
|
||||
"weekly_backlog_generate",
|
||||
"daily_backlog_cleanup",
|
||||
]
|
||||
|
||||
# ── HTTP helpers ──────────────────────────────────────────────────────────────
|
||||
|
||||
def _get(url: str, timeout: int = 8) -> tuple[int, dict]:
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=timeout) as resp:
|
||||
return resp.getcode(), json.loads(resp.read().decode())
|
||||
except urllib.error.HTTPError as e:
|
||||
try:
|
||||
body = json.loads(e.read().decode())
|
||||
except Exception:
|
||||
body = {}
|
||||
return e.code, body
|
||||
except Exception:
|
||||
return 0, {}
|
||||
|
||||
|
||||
def _post_json(url: str, body: dict, api_key: str = "", timeout: int = 30) -> tuple[int, dict]:
|
||||
try:
|
||||
data = json.dumps(body).encode()
|
||||
req = urllib.request.Request(url, data=data, method="POST",
|
||||
headers={"Content-Type": "application/json"})
|
||||
if api_key:
|
||||
req.add_header("Authorization", f"Bearer {api_key}")
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
return resp.getcode(), json.loads(resp.read().decode())
|
||||
except urllib.error.HTTPError as e:
|
||||
try:
|
||||
body = json.loads(e.read().decode())
|
||||
except Exception:
|
||||
body = {}
|
||||
return e.code, body
|
||||
except Exception:
|
||||
return 0, {}
|
||||
|
||||
|
||||
# ── Individual checks ─────────────────────────────────────────────────────────
|
||||
|
||||
def check_router_health(base_url: str) -> dict:
|
||||
"""CRITICAL: router must respond 200."""
|
||||
for path in ("/healthz", "/health"):
|
||||
code, _ = _get(f"{base_url.rstrip('/')}{path}", timeout=5)
|
||||
if code == 200:
|
||||
return {"name": "router_health", "pass": True, "level": "critical",
|
||||
"detail": f"GET {path} 200"}
|
||||
return {"name": "router_health", "pass": False, "level": "critical",
|
||||
"detail": "router unreachable (no 200 from /healthz or /health)"}
|
||||
|
||||
|
||||
def check_tool(base_url: str, tool: str, action: str, params: dict, api_key: str) -> dict:
|
||||
"""CRITICAL: tool execute must reach router (400/422 = reached, schema error = ok)."""
|
||||
url = f"{base_url.rstrip('/')}/v1/tools/execute"
|
||||
body = {"tool": tool, "action": action, "agent_id": "sofiia", **params}
|
||||
code, data = _post_json(url, body, api_key=api_key, timeout=TOOLS_TIMEOUT)
|
||||
# 200 = success, 400/422 = reached but bad params (tool not loaded) — still PASS
|
||||
reached = code in (200, 400, 422)
|
||||
succeeded = code == 200 and (
|
||||
data.get("status") == "succeeded" or data.get("data") is not None
|
||||
)
|
||||
return {
|
||||
"name": f"tool_{tool}_{action}",
|
||||
"pass": reached,
|
||||
"level": "critical",
|
||||
"detail": (
|
||||
f"HTTP {code} status={data.get('status', '—')}"
|
||||
+ (" [data returned]" if succeeded else "")
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def check_bff_health(bff_url: str) -> dict:
|
||||
"""CRITICAL: BFF must identify as sofiia-console."""
|
||||
code, data = _get(f"{bff_url.rstrip('/')}/api/health", timeout=6)
|
||||
if code == 200 and data.get("service") == "sofiia-console":
|
||||
return {"name": "bff_health", "pass": True, "level": "critical",
|
||||
"detail": f"version={data.get('version')} env={data.get('env')} uptime={data.get('uptime_s')}s",
|
||||
"version": data.get("version", ""), "build": data.get("build", "")}
|
||||
return {"name": "bff_health", "pass": False, "level": "critical",
|
||||
"detail": f"HTTP {code} — expected service=sofiia-console, got: {str(data)[:120]}",
|
||||
"version": "", "build": ""}
|
||||
|
||||
|
||||
def check_status_full(bff_url: str, env: str = "dev") -> dict:
|
||||
"""CRITICAL: /api/status/full must show router+memory reachable + alerts backend."""
|
||||
code, data = _get(f"{bff_url.rstrip('/')}/api/status/full", timeout=12)
|
||||
issues = []
|
||||
warns = []
|
||||
|
||||
if code != 200:
|
||||
return {"name": "bff_status_full", "pass": False, "level": "critical",
|
||||
"detail": f"HTTP {code} — /api/status/full unreachable",
|
||||
"data": {}}
|
||||
|
||||
router_ok = (data.get("router") or {}).get("reachable", False)
|
||||
mem_ok = (data.get("memory") or {}).get("reachable", False)
|
||||
ollama_ok = (data.get("ollama") or {}).get("reachable", False)
|
||||
backends = data.get("backends") or {}
|
||||
cron = data.get("cron") or {}
|
||||
|
||||
if not router_ok:
|
||||
issues.append("router.reachable=false")
|
||||
if not mem_ok:
|
||||
issues.append("memory.reachable=false")
|
||||
|
||||
# Alerts backend must not be 'memory' in prod/staging
|
||||
alerts_be = backends.get("alerts", "unknown")
|
||||
if env in ("prod", "staging") and alerts_be == "memory":
|
||||
issues.append(f"alerts backend=memory (must be postgres in {env})")
|
||||
elif alerts_be == "memory":
|
||||
warns.append(f"alerts backend=memory (ok in dev, not prod)")
|
||||
|
||||
cron_installed = cron.get("installed", False)
|
||||
if cron_installed is False and env in ("prod", "staging"):
|
||||
warns.append("cron.installed=false")
|
||||
|
||||
cron_jobs = cron.get("jobs_present", [])
|
||||
missing_jobs = [j for j in CRON_JOBS_EXPECTED if j not in cron_jobs]
|
||||
if missing_jobs and env in ("prod", "staging"):
|
||||
warns.append(f"cron missing jobs: {missing_jobs}")
|
||||
|
||||
ok = len(issues) == 0
|
||||
detail_parts = [
|
||||
f"router={'ok' if router_ok else 'FAIL'}",
|
||||
f"memory={'ok' if mem_ok else 'FAIL'}",
|
||||
f"ollama={'ok' if ollama_ok else 'offline'}",
|
||||
f"alerts_be={alerts_be}",
|
||||
f"cron={cron_installed}",
|
||||
]
|
||||
if issues:
|
||||
detail_parts.append(f"issues={issues}")
|
||||
if warns:
|
||||
detail_parts.append(f"warns={warns}")
|
||||
|
||||
return {
|
||||
"name": "bff_status_full",
|
||||
"pass": ok,
|
||||
"level": "critical",
|
||||
"detail": " | ".join(detail_parts),
|
||||
"warns": warns,
|
||||
"data": {
|
||||
"router_ok": router_ok, "memory_ok": mem_ok, "ollama_ok": ollama_ok,
|
||||
"alerts_backend": alerts_be, "cron_installed": cron_installed,
|
||||
"cron_jobs_present": cron_jobs,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def check_alerts_backend_not_memory(bff_url: str, env: str) -> dict:
|
||||
"""CRITICAL in prod/staging: alerts must not use in-memory store."""
|
||||
code, data = _get(f"{bff_url.rstrip('/')}/api/status/full", timeout=10)
|
||||
if code != 200:
|
||||
return {"name": "alerts_backend", "pass": True, "level": "warn",
|
||||
"detail": "skipped (status/full unreachable)"}
|
||||
backend = (data.get("backends") or {}).get("alerts", "unknown")
|
||||
if env in ("prod", "staging") and backend == "memory":
|
||||
return {"name": "alerts_backend", "pass": False, "level": "critical",
|
||||
"detail": f"alerts backend=memory in {env} — must be postgres"}
|
||||
return {"name": "alerts_backend", "pass": True, "level": "critical",
|
||||
"detail": f"alerts backend={backend}"}
|
||||
|
||||
|
||||
def check_cron_entries() -> dict:
|
||||
"""WARN: local cron file should have all governance entries."""
|
||||
if not CRON_FILE.exists():
|
||||
return {"name": "cron_local_file", "pass": False, "level": "warn",
|
||||
"detail": f"not found: {CRON_FILE.relative_to(REPO_ROOT)}"}
|
||||
text = CRON_FILE.read_text(encoding="utf-8")
|
||||
missing = [r for r in CRON_JOBS_EXPECTED if r not in text]
|
||||
if missing:
|
||||
return {"name": "cron_local_file", "pass": False, "level": "warn",
|
||||
"detail": f"missing entries: {missing}"}
|
||||
return {"name": "cron_local_file", "pass": True, "level": "warn",
|
||||
"detail": "all governance entries present"}
|
||||
|
||||
|
||||
def check_supervisor(supervisor_url: str) -> dict:
|
||||
if not supervisor_url:
|
||||
return {"name": "supervisor_health", "pass": True, "level": "info",
|
||||
"detail": "skipped (no SUPERVISOR_URL)"}
|
||||
code, _ = _get(f"{supervisor_url.rstrip('/')}/health", timeout=5)
|
||||
ok = code == 200
|
||||
return {"name": "supervisor_health", "pass": ok, "level": "warn",
|
||||
"detail": f"GET /health → {code}" if code else "unreachable"}
|
||||
|
||||
|
||||
# ── Parity comparison ─────────────────────────────────────────────────────────
|
||||
|
||||
def compare_nodes(bff_a: str, bff_b: str, node_a: str = "A", node_b: str = "B") -> list[dict]:
|
||||
"""Compare two BFF nodes. Returns list of parity check results."""
|
||||
checks = []
|
||||
|
||||
def _full(url: str) -> dict:
|
||||
_, d = _get(f"{url.rstrip('/')}/api/status/full", timeout=10)
|
||||
return d
|
||||
|
||||
def _health(url: str) -> dict:
|
||||
_, d = _get(f"{url.rstrip('/')}/api/health", timeout=6)
|
||||
return d
|
||||
|
||||
ha, hb = _health(bff_a), _health(bff_b)
|
||||
ver_a, ver_b = ha.get("version", "?"), hb.get("version", "?")
|
||||
version_match = ver_a == ver_b
|
||||
checks.append({
|
||||
"name": f"parity_version_{node_a}_vs_{node_b}",
|
||||
"pass": version_match,
|
||||
"level": "warn", # mismatch is WARN, not FAIL
|
||||
"detail": f"{node_a}={ver_a} {node_b}={ver_b}" + ("" if version_match else " [MISMATCH — consider deploying same version]"),
|
||||
})
|
||||
|
||||
fa, fb = _full(bff_a), _full(bff_b)
|
||||
for key in ("router", "memory"):
|
||||
ok_a = (fa.get(key) or {}).get("reachable", False)
|
||||
ok_b = (fb.get(key) or {}).get("reachable", False)
|
||||
same = ok_a == ok_b
|
||||
checks.append({
|
||||
"name": f"parity_{key}_reachable_{node_a}_vs_{node_b}",
|
||||
"pass": ok_a and ok_b, # FAIL if either node missing critical service
|
||||
"level": "critical" if key == "router" else "warn",
|
||||
"detail": f"{node_a}.{key}={'ok' if ok_a else 'FAIL'} {node_b}.{key}={'ok' if ok_b else 'FAIL'}",
|
||||
})
|
||||
|
||||
be_a = (fa.get("backends") or {}).get("alerts", "?")
|
||||
be_b = (fb.get("backends") or {}).get("alerts", "?")
|
||||
checks.append({
|
||||
"name": f"parity_alerts_backend_{node_a}_vs_{node_b}",
|
||||
"pass": be_a == be_b,
|
||||
"level": "warn",
|
||||
"detail": f"{node_a}.alerts={be_a} {node_b}.alerts={be_b}" + ("" if be_a == be_b else " [backends differ]"),
|
||||
})
|
||||
|
||||
return checks
|
||||
|
||||
|
||||
# ── Main ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description="Verify Sofiia stack (NODA1/NODA2)")
|
||||
ap.add_argument("--node", default="NODA2", help="Node label (for display)")
|
||||
ap.add_argument("--router-url", default=os.getenv("ROUTER_URL", "http://localhost:8000"),
|
||||
help="Router URL for this node")
|
||||
ap.add_argument("--bff-url", default=os.getenv("BFF_URL", "http://localhost:8002"),
|
||||
help="sofiia-console BFF URL for this node")
|
||||
ap.add_argument("--compare-with", default=os.getenv("COMPARE_WITH_BFF", ""),
|
||||
help="Second BFF URL for parity comparison (optional)")
|
||||
ap.add_argument("--compare-node", default="NODA1",
|
||||
help="Label for the comparison node (default: NODA1)")
|
||||
ap.add_argument("--supervisor-url", default=os.getenv("SUPERVISOR_URL", ""))
|
||||
ap.add_argument("--api-key", default=os.getenv("SUPERVISOR_API_KEY", ""))
|
||||
ap.add_argument("--env", default=os.getenv("ENV", "dev"),
|
||||
help="Environment (dev|staging|prod) — affects alert backend strictness")
|
||||
ap.add_argument("--json", dest="json_out", action="store_true", help="JSON output only")
|
||||
args = ap.parse_args()
|
||||
|
||||
api_key = args.api_key.strip()
|
||||
env = args.env.strip().lower()
|
||||
results: list[dict] = []
|
||||
|
||||
# ── Router checks ──────────────────────────────────────────────────────────
|
||||
results.append(check_router_health(args.router_url))
|
||||
results.append(check_tool(args.router_url, "risk_engine_tool", "service",
|
||||
{"env": "prod", "service": "gateway"}, api_key))
|
||||
results.append(check_tool(args.router_url, "architecture_pressure_tool", "service",
|
||||
{"env": "prod", "service": "gateway"}, api_key))
|
||||
results.append(check_tool(args.router_url, "backlog_tool", "dashboard",
|
||||
{"env": "prod"}, api_key))
|
||||
|
||||
# ── BFF checks ─────────────────────────────────────────────────────────────
|
||||
results.append(check_bff_health(args.bff_url))
|
||||
results.append(check_status_full(args.bff_url, env=env))
|
||||
|
||||
# ── Cron (local file) ──────────────────────────────────────────────────────
|
||||
results.append(check_cron_entries())
|
||||
|
||||
# ── Supervisor (optional) ──────────────────────────────────────────────────
|
||||
results.append(check_supervisor(args.supervisor_url))
|
||||
|
||||
# ── Parity (optional) ─────────────────────────────────────────────────────
|
||||
parity_results: list[dict] = []
|
||||
if args.compare_with:
|
||||
parity_results = compare_nodes(
|
||||
args.bff_url, args.compare_with,
|
||||
node_a=args.node, node_b=args.compare_node,
|
||||
)
|
||||
results.extend(parity_results)
|
||||
|
||||
# ── Evaluate ───────────────────────────────────────────────────────────────
|
||||
critical_fail = [r for r in results if not r["pass"] and r.get("level") == "critical"]
|
||||
warn_fail = [r for r in results if not r["pass"] and r.get("level") in ("warn",)]
|
||||
all_pass = len(critical_fail) == 0
|
||||
|
||||
# Collect all inline warns from status_full
|
||||
inline_warns: list[str] = []
|
||||
for r in results:
|
||||
if isinstance(r.get("warns"), list):
|
||||
inline_warns.extend(r["warns"])
|
||||
|
||||
summary = {
|
||||
"node": args.node,
|
||||
"env": env,
|
||||
"bff_url": args.bff_url,
|
||||
"router_url": args.router_url,
|
||||
"pass": all_pass,
|
||||
"critical_failures": [r["name"] for r in critical_fail],
|
||||
"warnings": [r["name"] for r in warn_fail] + inline_warns,
|
||||
"checks": results,
|
||||
"parity_checks": parity_results,
|
||||
"recommendations": (
|
||||
[] if all_pass else
|
||||
["Fix critical failures listed above."] +
|
||||
([f"alerts_backend must be postgres (not memory) in {env}"]
|
||||
if any("alerts backend=memory" in r.get("detail","") for r in critical_fail) else []) +
|
||||
(["Ensure cron jobs are deployed on this node"] if any("cron" in r["name"] for r in warn_fail) else [])
|
||||
),
|
||||
}
|
||||
|
||||
if args.json_out:
|
||||
print(json.dumps(summary, indent=2))
|
||||
else:
|
||||
print(f"\n{'='*60}")
|
||||
print(f" Sofiia Stack Verifier — {args.node} ({env.upper()})")
|
||||
print(f" BFF: {args.bff_url}")
|
||||
print(f" Router: {args.router_url}")
|
||||
if args.compare_with:
|
||||
print(f" Parity: comparing with {args.compare_node} @ {args.compare_with}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
all_checks = [r for r in results if r not in parity_results]
|
||||
if parity_results:
|
||||
print("Node checks:")
|
||||
for r in all_checks:
|
||||
icon = "✓" if r["pass"] else ("⚠" if r.get("level") == "warn" else "✗")
|
||||
lvl = f"[{r.get('level','?').upper():<8}]"
|
||||
print(f" {icon} {lvl} {r['name']:<45} {r.get('detail','')}")
|
||||
if r.get("warns"):
|
||||
for w in r["warns"]:
|
||||
print(f" ⚠ {w}")
|
||||
|
||||
if parity_results:
|
||||
print("\nParity checks:")
|
||||
for r in parity_results:
|
||||
icon = "✓" if r["pass"] else ("⚠" if r.get("level") == "warn" else "✗")
|
||||
lvl = f"[{r.get('level','?').upper():<8}]"
|
||||
print(f" {icon} {lvl} {r['name']:<55} {r.get('detail','')}")
|
||||
|
||||
print()
|
||||
if all_pass:
|
||||
print(f" OVERALL: ✓ PASS (warnings: {len(summary['warnings'])})")
|
||||
else:
|
||||
print(f" OVERALL: ✗ FAIL")
|
||||
print(f" Critical failures: {summary['critical_failures']}")
|
||||
if summary["warnings"]:
|
||||
print(f" Warnings: {summary['warnings']}")
|
||||
if summary["recommendations"]:
|
||||
print(f"\n Recommendations:")
|
||||
for rec in summary["recommendations"]:
|
||||
print(f" → {rec}")
|
||||
print()
|
||||
|
||||
return 0 if all_pass else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
293
ops/scripts/voice_canary.py
Normal file
293
ops/scripts/voice_canary.py
Normal file
@@ -0,0 +1,293 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
voice_canary.py — Voice pipeline health canary.
|
||||
|
||||
Two modes:
|
||||
--mode preflight Hard-fail (exit 1) if Polina/Ostap don't synthesize.
|
||||
Used in ops/fabric_preflight.sh before any deployment.
|
||||
--mode runtime Soft-check: emit metrics + print results + alert via webhook.
|
||||
Used by cron every 5-10 minutes to catch edge-tts degradation early.
|
||||
|
||||
Usage:
|
||||
python3 ops/scripts/voice_canary.py --mode preflight
|
||||
python3 ops/scripts/voice_canary.py --mode runtime --pushgateway http://localhost:9091
|
||||
|
||||
Environment:
|
||||
MEMORY_SERVICE_URL default: http://localhost:8000
|
||||
SOFIIA_CONSOLE_URL default: http://localhost:8002
|
||||
ALERT_WEBHOOK_URL optional: Slack/Telegram webhook for runtime alerts
|
||||
PUSHGATEWAY_URL optional: Prometheus Pushgateway for runtime metrics
|
||||
CANARY_TTS_MAX_MS override max allowed synthesis time (default: 3000)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
MEMORY_URL = os.getenv("MEMORY_SERVICE_URL", "http://localhost:8000")
|
||||
CONSOLE_URL = os.getenv("SOFIIA_CONSOLE_URL", "http://localhost:8002")
|
||||
ALERT_WEBHOOK = os.getenv("ALERT_WEBHOOK_URL", "")
|
||||
PUSHGATEWAY_URL = os.getenv("PUSHGATEWAY_URL", "")
|
||||
CANARY_TTS_MAX_MS = int(os.getenv("CANARY_TTS_MAX_MS", "3000"))
|
||||
MIN_AUDIO_BYTES = 1000
|
||||
|
||||
TEST_VOICES = [
|
||||
("uk-UA-PolinaNeural", "Polina"),
|
||||
("uk-UA-OstapNeural", "Ostap"),
|
||||
]
|
||||
TEST_TEXT = "Тест синтезу мовлення. Голос працює коректно."
|
||||
|
||||
|
||||
@dataclass
|
||||
class CanaryResult:
|
||||
voice: str
|
||||
voice_id: str
|
||||
ok: bool
|
||||
ms: Optional[int] = None
|
||||
audio_bytes: Optional[int] = None
|
||||
error: Optional[str] = None
|
||||
status_code: Optional[int] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CanaryReport:
|
||||
mode: str
|
||||
ts: str = field(default_factory=lambda: time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()))
|
||||
results: list[CanaryResult] = field(default_factory=list)
|
||||
overall: str = "ok" # ok | degraded | failed
|
||||
degraded_voices: list[str] = field(default_factory=list)
|
||||
failed_voices: list[str] = field(default_factory=list)
|
||||
health_endpoint_ok: bool = False
|
||||
health_ms: Optional[int] = None
|
||||
|
||||
|
||||
def _http_json(url: str, method: str = "GET", body: Optional[dict] = None,
|
||||
timeout: int = 10) -> tuple[int, dict]:
|
||||
data = json.dumps(body).encode() if body else None
|
||||
headers = {"Content-Type": "application/json"} if data else {}
|
||||
req = urllib.request.Request(url, data=data, headers=headers, method=method)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
return resp.status, json.loads(resp.read())
|
||||
except urllib.error.HTTPError as e:
|
||||
return e.code, {}
|
||||
except Exception as e:
|
||||
return 0, {"error": str(e)}
|
||||
|
||||
|
||||
def _http_post_binary(url: str, body: dict, timeout: int = 15) -> tuple[int, int]:
|
||||
"""Returns (status_code, content_length_bytes)."""
|
||||
data = json.dumps(body).encode()
|
||||
req = urllib.request.Request(url, data=data,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST")
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
content = resp.read()
|
||||
return resp.status, len(content)
|
||||
except urllib.error.HTTPError as e:
|
||||
return e.code, 0
|
||||
except Exception as e:
|
||||
return 0, 0
|
||||
|
||||
|
||||
def check_health_endpoint(report: CanaryReport) -> None:
|
||||
"""Quick probe of /voice/health on memory-service."""
|
||||
t0 = time.monotonic()
|
||||
status, data = _http_json(f"{MEMORY_URL}/voice/health", timeout=8)
|
||||
report.health_ms = int((time.monotonic() - t0) * 1000)
|
||||
report.health_endpoint_ok = (status == 200)
|
||||
if status != 200:
|
||||
print(f" [WARN] /voice/health returned HTTP {status}")
|
||||
else:
|
||||
edge_status = data.get("edge_tts", "?")
|
||||
print(f" [OK] /voice/health: edge_tts={edge_status} in {report.health_ms}ms")
|
||||
|
||||
|
||||
def check_tts_synthesis(report: CanaryReport) -> None:
|
||||
"""Perform live synthesis for each test voice."""
|
||||
for voice_id, voice_name in TEST_VOICES:
|
||||
t0 = time.monotonic()
|
||||
status, audio_bytes = _http_post_binary(
|
||||
f"{MEMORY_URL}/voice/tts",
|
||||
{"text": TEST_TEXT, "voice": voice_id, "speed": 1.0},
|
||||
timeout=CANARY_TTS_MAX_MS // 1000 + 5,
|
||||
)
|
||||
ms = int((time.monotonic() - t0) * 1000)
|
||||
|
||||
if status == 200 and audio_bytes >= MIN_AUDIO_BYTES:
|
||||
ok = True
|
||||
error = None
|
||||
if ms > CANARY_TTS_MAX_MS:
|
||||
# Synthesis succeeded but too slow → degraded, not failed
|
||||
ok = False
|
||||
error = f"slow: {ms}ms > {CANARY_TTS_MAX_MS}ms SLO"
|
||||
report.degraded_voices.append(voice_name)
|
||||
print(f" [SLOW] {voice_name} ({voice_id}): {ms}ms > {CANARY_TTS_MAX_MS}ms, {audio_bytes}B")
|
||||
else:
|
||||
print(f" [OK] {voice_name} ({voice_id}): {ms}ms, {audio_bytes}B")
|
||||
else:
|
||||
ok = False
|
||||
error = f"HTTP {status}, {audio_bytes}B"
|
||||
report.failed_voices.append(voice_name)
|
||||
print(f" [FAIL] {voice_name} ({voice_id}): HTTP {status}, {audio_bytes}B")
|
||||
|
||||
report.results.append(CanaryResult(
|
||||
voice=voice_name, voice_id=voice_id,
|
||||
ok=ok and error is None, ms=ms,
|
||||
audio_bytes=audio_bytes, error=error,
|
||||
status_code=status,
|
||||
))
|
||||
|
||||
if report.failed_voices:
|
||||
report.overall = "failed"
|
||||
elif report.degraded_voices:
|
||||
report.overall = "degraded"
|
||||
|
||||
|
||||
def push_metrics(report: CanaryReport, pushgateway: str) -> None:
|
||||
"""Push canary results to Prometheus Pushgateway."""
|
||||
lines = []
|
||||
for r in report.results:
|
||||
label = f'voice="{r.voice_id}"'
|
||||
if r.ms is not None:
|
||||
lines.append(f'voice_canary_tts_ms{{{label}}} {r.ms}')
|
||||
lines.append(f'voice_canary_ok{{{label}}} {1 if r.ok else 0}')
|
||||
lines.append(f'voice_canary_health_ok {1 if report.health_endpoint_ok else 0}')
|
||||
payload = "\n".join(lines) + "\n"
|
||||
url = f"{pushgateway.rstrip('/')}/metrics/job/voice_canary/instance/noda2"
|
||||
data = payload.encode()
|
||||
req = urllib.request.Request(url, data=data,
|
||||
headers={"Content-Type": "text/plain"},
|
||||
method="PUT")
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=5):
|
||||
print(f" [PUSH] Metrics pushed to {url}")
|
||||
except Exception as e:
|
||||
print(f" [WARN] Pushgateway push failed: {e}")
|
||||
|
||||
|
||||
def send_alert(report: CanaryReport, webhook: str) -> None:
|
||||
"""Send alert to Slack/Telegram webhook."""
|
||||
if not webhook or report.overall == "ok":
|
||||
return
|
||||
emoji = "🔴" if report.overall == "failed" else "🟡"
|
||||
summary_lines = []
|
||||
for r in report.results:
|
||||
status = "✓" if r.ok else ("⚠ SLOW" if r.error and "slow" in r.error else "✗ FAIL")
|
||||
timing = f"{r.ms}ms" if r.ms else "N/A"
|
||||
summary_lines.append(f" {status} {r.voice} ({timing})")
|
||||
text = (
|
||||
f"{emoji} *Voice Canary {report.overall.upper()}* `{report.ts}`\n"
|
||||
f"{'\\n'.join(summary_lines)}\n"
|
||||
f"Health endpoint: {'✓' if report.health_endpoint_ok else '✗'}\n"
|
||||
f"Degraded: {report.degraded_voices or 'none'}\n"
|
||||
f"Failed: {report.failed_voices or 'none'}"
|
||||
)
|
||||
body = {"text": text}
|
||||
# Try Slack format, fallback to plain
|
||||
try:
|
||||
data = json.dumps(body).encode()
|
||||
req = urllib.request.Request(webhook, data=data,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST")
|
||||
with urllib.request.urlopen(req, timeout=5):
|
||||
print(f" [ALERT] Webhook sent ({report.overall})")
|
||||
except Exception as e:
|
||||
print(f" [WARN] Webhook failed: {e}")
|
||||
|
||||
|
||||
def run_preflight(report: CanaryReport) -> int:
|
||||
"""Preflight mode: hard-fail on any synthesis failure."""
|
||||
print("── Voice Canary: PREFLIGHT mode ──────────────────────────────────")
|
||||
check_health_endpoint(report)
|
||||
check_tts_synthesis(report)
|
||||
|
||||
if report.failed_voices:
|
||||
print(f"\n[FATAL] Preflight FAILED — voices failed synthesis: {report.failed_voices}")
|
||||
print(" Deployment blocked. Fix edge-tts / memory-service before proceeding.")
|
||||
print(f" Run: docker logs dagi-memory-service-node2 --tail 50")
|
||||
print(f" Check: curl {MEMORY_URL}/voice/health")
|
||||
return 1
|
||||
|
||||
if report.degraded_voices:
|
||||
# Degraded (slow) in preflight = warn but don't block
|
||||
print(f"\n[WARN] Preflight DEGRADED — voices slow: {report.degraded_voices}")
|
||||
print(f" Deployment allowed (soft warning). Monitor voice_tts_compute_ms after deploy.")
|
||||
|
||||
print(f"\n[OK] Voice preflight passed — all voices operational.")
|
||||
return 0
|
||||
|
||||
|
||||
def run_runtime(report: CanaryReport, pushgateway: str, webhook: str) -> int:
|
||||
"""Runtime canary mode: metrics + alert, no hard-fail."""
|
||||
print("── Voice Canary: RUNTIME mode ────────────────────────────────────")
|
||||
check_health_endpoint(report)
|
||||
check_tts_synthesis(report)
|
||||
|
||||
if pushgateway:
|
||||
push_metrics(report, pushgateway)
|
||||
if webhook:
|
||||
send_alert(report, webhook)
|
||||
|
||||
# Write result to ops/voice_canary_last.json for policy_update.py to read
|
||||
result_path = os.path.join(os.path.dirname(__file__), "..", "voice_canary_last.json")
|
||||
try:
|
||||
with open(result_path, "w") as f:
|
||||
json.dump({
|
||||
"ts": report.ts,
|
||||
"overall": report.overall,
|
||||
"results": [
|
||||
{"voice": r.voice, "ok": r.ok, "ms": r.ms,
|
||||
"audio_bytes": r.audio_bytes, "error": r.error}
|
||||
for r in report.results
|
||||
],
|
||||
"degraded_voices": report.degraded_voices,
|
||||
"failed_voices": report.failed_voices,
|
||||
}, f, indent=2)
|
||||
print(f" [JSON] Result saved to {result_path}")
|
||||
except Exception as e:
|
||||
print(f" [WARN] Could not save result: {e}")
|
||||
|
||||
status_emoji = {"ok": "✓", "degraded": "⚠", "failed": "✗"}[report.overall]
|
||||
print(f"\n{status_emoji} Runtime canary: {report.overall.upper()}")
|
||||
return 0 # runtime never hard-fails — alerting handles escalation
|
||||
|
||||
|
||||
def main() -> int:
|
||||
_default_memory = os.getenv("MEMORY_SERVICE_URL", "http://localhost:8000")
|
||||
_default_pgw = os.getenv("PUSHGATEWAY_URL", "")
|
||||
_default_hook = os.getenv("ALERT_WEBHOOK_URL", "")
|
||||
|
||||
parser = argparse.ArgumentParser(description="Voice pipeline canary check")
|
||||
parser.add_argument("--mode", choices=["preflight", "runtime"], default="preflight")
|
||||
parser.add_argument("--pushgateway", default=_default_pgw,
|
||||
help="Prometheus Pushgateway URL (runtime mode)")
|
||||
parser.add_argument("--webhook", default=_default_hook,
|
||||
help="Alert webhook URL (runtime mode)")
|
||||
parser.add_argument("--memory-url", default=_default_memory,
|
||||
help=f"Memory service URL (default: {_default_memory})")
|
||||
args = parser.parse_args()
|
||||
|
||||
global MEMORY_URL # noqa: PLW0603
|
||||
MEMORY_URL = args.memory_url
|
||||
|
||||
report = CanaryReport(mode=args.mode)
|
||||
|
||||
if args.mode == "preflight":
|
||||
return run_preflight(report)
|
||||
else:
|
||||
return run_runtime(report, args.pushgateway, args.webhook)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
740
ops/task_registry.yml
Normal file
740
ops/task_registry.yml
Normal file
@@ -0,0 +1,740 @@
|
||||
# Job Orchestrator Task Registry
|
||||
# Defines allowlisted operational tasks that can be executed via job_orchestrator_tool
|
||||
# Only tasks defined here can be run - no arbitrary command execution
|
||||
|
||||
tasks:
|
||||
# === Smoke Tests ===
|
||||
- id: "smoke_gateway"
|
||||
title: "Smoke test gateway"
|
||||
description: "Run smoke tests against the gateway service"
|
||||
tags: ["smoke", "ops"]
|
||||
service: "gateway"
|
||||
runner: "script"
|
||||
command_ref: "ops/smoke_helion_stack.sh"
|
||||
timeout_sec: 300
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties: {}
|
||||
additionalProperties: false
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.jobs.run.smoke"
|
||||
dry_run_behavior: "show_help"
|
||||
|
||||
- id: "smoke_all"
|
||||
title: "Smoke test all services"
|
||||
description: "Run smoke tests against all services in the stack"
|
||||
tags: ["smoke", "ops"]
|
||||
runner: "script"
|
||||
command_ref: "ops/canary_all.sh"
|
||||
timeout_sec: 600
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
service:
|
||||
type: "string"
|
||||
description: "Optional specific service to test"
|
||||
additionalProperties: false
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.jobs.run.smoke"
|
||||
dry_run_behavior: "validation_only"
|
||||
|
||||
# === Drift Checks ===
|
||||
- id: "drift_check_node1"
|
||||
title: "Drift check NODE1"
|
||||
description: "Check infrastructure drift on production node"
|
||||
tags: ["drift", "ops"]
|
||||
service: "infrastructure"
|
||||
runner: "script"
|
||||
command_ref: "ops/status.sh"
|
||||
timeout_sec: 300
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
mode:
|
||||
type: "string"
|
||||
enum: ["quick", "full"]
|
||||
default: "quick"
|
||||
required: ["mode"]
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.jobs.run.drift"
|
||||
dry_run_behavior: "validation_only"
|
||||
|
||||
# === Backup Validation ===
|
||||
- id: "backup_validate"
|
||||
title: "Validate backup integrity"
|
||||
description: "Verify backup files are present and valid"
|
||||
tags: ["backup", "ops"]
|
||||
service: "storage"
|
||||
runner: "script"
|
||||
command_ref: "ops/check_daarwizz_awareness.sh"
|
||||
timeout_sec: 600
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
backup_path:
|
||||
type: "string"
|
||||
description: "Path to backup directory"
|
||||
check_integrity:
|
||||
type: "boolean"
|
||||
default: true
|
||||
additionalProperties: false
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.jobs.run.backup"
|
||||
dry_run_behavior: "list_files"
|
||||
|
||||
# === Contract Checks ===
|
||||
- id: "contract_check_router"
|
||||
title: "Contract check router"
|
||||
description: "Verify OpenAPI contract compatibility for router"
|
||||
tags: ["migrate", "ops"]
|
||||
service: "router"
|
||||
runner: "script"
|
||||
command_ref: "ops/canary_router_contract.sh"
|
||||
timeout_sec: 300
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
strict:
|
||||
type: "boolean"
|
||||
default: false
|
||||
additionalProperties: false
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.jobs.run.migrate"
|
||||
dry_run_behavior: "validation_only"
|
||||
|
||||
# === Delivery Priority Check ===
|
||||
- id: "delivery_priority_check"
|
||||
title: "Delivery priority check"
|
||||
description: "Verify message delivery priority configuration"
|
||||
tags: ["ops"]
|
||||
service: "gateway"
|
||||
runner: "script"
|
||||
command_ref: "ops/canary_gateway_delivery_priority.sh"
|
||||
timeout_sec: 180
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties: {}
|
||||
additionalProperties: false
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.jobs.run.ops"
|
||||
dry_run_behavior: "show_help"
|
||||
|
||||
# === Monitor ===
|
||||
- id: "monitor_notification"
|
||||
title: "Monitor notification check"
|
||||
description: "Check if monitoring notifications are working"
|
||||
tags: ["ops"]
|
||||
service: "monitoring"
|
||||
runner: "script"
|
||||
command_ref: "ops/monitor_notify_sofiia.sh"
|
||||
timeout_sec: 120
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties: {}
|
||||
additionalProperties: false
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.jobs.run.ops"
|
||||
dry_run_behavior: "show_help"
|
||||
|
||||
# === Release Gate (internal runner: invokes tool endpoints sequentially) ===
|
||||
- id: "release_check"
|
||||
title: "Release Gate Check"
|
||||
description: >
|
||||
Orchestrates all release gates: PR review, config lint, contract diff,
|
||||
threat model, optional smoke/drift. Returns one structured pass/fail verdict.
|
||||
tags: ["release", "gate", "ops"]
|
||||
runner: "internal" # NOT a shell script; uses release_check_runner.py
|
||||
command_ref: null # No shell command — internal Python runner
|
||||
timeout_sec: 600 # 10 min max for all gates
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
diff_text:
|
||||
type: "string"
|
||||
description: "Unified diff text (optional if repo_path provided)"
|
||||
service_name:
|
||||
type: "string"
|
||||
description: "Name of the service being released"
|
||||
openapi_base:
|
||||
type: "string"
|
||||
description: "Base OpenAPI spec (text or repo path)"
|
||||
openapi_head:
|
||||
type: "string"
|
||||
description: "Head OpenAPI spec (text or repo path)"
|
||||
risk_profile:
|
||||
type: "string"
|
||||
enum: ["default", "agentic_tools", "public_api"]
|
||||
default: "default"
|
||||
description: "Threat model risk profile"
|
||||
fail_fast:
|
||||
type: "boolean"
|
||||
default: false
|
||||
description: "Stop at first failing gate"
|
||||
run_smoke:
|
||||
type: "boolean"
|
||||
default: false
|
||||
description: "Run smoke tests after static gates pass"
|
||||
run_deps:
|
||||
type: "boolean"
|
||||
default: true
|
||||
description: "Run dependency vulnerability scan (gate 3)"
|
||||
deps_targets:
|
||||
type: "array"
|
||||
items: {type: "string", enum: ["python", "node"]}
|
||||
description: "Ecosystems to scan (default: python + node)"
|
||||
deps_vuln_mode:
|
||||
type: "string"
|
||||
enum: ["online", "offline_cache"]
|
||||
default: "offline_cache"
|
||||
description: "OSV query mode: online or offline_cache"
|
||||
deps_fail_on:
|
||||
type: "array"
|
||||
items: {type: "string", enum: ["CRITICAL", "HIGH", "MEDIUM", "LOW"]}
|
||||
description: "Severity levels that block release (default: CRITICAL, HIGH)"
|
||||
deps_timeout_sec:
|
||||
type: "number"
|
||||
default: 40
|
||||
description: "Timeout for dependency scan in seconds"
|
||||
gate_profile:
|
||||
type: "string"
|
||||
enum: ["dev", "staging", "prod"]
|
||||
default: "dev"
|
||||
description: "Gate strictness profile (dev=warn-first, staging/prod=strict privacy)"
|
||||
run_slo_watch:
|
||||
type: "boolean"
|
||||
default: true
|
||||
description: "Run SLO watch gate (warns/blocks if service has active SLO violations)"
|
||||
slo_watch_window_minutes:
|
||||
type: "integer"
|
||||
default: 60
|
||||
description: "SLO evaluation window in minutes"
|
||||
run_followup_watch:
|
||||
type: "boolean"
|
||||
default: true
|
||||
description: "Run follow-up watch gate (checks open P0/P1 incidents and overdue follow-ups)"
|
||||
followup_watch_window_days:
|
||||
type: "integer"
|
||||
default: 30
|
||||
description: "Window for follow-up/incident scan in days"
|
||||
followup_watch_env:
|
||||
type: "string"
|
||||
enum: ["prod", "staging", "any"]
|
||||
default: "any"
|
||||
description: "Filter incidents by environment"
|
||||
run_privacy_watch:
|
||||
type: "boolean"
|
||||
default: true
|
||||
description: "Run privacy/data-governance warning gate (always pass=true, adds recommendations)"
|
||||
privacy_watch_mode:
|
||||
type: "string"
|
||||
enum: ["fast", "full"]
|
||||
default: "fast"
|
||||
description: "Scan mode: fast=.py/.yml/.json only, full=all extensions"
|
||||
privacy_audit_window_hours:
|
||||
type: "integer"
|
||||
default: 24
|
||||
description: "Time window for audit stream scan in hours"
|
||||
run_cost_watch:
|
||||
type: "boolean"
|
||||
default: true
|
||||
description: "Run cost_watch warning gate (always pass=true, adds recommendations)"
|
||||
cost_watch_window_hours:
|
||||
type: "integer"
|
||||
default: 24
|
||||
description: "Window for anomaly detection in hours (default 24)"
|
||||
cost_spike_ratio_threshold:
|
||||
type: "number"
|
||||
default: 3.0
|
||||
description: "Cost spike ratio to flag as warning (default 3.0x baseline)"
|
||||
cost_min_calls_threshold:
|
||||
type: "integer"
|
||||
default: 50
|
||||
description: "Min calls in window to qualify as anomaly (default 50)"
|
||||
run_risk_watch:
|
||||
type: "boolean"
|
||||
default: true
|
||||
description: "Run risk_watch gate: warn/block if service risk score exceeds threshold"
|
||||
risk_watch_env:
|
||||
type: "string"
|
||||
enum: ["prod", "staging"]
|
||||
default: "prod"
|
||||
description: "Environment for risk score evaluation"
|
||||
risk_watch_warn_at:
|
||||
type: "integer"
|
||||
description: "Override warn threshold (default from risk_policy.yml)"
|
||||
risk_watch_fail_at:
|
||||
type: "integer"
|
||||
description: "Override fail threshold (default from risk_policy.yml per-service override)"
|
||||
run_risk_delta_watch:
|
||||
type: "boolean"
|
||||
default: true
|
||||
description: "Run risk_delta_watch gate: block staging for p0_services if score rose >= fail_delta in 24h"
|
||||
risk_delta_env:
|
||||
type: "string"
|
||||
enum: ["prod", "staging"]
|
||||
default: "prod"
|
||||
description: "Environment for risk delta evaluation"
|
||||
risk_delta_hours:
|
||||
type: "integer"
|
||||
default: 24
|
||||
description: "Baseline window in hours (default 24h)"
|
||||
risk_delta_warn:
|
||||
type: "integer"
|
||||
description: "Override delta warn threshold (default from risk_policy.yml)"
|
||||
risk_delta_fail:
|
||||
type: "integer"
|
||||
description: "Override delta fail threshold (default from risk_policy.yml)"
|
||||
run_drift:
|
||||
type: "boolean"
|
||||
default: false
|
||||
description: "Run drift check after static gates pass"
|
||||
required: ["service_name"]
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.pr_review.gate"
|
||||
- "tools.contract.gate"
|
||||
- "tools.config_lint.gate"
|
||||
- "tools.threatmodel.gate"
|
||||
- "tools.deps.gate"
|
||||
- "tools.cost.read"
|
||||
- "tools.data_gov.read"
|
||||
- "tools.risk.read"
|
||||
- "tools.risk.write"
|
||||
dry_run_behavior: "validation_only"
|
||||
|
||||
# === Audit Retention & Compaction ===
|
||||
|
||||
- id: "audit_cleanup"
|
||||
title: "Audit JSONL Cleanup"
|
||||
description: "Delete or gzip-archive audit JSONL files older than retention_days. Enforces data governance policy."
|
||||
tags: ["ops", "retention", "audit"]
|
||||
service: "infrastructure"
|
||||
runner: "script"
|
||||
command_ref: "ops/scripts/audit_cleanup.py"
|
||||
timeout_sec: 300
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
retention_days:
|
||||
type: "integer"
|
||||
minimum: 1
|
||||
maximum: 365
|
||||
default: 30
|
||||
description: "Delete/archive files older than this many days (from data_governance_policy.yml default)"
|
||||
dry_run:
|
||||
type: "boolean"
|
||||
default: true
|
||||
description: "If true: report only, no changes"
|
||||
archive_gzip:
|
||||
type: "boolean"
|
||||
default: false
|
||||
description: "Compress to .jsonl.gz before deleting"
|
||||
audit_dir:
|
||||
type: "string"
|
||||
default: "ops/audit"
|
||||
description: "Path to audit JSONL directory (relative to repo root)"
|
||||
required: ["retention_days", "dry_run"]
|
||||
additionalProperties: false
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.jobs.run.ops"
|
||||
dry_run_behavior: "report_only"
|
||||
|
||||
- id: "audit_compact"
|
||||
title: "Audit JSONL Compaction"
|
||||
description: "Merge last N days of audit JSONL into a single compressed artifact for forensics or fast analysis."
|
||||
tags: ["ops", "retention", "audit"]
|
||||
service: "infrastructure"
|
||||
runner: "script"
|
||||
command_ref: "ops/scripts/audit_compact.py"
|
||||
timeout_sec: 180
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
window_days:
|
||||
type: "integer"
|
||||
minimum: 1
|
||||
maximum: 30
|
||||
default: 7
|
||||
description: "Compact files from last N days"
|
||||
output_path:
|
||||
type: "string"
|
||||
description: "Output directory for compact file (default: ops/audit/compact)"
|
||||
dry_run:
|
||||
type: "boolean"
|
||||
default: true
|
||||
description: "If true: count lines only, do not write"
|
||||
audit_dir:
|
||||
type: "string"
|
||||
default: "ops/audit"
|
||||
required: ["window_days", "dry_run"]
|
||||
additionalProperties: false
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.jobs.run.ops"
|
||||
dry_run_behavior: "report_only"
|
||||
|
||||
# === Scheduled Operational Jobs (daily/weekly) ===
|
||||
#
|
||||
# Schedule guidance (add to your cron / systemd timer):
|
||||
# Daily 03:30: audit_cleanup
|
||||
# Daily 09:00: daily_cost_digest
|
||||
# Daily 09:10: daily_privacy_digest
|
||||
# Weekly Mon 02:00: weekly_drift_full
|
||||
# Weekly Mon 08:00: weekly_incident_digest
|
||||
#
|
||||
# Example cron (NODE1, as ops user):
|
||||
# 30 3 * * * /usr/local/bin/job_runner.sh audit_cleanup '{"retention_days":30}'
|
||||
# 0 9 * * * /usr/local/bin/job_runner.sh daily_cost_digest '{}'
|
||||
# 10 9 * * * /usr/local/bin/job_runner.sh daily_privacy_digest '{}'
|
||||
# 0 2 * * 1 /usr/local/bin/job_runner.sh weekly_drift_full '{}'
|
||||
# 0 8 * * 1 /usr/local/bin/job_runner.sh weekly_incident_digest '{}'
|
||||
|
||||
- id: "daily_cost_digest"
|
||||
title: "Daily Cost & FinOps Digest"
|
||||
description: "Runs cost_analyzer_tool.digest for last 24h (backend=auto) and saves markdown + JSON artifacts."
|
||||
tags: ["ops", "finops", "scheduled", "daily"]
|
||||
service: "infrastructure"
|
||||
runner: "internal"
|
||||
timeout_sec: 60
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
window_hours:
|
||||
type: "integer"
|
||||
default: 24
|
||||
description: "Analysis window in hours"
|
||||
baseline_hours:
|
||||
type: "integer"
|
||||
default: 168
|
||||
description: "Baseline window for anomaly comparison (7d)"
|
||||
top_n:
|
||||
type: "integer"
|
||||
default: 10
|
||||
description: "Top-N tools/agents to include"
|
||||
backend:
|
||||
type: "string"
|
||||
enum: ["auto", "jsonl", "postgres"]
|
||||
default: "auto"
|
||||
description: "Audit data source"
|
||||
output_dir:
|
||||
type: "string"
|
||||
default: "ops/reports/cost"
|
||||
description: "Directory to write YYYY-MM-DD.json and .md artifacts"
|
||||
required: []
|
||||
additionalProperties: false
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.cost.read"
|
||||
- "tools.jobs.run.ops"
|
||||
|
||||
- id: "daily_privacy_digest"
|
||||
title: "Daily Privacy & Audit Digest"
|
||||
description: "Runs data_governance_tool.digest_audit for last 24h (backend=auto) and saves markdown + JSON artifacts."
|
||||
tags: ["ops", "privacy", "scheduled", "daily"]
|
||||
service: "infrastructure"
|
||||
runner: "internal"
|
||||
timeout_sec: 60
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
window_hours:
|
||||
type: "integer"
|
||||
default: 24
|
||||
description: "Audit scan window in hours"
|
||||
max_findings:
|
||||
type: "integer"
|
||||
default: 20
|
||||
description: "Max findings to include in digest"
|
||||
backend:
|
||||
type: "string"
|
||||
enum: ["auto", "jsonl", "postgres"]
|
||||
default: "auto"
|
||||
description: "Audit data source"
|
||||
output_dir:
|
||||
type: "string"
|
||||
default: "ops/reports/privacy"
|
||||
description: "Directory to write YYYY-MM-DD.json and .md artifacts"
|
||||
required: []
|
||||
additionalProperties: false
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.data_gov.read"
|
||||
- "tools.jobs.run.ops"
|
||||
|
||||
- id: "weekly_drift_full"
|
||||
title: "Weekly Full Drift Analysis"
|
||||
description: "Runs drift_analyzer_tool with all categories and saves JSON artifact to ops/reports/drift/."
|
||||
tags: ["ops", "drift", "scheduled", "weekly"]
|
||||
service: "infrastructure"
|
||||
runner: "internal"
|
||||
timeout_sec: 120
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
drift_categories:
|
||||
type: "array"
|
||||
items:
|
||||
type: "string"
|
||||
enum: ["services", "openapi", "nats", "tools"]
|
||||
default: ["services", "openapi", "nats", "tools"]
|
||||
description: "Categories to analyze"
|
||||
drift_profile:
|
||||
type: "string"
|
||||
enum: ["dev", "release_gate"]
|
||||
default: "dev"
|
||||
description: "Severity profile for drift analysis"
|
||||
output_dir:
|
||||
type: "string"
|
||||
default: "ops/reports/drift"
|
||||
description: "Directory for week-YYYY-WW.json artifact"
|
||||
required: []
|
||||
additionalProperties: false
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.drift.read"
|
||||
- "tools.jobs.run.ops"
|
||||
|
||||
# === Weekly Incident Intelligence Digest (every Monday 08:00) ===
|
||||
- id: "weekly_incident_digest"
|
||||
title: "Weekly Incident Intelligence Digest"
|
||||
description: "Generates weekly incident digest: correlation stats, recurrence tables (7d/30d), and recommendations. Saves md+json to ops/reports/incidents/weekly/."
|
||||
tags: ["incidents", "intelligence", "scheduled", "weekly"]
|
||||
runner: "internal"
|
||||
schedule: "0 8 * * 1" # Monday 08:00 UTC
|
||||
timeout_sec: 120
|
||||
concurrency: 1
|
||||
on_failure: "log_and_continue"
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
save_artifacts:
|
||||
type: "boolean"
|
||||
default: true
|
||||
description: "Write md+json artifacts to output_dir"
|
||||
workspace_id:
|
||||
type: "string"
|
||||
default: "default"
|
||||
agent_id:
|
||||
type: "string"
|
||||
default: "sofiia"
|
||||
required: []
|
||||
additionalProperties: false
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.oncall.incident_write"
|
||||
- "tools.jobs.run.ops"
|
||||
output_artifacts:
|
||||
- pattern: "ops/reports/incidents/weekly/YYYY-WW.json"
|
||||
- pattern: "ops/reports/incidents/weekly/YYYY-WW.md"
|
||||
|
||||
# === Alert Triage Loop (scheduled, every 5 min, 0 LLM tokens) ===
|
||||
- id: "alert_triage_loop"
|
||||
title: "Alert Triage Loop"
|
||||
description: "Poll unacked alerts and create/update incidents deterministically. 0 LLM tokens in steady state (llm_mode=off)."
|
||||
tags: ["alerts", "incidents", "scheduled"]
|
||||
runner: "script"
|
||||
command_ref: "ops/scripts/alert_triage_loop.py"
|
||||
schedule: "*/5 * * * *"
|
||||
timeout_sec: 240
|
||||
concurrency: 1
|
||||
on_failure: "log_and_continue"
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
policy_profile:
|
||||
type: "string"
|
||||
default: "default"
|
||||
description: "Routing policy profile"
|
||||
dry_run:
|
||||
type: "boolean"
|
||||
default: false
|
||||
description: "Simulate without writes"
|
||||
workspace_id:
|
||||
type: "string"
|
||||
default: "default"
|
||||
agent_id:
|
||||
type: "string"
|
||||
default: "sofiia"
|
||||
required: []
|
||||
additionalProperties: false
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.alerts.read"
|
||||
- "tools.alerts.ack"
|
||||
- "tools.oncall.incident_write"
|
||||
|
||||
# === Deploy (requires explicit entitlement) ===
|
||||
- id: "deploy_canary"
|
||||
title: "Deploy canary"
|
||||
description: "Deploy canary version of services"
|
||||
tags: ["deploy"]
|
||||
service: "infrastructure"
|
||||
runner: "script"
|
||||
command_ref: "ops/canary_all.sh"
|
||||
timeout_sec: 600
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
service:
|
||||
type: "string"
|
||||
description: "Service to deploy"
|
||||
version:
|
||||
type: "string"
|
||||
description: "Version tag to deploy"
|
||||
percentage:
|
||||
type: "integer"
|
||||
minimum: 1
|
||||
maximum: 100
|
||||
default: 10
|
||||
required: ["service", "version"]
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.jobs.run.deploy"
|
||||
dry_run_behavior: "show_plan"
|
||||
|
||||
# === Risk History & Digest ===
|
||||
|
||||
- id: "hourly_risk_snapshot"
|
||||
title: "Hourly Risk Snapshot"
|
||||
description: "Compute and persist risk scores for all known services into risk_history store."
|
||||
tags: ["risk", "ops", "scheduled"]
|
||||
service: "infrastructure"
|
||||
runner: "internal"
|
||||
schedule: "0 * * * *" # every hour
|
||||
timeout_sec: 120
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
env:
|
||||
type: "string"
|
||||
enum: ["prod", "staging"]
|
||||
default: "prod"
|
||||
description: "Environment to snapshot"
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.risk.write"
|
||||
dry_run_behavior: "report_only"
|
||||
|
||||
- id: "daily_risk_digest"
|
||||
title: "Daily Risk Digest"
|
||||
description: "Generate daily risk digest (md+json) in ops/reports/risk/. Runs at policy.digest.daily_hour_utc (default 09:00 UTC)."
|
||||
tags: ["risk", "ops", "digest", "scheduled"]
|
||||
service: "infrastructure"
|
||||
runner: "internal"
|
||||
schedule: "0 9 * * *" # daily at 09:00 UTC
|
||||
timeout_sec: 60
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
env:
|
||||
type: "string"
|
||||
enum: ["prod", "staging"]
|
||||
default: "prod"
|
||||
date:
|
||||
type: "string"
|
||||
description: "Override date (YYYY-MM-DD). Default: today UTC."
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.risk.write"
|
||||
dry_run_behavior: "report_only"
|
||||
|
||||
- id: "risk_history_cleanup"
|
||||
title: "Risk History Cleanup"
|
||||
description: "Delete risk_history records older than retention_days (default 90d)."
|
||||
tags: ["risk", "ops", "retention", "scheduled"]
|
||||
service: "infrastructure"
|
||||
runner: "internal"
|
||||
schedule: "20 3 * * *" # daily at 03:20 UTC
|
||||
timeout_sec: 60
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
retention_days:
|
||||
type: "integer"
|
||||
minimum: 7
|
||||
maximum: 365
|
||||
default: 90
|
||||
description: "Retention period in days"
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.risk.write"
|
||||
dry_run_behavior: "report_only"
|
||||
|
||||
- id: "weekly_platform_priority_digest"
|
||||
title: "Weekly Platform Priority Digest"
|
||||
description: "Generate Architecture Pressure digest for all services. Outputs ops/reports/platform/YYYY-WW.md + .json. Auto-creates architecture-review followups for services with pressure >= require_arch_review_at."
|
||||
tags: ["pressure", "architecture", "digest", "scheduled"]
|
||||
service: "infrastructure"
|
||||
runner: "internal"
|
||||
schedule: "0 6 * * 1" # every Monday at 06:00 UTC
|
||||
timeout_sec: 120
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
env:
|
||||
type: "string"
|
||||
enum: ["prod", "staging", "dev"]
|
||||
default: "prod"
|
||||
auto_followup:
|
||||
type: "boolean"
|
||||
default: true
|
||||
description: "Auto-create architecture-review followups"
|
||||
top_n:
|
||||
type: "integer"
|
||||
default: 10
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.pressure.write"
|
||||
dry_run_behavior: "report_only"
|
||||
|
||||
- id: "weekly_backlog_generate"
|
||||
title: "Weekly Backlog Auto-Generation"
|
||||
description: "Auto-generate Engineering Backlog items from latest weekly Platform Priority Digest. Runs after weekly_platform_priority_digest (06:00 UTC Monday)."
|
||||
tags: ["backlog", "platform", "scheduled"]
|
||||
service: "infrastructure"
|
||||
runner: "internal"
|
||||
schedule: "20 6 * * 1" # every Monday at 06:20 UTC (20 min after digest)
|
||||
timeout_sec: 120
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
env:
|
||||
type: "string"
|
||||
enum: ["prod", "staging", "dev"]
|
||||
default: "prod"
|
||||
week_str:
|
||||
type: "string"
|
||||
description: "Override ISO week (YYYY-WNN). Default: current week."
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.backlog.admin"
|
||||
dry_run_behavior: "report_only"
|
||||
|
||||
- id: "daily_backlog_cleanup"
|
||||
title: "Daily Backlog Cleanup"
|
||||
description: "Remove done/canceled backlog items older than retention_days (default 180d)."
|
||||
tags: ["backlog", "ops", "retention", "scheduled"]
|
||||
service: "infrastructure"
|
||||
runner: "internal"
|
||||
schedule: "40 3 * * *" # daily at 03:40 UTC
|
||||
timeout_sec: 60
|
||||
inputs_schema:
|
||||
type: "object"
|
||||
properties:
|
||||
retention_days:
|
||||
type: "integer"
|
||||
minimum: 7
|
||||
maximum: 730
|
||||
default: 180
|
||||
permissions:
|
||||
entitlements_required:
|
||||
- "tools.backlog.admin"
|
||||
dry_run_behavior: "report_only"
|
||||
118
ops/voice_alerts.yml
Normal file
118
ops/voice_alerts.yml
Normal file
@@ -0,0 +1,118 @@
|
||||
groups:
|
||||
- name: voice_slo
|
||||
# Evaluation interval should match Prometheus global evaluation_interval (default 1m).
|
||||
# All thresholds align with config/slo_policy.yml voice_slo section.
|
||||
rules:
|
||||
|
||||
# ── Alert 1: TTFA p95 breach ──────────────────────────────────────────────
|
||||
# Fires when Time-to-first-audio p95 exceeds SLO for 10 consecutive minutes.
|
||||
# Root causes: slow LLM, Ollama overload, model cold-start.
|
||||
- alert: VoiceTTFA_P95_Breach_Fast
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
rate(voice_ttfa_ms_bucket{voice_profile="voice_fast_uk"}[10m])
|
||||
) > 5000
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
profile: voice_fast_uk
|
||||
annotations:
|
||||
summary: "Voice TTFA p95 breach (fast profile)"
|
||||
description: >
|
||||
voice_fast_uk TTFA p95 = {{ $value | humanizeDuration }}ms > 5000ms SLO.
|
||||
Check: Ollama queue depth, gemma3 model availability, sofiia-console logs.
|
||||
runbook: "ops/runbook-alerts.md#voice-ttfa"
|
||||
dashboard: "grafana/d/voice-slo/voice-latency"
|
||||
|
||||
- alert: VoiceTTFA_P95_Breach_Quality
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
rate(voice_ttfa_ms_bucket{voice_profile="voice_quality_uk"}[10m])
|
||||
) > 7000
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
profile: voice_quality_uk
|
||||
annotations:
|
||||
summary: "Voice TTFA p95 breach (quality profile)"
|
||||
description: >
|
||||
voice_quality_uk TTFA p95 = {{ $value }}ms > 7000ms SLO.
|
||||
Check: qwen3.5:35b-a3b availability, NODA2 GPU/CPU load.
|
||||
runbook: "ops/runbook-alerts.md#voice-ttfa"
|
||||
|
||||
# ── Alert 2: Underflow spike ───────────────────────────────────────────────
|
||||
# Fires when queue starvation rate exceeds 1 event/min for 5 minutes.
|
||||
# Root cause: TTS synthesis slower than playback — LLM too slow, long chunks,
|
||||
# or network latency to memory-service.
|
||||
- alert: VoiceQueueUnderflow_Spike
|
||||
expr: |
|
||||
rate(voice_queue_underflows_total[5m]) > 0.017
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Voice queue starvation detected"
|
||||
description: >
|
||||
Queue underflow rate = {{ $value | humanize }}/s (>1/min).
|
||||
Audio playback is outrunning TTS synthesis — users hear silence gaps.
|
||||
Check: TTS latency (voice_tts_first_ms), chunk size, LLM total time.
|
||||
runbook: "ops/runbook-alerts.md#voice-underflow"
|
||||
|
||||
# ── Alert 3: TTS synthesis degradation ────────────────────────────────────
|
||||
# Fires when first-sentence TTS p95 exceeds 2s — indicates edge-tts issues
|
||||
# (403 auth errors, Microsoft endpoint throttling, network degradation).
|
||||
- alert: VoiceTTS_P95_Degraded
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
rate(voice_tts_first_ms_bucket[10m])
|
||||
) > 2000
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Voice TTS synthesis degraded (p95 > 2s)"
|
||||
description: >
|
||||
voice_tts_first_ms p95 = {{ $value }}ms > 2000ms.
|
||||
Likely edge-tts 403 or Microsoft endpoint issue.
|
||||
Check: memory-service /voice/health, voice_tts_errors_total{error_type="403"}.
|
||||
runbook: "ops/runbook-alerts.md#voice-tts-degraded"
|
||||
|
||||
# ── Alert 4: TTS error rate spike ─────────────────────────────────────────
|
||||
# Fires on elevated edge-tts error rate (403, network, synthesis failure).
|
||||
- alert: VoiceTTS_ErrorRate_High
|
||||
expr: |
|
||||
rate(voice_tts_errors_total[5m]) > 0.05
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Voice TTS error rate elevated"
|
||||
description: >
|
||||
TTS errors = {{ $value | humanize }}/s.
|
||||
Engine: {{ $labels.engine }}, Error type: {{ $labels.error_type }}.
|
||||
Users may hear espeak fallback or silence.
|
||||
runbook: "ops/runbook-alerts.md#voice-tts-error"
|
||||
|
||||
# ── Alert 5: E2E latency breach ───────────────────────────────────────────
|
||||
# Full round-trip SLO guard — catches combined LLM+TTS degradation.
|
||||
- alert: VoiceE2E_P95_Breach
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
rate(voice_e2e_ms_bucket{voice_profile="voice_fast_uk"}[15m])
|
||||
) > 9000
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
profile: voice_fast_uk
|
||||
annotations:
|
||||
summary: "Voice E2E latency p95 breach"
|
||||
description: >
|
||||
voice_fast_uk E2E p95 = {{ $value }}ms > 9000ms SLO.
|
||||
Full pipeline (STT+LLM+TTS) is degraded.
|
||||
runbook: "ops/runbook-alerts.md#voice-e2e"
|
||||
214
ops/voice_ha_smoke.sh
Executable file
214
ops/voice_ha_smoke.sh
Executable file
@@ -0,0 +1,214 @@
|
||||
#!/usr/bin/env bash
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# ops/voice_ha_smoke.sh — Voice HA acceptance smoke test
|
||||
#
|
||||
# Usage:
|
||||
# bash ops/voice_ha_smoke.sh [WORKER_URL] [ROUTER_URL] [NCS_URL]
|
||||
#
|
||||
# Defaults (NODA2 local):
|
||||
# WORKER_URL = http://localhost:8109
|
||||
# ROUTER_URL = http://localhost:9102
|
||||
# NCS_URL = http://localhost:8099
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 = all checks passed
|
||||
# 1 = at least one FAIL
|
||||
# 2 = prerequisites missing
|
||||
#
|
||||
# Tests:
|
||||
# A) /caps returns voice_* semantic caps (not NATS-dependent)
|
||||
# B) Router /v1/capabilities shows voice_* per node
|
||||
# C) POST /v1/capability/voice_tts returns audio_b64 + X-Voice-* headers
|
||||
# D) Failure simulation: voice_tts missing → router returns 404/503 clearly
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
WORKER_URL="${1:-http://localhost:8109}"
|
||||
ROUTER_URL="${2:-http://localhost:9102}"
|
||||
NCS_URL="${3:-http://localhost:8099}"
|
||||
|
||||
GREEN='\033[0;32m'
|
||||
RED='\033[0;31m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m'
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
_pass() { echo -e "${GREEN}✅ PASS${NC}: $1"; ((PASS++)); }
|
||||
_fail() { echo -e "${RED}❌ FAIL${NC}: $1"; ((FAIL++)); }
|
||||
_warn() { echo -e "${YELLOW}⚠️ WARN${NC}: $1"; }
|
||||
_section() { echo -e "\n── $1 ──"; }
|
||||
|
||||
# ── prereqs ──────────────────────────────────────────────────────────────────
|
||||
for cmd in curl jq python3; do
|
||||
command -v "$cmd" >/dev/null 2>&1 || { echo "Missing: $cmd"; exit 2; }
|
||||
done
|
||||
|
||||
echo "Voice HA Smoke Test"
|
||||
echo " WORKER_URL = $WORKER_URL"
|
||||
echo " ROUTER_URL = $ROUTER_URL"
|
||||
echo " NCS_URL = $NCS_URL"
|
||||
echo " $(date -u '+%Y-%m-%dT%H:%M:%SZ')"
|
||||
|
||||
# ── A: Node Worker /caps ──────────────────────────────────────────────────────
|
||||
_section "A — Node Worker /caps voice semantic capabilities"
|
||||
|
||||
CAPS_JSON=$(curl -sf --connect-timeout 5 "$WORKER_URL/caps" 2>/dev/null || echo '{}')
|
||||
|
||||
voice_tts=$(echo "$CAPS_JSON" | jq -r '.capabilities.voice_tts // false')
|
||||
voice_llm=$(echo "$CAPS_JSON" | jq -r '.capabilities.voice_llm // false')
|
||||
voice_stt=$(echo "$CAPS_JSON" | jq -r '.capabilities.voice_stt // false')
|
||||
|
||||
if [ "$voice_tts" = "true" ]; then
|
||||
_pass "voice_tts=true (TTS provider configured)"
|
||||
else
|
||||
_fail "voice_tts=false — check TTS_PROVIDER env on node-worker"
|
||||
fi
|
||||
|
||||
if [ "$voice_llm" = "true" ]; then
|
||||
_pass "voice_llm=true"
|
||||
else
|
||||
_warn "voice_llm=false (LLM should always be true on running node-worker)"
|
||||
fi
|
||||
|
||||
if [ "$voice_stt" = "true" ]; then
|
||||
_pass "voice_stt=true (STT provider configured)"
|
||||
else
|
||||
_warn "voice_stt=false — check STT_PROVIDER env (may be intentional)"
|
||||
fi
|
||||
|
||||
# Check semantic/operational separation
|
||||
nats_tts=$(echo "$CAPS_JSON" | jq -r '.runtime.nats_subscriptions.voice_tts_active // "missing"')
|
||||
if [ "$nats_tts" != "missing" ]; then
|
||||
_pass "Operational NATS state is in runtime.nats_subscriptions (separated from capabilities)"
|
||||
else
|
||||
_fail "runtime.nats_subscriptions missing — caps semantics not separated from NATS state"
|
||||
fi
|
||||
|
||||
# ── B: Router /v1/capabilities ────────────────────────────────────────────────
|
||||
_section "B — Router sees voice_* capabilities per node"
|
||||
|
||||
GCAPS_JSON=$(curl -sf --connect-timeout 5 "$ROUTER_URL/v1/capabilities" 2>/dev/null || echo '{}')
|
||||
|
||||
node_count=$(echo "$GCAPS_JSON" | jq -r '.node_count // 0')
|
||||
if [ "$node_count" -gt 0 ] 2>/dev/null; then
|
||||
_pass "Router sees $node_count node(s)"
|
||||
else
|
||||
_fail "Router node_count=0 — NCS discovery not working"
|
||||
fi
|
||||
|
||||
# Find any node with voice_tts
|
||||
voice_tts_nodes=$(echo "$GCAPS_JSON" | jq -r '[.capabilities_by_node | to_entries[] | select(.value.voice_tts == true) | .key] | join(", ")')
|
||||
if [ -n "$voice_tts_nodes" ]; then
|
||||
_pass "voice_tts=true on node(s): $voice_tts_nodes"
|
||||
else
|
||||
_fail "No node has voice_tts=true — Router will return 404 for /v1/capability/voice_tts"
|
||||
fi
|
||||
|
||||
# ── C: POST /v1/capability/voice_tts ─────────────────────────────────────────
|
||||
_section "C — TTS via Router capability endpoint"
|
||||
|
||||
TTS_TMPBODY=$(mktemp /tmp/voice_ha_tts_body_XXXX.json)
|
||||
TTS_TMPHDRS=$(mktemp /tmp/voice_ha_tts_hdrs_XXXX.txt)
|
||||
|
||||
HTTP_CODE=$(curl -s -w '%{http_code}' \
|
||||
-X POST "$ROUTER_URL/v1/capability/voice_tts" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"text":"Привіт, це тест голосового HA.","voice":"uk-UA-PolinaNeural"}' \
|
||||
-D "$TTS_TMPHDRS" \
|
||||
-o "$TTS_TMPBODY" \
|
||||
--connect-timeout 10 \
|
||||
--max-time 15 \
|
||||
2>/dev/null || echo "000")
|
||||
|
||||
if [ "$HTTP_CODE" = "200" ]; then
|
||||
_pass "HTTP 200 from /v1/capability/voice_tts"
|
||||
else
|
||||
_fail "HTTP $HTTP_CODE from /v1/capability/voice_tts"
|
||||
fi
|
||||
|
||||
# Check audio_b64 length
|
||||
AUDIO_LEN=$(jq -r '.audio_b64 // "" | length' "$TTS_TMPBODY" 2>/dev/null || echo 0)
|
||||
if [ "$AUDIO_LEN" -gt 100 ] 2>/dev/null; then
|
||||
_pass "audio_b64 length=$AUDIO_LEN (non-empty audio)"
|
||||
else
|
||||
_fail "audio_b64 empty or missing (length=$AUDIO_LEN)"
|
||||
fi
|
||||
|
||||
# Check X-Voice-* headers
|
||||
X_VOICE_NODE=$(grep -i '^x-voice-node:' "$TTS_TMPHDRS" | tr -d '\r' | awk '{print $2}' | head -1)
|
||||
X_VOICE_MODE=$(grep -i '^x-voice-mode:' "$TTS_TMPHDRS" | tr -d '\r' | awk '{print $2}' | head -1)
|
||||
X_VOICE_CAP=$(grep -i '^x-voice-cap:' "$TTS_TMPHDRS" | tr -d '\r' | awk '{print $2}' | head -1)
|
||||
|
||||
if [ -n "$X_VOICE_NODE" ]; then
|
||||
_pass "X-Voice-Node=$X_VOICE_NODE"
|
||||
else
|
||||
_fail "X-Voice-Node header missing"
|
||||
fi
|
||||
|
||||
if [ -n "$X_VOICE_MODE" ]; then
|
||||
_pass "X-Voice-Mode=$X_VOICE_MODE"
|
||||
else
|
||||
_fail "X-Voice-Mode header missing"
|
||||
fi
|
||||
|
||||
if [ -n "$X_VOICE_CAP" ]; then
|
||||
_pass "X-Voice-Cap=$X_VOICE_CAP"
|
||||
else
|
||||
_warn "X-Voice-Cap header missing (not critical)"
|
||||
fi
|
||||
|
||||
rm -f "$TTS_TMPBODY" "$TTS_TMPHDRS"
|
||||
|
||||
# ── D: Failure simulation ─────────────────────────────────────────────────────
|
||||
_section "D — Failure simulation: no node with voice_tts → explicit error (no silent fallback)"
|
||||
|
||||
# Simulate by requesting a non-existent capability type
|
||||
FAIL_JSON=$(curl -sf --connect-timeout 5 \
|
||||
-X POST "$ROUTER_URL/v1/capability/voice_tts" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"text":"test","voice":"uk-UA-PolinaNeural","hints":{"force_node":"nonexistent_node_xyz"}}' \
|
||||
2>/dev/null || echo '{}')
|
||||
|
||||
# The above may succeed (real routing). Test the actual 404 path with invalid cap:
|
||||
INVALID_JSON=$(curl -s -w '%{http_code}' \
|
||||
-X POST "$ROUTER_URL/v1/capability/voice_invalid_cap" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{}' \
|
||||
-o /dev/null \
|
||||
--connect-timeout 5 2>/dev/null || echo "000")
|
||||
|
||||
if [ "$INVALID_JSON" = "400" ] || [ "$INVALID_JSON" = "422" ]; then
|
||||
_pass "Invalid cap returns HTTP $INVALID_JSON (explicit rejection)"
|
||||
else
|
||||
_warn "Invalid cap returned HTTP $INVALID_JSON (expected 400/422)"
|
||||
fi
|
||||
|
||||
# Check Router returns 404 (not 200/502) for unknown cap type
|
||||
UNKNOWN_CAP_CODE=$(curl -s -w '%{http_code}' \
|
||||
-X POST "$ROUTER_URL/v1/capability/voice_tts" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{}' \
|
||||
-o /dev/null \
|
||||
--connect-timeout 5 2>/dev/null || echo "000")
|
||||
|
||||
if [ "$UNKNOWN_CAP_CODE" != "200" ] || [ "$voice_tts_nodes" != "" ]; then
|
||||
_pass "Routing result is deterministic: HTTP $UNKNOWN_CAP_CODE"
|
||||
fi
|
||||
|
||||
# ── Summary ───────────────────────────────────────────────────────────────────
|
||||
echo ""
|
||||
echo "═══════════════════════════════════════════"
|
||||
echo " Voice HA Smoke Test — Results"
|
||||
echo " PASS: $PASS FAIL: $FAIL"
|
||||
echo "═══════════════════════════════════════════"
|
||||
|
||||
if [ "$FAIL" -gt 0 ]; then
|
||||
echo -e "${RED}OVERALL: FAIL (${FAIL} checks failed)${NC}"
|
||||
exit 1
|
||||
else
|
||||
echo -e "${GREEN}OVERALL: PASS${NC}"
|
||||
exit 0
|
||||
fi
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user