feat(runtime): sync experience bus and learner stack into main
This commit is contained in:
61
docs/ops/experience_bus_phase1.md
Normal file
61
docs/ops/experience_bus_phase1.md
Normal file
@@ -0,0 +1,61 @@
|
||||
# Experience Bus Phase-1 (Router First)
|
||||
|
||||
## Scope
|
||||
- Source: router `/v1/agents/{id}/infer`
|
||||
- Event subject: `agent.experience.v1.<agent_id>`
|
||||
- JetStream stream: `EXPERIENCE` (`agent.experience.v1.>`)
|
||||
- DB table: `agent_experience_events` (append-only)
|
||||
- Controls: dedup + sampling
|
||||
|
||||
## Env knobs
|
||||
- `EXPERIENCE_BUS_ENABLED=true`
|
||||
- `EXPERIENCE_ENABLE_NATS=true`
|
||||
- `EXPERIENCE_ENABLE_DB=true`
|
||||
- `EXPERIENCE_DATABASE_URL=postgresql://<user>:<pass>@<host>:5432/daarion_memory`
|
||||
- `EXPERIENCE_OK_SAMPLE_PCT=10`
|
||||
- `EXPERIENCE_LATENCY_SPIKE_MS=5000`
|
||||
- `EXPERIENCE_DEDUP_WINDOW_SECONDS=900`
|
||||
- `EXPERIENCE_QUEUE_MAX=2000`
|
||||
|
||||
## Deploy
|
||||
1. Apply migration `migrations/054_agent_experience_events.sql`.
|
||||
2. Deploy router with updated `main.py`, `experience_bus.py`, `agent_metrics.py`.
|
||||
3. Restart router service.
|
||||
|
||||
## Smoke (30 calls)
|
||||
```bash
|
||||
for i in $(seq 1 15); do
|
||||
curl -sS -X POST http://127.0.0.1:9102/v1/agents/agromatrix/infer \
|
||||
-H 'content-type: application/json' \
|
||||
-d "{\"prompt\":\"experience smoke agromatrix $i $(date +%s%N)\"}" >/dev/null
|
||||
done
|
||||
|
||||
for i in $(seq 1 15); do
|
||||
curl -sS -X POST http://127.0.0.1:9102/v1/agents/stepan/infer \
|
||||
-H 'content-type: application/json' \
|
||||
-d "{\"prompt\":\"experience smoke stepan $i $(date +%s%N)\"}" >/dev/null
|
||||
done
|
||||
```
|
||||
|
||||
## Verify JetStream
|
||||
```bash
|
||||
# example (inside nats container with nats CLI)
|
||||
nats stream info EXPERIENCE
|
||||
nats stream view EXPERIENCE --count 5
|
||||
```
|
||||
|
||||
## Verify DB
|
||||
```sql
|
||||
SELECT count(*)
|
||||
FROM agent_experience_events
|
||||
WHERE ts > now() - interval '10 minutes';
|
||||
```
|
||||
|
||||
## Verify lifecycle guard unchanged
|
||||
```bash
|
||||
curl -sS -o /dev/null -w '%{http_code}\n' -X POST http://127.0.0.1:9102/v1/agents/aistalk/infer -H 'content-type: application/json' -d '{"prompt":"ping"}'
|
||||
# expected: 410
|
||||
|
||||
curl -sS -o /dev/null -w '%{http_code}\n' -X POST http://127.0.0.1:9102/v1/agents/devtools/infer -H 'content-type: application/json' -d '{"prompt":"ping"}'
|
||||
# expected: 404
|
||||
```
|
||||
71
docs/ops/experience_bus_phase2.md
Normal file
71
docs/ops/experience_bus_phase2.md
Normal file
@@ -0,0 +1,71 @@
|
||||
# Experience Bus Phase-2 (Lessons Extractor)
|
||||
|
||||
## Scope
|
||||
- Source stream: `EXPERIENCE`
|
||||
- Source subjects: `agent.experience.v1.>`
|
||||
- Consumer mode: durable pull + explicit ack
|
||||
- Output table: `agent_lessons` (append-only)
|
||||
- Output subject: `agent.lesson.v1` (optional publish)
|
||||
|
||||
## Service
|
||||
- Container: `dagi-experience-learner-node1`
|
||||
- Endpoint:
|
||||
- `GET /health`
|
||||
- `GET /metrics`
|
||||
|
||||
## Environment
|
||||
- `NATS_URL=nats://nats:4222`
|
||||
- `EXPERIENCE_STREAM_NAME=EXPERIENCE`
|
||||
- `EXPERIENCE_SUBJECT=agent.experience.v1.>`
|
||||
- `EXPERIENCE_DURABLE=experience-learner-v1`
|
||||
- `EXPERIENCE_ACK_WAIT_SECONDS=30`
|
||||
- `EXPERIENCE_MAX_DELIVER=20`
|
||||
- `EXPERIENCE_FETCH_BATCH=64`
|
||||
- `EXPERIENCE_FETCH_TIMEOUT_SECONDS=2`
|
||||
- `EXPERIENCE_WINDOW_SECONDS=1800`
|
||||
- `EXPERIENCE_OK_SAMPLE_PCT=10`
|
||||
- `EXPERIENCE_LATENCY_SPIKE_MS=5000`
|
||||
- `EXPERIENCE_ERROR_THRESHOLD=3`
|
||||
- `EXPERIENCE_SILENT_THRESHOLD=5`
|
||||
- `EXPERIENCE_LATENCY_THRESHOLD=3`
|
||||
- `EXPERIENCE_EVENT_DEDUP_TTL_SECONDS=3600`
|
||||
- `LEARNER_DATABASE_URL=postgresql://<user>:<pass>@<host>:5432/daarion_memory`
|
||||
- `LESSON_SUBJECT=agent.lesson.v1`
|
||||
- `LESSON_PUBLISH_ENABLED=true`
|
||||
|
||||
## Deploy
|
||||
1. Apply migration `migrations/055_agent_lessons.sql`.
|
||||
2. Deploy service `experience-learner`.
|
||||
3. Verify service health and metrics.
|
||||
|
||||
## Smoke
|
||||
```bash
|
||||
# Generate event traffic (Phase-1 router path)
|
||||
for i in $(seq 1 50); do
|
||||
agent=$([ $((i%2)) -eq 0 ] && echo "aistalk" || echo "devtools")
|
||||
curl -sS -m 8 -o /dev/null \
|
||||
-X POST "http://127.0.0.1:9102/v1/agents/${agent}/infer" \
|
||||
-H "content-type: application/json" \
|
||||
-d "{\"prompt\":\"phase2-smoke-${agent}-${i}-$(date +%s%N)\"}" || true
|
||||
done
|
||||
```
|
||||
|
||||
## Verify
|
||||
```bash
|
||||
# Lessons rows
|
||||
docker exec dagi-postgres psql -U daarion -d daarion_memory -tAc \
|
||||
"SELECT count(*) FROM agent_lessons WHERE ts > now()-interval '30 minutes';"
|
||||
|
||||
# Idempotency check (run again, duplicates should not explode)
|
||||
docker exec dagi-postgres psql -U daarion -d daarion_memory -tAc \
|
||||
"SELECT count(*), count(distinct lesson_key) FROM agent_lessons;"
|
||||
|
||||
# Learner metrics
|
||||
curl -sS http://127.0.0.1:9109/metrics | grep -E 'lessons_|js_messages_'
|
||||
```
|
||||
|
||||
## Acceptance
|
||||
- `agent_lessons` receives rows under live event flow.
|
||||
- Reprocessing/redelivery does not duplicate lessons (`lesson_key` unique).
|
||||
- `js_messages_acked_total` increases.
|
||||
- `js_messages_redelivered_total` is observable when replay/redelivery occurs.
|
||||
70
docs/ops/experience_bus_phase3.md
Normal file
70
docs/ops/experience_bus_phase3.md
Normal file
@@ -0,0 +1,70 @@
|
||||
# Experience Bus Phase-3 (Router Runtime Retrieval)
|
||||
|
||||
## Scope
|
||||
- Read path only in `router` before `/v1/agents/{id}/infer`.
|
||||
- Retrieves lessons from `agent_lessons` and injects a compact block:
|
||||
- `Operational Lessons (apply if relevant)`
|
||||
- Attach policy:
|
||||
- after last error / latency spike: always-on, `K=7`
|
||||
- otherwise sampled attach, default `10%`, `K=3`
|
||||
|
||||
## Environment
|
||||
- `LESSONS_ATTACH_ENABLED=true`
|
||||
- `LESSONS_DATABASE_URL=postgresql://<user>:<pass>@<host>:5432/daarion_memory`
|
||||
- `LESSONS_ATTACH_MIN=3`
|
||||
- `LESSONS_ATTACH_MAX=7`
|
||||
- `LESSONS_ATTACH_SAMPLE_PCT=10`
|
||||
- `LESSONS_ATTACH_TIMEOUT_MS=25`
|
||||
- `LESSONS_ATTACH_MAX_CHARS=1200`
|
||||
- `LESSONS_SIGNAL_CACHE_TTL_SECONDS=300`
|
||||
- `EXPERIENCE_LATENCY_SPIKE_MS=5000`
|
||||
|
||||
## Metrics
|
||||
- `lessons_retrieved_total{status="ok|timeout|err"}`
|
||||
- `lessons_attached_total{count="0|1-3|4-7"}`
|
||||
- `lessons_attach_latency_ms`
|
||||
|
||||
## Safety
|
||||
- Lessons block never includes raw user text.
|
||||
- Guard filters skip lessons containing prompt-injection-like markers:
|
||||
- `ignore previous`, `system:`, `developer:`, fenced code blocks.
|
||||
|
||||
## Smoke
|
||||
```bash
|
||||
# 1) Seed synthetic lessons for one agent (example: agromatrix)
|
||||
docker exec dagi-postgres psql -U daarion -d daarion_memory -c "
|
||||
INSERT INTO agent_lessons (lesson_id, lesson_key, ts, scope, agent_id, task_type, trigger, action, avoid, signals, evidence, raw)
|
||||
SELECT
|
||||
gen_random_uuid(),
|
||||
md5(random()::text || clock_timestamp()::text),
|
||||
now() - (g * interval '1 minute'),
|
||||
'agent',
|
||||
'agromatrix',
|
||||
'infer',
|
||||
'when retrying after model timeout',
|
||||
'switch provider or reduce token budget first',
|
||||
'avoid repeating the same failed provider with same payload',
|
||||
'{"error_class":"TimeoutError","provider":"deepseek","model":"deepseek-chat","profile":"reasoning"}'::jsonb,
|
||||
'{"count":3}'::jsonb,
|
||||
'{}'::jsonb
|
||||
FROM generate_series(1,10) g;"
|
||||
|
||||
# 2) Send infer calls
|
||||
for i in $(seq 1 20); do
|
||||
curl -sS -m 12 -o /dev/null \
|
||||
-X POST "http://127.0.0.1:9102/v1/agents/agromatrix/infer" \
|
||||
-H "content-type: application/json" \
|
||||
-d "{\"prompt\":\"phase3-smoke-${i}\",\"metadata\":{\"agent_id\":\"agromatrix\"}}" || true
|
||||
done
|
||||
|
||||
# 3) Check metrics
|
||||
curl -sS http://127.0.0.1:9102/metrics | grep -E 'lessons_retrieved_total|lessons_attached_total|lessons_attach_latency_ms'
|
||||
|
||||
# 4) Simulate DB issue (optional): lessons retrieval should fail-open and infer remains 200
|
||||
# (temporarily point LESSONS_DATABASE_URL to bad DSN + restart router)
|
||||
```
|
||||
|
||||
## Acceptance
|
||||
- Router logs include `lessons_attached=<k>` during sampled or always-on retrieval.
|
||||
- Infer path remains healthy when lessons DB is unavailable.
|
||||
- p95 infer latency impact stays controlled at sampling `10%`.
|
||||
99
docs/ops/experience_bus_phase4.md
Normal file
99
docs/ops/experience_bus_phase4.md
Normal file
@@ -0,0 +1,99 @@
|
||||
# Experience Bus Phase-4 (Gateway Hooks)
|
||||
|
||||
## Scope
|
||||
- Source: `gateway` (Telegram webhook path).
|
||||
- Emits `agent.experience.v1.<agent_id>` events with:
|
||||
- `source="gateway"`
|
||||
- `request_id`/`correlation_id`
|
||||
- `policy.sowa_decision` + normalized `reason`
|
||||
- `feedback.user_signal` (`none|positive|negative|retry|timeout`)
|
||||
- Optional DB append to `agent_experience_events` (fail-open).
|
||||
|
||||
## Environment (gateway)
|
||||
- `NATS_URL=nats://nats:4222`
|
||||
- `EXPERIENCE_BUS_ENABLED=true`
|
||||
- `EXPERIENCE_ENABLE_NATS=true`
|
||||
- `EXPERIENCE_ENABLE_DB=true`
|
||||
- `EXPERIENCE_STREAM_NAME=EXPERIENCE`
|
||||
- `EXPERIENCE_SUBJECT_PREFIX=agent.experience.v1`
|
||||
- `EXPERIENCE_DATABASE_URL=postgresql://<user>:<pass>@<host>:5432/daarion_memory`
|
||||
- `GATEWAY_USER_SIGNAL_RETRY_WINDOW_SECONDS=30`
|
||||
|
||||
## Metrics
|
||||
- `gateway_experience_published_total{status="ok|err"}`
|
||||
- `gateway_policy_decisions_total{sowa_decision,reason}`
|
||||
- `gateway_user_signal_total{user_signal}`
|
||||
- `gateway_webhook_latency_ms`
|
||||
|
||||
## Correlation contract
|
||||
- Gateway creates `request_id` (`correlation_id`) per webhook cycle.
|
||||
- Gateway forwards it to router via:
|
||||
- `metadata.request_id`
|
||||
- `metadata.trace_id`
|
||||
- `X-Request-Id` header
|
||||
- Router writes same `request_id` in its event payload for join.
|
||||
|
||||
## Smoke
|
||||
```bash
|
||||
# 1) Send webhook payload (agent-specific endpoint)
|
||||
curl -sS -X POST "http://127.0.0.1:9300/helion/telegram/webhook" \
|
||||
-H "content-type: application/json" \
|
||||
-d '{
|
||||
"update_id": 900001,
|
||||
"message": {
|
||||
"message_id": 101,
|
||||
"date": 1760000000,
|
||||
"text": "дякую",
|
||||
"chat": {"id": "smoke-chat-1", "type": "private"},
|
||||
"from": {"id": 7001, "username": "smoke_user", "is_bot": false}
|
||||
}
|
||||
}'
|
||||
|
||||
# 2) Retry signal (same text quickly)
|
||||
curl -sS -X POST "http://127.0.0.1:9300/helion/telegram/webhook" \
|
||||
-H "content-type: application/json" \
|
||||
-d '{
|
||||
"update_id": 900002,
|
||||
"message": {
|
||||
"message_id": 102,
|
||||
"date": 1760000005,
|
||||
"text": "перевір",
|
||||
"chat": {"id": "smoke-chat-1", "type": "private"},
|
||||
"from": {"id": 7001, "username": "smoke_user", "is_bot": false}
|
||||
}
|
||||
}'
|
||||
|
||||
curl -sS -X POST "http://127.0.0.1:9300/helion/telegram/webhook" \
|
||||
-H "content-type: application/json" \
|
||||
-d '{
|
||||
"update_id": 900003,
|
||||
"message": {
|
||||
"message_id": 103,
|
||||
"date": 1760000010,
|
||||
"text": "перевір",
|
||||
"chat": {"id": "smoke-chat-1", "type": "private"},
|
||||
"from": {"id": 7001, "username": "smoke_user", "is_bot": false}
|
||||
}
|
||||
}'
|
||||
|
||||
# 3) Verify metrics
|
||||
curl -sS http://127.0.0.1:9300/metrics | grep -E 'gateway_experience_published_total|gateway_policy_decisions_total|gateway_user_signal_total|gateway_webhook_latency_ms'
|
||||
|
||||
# 4) Verify DB rows
|
||||
docker exec dagi-postgres psql -U daarion -d daarion_memory -tAc \
|
||||
"SELECT count(*) FROM agent_experience_events WHERE source='gateway' AND ts > now()-interval '10 minutes';"
|
||||
|
||||
# 5) Verify correlation join (gateway <-> router)
|
||||
docker exec dagi-postgres psql -U daarion -d daarion_memory -P pager=off -c \
|
||||
"SELECT source, agent_id, request_id, task_type, ts
|
||||
FROM agent_experience_events
|
||||
WHERE ts > now()-interval '10 minutes'
|
||||
AND source IN ('gateway','router')
|
||||
ORDER BY ts DESC LIMIT 40;"
|
||||
```
|
||||
|
||||
## Acceptance
|
||||
- Gateway publishes and stores events without blocking webhook path.
|
||||
- `request_id` can join gateway and router records for same conversation turn.
|
||||
- `policy.sowa_decision` and `feedback.user_signal` are present in gateway `raw` event.
|
||||
- If NATS/DB unavailable, webhook still returns normal success path (fail-open telemetry).
|
||||
18
docs/ops/payloads/phase4_1_payload_source_lock.json
Normal file
18
docs/ops/payloads/phase4_1_payload_source_lock.json
Normal file
@@ -0,0 +1,18 @@
|
||||
{
|
||||
"update_id": 900002,
|
||||
"message": {
|
||||
"message_id": 1,
|
||||
"date": 1760007001,
|
||||
"chat": {
|
||||
"id": 0,
|
||||
"type": "private"
|
||||
},
|
||||
"from": {
|
||||
"id": 0,
|
||||
"is_bot": false,
|
||||
"first_name": "Source",
|
||||
"username": "lock_smoke"
|
||||
},
|
||||
"text": "source-lock-smoke"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"update_id": 900001,
|
||||
"inline_query": {
|
||||
"id": "phase4-inline-1",
|
||||
"query": "unsupported-smoke",
|
||||
"offset": "",
|
||||
"from": {
|
||||
"id": 12345,
|
||||
"is_bot": false,
|
||||
"first_name": "Smoke"
|
||||
}
|
||||
}
|
||||
}
|
||||
17
docs/ops/payloads/phase5_payload_group_source_lock.json
Normal file
17
docs/ops/payloads/phase5_payload_group_source_lock.json
Normal file
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"update_id": 910002,
|
||||
"message": {
|
||||
"message_id": 1002,
|
||||
"date": 1760010002,
|
||||
"chat": {
|
||||
"id": -1005001002,
|
||||
"type": "group"
|
||||
},
|
||||
"from": {
|
||||
"id": 551002,
|
||||
"is_bot": false,
|
||||
"username": "phase5_lock_user"
|
||||
},
|
||||
"text": "agromatrix, перевір lock smoke"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
{
|
||||
"update_id": 910001,
|
||||
"message": {
|
||||
"message_id": 1001,
|
||||
"date": 1760010001,
|
||||
"chat": {
|
||||
"id": -1005001001,
|
||||
"type": "group"
|
||||
},
|
||||
"from": {
|
||||
"id": 551001,
|
||||
"is_bot": false,
|
||||
"username": "phase5_group_user"
|
||||
},
|
||||
"sticker": {
|
||||
"file_id": "dummy_sticker_file"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"update_id": 910003,
|
||||
"message": {
|
||||
"message_id": 1003,
|
||||
"date": 1760010003,
|
||||
"chat": {
|
||||
"id": 551003,
|
||||
"type": "private"
|
||||
},
|
||||
"from": {
|
||||
"id": 551003,
|
||||
"is_bot": false,
|
||||
"username": "phase5_private_user"
|
||||
},
|
||||
"text": "що на цьому фото?"
|
||||
}
|
||||
}
|
||||
134
docs/ops/phase4_1_gateway_early_return_coverage.md
Normal file
134
docs/ops/phase4_1_gateway_early_return_coverage.md
Normal file
@@ -0,0 +1,134 @@
|
||||
# Phase-4.1 Gateway Early-Return Coverage
|
||||
|
||||
## Goal
|
||||
Enforce the gateway telemetry invariant:
|
||||
- 1 webhook call -> 1 `source="gateway"` event row.
|
||||
- `request_id` always present.
|
||||
- Early-return branches are emitted with deterministic reasons.
|
||||
|
||||
## Deploy
|
||||
```bash
|
||||
cd /opt/microdao-daarion
|
||||
docker compose -f docker-compose.node1.yml up -d --no-deps --build --force-recreate gateway
|
||||
```
|
||||
|
||||
## Seed / Precheck
|
||||
```bash
|
||||
export GATEWAY_WEBHOOK_URL='http://127.0.0.1:9300/agromatrix/telegram/webhook'
|
||||
export PG_CONTAINER='dagi-postgres'
|
||||
|
||||
pre_rows_gateway=$(docker exec "$PG_CONTAINER" psql -U daarion -d daarion_memory -tAc \
|
||||
"SELECT count(*) FROM agent_experience_events WHERE source='gateway' AND ts > now()-interval '10 minutes';")
|
||||
|
||||
pre_rows_join=$(docker exec "$PG_CONTAINER" psql -U daarion -d daarion_memory -tAc \
|
||||
"SELECT count(*) FROM (
|
||||
SELECT g.request_id
|
||||
FROM agent_experience_events g
|
||||
JOIN agent_experience_events r ON r.request_id=g.request_id
|
||||
WHERE g.source='gateway' AND r.source='router' AND g.ts > now()-interval '10 minutes'
|
||||
) x;")
|
||||
|
||||
pre_js=$(curl -sS http://127.0.0.1:8222/jsz?streams=true | python3 -c '
|
||||
import json,sys
|
||||
j=json.load(sys.stdin)
|
||||
d=(j.get("account_details") or [{}])[0].get("stream_detail") or []
|
||||
print(next((s.get("state",{}).get("messages",0) for s in d if s.get("name")=="EXPERIENCE"),0))
|
||||
')
|
||||
|
||||
echo "pre_rows_gateway=$pre_rows_gateway"
|
||||
echo "pre_rows_join=$pre_rows_join"
|
||||
echo "pre_js=$pre_js"
|
||||
```
|
||||
|
||||
## Fixed Payload Replay
|
||||
|
||||
### 1) Unsupported (`unsupported_no_message`)
|
||||
Payload file:
|
||||
- `docs/ops/payloads/phase4_1_payload_unsupported_no_message.json`
|
||||
|
||||
```bash
|
||||
curl -sS -X POST "$GATEWAY_WEBHOOK_URL" \
|
||||
-H 'content-type: application/json' \
|
||||
-d @docs/ops/payloads/phase4_1_payload_unsupported_no_message.json
|
||||
```
|
||||
|
||||
### 2) Source-lock (`source_lock_duplicate_update`)
|
||||
Payload file:
|
||||
- `docs/ops/payloads/phase4_1_payload_source_lock.json`
|
||||
|
||||
```bash
|
||||
# first request
|
||||
curl -sS -X POST "$GATEWAY_WEBHOOK_URL" \
|
||||
-H 'content-type: application/json' \
|
||||
-d @docs/ops/payloads/phase4_1_payload_source_lock.json
|
||||
|
||||
# duplicate replay (same update_id)
|
||||
curl -sS -X POST "$GATEWAY_WEBHOOK_URL" \
|
||||
-H 'content-type: application/json' \
|
||||
-d @docs/ops/payloads/phase4_1_payload_source_lock.json
|
||||
```
|
||||
|
||||
## Assertions
|
||||
|
||||
### A) Row delta strictness
|
||||
```bash
|
||||
post_rows_gateway=$(docker exec "$PG_CONTAINER" psql -U daarion -d daarion_memory -tAc \
|
||||
"SELECT count(*) FROM agent_experience_events WHERE source='gateway' AND ts > now()-interval '10 minutes';")
|
||||
|
||||
delta_rows=$((post_rows_gateway-pre_rows_gateway))
|
||||
echo "delta_rows=$delta_rows"
|
||||
# expected: delta_rows == 3 for this replay batch
|
||||
```
|
||||
|
||||
### B) Deterministic reasons + unknown policy on unsupported
|
||||
```bash
|
||||
docker exec "$PG_CONTAINER" psql -U daarion -d daarion_memory -P pager=off -c "
|
||||
SELECT request_id,
|
||||
raw->'policy'->>'sowa_decision' as sowa,
|
||||
raw->'policy'->>'reason' as reason,
|
||||
raw->'feedback'->>'user_signal' as user_signal,
|
||||
raw->'result'->>'http_status' as http_status,
|
||||
ts
|
||||
FROM agent_experience_events
|
||||
WHERE source='gateway' AND ts > now()-interval '10 minutes'
|
||||
ORDER BY ts DESC LIMIT 20;
|
||||
"
|
||||
|
||||
# expected in this batch:
|
||||
# - reason=unsupported_no_message (1 row), policy.sowa_decision=UNKNOWN
|
||||
# - reason=source_lock_duplicate_update (1 row)
|
||||
# - first source-lock request usually reason=prober_request (or normal path reason)
|
||||
```
|
||||
|
||||
### C) Metrics assertions
|
||||
```bash
|
||||
curl -sS http://127.0.0.1:9300/metrics | grep -E \
|
||||
'gateway_experience_emitted_total|gateway_early_return_total|gateway_event_finalize_latency_ms|gateway_experience_published_total'
|
||||
|
||||
# expected:
|
||||
# - gateway_experience_emitted_total{path="early_return",status="ok"} increments
|
||||
# - gateway_early_return_total{reason="unsupported_no_message"} increments
|
||||
# - gateway_early_return_total{reason="source_lock_duplicate_update"} increments
|
||||
```
|
||||
|
||||
### D) Join sanity (normal path unaffected)
|
||||
```bash
|
||||
post_rows_join=$(docker exec "$PG_CONTAINER" psql -U daarion -d daarion_memory -tAc \
|
||||
"SELECT count(*) FROM (
|
||||
SELECT g.request_id
|
||||
FROM agent_experience_events g
|
||||
JOIN agent_experience_events r ON r.request_id=g.request_id
|
||||
WHERE g.source='gateway' AND r.source='router' AND g.ts > now()-interval '10 minutes'
|
||||
) x;")
|
||||
|
||||
echo "post_rows_join=$post_rows_join"
|
||||
# expected: join remains non-zero for normal webhook traffic
|
||||
```
|
||||
|
||||
## PASS Criteria
|
||||
- Strict `delta_rows == N_webhooks` for replay batch.
|
||||
- Exactly one `source='gateway'` row per webhook call.
|
||||
- `request_id` present on all new rows.
|
||||
- Early-return reasons are deterministic (`unsupported_no_message`, `source_lock_duplicate_update`).
|
||||
- Metrics counters for early-return/finalize are incrementing.
|
||||
- Normal gateway<->router join remains healthy.
|
||||
136
docs/ops/phase5_anti_silent_group_ux.md
Normal file
136
docs/ops/phase5_anti_silent_group_ux.md
Normal file
@@ -0,0 +1,136 @@
|
||||
# Phase-5 Anti-Silent / Group UX
|
||||
|
||||
## Goal
|
||||
Reduce user-facing silent outcomes in group/private chat flows while avoiding spam.
|
||||
|
||||
## Invariants
|
||||
- `I1`: no silent user-facing failure for `public_active` agents (`SILENT`/early-return -> short ACK).
|
||||
- `I2`: one-message rule per webhook in group chats.
|
||||
- `I3`: debounce by `(chat_id, agent_id, reason)` with cooldown.
|
||||
- `I4`: evidence in gateway event (`anti_silent_action`, `anti_silent_template`, `chat_type`).
|
||||
|
||||
## Deploy
|
||||
```bash
|
||||
cd /opt/microdao-daarion
|
||||
docker compose -f docker-compose.node1.yml up -d --no-deps --build --force-recreate gateway
|
||||
```
|
||||
|
||||
## Seed / Precheck
|
||||
```bash
|
||||
export GATEWAY_WEBHOOK_URL='http://127.0.0.1:9300/agromatrix/telegram/webhook'
|
||||
export PG_CONTAINER='dagi-postgres'
|
||||
|
||||
pre_rows_gateway=$(docker exec "$PG_CONTAINER" psql -U daarion -d daarion_memory -tAc \
|
||||
"SELECT count(*) FROM agent_experience_events WHERE source='gateway' AND ts > now()-interval '10 minutes';")
|
||||
|
||||
pre_rows_join=$(docker exec "$PG_CONTAINER" psql -U daarion -d daarion_memory -tAc \
|
||||
"SELECT count(*) FROM (
|
||||
SELECT g.request_id
|
||||
FROM agent_experience_events g
|
||||
JOIN agent_experience_events r ON r.request_id=g.request_id
|
||||
WHERE g.source='gateway' AND r.source='router' AND g.ts > now()-interval '10 minutes'
|
||||
) x;")
|
||||
|
||||
pre_js=$(curl -sS http://127.0.0.1:8222/jsz?streams=true | python3 -c '
|
||||
import json,sys
|
||||
j=json.load(sys.stdin)
|
||||
d=(j.get("account_details") or [{}])[0].get("stream_detail") or []
|
||||
print(next((s.get("state",{}).get("messages",0) for s in d if s.get("name")=="EXPERIENCE"),0))
|
||||
')
|
||||
|
||||
echo "pre_rows_gateway=$pre_rows_gateway"
|
||||
echo "pre_rows_join=$pre_rows_join"
|
||||
echo "pre_js=$pre_js"
|
||||
```
|
||||
|
||||
## Fixed Payload Replay
|
||||
|
||||
Payloads:
|
||||
- `docs/ops/payloads/phase5_payload_group_unsupported_no_message.json`
|
||||
- `docs/ops/payloads/phase5_payload_group_source_lock.json`
|
||||
- `docs/ops/payloads/phase5_payload_private_photo_unsupported.json`
|
||||
|
||||
```bash
|
||||
# 1) group unsupported_no_message
|
||||
curl -sS -X POST "$GATEWAY_WEBHOOK_URL" \
|
||||
-H 'content-type: application/json' \
|
||||
-d @docs/ops/payloads/phase5_payload_group_unsupported_no_message.json
|
||||
|
||||
# 2) group source_lock pair (same update_id)
|
||||
curl -sS -X POST "$GATEWAY_WEBHOOK_URL" \
|
||||
-H 'content-type: application/json' \
|
||||
-d @docs/ops/payloads/phase5_payload_group_source_lock.json
|
||||
|
||||
curl -sS -X POST "$GATEWAY_WEBHOOK_URL" \
|
||||
-H 'content-type: application/json' \
|
||||
-d @docs/ops/payloads/phase5_payload_group_source_lock.json
|
||||
|
||||
# 3) private photo unsupported (photo follow-up without image context)
|
||||
curl -sS -X POST "$GATEWAY_WEBHOOK_URL" \
|
||||
-H 'content-type: application/json' \
|
||||
-d @docs/ops/payloads/phase5_payload_private_photo_unsupported.json
|
||||
```
|
||||
|
||||
## Assertions
|
||||
|
||||
### A) Strict row delta
|
||||
```bash
|
||||
post_rows_gateway=$(docker exec "$PG_CONTAINER" psql -U daarion -d daarion_memory -tAc \
|
||||
"SELECT count(*) FROM agent_experience_events WHERE source='gateway' AND ts > now()-interval '10 minutes';")
|
||||
|
||||
delta_rows=$((post_rows_gateway-pre_rows_gateway))
|
||||
echo "delta_rows=$delta_rows"
|
||||
# expected: delta_rows == 4
|
||||
```
|
||||
|
||||
### B) Anti-silent evidence in DB
|
||||
```bash
|
||||
docker exec "$PG_CONTAINER" psql -U daarion -d daarion_memory -P pager=off -c "
|
||||
SELECT request_id,
|
||||
raw->>'chat_type' as chat_type,
|
||||
raw->'policy'->>'sowa_decision' as sowa,
|
||||
raw->'policy'->>'reason' as reason,
|
||||
raw->>'anti_silent_action' as anti_silent_action,
|
||||
raw->>'anti_silent_template' as anti_silent_template,
|
||||
raw->'result'->>'http_status' as http_status,
|
||||
ts
|
||||
FROM agent_experience_events
|
||||
WHERE source='gateway' AND agent_id='agromatrix' AND ts > now()-interval '10 minutes'
|
||||
ORDER BY ts DESC LIMIT 25;
|
||||
"
|
||||
|
||||
# expected:
|
||||
# - reason=unsupported_no_message with sowa=UNKNOWN (group path)
|
||||
# - reason=source_lock_duplicate_update with anti_silent_action in (ACK_EMITTED, ACK_SUPPRESSED_COOLDOWN)
|
||||
# - reason=photo_followup_without_image_context with anti_silent_action=ACK_EMITTED
|
||||
```
|
||||
|
||||
### C) Metrics
|
||||
```bash
|
||||
curl -sS http://127.0.0.1:9300/metrics | grep -E \
|
||||
'gateway_anti_silent_total|gateway_ack_sent_total|gateway_experience_emitted_total|gateway_early_return_total'
|
||||
|
||||
# expected:
|
||||
# - gateway_anti_silent_total{action="ACK_EMITTED",reason="...",chat_type="group|private"} increments
|
||||
# - source-lock repeated request may produce ACK_SUPPRESSED_COOLDOWN depending on timing
|
||||
```
|
||||
|
||||
### D) Join sanity (normal path still healthy)
|
||||
```bash
|
||||
post_rows_join=$(docker exec "$PG_CONTAINER" psql -U daarion -d daarion_memory -tAc \
|
||||
"SELECT count(*) FROM (
|
||||
SELECT g.request_id
|
||||
FROM agent_experience_events g
|
||||
JOIN agent_experience_events r ON r.request_id=g.request_id
|
||||
WHERE g.source='gateway' AND r.source='router' AND g.ts > now()-interval '10 minutes'
|
||||
) x;")
|
||||
|
||||
echo "post_rows_join=$post_rows_join"
|
||||
# expected: non-zero with normal traffic
|
||||
```
|
||||
|
||||
## PASS Criteria
|
||||
- `delta_rows == 4` for fixed replay batch.
|
||||
- Deterministic reason codes present (`unsupported_no_message`, `source_lock_duplicate_update`, `photo_followup_without_image_context`).
|
||||
- `anti_silent_action` is present for anti-silent branches (`ACK_EMITTED` or `ACK_SUPPRESSED_COOLDOWN`).
|
||||
- No evidence of double-event for one webhook request.
|
||||
Reference in New Issue
Block a user