feat(runtime): sync experience bus and learner stack into main
This commit is contained in:
839
services/experience-learner/main.py
Normal file
839
services/experience-learner/main.py
Normal file
@@ -0,0 +1,839 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
from collections import OrderedDict, deque
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Any, Deque, Dict, Optional, Tuple
|
||||
|
||||
import asyncpg
|
||||
import nats
|
||||
from fastapi import FastAPI, Response
|
||||
from nats.aio.msg import Msg
|
||||
from nats.js.api import AckPolicy, ConsumerConfig, DeliverPolicy
|
||||
from prometheus_client import CONTENT_TYPE_LATEST, Counter, Gauge, generate_latest
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
level=os.getenv("LOG_LEVEL", "INFO").upper(),
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
)
|
||||
logger = logging.getLogger("experience_learner")
|
||||
|
||||
|
||||
LESSONS_EXTRACTED = Counter(
|
||||
"lessons_extracted_total",
|
||||
"Total lessons extracted from experience events",
|
||||
["status"],
|
||||
)
|
||||
LESSONS_INSERT = Counter(
|
||||
"lessons_insert_total",
|
||||
"Total lesson insert attempts",
|
||||
["status"],
|
||||
)
|
||||
JS_MESSAGES_ACKED = Counter(
|
||||
"js_messages_acked_total",
|
||||
"Total JetStream messages acked by learner",
|
||||
)
|
||||
JS_MESSAGES_REDELIVERED = Counter(
|
||||
"js_messages_redelivered_total",
|
||||
"Total redelivered JetStream messages observed by learner",
|
||||
)
|
||||
EVENTS_SELECTED = Counter(
|
||||
"experience_learner_events_selected_total",
|
||||
"Events selected for learner processing",
|
||||
["reason"],
|
||||
)
|
||||
EVENTS_DROPPED = Counter(
|
||||
"experience_learner_events_dropped_total",
|
||||
"Events dropped by learner filtering/dedup",
|
||||
["reason"],
|
||||
)
|
||||
LESSON_PUBLISH = Counter(
|
||||
"experience_learner_lessons_published_total",
|
||||
"Lesson publish attempts to JetStream",
|
||||
["status"],
|
||||
)
|
||||
ANTI_SILENT_TUNING_EVALUATED = Counter(
|
||||
"experience_learner_anti_silent_tuning_evaluated_total",
|
||||
"Anti-silent tuning lesson generation evaluations",
|
||||
["status"],
|
||||
)
|
||||
CONSUMER_RUNNING = Gauge(
|
||||
"experience_learner_consumer_running",
|
||||
"1 when learner consumer loop is running",
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EventSample:
|
||||
ts_mono: float
|
||||
ok: bool
|
||||
latency_ms: int
|
||||
|
||||
|
||||
class ExperienceLearner:
|
||||
def __init__(self) -> None:
|
||||
self.node_id = os.getenv("NODE_ID", "NODA1")
|
||||
self.nats_url = os.getenv("NATS_URL", "nats://nats:4222")
|
||||
self.stream_name = os.getenv("EXPERIENCE_STREAM_NAME", "EXPERIENCE")
|
||||
self.subject = os.getenv("EXPERIENCE_SUBJECT", "agent.experience.v1.>")
|
||||
self.lesson_subject = os.getenv("LESSON_SUBJECT", "agent.lesson.v1")
|
||||
self.durable = os.getenv("EXPERIENCE_DURABLE", "experience-learner-v1")
|
||||
self.deliver_policy = os.getenv("EXPERIENCE_DELIVER_POLICY", "all").lower()
|
||||
self.ack_wait_s = float(os.getenv("EXPERIENCE_ACK_WAIT_SECONDS", "30"))
|
||||
self.max_deliver = int(os.getenv("EXPERIENCE_MAX_DELIVER", "20"))
|
||||
self.fetch_batch = int(os.getenv("EXPERIENCE_FETCH_BATCH", "64"))
|
||||
self.fetch_timeout_s = float(os.getenv("EXPERIENCE_FETCH_TIMEOUT_SECONDS", "2"))
|
||||
|
||||
self.window_s = int(os.getenv("EXPERIENCE_WINDOW_SECONDS", "1800"))
|
||||
self.ok_sample_pct = float(os.getenv("EXPERIENCE_OK_SAMPLE_PCT", "10"))
|
||||
self.latency_spike_ms = int(os.getenv("EXPERIENCE_LATENCY_SPIKE_MS", "5000"))
|
||||
self.error_threshold = int(os.getenv("EXPERIENCE_ERROR_THRESHOLD", "3"))
|
||||
self.silent_threshold = int(os.getenv("EXPERIENCE_SILENT_THRESHOLD", "5"))
|
||||
self.latency_threshold = int(os.getenv("EXPERIENCE_LATENCY_THRESHOLD", "3"))
|
||||
|
||||
self.event_dedup_ttl_s = int(os.getenv("EXPERIENCE_EVENT_DEDUP_TTL_SECONDS", "3600"))
|
||||
self.event_dedup_max = int(os.getenv("EXPERIENCE_EVENT_DEDUP_MAX", "100000"))
|
||||
self.publish_lessons = os.getenv("LESSON_PUBLISH_ENABLED", "true").lower() in {"1", "true", "yes"}
|
||||
self.anti_silent_tuning_enabled = os.getenv("ANTI_SILENT_TUNING_ENABLED", "true").lower() in {"1", "true", "yes"}
|
||||
self.anti_silent_window_days = max(1, int(os.getenv("ANTI_SILENT_TUNING_WINDOW_DAYS", "7")))
|
||||
self.anti_silent_min_evidence = max(1, int(os.getenv("ANTI_SILENT_TUNING_MIN_EVIDENCE", "20")))
|
||||
self.anti_silent_min_score = max(0.0, min(1.0, float(os.getenv("ANTI_SILENT_TUNING_MIN_SCORE", "0.75"))))
|
||||
self.anti_silent_weight_retry = max(0.0, min(1.0, float(os.getenv("ANTI_SILENT_TUNING_WEIGHT_RETRY", "0.6"))))
|
||||
self.anti_silent_weight_negative = max(0.0, min(1.0, float(os.getenv("ANTI_SILENT_TUNING_WEIGHT_NEGATIVE", "0.3"))))
|
||||
self.anti_silent_weight_suppressed = max(0.0, min(1.0, float(os.getenv("ANTI_SILENT_TUNING_WEIGHT_SUPPRESSED", "0.1"))))
|
||||
self.anti_silent_ttl_days = max(1, int(os.getenv("ANTI_SILENT_TUNING_TTL_DAYS", "7")))
|
||||
|
||||
self.db_dsn = (
|
||||
os.getenv("LEARNER_DATABASE_URL")
|
||||
or os.getenv("EXPERIENCE_DATABASE_URL")
|
||||
or os.getenv("DATABASE_URL")
|
||||
)
|
||||
if not self.db_dsn:
|
||||
raise RuntimeError("LEARNER_DATABASE_URL (or EXPERIENCE_DATABASE_URL/DATABASE_URL) is required")
|
||||
|
||||
self._running = False
|
||||
self._task: Optional[asyncio.Task[Any]] = None
|
||||
self._nc = None
|
||||
self._js = None
|
||||
self._sub = None
|
||||
self._pool: Optional[asyncpg.Pool] = None
|
||||
|
||||
self._seen_events: "OrderedDict[str, float]" = OrderedDict()
|
||||
self._buckets: Dict[str, Deque[EventSample]] = {}
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def start(self) -> None:
|
||||
if self._running:
|
||||
return
|
||||
self._pool = await asyncpg.create_pool(self.db_dsn, min_size=1, max_size=4)
|
||||
self._nc = await nats.connect(self.nats_url)
|
||||
self._js = self._nc.jetstream()
|
||||
await self._ensure_consumer()
|
||||
self._sub = await self._js.pull_subscribe(
|
||||
self.subject,
|
||||
durable=self.durable,
|
||||
stream=self.stream_name,
|
||||
)
|
||||
self._running = True
|
||||
CONSUMER_RUNNING.set(1)
|
||||
self._task = asyncio.create_task(self._consume_loop(), name="experience-learner")
|
||||
logger.info(
|
||||
"experience-learner started stream=%s subject=%s durable=%s",
|
||||
self.stream_name,
|
||||
self.subject,
|
||||
self.durable,
|
||||
)
|
||||
|
||||
async def stop(self) -> None:
|
||||
self._running = False
|
||||
CONSUMER_RUNNING.set(0)
|
||||
if self._task:
|
||||
self._task.cancel()
|
||||
with contextlib.suppress(Exception):
|
||||
await self._task
|
||||
self._task = None
|
||||
if self._nc:
|
||||
await self._nc.close()
|
||||
self._nc = None
|
||||
self._js = None
|
||||
self._sub = None
|
||||
if self._pool:
|
||||
await self._pool.close()
|
||||
self._pool = None
|
||||
|
||||
async def _ensure_consumer(self) -> None:
|
||||
if self._js is None:
|
||||
return
|
||||
deliver_policy = DeliverPolicy.ALL if self.deliver_policy == "all" else DeliverPolicy.NEW
|
||||
cfg = ConsumerConfig(
|
||||
durable_name=self.durable,
|
||||
ack_policy=AckPolicy.EXPLICIT,
|
||||
ack_wait=self.ack_wait_s,
|
||||
max_deliver=self.max_deliver,
|
||||
deliver_policy=deliver_policy,
|
||||
filter_subject=self.subject,
|
||||
)
|
||||
try:
|
||||
await self._js.add_consumer(self.stream_name, config=cfg)
|
||||
logger.info("consumer created durable=%s stream=%s", self.durable, self.stream_name)
|
||||
except Exception as exc:
|
||||
msg = str(exc).lower()
|
||||
if "consumer name already in use" in msg or "consumer already exists" in msg:
|
||||
logger.info("consumer exists durable=%s stream=%s", self.durable, self.stream_name)
|
||||
else:
|
||||
raise
|
||||
|
||||
async def _consume_loop(self) -> None:
|
||||
assert self._sub is not None
|
||||
while self._running:
|
||||
try:
|
||||
msgs = await self._sub.fetch(self.fetch_batch, timeout=self.fetch_timeout_s)
|
||||
except asyncio.TimeoutError:
|
||||
continue
|
||||
except Exception as exc:
|
||||
logger.warning("fetch failed: %s", exc)
|
||||
await asyncio.sleep(1.0)
|
||||
continue
|
||||
|
||||
for msg in msgs:
|
||||
await self._handle_msg(msg)
|
||||
|
||||
async def _handle_msg(self, msg: Msg) -> None:
|
||||
try:
|
||||
metadata = getattr(msg, "metadata", None)
|
||||
if metadata is not None and getattr(metadata, "num_delivered", 1) > 1:
|
||||
JS_MESSAGES_REDELIVERED.inc()
|
||||
|
||||
event = json.loads(msg.data.decode("utf-8", errors="replace"))
|
||||
if not isinstance(event, dict):
|
||||
EVENTS_DROPPED.labels(reason="invalid_payload").inc()
|
||||
await msg.ack()
|
||||
JS_MESSAGES_ACKED.inc()
|
||||
return
|
||||
|
||||
event_id = str(event.get("event_id") or "").strip()
|
||||
if event_id and await self._seen_event(event_id):
|
||||
EVENTS_DROPPED.labels(reason="event_dedup").inc()
|
||||
await msg.ack()
|
||||
JS_MESSAGES_ACKED.inc()
|
||||
return
|
||||
|
||||
keep, reason = self._should_keep(event)
|
||||
if not keep:
|
||||
EVENTS_DROPPED.labels(reason=reason).inc()
|
||||
await msg.ack()
|
||||
JS_MESSAGES_ACKED.inc()
|
||||
return
|
||||
|
||||
EVENTS_SELECTED.labels(reason=reason).inc()
|
||||
lessons = await self._extract_lessons(event)
|
||||
if not lessons:
|
||||
await msg.ack()
|
||||
JS_MESSAGES_ACKED.inc()
|
||||
return
|
||||
|
||||
for lesson in lessons:
|
||||
LESSONS_EXTRACTED.labels(status="ok").inc()
|
||||
insert_status = await self._insert_lesson(lesson)
|
||||
LESSONS_INSERT.labels(status=insert_status).inc()
|
||||
if insert_status == "ok" and self.publish_lessons:
|
||||
await self._publish_lesson(lesson)
|
||||
|
||||
await msg.ack()
|
||||
JS_MESSAGES_ACKED.inc()
|
||||
except Exception as exc:
|
||||
LESSONS_EXTRACTED.labels(status="err").inc()
|
||||
logger.exception("message handling failed: %s", exc)
|
||||
with contextlib.suppress(Exception):
|
||||
await msg.nak()
|
||||
|
||||
async def _seen_event(self, event_id: str) -> bool:
|
||||
now = time.monotonic()
|
||||
async with self._lock:
|
||||
self._prune_seen(now)
|
||||
seen_ts = self._seen_events.get(event_id)
|
||||
if seen_ts is not None and (now - seen_ts) < self.event_dedup_ttl_s:
|
||||
return True
|
||||
self._seen_events[event_id] = now
|
||||
self._seen_events.move_to_end(event_id, last=True)
|
||||
while len(self._seen_events) > self.event_dedup_max:
|
||||
self._seen_events.popitem(last=False)
|
||||
return False
|
||||
|
||||
def _prune_seen(self, now: float) -> None:
|
||||
threshold = now - self.event_dedup_ttl_s
|
||||
while self._seen_events:
|
||||
_, ts = next(iter(self._seen_events.items()))
|
||||
if ts >= threshold:
|
||||
break
|
||||
self._seen_events.popitem(last=False)
|
||||
|
||||
def _should_keep(self, event: Dict[str, Any]) -> Tuple[bool, str]:
|
||||
result = event.get("result") or {}
|
||||
llm = event.get("llm") or {}
|
||||
policy = event.get("policy") or {}
|
||||
ok = bool(result.get("ok"))
|
||||
status = _as_int(result.get("http_status"), 0)
|
||||
latency_ms = _as_int(llm.get("latency_ms"), 0)
|
||||
sowa_decision = str(policy.get("sowa_decision") or "").upper()
|
||||
|
||||
if not ok:
|
||||
return True, "error"
|
||||
if self._is_anti_silent_gateway_event(event):
|
||||
return True, "anti_silent_signal"
|
||||
if sowa_decision == "SILENT":
|
||||
return True, "silent"
|
||||
if status >= 500:
|
||||
return True, "http_5xx"
|
||||
if latency_ms >= self.latency_spike_ms:
|
||||
return True, "latency_spike"
|
||||
if random.random() * 100.0 < self.ok_sample_pct:
|
||||
return True, "ok_sample_in"
|
||||
return False, "ok_sample_out"
|
||||
|
||||
def _is_anti_silent_gateway_event(self, event: Dict[str, Any]) -> bool:
|
||||
if not self.anti_silent_tuning_enabled:
|
||||
return False
|
||||
if str(event.get("source") or "").strip().lower() != "gateway":
|
||||
return False
|
||||
action = str(event.get("anti_silent_action") or "").strip().upper()
|
||||
if action not in {"ACK_EMITTED", "ACK_SUPPRESSED_COOLDOWN"}:
|
||||
return False
|
||||
reason = str((event.get("policy") or {}).get("reason") or "").strip()
|
||||
template_id = str(event.get("anti_silent_template") or "").strip()
|
||||
return bool(reason and template_id)
|
||||
|
||||
async def _extract_lessons(self, event: Dict[str, Any]) -> list[Dict[str, Any]]:
|
||||
lessons: list[Dict[str, Any]] = []
|
||||
|
||||
tuning_lesson = await self._try_extract_anti_silent_tuning_lesson(event)
|
||||
if tuning_lesson is not None:
|
||||
lessons.append(tuning_lesson)
|
||||
|
||||
operational_lesson = await self._try_extract_lesson(event)
|
||||
if operational_lesson is not None:
|
||||
lessons.append(operational_lesson)
|
||||
|
||||
return lessons
|
||||
|
||||
async def _try_extract_lesson(self, event: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
categories = self._lesson_categories(event)
|
||||
if not categories:
|
||||
return None
|
||||
|
||||
for category in categories:
|
||||
bucket_key = self._bucket_key(category, event)
|
||||
count, ok_rate, p95_latency = self._update_bucket(bucket_key, event)
|
||||
threshold = self._threshold_for(category)
|
||||
if count < threshold:
|
||||
continue
|
||||
lesson = self._build_lesson(
|
||||
category=category,
|
||||
event=event,
|
||||
count=count,
|
||||
ok_rate=ok_rate,
|
||||
p95_latency=p95_latency,
|
||||
)
|
||||
return lesson
|
||||
|
||||
return None
|
||||
|
||||
async def _try_extract_anti_silent_tuning_lesson(self, event: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
if not self._is_anti_silent_gateway_event(event):
|
||||
return None
|
||||
if self._pool is None:
|
||||
ANTI_SILENT_TUNING_EVALUATED.labels(status="pool_missing").inc()
|
||||
return None
|
||||
|
||||
policy = event.get("policy") or {}
|
||||
reason = _safe_token(policy.get("reason"))
|
||||
chat_type = _safe_token(event.get("chat_type"))
|
||||
if not reason or not chat_type:
|
||||
ANTI_SILENT_TUNING_EVALUATED.labels(status="missing_fields").inc()
|
||||
return None
|
||||
|
||||
stats = await self._anti_silent_stats(reason=reason, chat_type=chat_type)
|
||||
if not stats:
|
||||
ANTI_SILENT_TUNING_EVALUATED.labels(status="no_data").inc()
|
||||
return None
|
||||
|
||||
candidates = [item for item in stats if int(item.get("n") or 0) >= self.anti_silent_min_evidence]
|
||||
if not candidates:
|
||||
ANTI_SILENT_TUNING_EVALUATED.labels(status="insufficient_evidence").inc()
|
||||
return None
|
||||
|
||||
best = max(candidates, key=lambda item: (float(item.get("score", 0.0)), int(item.get("n", 0))))
|
||||
best_score = float(best.get("score", 0.0))
|
||||
if best_score < self.anti_silent_min_score:
|
||||
ANTI_SILENT_TUNING_EVALUATED.labels(status="below_score").inc()
|
||||
return None
|
||||
|
||||
worst = min(candidates, key=lambda item: (float(item.get("score", 0.0)), -int(item.get("n", 0))))
|
||||
best_template = str(best.get("template_id") or "").strip().upper()
|
||||
if not best_template:
|
||||
ANTI_SILENT_TUNING_EVALUATED.labels(status="bad_template").inc()
|
||||
return None
|
||||
|
||||
trigger = f"reason={reason};chat_type={chat_type}"
|
||||
action = f"prefer_template={best_template}"
|
||||
avoid = ""
|
||||
worst_template = str(worst.get("template_id") or "").strip().upper()
|
||||
if worst_template and worst_template != best_template:
|
||||
avoid = f"avoid_template={worst_template}"
|
||||
if not avoid:
|
||||
avoid = "avoid_template=none"
|
||||
|
||||
lesson_type = "anti_silent_tuning"
|
||||
lesson_key_raw = "|".join([lesson_type, trigger, action])
|
||||
lesson_key = hashlib.sha256(lesson_key_raw.encode("utf-8")).hexdigest()
|
||||
now_dt = datetime.now(timezone.utc)
|
||||
expires_at = (now_dt + timedelta(days=self.anti_silent_ttl_days)).isoformat().replace("+00:00", "Z")
|
||||
evidence = {
|
||||
"n_best": int(best.get("n") or 0),
|
||||
"score_best": round(best_score, 6),
|
||||
"retry_rate": round(float(best.get("retry_rate", 0.0)), 6),
|
||||
"negative_rate": round(float(best.get("negative_rate", 0.0)), 6),
|
||||
"suppressed_rate": round(float(best.get("suppressed_rate", 0.0)), 6),
|
||||
"window_days": self.anti_silent_window_days,
|
||||
"weights": {
|
||||
"retry": self.anti_silent_weight_retry,
|
||||
"negative": self.anti_silent_weight_negative,
|
||||
"suppressed": self.anti_silent_weight_suppressed,
|
||||
},
|
||||
"candidates": stats,
|
||||
}
|
||||
signals = {
|
||||
"policy_reason": reason,
|
||||
"chat_type": chat_type,
|
||||
"lesson_type": lesson_type,
|
||||
"trigger_kind": "anti_silent_ack_template",
|
||||
}
|
||||
lesson: Dict[str, Any] = {
|
||||
"lesson_id": str(uuid.uuid4()),
|
||||
"lesson_key": lesson_key,
|
||||
"lesson_type": lesson_type,
|
||||
"ts": now_dt.isoformat().replace("+00:00", "Z"),
|
||||
"expires_at": expires_at,
|
||||
"scope": "global",
|
||||
"agent_id": None,
|
||||
"task_type": "webhook",
|
||||
"trigger": trigger,
|
||||
"action": action,
|
||||
"avoid": avoid,
|
||||
"signals": signals,
|
||||
"evidence": evidence,
|
||||
}
|
||||
lesson["raw"] = dict(lesson)
|
||||
ANTI_SILENT_TUNING_EVALUATED.labels(status="ok").inc()
|
||||
return lesson
|
||||
|
||||
async def _anti_silent_stats(self, *, reason: str, chat_type: str) -> list[Dict[str, Any]]:
|
||||
if self._pool is None:
|
||||
return []
|
||||
query = """
|
||||
SELECT
|
||||
COALESCE(raw->>'anti_silent_template', '') AS template_id,
|
||||
COUNT(*)::int AS n,
|
||||
AVG(
|
||||
CASE
|
||||
WHEN COALESCE(raw->'feedback'->>'user_signal', 'none') = 'retry' THEN 1.0
|
||||
ELSE 0.0
|
||||
END
|
||||
)::float8 AS retry_rate,
|
||||
AVG(
|
||||
CASE
|
||||
WHEN COALESCE(raw->'feedback'->>'user_signal', 'none') = 'negative' THEN 1.0
|
||||
ELSE 0.0
|
||||
END
|
||||
)::float8 AS negative_rate,
|
||||
AVG(
|
||||
CASE
|
||||
WHEN COALESCE(raw->>'anti_silent_action', '') = 'ACK_SUPPRESSED_COOLDOWN' THEN 1.0
|
||||
ELSE 0.0
|
||||
END
|
||||
)::float8 AS suppressed_rate
|
||||
FROM agent_experience_events
|
||||
WHERE source = 'gateway'
|
||||
AND ts >= (now() - ($1::int * interval '1 day'))
|
||||
AND COALESCE(raw->'policy'->>'reason', '') = $2
|
||||
AND COALESCE(raw->>'chat_type', 'unknown') = $3
|
||||
AND COALESCE(raw->>'anti_silent_action', '') IN ('ACK_EMITTED', 'ACK_SUPPRESSED_COOLDOWN')
|
||||
AND COALESCE(raw->>'anti_silent_template', '') <> ''
|
||||
GROUP BY 1
|
||||
HAVING COUNT(*) >= $4
|
||||
"""
|
||||
try:
|
||||
async with self._pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
query,
|
||||
self.anti_silent_window_days,
|
||||
reason,
|
||||
chat_type,
|
||||
self.anti_silent_min_evidence,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("anti-silent stats query failed: %s", exc)
|
||||
return []
|
||||
|
||||
results: list[Dict[str, Any]] = []
|
||||
for row in rows:
|
||||
template_id = str(row.get("template_id") or "").strip().upper()
|
||||
if not template_id:
|
||||
continue
|
||||
n = int(row.get("n") or 0)
|
||||
retry_rate = float(row.get("retry_rate") or 0.0)
|
||||
negative_rate = float(row.get("negative_rate") or 0.0)
|
||||
suppressed_rate = float(row.get("suppressed_rate") or 0.0)
|
||||
score = 1.0 - (
|
||||
self.anti_silent_weight_retry * retry_rate
|
||||
+ self.anti_silent_weight_negative * negative_rate
|
||||
+ self.anti_silent_weight_suppressed * suppressed_rate
|
||||
)
|
||||
score = max(0.0, min(1.0, score))
|
||||
results.append(
|
||||
{
|
||||
"template_id": template_id,
|
||||
"n": n,
|
||||
"retry_rate": retry_rate,
|
||||
"negative_rate": negative_rate,
|
||||
"suppressed_rate": suppressed_rate,
|
||||
"score": score,
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
def _lesson_categories(self, event: Dict[str, Any]) -> list[str]:
|
||||
result = event.get("result") or {}
|
||||
llm = event.get("llm") or {}
|
||||
policy = event.get("policy") or {}
|
||||
categories: list[str] = []
|
||||
|
||||
if not bool(result.get("ok")):
|
||||
categories.append("error_repeat")
|
||||
if str(policy.get("sowa_decision") or "").upper() == "SILENT":
|
||||
categories.append("silent_repeat")
|
||||
if _as_int(llm.get("latency_ms"), 0) >= self.latency_spike_ms:
|
||||
categories.append("latency_spike")
|
||||
return categories
|
||||
|
||||
def _bucket_key(self, category: str, event: Dict[str, Any]) -> str:
|
||||
llm = event.get("llm") or {}
|
||||
result = event.get("result") or {}
|
||||
policy = event.get("policy") or {}
|
||||
parts = [
|
||||
category,
|
||||
str(event.get("agent_id") or ""),
|
||||
str(event.get("task_type") or "infer"),
|
||||
str(result.get("error_class") or ""),
|
||||
str(policy.get("reason") or ""),
|
||||
str(llm.get("provider") or ""),
|
||||
str(llm.get("model") or ""),
|
||||
str(llm.get("profile") or ""),
|
||||
]
|
||||
return "|".join(parts)
|
||||
|
||||
def _update_bucket(self, bucket_key: str, event: Dict[str, Any]) -> Tuple[int, Optional[float], Optional[int]]:
|
||||
now = time.monotonic()
|
||||
llm = event.get("llm") or {}
|
||||
result = event.get("result") or {}
|
||||
sample = EventSample(
|
||||
ts_mono=now,
|
||||
ok=bool(result.get("ok")),
|
||||
latency_ms=_as_int(llm.get("latency_ms"), 0),
|
||||
)
|
||||
bucket = self._buckets.get(bucket_key)
|
||||
if bucket is None:
|
||||
bucket = deque()
|
||||
self._buckets[bucket_key] = bucket
|
||||
bucket.append(sample)
|
||||
|
||||
cutoff = now - self.window_s
|
||||
while bucket and bucket[0].ts_mono < cutoff:
|
||||
bucket.popleft()
|
||||
|
||||
if not bucket:
|
||||
return 0, None, None
|
||||
|
||||
count = len(bucket)
|
||||
ok_count = sum(1 for item in bucket if item.ok)
|
||||
ok_rate = round(ok_count / count, 4) if count > 0 else None
|
||||
latencies = sorted(item.latency_ms for item in bucket)
|
||||
p95_latency = _p95(latencies)
|
||||
return count, ok_rate, p95_latency
|
||||
|
||||
def _threshold_for(self, category: str) -> int:
|
||||
if category == "error_repeat":
|
||||
return self.error_threshold
|
||||
if category == "silent_repeat":
|
||||
return self.silent_threshold
|
||||
return self.latency_threshold
|
||||
|
||||
def _build_lesson(
|
||||
self,
|
||||
category: str,
|
||||
event: Dict[str, Any],
|
||||
count: int,
|
||||
ok_rate: Optional[float],
|
||||
p95_latency: Optional[int],
|
||||
) -> Dict[str, Any]:
|
||||
llm = event.get("llm") or {}
|
||||
result = event.get("result") or {}
|
||||
policy = event.get("policy") or {}
|
||||
agent_id = str(event.get("agent_id") or "").strip() or None
|
||||
task_type = str(event.get("task_type") or "infer")
|
||||
error_class = _safe_token(result.get("error_class"))
|
||||
policy_reason = _safe_token(policy.get("reason"))
|
||||
sowa_decision = _safe_token(policy.get("sowa_decision"))
|
||||
|
||||
if category == "silent_repeat":
|
||||
trigger = "Frequent SILENT policy outcomes on active conversation flow."
|
||||
action = "Use short ACK/CHALLENGE clarification before silencing response."
|
||||
avoid = "Avoid immediate SILENT when user intent might target the agent."
|
||||
elif category == "latency_spike":
|
||||
trigger = "Repeated latency spikes above configured SLA threshold."
|
||||
action = "Prefer faster model/profile and reduce expensive tool rounds."
|
||||
avoid = "Avoid routing to slow provider/profile for same task pattern."
|
||||
else:
|
||||
trigger = f"Repeated inference failures of class '{error_class or 'unknown_error'}'."
|
||||
action = "Switch to stable provider/profile and constrain optional tool calls."
|
||||
avoid = "Avoid blind retries on the same failing route."
|
||||
|
||||
scope = "agent" if agent_id else "global"
|
||||
lesson_key_raw = "|".join(
|
||||
[
|
||||
scope,
|
||||
str(agent_id or ""),
|
||||
trigger,
|
||||
action,
|
||||
avoid,
|
||||
str(error_class or ""),
|
||||
str(policy_reason or ""),
|
||||
]
|
||||
)
|
||||
lesson_key = hashlib.sha256(lesson_key_raw.encode("utf-8")).hexdigest()
|
||||
|
||||
lesson = {
|
||||
"lesson_id": str(uuid.uuid4()),
|
||||
"lesson_key": lesson_key,
|
||||
"ts": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
||||
"scope": scope,
|
||||
"agent_id": agent_id,
|
||||
"task_type": task_type,
|
||||
"trigger": trigger,
|
||||
"action": action,
|
||||
"avoid": avoid,
|
||||
"signals": {
|
||||
"policy_reason": policy_reason,
|
||||
"policy_decision": sowa_decision,
|
||||
"error_class": error_class,
|
||||
"provider": _safe_token(llm.get("provider")),
|
||||
"model": _safe_token(llm.get("model")),
|
||||
"profile": _safe_token(llm.get("profile")),
|
||||
},
|
||||
"evidence": {
|
||||
"count": count,
|
||||
"ok_rate": ok_rate,
|
||||
"p95_latency_ms": p95_latency,
|
||||
},
|
||||
}
|
||||
lesson["raw"] = dict(lesson)
|
||||
return lesson
|
||||
|
||||
async def _insert_lesson(self, lesson: Dict[str, Any]) -> str:
|
||||
if self._pool is None:
|
||||
return "err"
|
||||
query_insert = """
|
||||
INSERT INTO agent_lessons (
|
||||
lesson_id,
|
||||
lesson_key,
|
||||
ts,
|
||||
scope,
|
||||
agent_id,
|
||||
task_type,
|
||||
trigger,
|
||||
action,
|
||||
avoid,
|
||||
signals,
|
||||
evidence,
|
||||
raw
|
||||
) VALUES (
|
||||
$1::uuid,
|
||||
$2,
|
||||
$3::timestamptz,
|
||||
$4,
|
||||
$5,
|
||||
$6,
|
||||
$7,
|
||||
$8,
|
||||
$9,
|
||||
$10::jsonb,
|
||||
$11::jsonb,
|
||||
$12::jsonb
|
||||
)
|
||||
ON CONFLICT (lesson_key) DO NOTHING
|
||||
RETURNING id
|
||||
"""
|
||||
query_tuning_upsert = """
|
||||
INSERT INTO agent_lessons (
|
||||
lesson_id,
|
||||
lesson_key,
|
||||
ts,
|
||||
scope,
|
||||
agent_id,
|
||||
task_type,
|
||||
trigger,
|
||||
action,
|
||||
avoid,
|
||||
signals,
|
||||
evidence,
|
||||
raw
|
||||
) VALUES (
|
||||
$1::uuid,
|
||||
$2,
|
||||
$3::timestamptz,
|
||||
$4,
|
||||
$5,
|
||||
$6,
|
||||
$7,
|
||||
$8,
|
||||
$9,
|
||||
$10::jsonb,
|
||||
$11::jsonb,
|
||||
$12::jsonb
|
||||
)
|
||||
ON CONFLICT (lesson_key) DO UPDATE SET
|
||||
ts = EXCLUDED.ts,
|
||||
scope = EXCLUDED.scope,
|
||||
agent_id = EXCLUDED.agent_id,
|
||||
task_type = EXCLUDED.task_type,
|
||||
trigger = EXCLUDED.trigger,
|
||||
action = EXCLUDED.action,
|
||||
avoid = EXCLUDED.avoid,
|
||||
signals = EXCLUDED.signals,
|
||||
evidence = EXCLUDED.evidence,
|
||||
raw = EXCLUDED.raw
|
||||
RETURNING id
|
||||
"""
|
||||
|
||||
try:
|
||||
lesson_id = uuid.UUID(str(lesson["lesson_id"]))
|
||||
ts_value = _as_timestamptz(lesson["ts"])
|
||||
lesson_type = str(lesson.get("lesson_type") or "").strip().lower()
|
||||
query = query_tuning_upsert if lesson_type == "anti_silent_tuning" else query_insert
|
||||
async with self._pool.acquire() as conn:
|
||||
row_id = await conn.fetchval(
|
||||
query,
|
||||
lesson_id,
|
||||
lesson["lesson_key"],
|
||||
ts_value,
|
||||
lesson["scope"],
|
||||
lesson.get("agent_id"),
|
||||
lesson["task_type"],
|
||||
lesson["trigger"],
|
||||
lesson["action"],
|
||||
lesson["avoid"],
|
||||
json.dumps(lesson["signals"], ensure_ascii=False),
|
||||
json.dumps(lesson["evidence"], ensure_ascii=False),
|
||||
json.dumps(lesson, ensure_ascii=False),
|
||||
)
|
||||
if row_id is None:
|
||||
return "conflict"
|
||||
return "ok"
|
||||
except Exception as exc:
|
||||
logger.warning("insert lesson failed: %s", exc)
|
||||
return "err"
|
||||
|
||||
async def _publish_lesson(self, lesson: Dict[str, Any]) -> None:
|
||||
if self._js is None:
|
||||
LESSON_PUBLISH.labels(status="skipped").inc()
|
||||
return
|
||||
payload = json.dumps(lesson, ensure_ascii=False).encode("utf-8")
|
||||
headers = {"Nats-Msg-Id": str(lesson["lesson_id"])}
|
||||
try:
|
||||
await self._js.publish(self.lesson_subject, payload, headers=headers)
|
||||
LESSON_PUBLISH.labels(status="ok").inc()
|
||||
except Exception as exc:
|
||||
LESSON_PUBLISH.labels(status="err").inc()
|
||||
logger.warning("publish lesson failed: %s", exc)
|
||||
|
||||
async def health(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"status": "ok" if self._running else "starting",
|
||||
"node_id": self.node_id,
|
||||
"stream": self.stream_name,
|
||||
"subject": self.subject,
|
||||
"durable": self.durable,
|
||||
"nats_connected": self._nc is not None and self._nc.is_connected,
|
||||
"db_connected": self._pool is not None,
|
||||
"running": self._running,
|
||||
}
|
||||
|
||||
|
||||
def _safe_token(value: Any) -> Optional[str]:
|
||||
if value is None:
|
||||
return None
|
||||
text = str(value)
|
||||
text = re.sub(r"(?i)bearer\s+[A-Za-z0-9._-]+", "bearer [redacted]", text)
|
||||
text = re.sub(r"(?i)(api[_-]?key|token|password|secret)\s*[:=]\s*[^\s,;]+", r"\1=[redacted]", text)
|
||||
text = re.sub(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", "[redacted-email]", text)
|
||||
text = re.sub(r"https?://[^\s]+", "[redacted-url]", text)
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text[:180] if text else None
|
||||
|
||||
|
||||
def _as_int(value: Any, default: int) -> int:
|
||||
try:
|
||||
return int(value)
|
||||
except Exception:
|
||||
return default
|
||||
|
||||
|
||||
def _as_timestamptz(value: Any) -> datetime:
|
||||
if isinstance(value, datetime):
|
||||
return value if value.tzinfo is not None else value.replace(tzinfo=timezone.utc)
|
||||
try:
|
||||
parsed = datetime.fromisoformat(str(value).replace("Z", "+00:00"))
|
||||
return parsed if parsed.tzinfo is not None else parsed.replace(tzinfo=timezone.utc)
|
||||
except Exception:
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
def _p95(sorted_values: list[int]) -> Optional[int]:
|
||||
if not sorted_values:
|
||||
return None
|
||||
idx = int(round(0.95 * (len(sorted_values) - 1)))
|
||||
return sorted_values[min(max(idx, 0), len(sorted_values) - 1)]
|
||||
|
||||
app = FastAPI(title="Experience Learner")
|
||||
learner = ExperienceLearner()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup() -> None:
|
||||
await learner.start()
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown() -> None:
|
||||
await learner.stop()
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health() -> Dict[str, Any]:
|
||||
return await learner.health()
|
||||
|
||||
|
||||
@app.get("/metrics")
|
||||
async def metrics() -> Response:
|
||||
return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
|
||||
Reference in New Issue
Block a user