from __future__ import annotations import asyncio import contextlib import hashlib import json import logging import os import random import re import time import uuid from collections import OrderedDict, deque from dataclasses import dataclass from datetime import datetime, timedelta, timezone from typing import Any, Deque, Dict, Optional, Tuple import asyncpg import nats from fastapi import FastAPI, Response from nats.aio.msg import Msg from nats.js.api import AckPolicy, ConsumerConfig, DeliverPolicy from prometheus_client import CONTENT_TYPE_LATEST, Counter, Gauge, generate_latest logging.basicConfig( level=os.getenv("LOG_LEVEL", "INFO").upper(), format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", ) logger = logging.getLogger("experience_learner") LESSONS_EXTRACTED = Counter( "lessons_extracted_total", "Total lessons extracted from experience events", ["status"], ) LESSONS_INSERT = Counter( "lessons_insert_total", "Total lesson insert attempts", ["status"], ) JS_MESSAGES_ACKED = Counter( "js_messages_acked_total", "Total JetStream messages acked by learner", ) JS_MESSAGES_REDELIVERED = Counter( "js_messages_redelivered_total", "Total redelivered JetStream messages observed by learner", ) EVENTS_SELECTED = Counter( "experience_learner_events_selected_total", "Events selected for learner processing", ["reason"], ) EVENTS_DROPPED = Counter( "experience_learner_events_dropped_total", "Events dropped by learner filtering/dedup", ["reason"], ) LESSON_PUBLISH = Counter( "experience_learner_lessons_published_total", "Lesson publish attempts to JetStream", ["status"], ) ANTI_SILENT_TUNING_EVALUATED = Counter( "experience_learner_anti_silent_tuning_evaluated_total", "Anti-silent tuning lesson generation evaluations", ["status"], ) CONSUMER_RUNNING = Gauge( "experience_learner_consumer_running", "1 when learner consumer loop is running", ) @dataclass class EventSample: ts_mono: float ok: bool latency_ms: int class ExperienceLearner: def __init__(self) -> None: self.node_id = os.getenv("NODE_ID", "NODA1") self.nats_url = os.getenv("NATS_URL", "nats://nats:4222") self.stream_name = os.getenv("EXPERIENCE_STREAM_NAME", "EXPERIENCE") self.subject = os.getenv("EXPERIENCE_SUBJECT", "agent.experience.v1.>") self.lesson_subject = os.getenv("LESSON_SUBJECT", "agent.lesson.v1") self.durable = os.getenv("EXPERIENCE_DURABLE", "experience-learner-v1") self.deliver_policy = os.getenv("EXPERIENCE_DELIVER_POLICY", "all").lower() self.ack_wait_s = float(os.getenv("EXPERIENCE_ACK_WAIT_SECONDS", "30")) self.max_deliver = int(os.getenv("EXPERIENCE_MAX_DELIVER", "20")) self.fetch_batch = int(os.getenv("EXPERIENCE_FETCH_BATCH", "64")) self.fetch_timeout_s = float(os.getenv("EXPERIENCE_FETCH_TIMEOUT_SECONDS", "2")) self.window_s = int(os.getenv("EXPERIENCE_WINDOW_SECONDS", "1800")) self.ok_sample_pct = float(os.getenv("EXPERIENCE_OK_SAMPLE_PCT", "10")) self.latency_spike_ms = int(os.getenv("EXPERIENCE_LATENCY_SPIKE_MS", "5000")) self.error_threshold = int(os.getenv("EXPERIENCE_ERROR_THRESHOLD", "3")) self.silent_threshold = int(os.getenv("EXPERIENCE_SILENT_THRESHOLD", "5")) self.latency_threshold = int(os.getenv("EXPERIENCE_LATENCY_THRESHOLD", "3")) self.event_dedup_ttl_s = int(os.getenv("EXPERIENCE_EVENT_DEDUP_TTL_SECONDS", "3600")) self.event_dedup_max = int(os.getenv("EXPERIENCE_EVENT_DEDUP_MAX", "100000")) self.publish_lessons = os.getenv("LESSON_PUBLISH_ENABLED", "true").lower() in {"1", "true", "yes"} self.anti_silent_tuning_enabled = os.getenv("ANTI_SILENT_TUNING_ENABLED", "true").lower() in {"1", "true", "yes"} self.anti_silent_window_days = max(1, int(os.getenv("ANTI_SILENT_TUNING_WINDOW_DAYS", "7"))) self.anti_silent_min_evidence = max(1, int(os.getenv("ANTI_SILENT_TUNING_MIN_EVIDENCE", "20"))) self.anti_silent_min_score = max(0.0, min(1.0, float(os.getenv("ANTI_SILENT_TUNING_MIN_SCORE", "0.75")))) self.anti_silent_weight_retry = max(0.0, min(1.0, float(os.getenv("ANTI_SILENT_TUNING_WEIGHT_RETRY", "0.6")))) self.anti_silent_weight_negative = max(0.0, min(1.0, float(os.getenv("ANTI_SILENT_TUNING_WEIGHT_NEGATIVE", "0.3")))) self.anti_silent_weight_suppressed = max(0.0, min(1.0, float(os.getenv("ANTI_SILENT_TUNING_WEIGHT_SUPPRESSED", "0.1")))) self.anti_silent_ttl_days = max(1, int(os.getenv("ANTI_SILENT_TUNING_TTL_DAYS", "7"))) self.db_dsn = ( os.getenv("LEARNER_DATABASE_URL") or os.getenv("EXPERIENCE_DATABASE_URL") or os.getenv("DATABASE_URL") ) if not self.db_dsn: raise RuntimeError("LEARNER_DATABASE_URL (or EXPERIENCE_DATABASE_URL/DATABASE_URL) is required") self._running = False self._task: Optional[asyncio.Task[Any]] = None self._nc = None self._js = None self._sub = None self._pool: Optional[asyncpg.Pool] = None self._seen_events: "OrderedDict[str, float]" = OrderedDict() self._buckets: Dict[str, Deque[EventSample]] = {} self._lock = asyncio.Lock() async def start(self) -> None: if self._running: return self._pool = await asyncpg.create_pool(self.db_dsn, min_size=1, max_size=4) self._nc = await nats.connect(self.nats_url) self._js = self._nc.jetstream() await self._ensure_consumer() self._sub = await self._js.pull_subscribe( self.subject, durable=self.durable, stream=self.stream_name, ) self._running = True CONSUMER_RUNNING.set(1) self._task = asyncio.create_task(self._consume_loop(), name="experience-learner") logger.info( "experience-learner started stream=%s subject=%s durable=%s", self.stream_name, self.subject, self.durable, ) async def stop(self) -> None: self._running = False CONSUMER_RUNNING.set(0) if self._task: self._task.cancel() with contextlib.suppress(Exception): await self._task self._task = None if self._nc: await self._nc.close() self._nc = None self._js = None self._sub = None if self._pool: await self._pool.close() self._pool = None async def _ensure_consumer(self) -> None: if self._js is None: return deliver_policy = DeliverPolicy.ALL if self.deliver_policy == "all" else DeliverPolicy.NEW cfg = ConsumerConfig( durable_name=self.durable, ack_policy=AckPolicy.EXPLICIT, ack_wait=self.ack_wait_s, max_deliver=self.max_deliver, deliver_policy=deliver_policy, filter_subject=self.subject, ) try: await self._js.add_consumer(self.stream_name, config=cfg) logger.info("consumer created durable=%s stream=%s", self.durable, self.stream_name) except Exception as exc: msg = str(exc).lower() if "consumer name already in use" in msg or "consumer already exists" in msg: logger.info("consumer exists durable=%s stream=%s", self.durable, self.stream_name) else: raise async def _consume_loop(self) -> None: assert self._sub is not None while self._running: try: msgs = await self._sub.fetch(self.fetch_batch, timeout=self.fetch_timeout_s) except asyncio.TimeoutError: continue except Exception as exc: logger.warning("fetch failed: %s", exc) await asyncio.sleep(1.0) continue for msg in msgs: await self._handle_msg(msg) async def _handle_msg(self, msg: Msg) -> None: try: metadata = getattr(msg, "metadata", None) if metadata is not None and getattr(metadata, "num_delivered", 1) > 1: JS_MESSAGES_REDELIVERED.inc() event = json.loads(msg.data.decode("utf-8", errors="replace")) if not isinstance(event, dict): EVENTS_DROPPED.labels(reason="invalid_payload").inc() await msg.ack() JS_MESSAGES_ACKED.inc() return event_id = str(event.get("event_id") or "").strip() if event_id and await self._seen_event(event_id): EVENTS_DROPPED.labels(reason="event_dedup").inc() await msg.ack() JS_MESSAGES_ACKED.inc() return keep, reason = self._should_keep(event) if not keep: EVENTS_DROPPED.labels(reason=reason).inc() await msg.ack() JS_MESSAGES_ACKED.inc() return EVENTS_SELECTED.labels(reason=reason).inc() lessons = await self._extract_lessons(event) if not lessons: await msg.ack() JS_MESSAGES_ACKED.inc() return for lesson in lessons: LESSONS_EXTRACTED.labels(status="ok").inc() insert_status = await self._insert_lesson(lesson) LESSONS_INSERT.labels(status=insert_status).inc() if insert_status == "ok" and self.publish_lessons: await self._publish_lesson(lesson) await msg.ack() JS_MESSAGES_ACKED.inc() except Exception as exc: LESSONS_EXTRACTED.labels(status="err").inc() logger.exception("message handling failed: %s", exc) with contextlib.suppress(Exception): await msg.nak() async def _seen_event(self, event_id: str) -> bool: now = time.monotonic() async with self._lock: self._prune_seen(now) seen_ts = self._seen_events.get(event_id) if seen_ts is not None and (now - seen_ts) < self.event_dedup_ttl_s: return True self._seen_events[event_id] = now self._seen_events.move_to_end(event_id, last=True) while len(self._seen_events) > self.event_dedup_max: self._seen_events.popitem(last=False) return False def _prune_seen(self, now: float) -> None: threshold = now - self.event_dedup_ttl_s while self._seen_events: _, ts = next(iter(self._seen_events.items())) if ts >= threshold: break self._seen_events.popitem(last=False) def _should_keep(self, event: Dict[str, Any]) -> Tuple[bool, str]: result = event.get("result") or {} llm = event.get("llm") or {} policy = event.get("policy") or {} ok = bool(result.get("ok")) status = _as_int(result.get("http_status"), 0) latency_ms = _as_int(llm.get("latency_ms"), 0) sowa_decision = str(policy.get("sowa_decision") or "").upper() if not ok: return True, "error" if self._is_anti_silent_gateway_event(event): return True, "anti_silent_signal" if sowa_decision == "SILENT": return True, "silent" if status >= 500: return True, "http_5xx" if latency_ms >= self.latency_spike_ms: return True, "latency_spike" if random.random() * 100.0 < self.ok_sample_pct: return True, "ok_sample_in" return False, "ok_sample_out" def _is_anti_silent_gateway_event(self, event: Dict[str, Any]) -> bool: if not self.anti_silent_tuning_enabled: return False if str(event.get("source") or "").strip().lower() != "gateway": return False action = str(event.get("anti_silent_action") or "").strip().upper() if action not in {"ACK_EMITTED", "ACK_SUPPRESSED_COOLDOWN"}: return False reason = str((event.get("policy") or {}).get("reason") or "").strip() template_id = str(event.get("anti_silent_template") or "").strip() return bool(reason and template_id) async def _extract_lessons(self, event: Dict[str, Any]) -> list[Dict[str, Any]]: lessons: list[Dict[str, Any]] = [] tuning_lesson = await self._try_extract_anti_silent_tuning_lesson(event) if tuning_lesson is not None: lessons.append(tuning_lesson) operational_lesson = await self._try_extract_lesson(event) if operational_lesson is not None: lessons.append(operational_lesson) return lessons async def _try_extract_lesson(self, event: Dict[str, Any]) -> Optional[Dict[str, Any]]: categories = self._lesson_categories(event) if not categories: return None for category in categories: bucket_key = self._bucket_key(category, event) count, ok_rate, p95_latency = self._update_bucket(bucket_key, event) threshold = self._threshold_for(category) if count < threshold: continue lesson = self._build_lesson( category=category, event=event, count=count, ok_rate=ok_rate, p95_latency=p95_latency, ) return lesson return None async def _try_extract_anti_silent_tuning_lesson(self, event: Dict[str, Any]) -> Optional[Dict[str, Any]]: if not self._is_anti_silent_gateway_event(event): return None if self._pool is None: ANTI_SILENT_TUNING_EVALUATED.labels(status="pool_missing").inc() return None policy = event.get("policy") or {} reason = _safe_token(policy.get("reason")) chat_type = _safe_token(event.get("chat_type")) if not reason or not chat_type: ANTI_SILENT_TUNING_EVALUATED.labels(status="missing_fields").inc() return None stats = await self._anti_silent_stats(reason=reason, chat_type=chat_type) if not stats: ANTI_SILENT_TUNING_EVALUATED.labels(status="no_data").inc() return None candidates = [item for item in stats if int(item.get("n") or 0) >= self.anti_silent_min_evidence] if not candidates: ANTI_SILENT_TUNING_EVALUATED.labels(status="insufficient_evidence").inc() return None best = max(candidates, key=lambda item: (float(item.get("score", 0.0)), int(item.get("n", 0)))) best_score = float(best.get("score", 0.0)) if best_score < self.anti_silent_min_score: ANTI_SILENT_TUNING_EVALUATED.labels(status="below_score").inc() return None worst = min(candidates, key=lambda item: (float(item.get("score", 0.0)), -int(item.get("n", 0)))) best_template = str(best.get("template_id") or "").strip().upper() if not best_template: ANTI_SILENT_TUNING_EVALUATED.labels(status="bad_template").inc() return None trigger = f"reason={reason};chat_type={chat_type}" action = f"prefer_template={best_template}" avoid = "" worst_template = str(worst.get("template_id") or "").strip().upper() if worst_template and worst_template != best_template: avoid = f"avoid_template={worst_template}" if not avoid: avoid = "avoid_template=none" lesson_type = "anti_silent_tuning" lesson_key_raw = "|".join([lesson_type, trigger, action]) lesson_key = hashlib.sha256(lesson_key_raw.encode("utf-8")).hexdigest() now_dt = datetime.now(timezone.utc) expires_at = (now_dt + timedelta(days=self.anti_silent_ttl_days)).isoformat().replace("+00:00", "Z") evidence = { "n_best": int(best.get("n") or 0), "score_best": round(best_score, 6), "retry_rate": round(float(best.get("retry_rate", 0.0)), 6), "negative_rate": round(float(best.get("negative_rate", 0.0)), 6), "suppressed_rate": round(float(best.get("suppressed_rate", 0.0)), 6), "window_days": self.anti_silent_window_days, "weights": { "retry": self.anti_silent_weight_retry, "negative": self.anti_silent_weight_negative, "suppressed": self.anti_silent_weight_suppressed, }, "candidates": stats, } signals = { "policy_reason": reason, "chat_type": chat_type, "lesson_type": lesson_type, "trigger_kind": "anti_silent_ack_template", } lesson: Dict[str, Any] = { "lesson_id": str(uuid.uuid4()), "lesson_key": lesson_key, "lesson_type": lesson_type, "ts": now_dt.isoformat().replace("+00:00", "Z"), "expires_at": expires_at, "scope": "global", "agent_id": None, "task_type": "webhook", "trigger": trigger, "action": action, "avoid": avoid, "signals": signals, "evidence": evidence, } lesson["raw"] = dict(lesson) ANTI_SILENT_TUNING_EVALUATED.labels(status="ok").inc() return lesson async def _anti_silent_stats(self, *, reason: str, chat_type: str) -> list[Dict[str, Any]]: if self._pool is None: return [] query = """ SELECT COALESCE(raw->>'anti_silent_template', '') AS template_id, COUNT(*)::int AS n, AVG( CASE WHEN COALESCE(raw->'feedback'->>'user_signal', 'none') = 'retry' THEN 1.0 ELSE 0.0 END )::float8 AS retry_rate, AVG( CASE WHEN COALESCE(raw->'feedback'->>'user_signal', 'none') = 'negative' THEN 1.0 ELSE 0.0 END )::float8 AS negative_rate, AVG( CASE WHEN COALESCE(raw->>'anti_silent_action', '') = 'ACK_SUPPRESSED_COOLDOWN' THEN 1.0 ELSE 0.0 END )::float8 AS suppressed_rate FROM agent_experience_events WHERE source = 'gateway' AND ts >= (now() - ($1::int * interval '1 day')) AND COALESCE(raw->'policy'->>'reason', '') = $2 AND COALESCE(raw->>'chat_type', 'unknown') = $3 AND COALESCE(raw->>'anti_silent_action', '') IN ('ACK_EMITTED', 'ACK_SUPPRESSED_COOLDOWN') AND COALESCE(raw->>'anti_silent_template', '') <> '' GROUP BY 1 HAVING COUNT(*) >= $4 """ try: async with self._pool.acquire() as conn: rows = await conn.fetch( query, self.anti_silent_window_days, reason, chat_type, self.anti_silent_min_evidence, ) except Exception as exc: logger.warning("anti-silent stats query failed: %s", exc) return [] results: list[Dict[str, Any]] = [] for row in rows: template_id = str(row.get("template_id") or "").strip().upper() if not template_id: continue n = int(row.get("n") or 0) retry_rate = float(row.get("retry_rate") or 0.0) negative_rate = float(row.get("negative_rate") or 0.0) suppressed_rate = float(row.get("suppressed_rate") or 0.0) score = 1.0 - ( self.anti_silent_weight_retry * retry_rate + self.anti_silent_weight_negative * negative_rate + self.anti_silent_weight_suppressed * suppressed_rate ) score = max(0.0, min(1.0, score)) results.append( { "template_id": template_id, "n": n, "retry_rate": retry_rate, "negative_rate": negative_rate, "suppressed_rate": suppressed_rate, "score": score, } ) return results def _lesson_categories(self, event: Dict[str, Any]) -> list[str]: result = event.get("result") or {} llm = event.get("llm") or {} policy = event.get("policy") or {} categories: list[str] = [] if not bool(result.get("ok")): categories.append("error_repeat") if str(policy.get("sowa_decision") or "").upper() == "SILENT": categories.append("silent_repeat") if _as_int(llm.get("latency_ms"), 0) >= self.latency_spike_ms: categories.append("latency_spike") return categories def _bucket_key(self, category: str, event: Dict[str, Any]) -> str: llm = event.get("llm") or {} result = event.get("result") or {} policy = event.get("policy") or {} parts = [ category, str(event.get("agent_id") or ""), str(event.get("task_type") or "infer"), str(result.get("error_class") or ""), str(policy.get("reason") or ""), str(llm.get("provider") or ""), str(llm.get("model") or ""), str(llm.get("profile") or ""), ] return "|".join(parts) def _update_bucket(self, bucket_key: str, event: Dict[str, Any]) -> Tuple[int, Optional[float], Optional[int]]: now = time.monotonic() llm = event.get("llm") or {} result = event.get("result") or {} sample = EventSample( ts_mono=now, ok=bool(result.get("ok")), latency_ms=_as_int(llm.get("latency_ms"), 0), ) bucket = self._buckets.get(bucket_key) if bucket is None: bucket = deque() self._buckets[bucket_key] = bucket bucket.append(sample) cutoff = now - self.window_s while bucket and bucket[0].ts_mono < cutoff: bucket.popleft() if not bucket: return 0, None, None count = len(bucket) ok_count = sum(1 for item in bucket if item.ok) ok_rate = round(ok_count / count, 4) if count > 0 else None latencies = sorted(item.latency_ms for item in bucket) p95_latency = _p95(latencies) return count, ok_rate, p95_latency def _threshold_for(self, category: str) -> int: if category == "error_repeat": return self.error_threshold if category == "silent_repeat": return self.silent_threshold return self.latency_threshold def _build_lesson( self, category: str, event: Dict[str, Any], count: int, ok_rate: Optional[float], p95_latency: Optional[int], ) -> Dict[str, Any]: llm = event.get("llm") or {} result = event.get("result") or {} policy = event.get("policy") or {} agent_id = str(event.get("agent_id") or "").strip() or None task_type = str(event.get("task_type") or "infer") error_class = _safe_token(result.get("error_class")) policy_reason = _safe_token(policy.get("reason")) sowa_decision = _safe_token(policy.get("sowa_decision")) if category == "silent_repeat": trigger = "Frequent SILENT policy outcomes on active conversation flow." action = "Use short ACK/CHALLENGE clarification before silencing response." avoid = "Avoid immediate SILENT when user intent might target the agent." elif category == "latency_spike": trigger = "Repeated latency spikes above configured SLA threshold." action = "Prefer faster model/profile and reduce expensive tool rounds." avoid = "Avoid routing to slow provider/profile for same task pattern." else: trigger = f"Repeated inference failures of class '{error_class or 'unknown_error'}'." action = "Switch to stable provider/profile and constrain optional tool calls." avoid = "Avoid blind retries on the same failing route." scope = "agent" if agent_id else "global" lesson_key_raw = "|".join( [ scope, str(agent_id or ""), trigger, action, avoid, str(error_class or ""), str(policy_reason or ""), ] ) lesson_key = hashlib.sha256(lesson_key_raw.encode("utf-8")).hexdigest() lesson = { "lesson_id": str(uuid.uuid4()), "lesson_key": lesson_key, "ts": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), "scope": scope, "agent_id": agent_id, "task_type": task_type, "trigger": trigger, "action": action, "avoid": avoid, "signals": { "policy_reason": policy_reason, "policy_decision": sowa_decision, "error_class": error_class, "provider": _safe_token(llm.get("provider")), "model": _safe_token(llm.get("model")), "profile": _safe_token(llm.get("profile")), }, "evidence": { "count": count, "ok_rate": ok_rate, "p95_latency_ms": p95_latency, }, } lesson["raw"] = dict(lesson) return lesson async def _insert_lesson(self, lesson: Dict[str, Any]) -> str: if self._pool is None: return "err" query_insert = """ INSERT INTO agent_lessons ( lesson_id, lesson_key, ts, scope, agent_id, task_type, trigger, action, avoid, signals, evidence, raw ) VALUES ( $1::uuid, $2, $3::timestamptz, $4, $5, $6, $7, $8, $9, $10::jsonb, $11::jsonb, $12::jsonb ) ON CONFLICT (lesson_key) DO NOTHING RETURNING id """ query_tuning_upsert = """ INSERT INTO agent_lessons ( lesson_id, lesson_key, ts, scope, agent_id, task_type, trigger, action, avoid, signals, evidence, raw ) VALUES ( $1::uuid, $2, $3::timestamptz, $4, $5, $6, $7, $8, $9, $10::jsonb, $11::jsonb, $12::jsonb ) ON CONFLICT (lesson_key) DO UPDATE SET ts = EXCLUDED.ts, scope = EXCLUDED.scope, agent_id = EXCLUDED.agent_id, task_type = EXCLUDED.task_type, trigger = EXCLUDED.trigger, action = EXCLUDED.action, avoid = EXCLUDED.avoid, signals = EXCLUDED.signals, evidence = EXCLUDED.evidence, raw = EXCLUDED.raw RETURNING id """ try: lesson_id = uuid.UUID(str(lesson["lesson_id"])) ts_value = _as_timestamptz(lesson["ts"]) lesson_type = str(lesson.get("lesson_type") or "").strip().lower() query = query_tuning_upsert if lesson_type == "anti_silent_tuning" else query_insert async with self._pool.acquire() as conn: row_id = await conn.fetchval( query, lesson_id, lesson["lesson_key"], ts_value, lesson["scope"], lesson.get("agent_id"), lesson["task_type"], lesson["trigger"], lesson["action"], lesson["avoid"], json.dumps(lesson["signals"], ensure_ascii=False), json.dumps(lesson["evidence"], ensure_ascii=False), json.dumps(lesson, ensure_ascii=False), ) if row_id is None: return "conflict" return "ok" except Exception as exc: logger.warning("insert lesson failed: %s", exc) return "err" async def _publish_lesson(self, lesson: Dict[str, Any]) -> None: if self._js is None: LESSON_PUBLISH.labels(status="skipped").inc() return payload = json.dumps(lesson, ensure_ascii=False).encode("utf-8") headers = {"Nats-Msg-Id": str(lesson["lesson_id"])} try: await self._js.publish(self.lesson_subject, payload, headers=headers) LESSON_PUBLISH.labels(status="ok").inc() except Exception as exc: LESSON_PUBLISH.labels(status="err").inc() logger.warning("publish lesson failed: %s", exc) async def health(self) -> Dict[str, Any]: return { "status": "ok" if self._running else "starting", "node_id": self.node_id, "stream": self.stream_name, "subject": self.subject, "durable": self.durable, "nats_connected": self._nc is not None and self._nc.is_connected, "db_connected": self._pool is not None, "running": self._running, } def _safe_token(value: Any) -> Optional[str]: if value is None: return None text = str(value) text = re.sub(r"(?i)bearer\s+[A-Za-z0-9._-]+", "bearer [redacted]", text) text = re.sub(r"(?i)(api[_-]?key|token|password|secret)\s*[:=]\s*[^\s,;]+", r"\1=[redacted]", text) text = re.sub(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", "[redacted-email]", text) text = re.sub(r"https?://[^\s]+", "[redacted-url]", text) text = re.sub(r"\s+", " ", text).strip() return text[:180] if text else None def _as_int(value: Any, default: int) -> int: try: return int(value) except Exception: return default def _as_timestamptz(value: Any) -> datetime: if isinstance(value, datetime): return value if value.tzinfo is not None else value.replace(tzinfo=timezone.utc) try: parsed = datetime.fromisoformat(str(value).replace("Z", "+00:00")) return parsed if parsed.tzinfo is not None else parsed.replace(tzinfo=timezone.utc) except Exception: return datetime.now(timezone.utc) def _p95(sorted_values: list[int]) -> Optional[int]: if not sorted_values: return None idx = int(round(0.95 * (len(sorted_values) - 1))) return sorted_values[min(max(idx, 0), len(sorted_values) - 1)] app = FastAPI(title="Experience Learner") learner = ExperienceLearner() @app.on_event("startup") async def startup() -> None: await learner.start() @app.on_event("shutdown") async def shutdown() -> None: await learner.stop() @app.get("/health") async def health() -> Dict[str, Any]: return await learner.health() @app.get("/metrics") async def metrics() -> Response: return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)