feat: MD pipeline — market-data-service hardening + SenpAI NATS consumer

Producer (market-data-service):
- Backpressure: smart drop policy (heartbeats→quotes→trades preserved)
- Heartbeat monitor: synthetic HeartbeatEvent on provider silence
- Graceful shutdown: WS→bus→storage→DB engine cleanup sequence
- Bybit V5 public WS provider (backup for Binance, no API key needed)
- FailoverManager: health-based provider switching with recovery
- NATS output adapter: md.events.{type}.{symbol} for SenpAI
- /bus-stats endpoint for backpressure monitoring
- Dockerfile + docker-compose.node1.yml integration
- 36 tests (parsing + bus + failover), requirements.lock

Consumer (senpai-md-consumer):
- NATSConsumer: subscribe md.events.>, queue group senpai-md, backpressure
- State store: LatestState + RollingWindow (deque, 60s)
- Feature engine: 11 features (mid, spread, VWAP, return, vol, latency)
- Rule-based signals: long/short on return+volume+spread conditions
- Publisher: rate-limited features + signals + alerts to NATS
- HTTP API: /health, /metrics, /state/latest, /features/latest, /stats
- 10 Prometheus metrics
- Dockerfile + docker-compose.senpai.yml
- 41 tests (parsing + state + features + rate-limit), requirements.lock

CI: ruff + pytest + smoke import for both services
Tests: 77 total passed, lint clean
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Apple
2026-02-09 11:46:15 -08:00
parent c50843933f
commit 09dee24342
47 changed files with 3930 additions and 56 deletions

View File

@@ -1,6 +1,11 @@
"""
Async event bus — fan-out from providers to consumers.
Features:
- Backpressure with smart drop policy (drop quotes before trades)
- Heartbeat timer per provider (detects dead channels)
- Graceful drain on shutdown
Usage:
bus = EventBus()
bus.add_consumer(storage_consumer)
@@ -13,9 +18,10 @@ from __future__ import annotations
import asyncio
import logging
import time
from typing import Protocol
from app.domain.events import Event
from app.domain.events import Event, EventType, HeartbeatEvent
logger = logging.getLogger(__name__)
@@ -26,37 +32,105 @@ class EventConsumer(Protocol):
async def handle(self, event: Event) -> None: ...
# Events that can be dropped under backpressure (least critical first)
_DROPPABLE_PRIORITY = {
EventType.HEARTBEAT: 0, # always droppable
EventType.QUOTE: 1, # drop quotes before trades
EventType.BOOK_L2: 2, # drop book snapshots before trades
EventType.TRADE: 3, # trades are most critical — last to drop
}
class EventBus:
"""
Simple async fan-out bus.
Async fan-out bus with backpressure and heartbeat monitoring.
Every published event is dispatched to all registered consumers
concurrently (gather). A slow consumer doesn't block others thanks
to the internal queue + worker pattern.
Backpressure policy:
- Queue 80% full → start dropping HEARTBEAT events
- Queue 90% full → also drop QUOTE events
- Queue 100% full → drop oldest (any type)
Heartbeat timer:
- Emits synthetic HeartbeatEvent if a provider sends nothing
for `heartbeat_interval` seconds, making dead channels visible.
"""
def __init__(self, queue_size: int = 10_000) -> None:
def __init__(
self,
queue_size: int = 10_000,
heartbeat_interval: float = 10.0,
) -> None:
self._consumers: list[EventConsumer] = []
self._queue: asyncio.Queue[Event | None] = asyncio.Queue(maxsize=queue_size)
self._max_size = queue_size
self._running = False
self._task: asyncio.Task | None = None
self._heartbeat_interval = heartbeat_interval
self._heartbeat_tasks: dict[str, asyncio.Task] = {}
self._provider_last_seen: dict[str, float] = {}
# Backpressure counters
self._dropped: dict[str, int] = {}
def add_consumer(self, consumer: EventConsumer) -> None:
self._consumers.append(consumer)
logger.info("bus.consumer_added", extra={"consumer": type(consumer).__name__})
def register_provider(self, provider_name: str) -> None:
"""Register a provider for heartbeat monitoring."""
self._provider_last_seen[provider_name] = time.monotonic()
async def publish(self, event: Event) -> None:
"""Put event into internal queue (non-blocking if queue not full)."""
"""
Put event into internal queue with backpressure.
Drop policy under pressure:
- 80%+ → drop heartbeats
- 90%+ → drop quotes/book snapshots
- 100% → drop oldest event
"""
current = self._queue.qsize()
fill_pct = current / self._max_size if self._max_size > 0 else 0
# Track provider activity for heartbeat timer
self._provider_last_seen[event.provider] = time.monotonic()
priority = _DROPPABLE_PRIORITY.get(event.event_type, 3)
# Backpressure: drop low-priority events when queue is filling up
if fill_pct >= 0.9 and priority <= 1:
# Drop heartbeats and quotes
self._dropped[event.event_type.value] = self._dropped.get(event.event_type.value, 0) + 1
if self._dropped[event.event_type.value] % 1000 == 1:
logger.warning(
"bus.backpressure_drop",
extra={
"type": event.event_type.value,
"fill_pct": f"{fill_pct:.0%}",
"total_dropped": self._dropped,
},
)
return
if fill_pct >= 0.8 and priority == 0:
# Drop heartbeats only
return
try:
self._queue.put_nowait(event)
except asyncio.QueueFull:
logger.warning("bus.queue_full, dropping oldest event")
# Drop oldest to keep queue moving
# Last resort: drop oldest to make room
try:
self._queue.get_nowait()
dropped = self._queue.get_nowait()
logger.warning(
"bus.queue_full_drop_oldest",
extra={"dropped_type": dropped.event_type.value if dropped else "None"},
)
except asyncio.QueueEmpty:
pass
self._queue.put_nowait(event)
try:
self._queue.put_nowait(event)
except asyncio.QueueFull:
pass # truly stuck
async def _worker(self) -> None:
"""Background worker that drains the queue and fans out."""
@@ -75,20 +149,79 @@ class EventBus:
extra={"consumer": consumer_name, "error": str(result)},
)
async def _heartbeat_monitor(self, provider_name: str) -> None:
"""Emit synthetic heartbeat if provider goes silent."""
while self._running:
await asyncio.sleep(self._heartbeat_interval)
if not self._running:
break
last = self._provider_last_seen.get(provider_name, 0)
elapsed = time.monotonic() - last
if elapsed > self._heartbeat_interval:
# Provider is silent — emit heartbeat so metrics/logs see it
logger.warning(
"bus.provider_silent",
extra={
"provider": provider_name,
"silent_seconds": f"{elapsed:.1f}",
},
)
hb = HeartbeatEvent(provider=provider_name)
await self.publish(hb)
async def start(self) -> None:
"""Start the bus worker."""
"""Start the bus worker and heartbeat monitors."""
self._running = True
self._task = asyncio.create_task(self._worker())
logger.info("bus.started", extra={"consumers": len(self._consumers)})
# Start heartbeat monitors for registered providers
for pname in self._provider_last_seen:
task = asyncio.create_task(self._heartbeat_monitor(pname))
self._heartbeat_tasks[pname] = task
logger.info(
"bus.started",
extra={
"consumers": len(self._consumers),
"providers_monitored": list(self._provider_last_seen.keys()),
},
)
async def stop(self) -> None:
"""Graceful shutdown: drain queue then stop."""
"""Graceful shutdown: stop heartbeats, drain queue, stop worker."""
self._running = False
await self._queue.put(None) # sentinel
# Cancel heartbeat monitors
for task in self._heartbeat_tasks.values():
task.cancel()
for task in self._heartbeat_tasks.values():
try:
await task
except asyncio.CancelledError:
pass
self._heartbeat_tasks.clear()
# Drain remaining events
remaining = self._queue.qsize()
if remaining > 0:
logger.info("bus.draining", extra={"remaining": remaining})
# Send sentinel to stop worker
await self._queue.put(None)
if self._task:
await self._task
if self._dropped:
logger.info("bus.drop_stats", extra={"dropped": self._dropped})
logger.info("bus.stopped")
@property
def queue_size(self) -> int:
return self._queue.qsize()
@property
def fill_percent(self) -> float:
return self._queue.qsize() / self._max_size if self._max_size > 0 else 0

View File

@@ -0,0 +1,170 @@
"""
Provider failover manager.
Tracks provider health per symbol and recommends the best active source.
Policy:
- Each provider has a "health score" per symbol (0.0 1.0)
- Score decreases on gaps (heartbeat timeout) and error events
- Score increases on each successful trade/quote received
- When primary provider's score drops below threshold → switch to backup
Usage:
failover = FailoverManager(primary="binance", backups=["bybit"])
failover.record_event("binance", "BTCUSDT") # bumps score
failover.record_gap("binance", "BTCUSDT") # decreases score
best = failover.get_best_provider("BTCUSDT") # → "binance" or "bybit"
"""
from __future__ import annotations
import logging
import time
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class ProviderHealth:
"""Health tracker for one provider+symbol pair."""
score: float = 1.0
event_count: int = 0
gap_count: int = 0
last_event_ts: float = 0.0
last_gap_ts: float = 0.0
def record_event(self) -> None:
"""Bump health score on successful event."""
self.event_count += 1
self.last_event_ts = time.monotonic()
# Recover towards 1.0 gradually
self.score = min(1.0, self.score + 0.01)
def record_gap(self) -> None:
"""Decrease health score on gap/timeout."""
self.gap_count += 1
self.last_gap_ts = time.monotonic()
self.score = max(0.0, self.score - 0.2)
class FailoverManager:
"""
Tracks provider health and recommends best source per symbol.
"""
def __init__(
self,
primary: str,
backups: list[str] | None = None,
switch_threshold: float = 0.3,
recovery_threshold: float = 0.7,
) -> None:
self._primary = primary
self._backups = backups or []
self._all_providers = [primary] + self._backups
self._switch_threshold = switch_threshold
self._recovery_threshold = recovery_threshold
# provider → symbol → ProviderHealth
self._health: dict[str, dict[str, ProviderHealth]] = {}
# symbol → currently active provider
self._active: dict[str, str] = {}
def _get_health(self, provider: str, symbol: str) -> ProviderHealth:
"""Get or create health tracker."""
if provider not in self._health:
self._health[provider] = {}
if symbol not in self._health[provider]:
self._health[provider][symbol] = ProviderHealth()
return self._health[provider][symbol]
def record_event(self, provider: str, symbol: str) -> None:
"""Record a successful event from provider for symbol."""
self._get_health(provider, symbol).record_event()
def record_gap(self, provider: str, symbol: str) -> None:
"""Record a gap/timeout for provider+symbol."""
h = self._get_health(provider, symbol)
h.record_gap()
logger.warning(
"failover.gap_recorded",
extra={
"provider": provider,
"symbol": symbol,
"score": f"{h.score:.2f}",
"gaps": h.gap_count,
},
)
def get_best_provider(self, symbol: str) -> str:
"""
Return the currently recommended provider for this symbol.
Logic:
1. If active provider score >= switch_threshold → keep it
2. If active provider drops below → switch to healthiest backup
3. If active provider recovers above recovery_threshold → switch back to primary
"""
current = self._active.get(symbol, self._primary)
current_health = self._get_health(current, symbol)
# Check if current provider is degraded
if current_health.score < self._switch_threshold:
# Find best backup
best_provider = current
best_score = current_health.score
for p in self._all_providers:
if p == current:
continue
h = self._get_health(p, symbol)
if h.score > best_score:
best_provider = p
best_score = h.score
if best_provider != current:
logger.warning(
"failover.switching",
extra={
"symbol": symbol,
"from": current,
"to": best_provider,
"old_score": f"{current_health.score:.2f}",
"new_score": f"{best_score:.2f}",
},
)
self._active[symbol] = best_provider
return best_provider
# Check if primary has recovered and we're on a backup
if current != self._primary:
primary_health = self._get_health(self._primary, symbol)
if primary_health.score >= self._recovery_threshold:
logger.info(
"failover.returning_to_primary",
extra={
"symbol": symbol,
"primary_score": f"{primary_health.score:.2f}",
},
)
self._active[symbol] = self._primary
return self._primary
self._active[symbol] = current
return current
def get_status(self) -> dict:
"""Return full failover status for monitoring."""
status = {}
for provider, symbols in self._health.items():
for symbol, health in symbols.items():
key = f"{provider}/{symbol}"
status[key] = {
"score": round(health.score, 2),
"events": health.event_count,
"gaps": health.gap_count,
"active": self._active.get(symbol) == provider,
}
return status