feat: MD pipeline — market-data-service hardening + SenpAI NATS consumer

Producer (market-data-service): - Backpressure: smart drop policy (heartbeats→quotes→trades preserved) - Heartbeat monitor: synthetic HeartbeatEvent on provider silence - Graceful shutdown: WS→bus→storage→DB engine cleanup sequence - Bybit V5 public WS provider (backup for Binance, no API key needed) - FailoverManager: health-based provider switching with recovery - NATS output adapter: md.events.{type}.{symbol} for SenpAI - /bus-stats endpoint for backpressure monitoring - Dockerfile + docker-compose.node1.yml integration - 36 tests (parsing + bus + failover), requirements.lock Consumer (senpai-md-consumer): - NATSConsumer: subscribe md.events.>, queue group senpai-md, backpressure - State store: LatestState + RollingWindow (deque, 60s) - Feature engine: 11 features (mid, spread, VWAP, return, vol, latency) - Rule-based signals: long/short on return+volume+spread conditions - Publisher: rate-limited features + signals + alerts to NATS - HTTP API: /health, /metrics, /state/latest, /features/latest, /stats - 10 Prometheus metrics - Dockerfile + docker-compose.senpai.yml - 41 tests (parsing + state + features + rate-limit), requirements.lock CI: ruff + pytest + smoke import for both services Tests: 77 total passed, lint clean Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-09 11:46:15 -08:00
parent c50843933f
commit 09dee24342
47 changed files with 3930 additions and 56 deletions
--- a/services/market-data-service/app/core/bus.py
+++ b/services/market-data-service/app/core/bus.py
@@ -1,6 +1,11 @@
 """
 Async event bus — fan-out from providers to consumers.

+Features:
+- Backpressure with smart drop policy (drop quotes before trades)
+- Heartbeat timer per provider (detects dead channels)
+- Graceful drain on shutdown
+
 Usage:
    bus = EventBus()
    bus.add_consumer(storage_consumer)
@@ -13,9 +18,10 @@ from __future__ import annotations

 import asyncio
 import logging
+import time
 from typing import Protocol

-from app.domain.events import Event
+from app.domain.events import Event, EventType, HeartbeatEvent

 logger = logging.getLogger(__name__)

@@ -26,37 +32,105 @@ class EventConsumer(Protocol):
    async def handle(self, event: Event) -> None: ...


+# Events that can be dropped under backpressure (least critical first)
+_DROPPABLE_PRIORITY = {
+    EventType.HEARTBEAT: 0,  # always droppable
+    EventType.QUOTE: 1,  # drop quotes before trades
+    EventType.BOOK_L2: 2,  # drop book snapshots before trades
+    EventType.TRADE: 3,  # trades are most critical — last to drop
+}
+
+
 class EventBus:
    """
-    Simple async fan-out bus.
+    Async fan-out bus with backpressure and heartbeat monitoring.

-    Every published event is dispatched to all registered consumers
-    concurrently (gather). A slow consumer doesn't block others thanks
-    to the internal queue + worker pattern.
+    Backpressure policy:
+    - Queue 80% full → start dropping HEARTBEAT events
+    - Queue 90% full → also drop QUOTE events
+    - Queue 100% full → drop oldest (any type)
+
+    Heartbeat timer:
+    - Emits synthetic HeartbeatEvent if a provider sends nothing
+      for `heartbeat_interval` seconds, making dead channels visible.
    """

-    def __init__(self, queue_size: int = 10_000) -> None:
+    def __init__(
+        self,
+        queue_size: int = 10_000,
+        heartbeat_interval: float = 10.0,
+    ) -> None:
        self._consumers: list[EventConsumer] = []
        self._queue: asyncio.Queue[Event | None] = asyncio.Queue(maxsize=queue_size)
+        self._max_size = queue_size
        self._running = False
        self._task: asyncio.Task | None = None
+        self._heartbeat_interval = heartbeat_interval
+        self._heartbeat_tasks: dict[str, asyncio.Task] = {}
+        self._provider_last_seen: dict[str, float] = {}
+        # Backpressure counters
+        self._dropped: dict[str, int] = {}

    def add_consumer(self, consumer: EventConsumer) -> None:
        self._consumers.append(consumer)
        logger.info("bus.consumer_added", extra={"consumer": type(consumer).__name__})

+    def register_provider(self, provider_name: str) -> None:
+        """Register a provider for heartbeat monitoring."""
+        self._provider_last_seen[provider_name] = time.monotonic()
+
    async def publish(self, event: Event) -> None:
-        """Put event into internal queue (non-blocking if queue not full)."""
+        """
+        Put event into internal queue with backpressure.
+
+        Drop policy under pressure:
+        - 80%+ → drop heartbeats
+        - 90%+ → drop quotes/book snapshots
+        - 100% → drop oldest event
+        """
+        current = self._queue.qsize()
+        fill_pct = current / self._max_size if self._max_size > 0 else 0
+
+        # Track provider activity for heartbeat timer
+        self._provider_last_seen[event.provider] = time.monotonic()
+
+        priority = _DROPPABLE_PRIORITY.get(event.event_type, 3)
+
+        # Backpressure: drop low-priority events when queue is filling up
+        if fill_pct >= 0.9 and priority <= 1:
+            # Drop heartbeats and quotes
+            self._dropped[event.event_type.value] = self._dropped.get(event.event_type.value, 0) + 1
+            if self._dropped[event.event_type.value] % 1000 == 1:
+                logger.warning(
+                    "bus.backpressure_drop",
+                    extra={
+                        "type": event.event_type.value,
+                        "fill_pct": f"{fill_pct:.0%}",
+                        "total_dropped": self._dropped,
+                    },
+                )
+            return
+
+        if fill_pct >= 0.8 and priority == 0:
+            # Drop heartbeats only
+            return
+
        try:
            self._queue.put_nowait(event)
        except asyncio.QueueFull:
-            logger.warning("bus.queue_full, dropping oldest event")
-            # Drop oldest to keep queue moving
+            # Last resort: drop oldest to make room
            try:
-                self._queue.get_nowait()
+                dropped = self._queue.get_nowait()
+                logger.warning(
+                    "bus.queue_full_drop_oldest",
+                    extra={"dropped_type": dropped.event_type.value if dropped else "None"},
+                )
            except asyncio.QueueEmpty:
                pass
-            self._queue.put_nowait(event)
+            try:
+                self._queue.put_nowait(event)
+            except asyncio.QueueFull:
+                pass  # truly stuck

    async def _worker(self) -> None:
        """Background worker that drains the queue and fans out."""
@@ -75,20 +149,79 @@ class EventBus:
                            extra={"consumer": consumer_name, "error": str(result)},
                        )

+    async def _heartbeat_monitor(self, provider_name: str) -> None:
+        """Emit synthetic heartbeat if provider goes silent."""
+        while self._running:
+            await asyncio.sleep(self._heartbeat_interval)
+            if not self._running:
+                break
+
+            last = self._provider_last_seen.get(provider_name, 0)
+            elapsed = time.monotonic() - last
+
+            if elapsed > self._heartbeat_interval:
+                # Provider is silent — emit heartbeat so metrics/logs see it
+                logger.warning(
+                    "bus.provider_silent",
+                    extra={
+                        "provider": provider_name,
+                        "silent_seconds": f"{elapsed:.1f}",
+                    },
+                )
+                hb = HeartbeatEvent(provider=provider_name)
+                await self.publish(hb)
+
    async def start(self) -> None:
-        """Start the bus worker."""
+        """Start the bus worker and heartbeat monitors."""
        self._running = True
        self._task = asyncio.create_task(self._worker())
-        logger.info("bus.started", extra={"consumers": len(self._consumers)})
+
+        # Start heartbeat monitors for registered providers
+        for pname in self._provider_last_seen:
+            task = asyncio.create_task(self._heartbeat_monitor(pname))
+            self._heartbeat_tasks[pname] = task
+
+        logger.info(
+            "bus.started",
+            extra={
+                "consumers": len(self._consumers),
+                "providers_monitored": list(self._provider_last_seen.keys()),
+            },
+        )

    async def stop(self) -> None:
-        """Graceful shutdown: drain queue then stop."""
+        """Graceful shutdown: stop heartbeats, drain queue, stop worker."""
        self._running = False
-        await self._queue.put(None)  # sentinel
+
+        # Cancel heartbeat monitors
+        for task in self._heartbeat_tasks.values():
+            task.cancel()
+        for task in self._heartbeat_tasks.values():
+            try:
+                await task
+            except asyncio.CancelledError:
+                pass
+        self._heartbeat_tasks.clear()
+
+        # Drain remaining events
+        remaining = self._queue.qsize()
+        if remaining > 0:
+            logger.info("bus.draining", extra={"remaining": remaining})
+
+        # Send sentinel to stop worker
+        await self._queue.put(None)
        if self._task:
            await self._task
+
+        if self._dropped:
+            logger.info("bus.drop_stats", extra={"dropped": self._dropped})
+
        logger.info("bus.stopped")

    @property
    def queue_size(self) -> int:
        return self._queue.qsize()
+
+    @property
+    def fill_percent(self) -> float:
+        return self._queue.qsize() / self._max_size if self._max_size > 0 else 0
--- a/services/market-data-service/app/core/failover.py
+++ b/services/market-data-service/app/core/failover.py
@@ -0,0 +1,170 @@
+"""
+Provider failover manager.
+
+Tracks provider health per symbol and recommends the best active source.
+
+Policy:
+- Each provider has a "health score" per symbol (0.0 – 1.0)
+- Score decreases on gaps (heartbeat timeout) and error events
+- Score increases on each successful trade/quote received
+- When primary provider's score drops below threshold → switch to backup
+
+Usage:
+    failover = FailoverManager(primary="binance", backups=["bybit"])
+    failover.record_event("binance", "BTCUSDT")  # bumps score
+    failover.record_gap("binance", "BTCUSDT")    # decreases score
+    best = failover.get_best_provider("BTCUSDT")  # → "binance" or "bybit"
+"""
+from __future__ import annotations
+
+import logging
+import time
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ProviderHealth:
+    """Health tracker for one provider+symbol pair."""
+
+    score: float = 1.0
+    event_count: int = 0
+    gap_count: int = 0
+    last_event_ts: float = 0.0
+    last_gap_ts: float = 0.0
+
+    def record_event(self) -> None:
+        """Bump health score on successful event."""
+        self.event_count += 1
+        self.last_event_ts = time.monotonic()
+        # Recover towards 1.0 gradually
+        self.score = min(1.0, self.score + 0.01)
+
+    def record_gap(self) -> None:
+        """Decrease health score on gap/timeout."""
+        self.gap_count += 1
+        self.last_gap_ts = time.monotonic()
+        self.score = max(0.0, self.score - 0.2)
+
+
+class FailoverManager:
+    """
+    Tracks provider health and recommends best source per symbol.
+    """
+
+    def __init__(
+        self,
+        primary: str,
+        backups: list[str] | None = None,
+        switch_threshold: float = 0.3,
+        recovery_threshold: float = 0.7,
+    ) -> None:
+        self._primary = primary
+        self._backups = backups or []
+        self._all_providers = [primary] + self._backups
+        self._switch_threshold = switch_threshold
+        self._recovery_threshold = recovery_threshold
+
+        # provider → symbol → ProviderHealth
+        self._health: dict[str, dict[str, ProviderHealth]] = {}
+
+        # symbol → currently active provider
+        self._active: dict[str, str] = {}
+
+    def _get_health(self, provider: str, symbol: str) -> ProviderHealth:
+        """Get or create health tracker."""
+        if provider not in self._health:
+            self._health[provider] = {}
+        if symbol not in self._health[provider]:
+            self._health[provider][symbol] = ProviderHealth()
+        return self._health[provider][symbol]
+
+    def record_event(self, provider: str, symbol: str) -> None:
+        """Record a successful event from provider for symbol."""
+        self._get_health(provider, symbol).record_event()
+
+    def record_gap(self, provider: str, symbol: str) -> None:
+        """Record a gap/timeout for provider+symbol."""
+        h = self._get_health(provider, symbol)
+        h.record_gap()
+        logger.warning(
+            "failover.gap_recorded",
+            extra={
+                "provider": provider,
+                "symbol": symbol,
+                "score": f"{h.score:.2f}",
+                "gaps": h.gap_count,
+            },
+        )
+
+    def get_best_provider(self, symbol: str) -> str:
+        """
+        Return the currently recommended provider for this symbol.
+
+        Logic:
+        1. If active provider score >= switch_threshold → keep it
+        2. If active provider drops below → switch to healthiest backup
+        3. If active provider recovers above recovery_threshold → switch back to primary
+        """
+        current = self._active.get(symbol, self._primary)
+        current_health = self._get_health(current, symbol)
+
+        # Check if current provider is degraded
+        if current_health.score < self._switch_threshold:
+            # Find best backup
+            best_provider = current
+            best_score = current_health.score
+
+            for p in self._all_providers:
+                if p == current:
+                    continue
+                h = self._get_health(p, symbol)
+                if h.score > best_score:
+                    best_provider = p
+                    best_score = h.score
+
+            if best_provider != current:
+                logger.warning(
+                    "failover.switching",
+                    extra={
+                        "symbol": symbol,
+                        "from": current,
+                        "to": best_provider,
+                        "old_score": f"{current_health.score:.2f}",
+                        "new_score": f"{best_score:.2f}",
+                    },
+                )
+                self._active[symbol] = best_provider
+                return best_provider
+
+        # Check if primary has recovered and we're on a backup
+        if current != self._primary:
+            primary_health = self._get_health(self._primary, symbol)
+            if primary_health.score >= self._recovery_threshold:
+                logger.info(
+                    "failover.returning_to_primary",
+                    extra={
+                        "symbol": symbol,
+                        "primary_score": f"{primary_health.score:.2f}",
+                    },
+                )
+                self._active[symbol] = self._primary
+                return self._primary
+
+        self._active[symbol] = current
+        return current
+
+    def get_status(self) -> dict:
+        """Return full failover status for monitoring."""
+        status = {}
+        for provider, symbols in self._health.items():
+            for symbol, health in symbols.items():
+                key = f"{provider}/{symbol}"
+                status[key] = {
+                    "score": round(health.score, 2),
+                    "events": health.event_count,
+                    "gaps": health.gap_count,
+                    "active": self._active.get(symbol) == provider,
+                }
+        return status