feat: MD pipeline — market-data-service hardening + SenpAI NATS consumer
Producer (market-data-service):
- Backpressure: smart drop policy (heartbeats→quotes→trades preserved)
- Heartbeat monitor: synthetic HeartbeatEvent on provider silence
- Graceful shutdown: WS→bus→storage→DB engine cleanup sequence
- Bybit V5 public WS provider (backup for Binance, no API key needed)
- FailoverManager: health-based provider switching with recovery
- NATS output adapter: md.events.{type}.{symbol} for SenpAI
- /bus-stats endpoint for backpressure monitoring
- Dockerfile + docker-compose.node1.yml integration
- 36 tests (parsing + bus + failover), requirements.lock
Consumer (senpai-md-consumer):
- NATSConsumer: subscribe md.events.>, queue group senpai-md, backpressure
- State store: LatestState + RollingWindow (deque, 60s)
- Feature engine: 11 features (mid, spread, VWAP, return, vol, latency)
- Rule-based signals: long/short on return+volume+spread conditions
- Publisher: rate-limited features + signals + alerts to NATS
- HTTP API: /health, /metrics, /state/latest, /features/latest, /stats
- 10 Prometheus metrics
- Dockerfile + docker-compose.senpai.yml
- 41 tests (parsing + state + features + rate-limit), requirements.lock
CI: ruff + pytest + smoke import for both services
Tests: 77 total passed, lint clean
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -1,6 +1,11 @@
|
||||
"""
|
||||
Async event bus — fan-out from providers to consumers.
|
||||
|
||||
Features:
|
||||
- Backpressure with smart drop policy (drop quotes before trades)
|
||||
- Heartbeat timer per provider (detects dead channels)
|
||||
- Graceful drain on shutdown
|
||||
|
||||
Usage:
|
||||
bus = EventBus()
|
||||
bus.add_consumer(storage_consumer)
|
||||
@@ -13,9 +18,10 @@ from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import Protocol
|
||||
|
||||
from app.domain.events import Event
|
||||
from app.domain.events import Event, EventType, HeartbeatEvent
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -26,37 +32,105 @@ class EventConsumer(Protocol):
|
||||
async def handle(self, event: Event) -> None: ...
|
||||
|
||||
|
||||
# Events that can be dropped under backpressure (least critical first)
|
||||
_DROPPABLE_PRIORITY = {
|
||||
EventType.HEARTBEAT: 0, # always droppable
|
||||
EventType.QUOTE: 1, # drop quotes before trades
|
||||
EventType.BOOK_L2: 2, # drop book snapshots before trades
|
||||
EventType.TRADE: 3, # trades are most critical — last to drop
|
||||
}
|
||||
|
||||
|
||||
class EventBus:
|
||||
"""
|
||||
Simple async fan-out bus.
|
||||
Async fan-out bus with backpressure and heartbeat monitoring.
|
||||
|
||||
Every published event is dispatched to all registered consumers
|
||||
concurrently (gather). A slow consumer doesn't block others thanks
|
||||
to the internal queue + worker pattern.
|
||||
Backpressure policy:
|
||||
- Queue 80% full → start dropping HEARTBEAT events
|
||||
- Queue 90% full → also drop QUOTE events
|
||||
- Queue 100% full → drop oldest (any type)
|
||||
|
||||
Heartbeat timer:
|
||||
- Emits synthetic HeartbeatEvent if a provider sends nothing
|
||||
for `heartbeat_interval` seconds, making dead channels visible.
|
||||
"""
|
||||
|
||||
def __init__(self, queue_size: int = 10_000) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
queue_size: int = 10_000,
|
||||
heartbeat_interval: float = 10.0,
|
||||
) -> None:
|
||||
self._consumers: list[EventConsumer] = []
|
||||
self._queue: asyncio.Queue[Event | None] = asyncio.Queue(maxsize=queue_size)
|
||||
self._max_size = queue_size
|
||||
self._running = False
|
||||
self._task: asyncio.Task | None = None
|
||||
self._heartbeat_interval = heartbeat_interval
|
||||
self._heartbeat_tasks: dict[str, asyncio.Task] = {}
|
||||
self._provider_last_seen: dict[str, float] = {}
|
||||
# Backpressure counters
|
||||
self._dropped: dict[str, int] = {}
|
||||
|
||||
def add_consumer(self, consumer: EventConsumer) -> None:
|
||||
self._consumers.append(consumer)
|
||||
logger.info("bus.consumer_added", extra={"consumer": type(consumer).__name__})
|
||||
|
||||
def register_provider(self, provider_name: str) -> None:
|
||||
"""Register a provider for heartbeat monitoring."""
|
||||
self._provider_last_seen[provider_name] = time.monotonic()
|
||||
|
||||
async def publish(self, event: Event) -> None:
|
||||
"""Put event into internal queue (non-blocking if queue not full)."""
|
||||
"""
|
||||
Put event into internal queue with backpressure.
|
||||
|
||||
Drop policy under pressure:
|
||||
- 80%+ → drop heartbeats
|
||||
- 90%+ → drop quotes/book snapshots
|
||||
- 100% → drop oldest event
|
||||
"""
|
||||
current = self._queue.qsize()
|
||||
fill_pct = current / self._max_size if self._max_size > 0 else 0
|
||||
|
||||
# Track provider activity for heartbeat timer
|
||||
self._provider_last_seen[event.provider] = time.monotonic()
|
||||
|
||||
priority = _DROPPABLE_PRIORITY.get(event.event_type, 3)
|
||||
|
||||
# Backpressure: drop low-priority events when queue is filling up
|
||||
if fill_pct >= 0.9 and priority <= 1:
|
||||
# Drop heartbeats and quotes
|
||||
self._dropped[event.event_type.value] = self._dropped.get(event.event_type.value, 0) + 1
|
||||
if self._dropped[event.event_type.value] % 1000 == 1:
|
||||
logger.warning(
|
||||
"bus.backpressure_drop",
|
||||
extra={
|
||||
"type": event.event_type.value,
|
||||
"fill_pct": f"{fill_pct:.0%}",
|
||||
"total_dropped": self._dropped,
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
if fill_pct >= 0.8 and priority == 0:
|
||||
# Drop heartbeats only
|
||||
return
|
||||
|
||||
try:
|
||||
self._queue.put_nowait(event)
|
||||
except asyncio.QueueFull:
|
||||
logger.warning("bus.queue_full, dropping oldest event")
|
||||
# Drop oldest to keep queue moving
|
||||
# Last resort: drop oldest to make room
|
||||
try:
|
||||
self._queue.get_nowait()
|
||||
dropped = self._queue.get_nowait()
|
||||
logger.warning(
|
||||
"bus.queue_full_drop_oldest",
|
||||
extra={"dropped_type": dropped.event_type.value if dropped else "None"},
|
||||
)
|
||||
except asyncio.QueueEmpty:
|
||||
pass
|
||||
self._queue.put_nowait(event)
|
||||
try:
|
||||
self._queue.put_nowait(event)
|
||||
except asyncio.QueueFull:
|
||||
pass # truly stuck
|
||||
|
||||
async def _worker(self) -> None:
|
||||
"""Background worker that drains the queue and fans out."""
|
||||
@@ -75,20 +149,79 @@ class EventBus:
|
||||
extra={"consumer": consumer_name, "error": str(result)},
|
||||
)
|
||||
|
||||
async def _heartbeat_monitor(self, provider_name: str) -> None:
|
||||
"""Emit synthetic heartbeat if provider goes silent."""
|
||||
while self._running:
|
||||
await asyncio.sleep(self._heartbeat_interval)
|
||||
if not self._running:
|
||||
break
|
||||
|
||||
last = self._provider_last_seen.get(provider_name, 0)
|
||||
elapsed = time.monotonic() - last
|
||||
|
||||
if elapsed > self._heartbeat_interval:
|
||||
# Provider is silent — emit heartbeat so metrics/logs see it
|
||||
logger.warning(
|
||||
"bus.provider_silent",
|
||||
extra={
|
||||
"provider": provider_name,
|
||||
"silent_seconds": f"{elapsed:.1f}",
|
||||
},
|
||||
)
|
||||
hb = HeartbeatEvent(provider=provider_name)
|
||||
await self.publish(hb)
|
||||
|
||||
async def start(self) -> None:
|
||||
"""Start the bus worker."""
|
||||
"""Start the bus worker and heartbeat monitors."""
|
||||
self._running = True
|
||||
self._task = asyncio.create_task(self._worker())
|
||||
logger.info("bus.started", extra={"consumers": len(self._consumers)})
|
||||
|
||||
# Start heartbeat monitors for registered providers
|
||||
for pname in self._provider_last_seen:
|
||||
task = asyncio.create_task(self._heartbeat_monitor(pname))
|
||||
self._heartbeat_tasks[pname] = task
|
||||
|
||||
logger.info(
|
||||
"bus.started",
|
||||
extra={
|
||||
"consumers": len(self._consumers),
|
||||
"providers_monitored": list(self._provider_last_seen.keys()),
|
||||
},
|
||||
)
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""Graceful shutdown: drain queue then stop."""
|
||||
"""Graceful shutdown: stop heartbeats, drain queue, stop worker."""
|
||||
self._running = False
|
||||
await self._queue.put(None) # sentinel
|
||||
|
||||
# Cancel heartbeat monitors
|
||||
for task in self._heartbeat_tasks.values():
|
||||
task.cancel()
|
||||
for task in self._heartbeat_tasks.values():
|
||||
try:
|
||||
await task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._heartbeat_tasks.clear()
|
||||
|
||||
# Drain remaining events
|
||||
remaining = self._queue.qsize()
|
||||
if remaining > 0:
|
||||
logger.info("bus.draining", extra={"remaining": remaining})
|
||||
|
||||
# Send sentinel to stop worker
|
||||
await self._queue.put(None)
|
||||
if self._task:
|
||||
await self._task
|
||||
|
||||
if self._dropped:
|
||||
logger.info("bus.drop_stats", extra={"dropped": self._dropped})
|
||||
|
||||
logger.info("bus.stopped")
|
||||
|
||||
@property
|
||||
def queue_size(self) -> int:
|
||||
return self._queue.qsize()
|
||||
|
||||
@property
|
||||
def fill_percent(self) -> float:
|
||||
return self._queue.qsize() / self._max_size if self._max_size > 0 else 0
|
||||
|
||||
170
services/market-data-service/app/core/failover.py
Normal file
170
services/market-data-service/app/core/failover.py
Normal file
@@ -0,0 +1,170 @@
|
||||
"""
|
||||
Provider failover manager.
|
||||
|
||||
Tracks provider health per symbol and recommends the best active source.
|
||||
|
||||
Policy:
|
||||
- Each provider has a "health score" per symbol (0.0 – 1.0)
|
||||
- Score decreases on gaps (heartbeat timeout) and error events
|
||||
- Score increases on each successful trade/quote received
|
||||
- When primary provider's score drops below threshold → switch to backup
|
||||
|
||||
Usage:
|
||||
failover = FailoverManager(primary="binance", backups=["bybit"])
|
||||
failover.record_event("binance", "BTCUSDT") # bumps score
|
||||
failover.record_gap("binance", "BTCUSDT") # decreases score
|
||||
best = failover.get_best_provider("BTCUSDT") # → "binance" or "bybit"
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProviderHealth:
|
||||
"""Health tracker for one provider+symbol pair."""
|
||||
|
||||
score: float = 1.0
|
||||
event_count: int = 0
|
||||
gap_count: int = 0
|
||||
last_event_ts: float = 0.0
|
||||
last_gap_ts: float = 0.0
|
||||
|
||||
def record_event(self) -> None:
|
||||
"""Bump health score on successful event."""
|
||||
self.event_count += 1
|
||||
self.last_event_ts = time.monotonic()
|
||||
# Recover towards 1.0 gradually
|
||||
self.score = min(1.0, self.score + 0.01)
|
||||
|
||||
def record_gap(self) -> None:
|
||||
"""Decrease health score on gap/timeout."""
|
||||
self.gap_count += 1
|
||||
self.last_gap_ts = time.monotonic()
|
||||
self.score = max(0.0, self.score - 0.2)
|
||||
|
||||
|
||||
class FailoverManager:
|
||||
"""
|
||||
Tracks provider health and recommends best source per symbol.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
primary: str,
|
||||
backups: list[str] | None = None,
|
||||
switch_threshold: float = 0.3,
|
||||
recovery_threshold: float = 0.7,
|
||||
) -> None:
|
||||
self._primary = primary
|
||||
self._backups = backups or []
|
||||
self._all_providers = [primary] + self._backups
|
||||
self._switch_threshold = switch_threshold
|
||||
self._recovery_threshold = recovery_threshold
|
||||
|
||||
# provider → symbol → ProviderHealth
|
||||
self._health: dict[str, dict[str, ProviderHealth]] = {}
|
||||
|
||||
# symbol → currently active provider
|
||||
self._active: dict[str, str] = {}
|
||||
|
||||
def _get_health(self, provider: str, symbol: str) -> ProviderHealth:
|
||||
"""Get or create health tracker."""
|
||||
if provider not in self._health:
|
||||
self._health[provider] = {}
|
||||
if symbol not in self._health[provider]:
|
||||
self._health[provider][symbol] = ProviderHealth()
|
||||
return self._health[provider][symbol]
|
||||
|
||||
def record_event(self, provider: str, symbol: str) -> None:
|
||||
"""Record a successful event from provider for symbol."""
|
||||
self._get_health(provider, symbol).record_event()
|
||||
|
||||
def record_gap(self, provider: str, symbol: str) -> None:
|
||||
"""Record a gap/timeout for provider+symbol."""
|
||||
h = self._get_health(provider, symbol)
|
||||
h.record_gap()
|
||||
logger.warning(
|
||||
"failover.gap_recorded",
|
||||
extra={
|
||||
"provider": provider,
|
||||
"symbol": symbol,
|
||||
"score": f"{h.score:.2f}",
|
||||
"gaps": h.gap_count,
|
||||
},
|
||||
)
|
||||
|
||||
def get_best_provider(self, symbol: str) -> str:
|
||||
"""
|
||||
Return the currently recommended provider for this symbol.
|
||||
|
||||
Logic:
|
||||
1. If active provider score >= switch_threshold → keep it
|
||||
2. If active provider drops below → switch to healthiest backup
|
||||
3. If active provider recovers above recovery_threshold → switch back to primary
|
||||
"""
|
||||
current = self._active.get(symbol, self._primary)
|
||||
current_health = self._get_health(current, symbol)
|
||||
|
||||
# Check if current provider is degraded
|
||||
if current_health.score < self._switch_threshold:
|
||||
# Find best backup
|
||||
best_provider = current
|
||||
best_score = current_health.score
|
||||
|
||||
for p in self._all_providers:
|
||||
if p == current:
|
||||
continue
|
||||
h = self._get_health(p, symbol)
|
||||
if h.score > best_score:
|
||||
best_provider = p
|
||||
best_score = h.score
|
||||
|
||||
if best_provider != current:
|
||||
logger.warning(
|
||||
"failover.switching",
|
||||
extra={
|
||||
"symbol": symbol,
|
||||
"from": current,
|
||||
"to": best_provider,
|
||||
"old_score": f"{current_health.score:.2f}",
|
||||
"new_score": f"{best_score:.2f}",
|
||||
},
|
||||
)
|
||||
self._active[symbol] = best_provider
|
||||
return best_provider
|
||||
|
||||
# Check if primary has recovered and we're on a backup
|
||||
if current != self._primary:
|
||||
primary_health = self._get_health(self._primary, symbol)
|
||||
if primary_health.score >= self._recovery_threshold:
|
||||
logger.info(
|
||||
"failover.returning_to_primary",
|
||||
extra={
|
||||
"symbol": symbol,
|
||||
"primary_score": f"{primary_health.score:.2f}",
|
||||
},
|
||||
)
|
||||
self._active[symbol] = self._primary
|
||||
return self._primary
|
||||
|
||||
self._active[symbol] = current
|
||||
return current
|
||||
|
||||
def get_status(self) -> dict:
|
||||
"""Return full failover status for monitoring."""
|
||||
status = {}
|
||||
for provider, symbols in self._health.items():
|
||||
for symbol, health in symbols.items():
|
||||
key = f"{provider}/{symbol}"
|
||||
status[key] = {
|
||||
"score": round(health.score, 2),
|
||||
"events": health.event_count,
|
||||
"gaps": health.gap_count,
|
||||
"active": self._active.get(symbol) == provider,
|
||||
}
|
||||
return status
|
||||
Reference in New Issue
Block a user