feat: MD pipeline — market-data-service hardening + SenpAI NATS consumer

Producer (market-data-service):
- Backpressure: smart drop policy (heartbeats→quotes→trades preserved)
- Heartbeat monitor: synthetic HeartbeatEvent on provider silence
- Graceful shutdown: WS→bus→storage→DB engine cleanup sequence
- Bybit V5 public WS provider (backup for Binance, no API key needed)
- FailoverManager: health-based provider switching with recovery
- NATS output adapter: md.events.{type}.{symbol} for SenpAI
- /bus-stats endpoint for backpressure monitoring
- Dockerfile + docker-compose.node1.yml integration
- 36 tests (parsing + bus + failover), requirements.lock

Consumer (senpai-md-consumer):
- NATSConsumer: subscribe md.events.>, queue group senpai-md, backpressure
- State store: LatestState + RollingWindow (deque, 60s)
- Feature engine: 11 features (mid, spread, VWAP, return, vol, latency)
- Rule-based signals: long/short on return+volume+spread conditions
- Publisher: rate-limited features + signals + alerts to NATS
- HTTP API: /health, /metrics, /state/latest, /features/latest, /stats
- 10 Prometheus metrics
- Dockerfile + docker-compose.senpai.yml
- 41 tests (parsing + state + features + rate-limit), requirements.lock

CI: ruff + pytest + smoke import for both services
Tests: 77 total passed, lint clean
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Apple
2026-02-09 11:46:15 -08:00
parent c50843933f
commit 09dee24342
47 changed files with 3930 additions and 56 deletions

View File

@@ -18,7 +18,6 @@ import asyncio
import logging
import signal
import sys
from contextlib import asynccontextmanager
import structlog
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
@@ -26,14 +25,18 @@ from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
from app.config import settings
from app.core.bus import EventBus
from app.consumers.metrics import MetricsConsumer
from app.consumers.nats_output import NatsOutputConsumer
from app.consumers.print import PrintConsumer
from app.consumers.storage import StorageConsumer
from app.db.schema import init_db
from app.db.schema import engine, init_db
from app.db import repo
from app.providers import MarketDataProvider, get_provider
logger = structlog.get_logger()
# Global reference to bus (for HTTP status endpoint)
_bus: EventBus | None = None
# ── Logging setup ──────────────────────────────────────────────────────
@@ -105,6 +108,18 @@ async def _http_handler(reader: asyncio.StreamReader, writer: asyncio.StreamWrit
}
body = json.dumps(result, ensure_ascii=False).encode()
content_type = "application/json"
elif path == "/bus-stats":
import json as _json
bus_info = {"queue_size": 0, "fill_percent": 0.0}
if _bus:
bus_info = {
"queue_size": _bus.queue_size,
"fill_percent": round(_bus.fill_percent * 100, 1),
"max_size": _bus._max_size,
}
body = _json.dumps(bus_info).encode()
content_type = "application/json"
else:
body = b'{"error":"not found"}'
content_type = "application/json"
@@ -179,8 +194,13 @@ async def main(provider_names: list[str], symbols: list[str]) -> None:
# Init database
await init_db()
# Setup bus + consumers
bus = EventBus()
global _bus
# Setup bus + consumers (heartbeat interval from config)
bus = EventBus(
queue_size=10_000,
heartbeat_interval=settings.heartbeat_timeout / 2, # check twice per timeout
)
storage = StorageConsumer()
await storage.start()
@@ -192,16 +212,29 @@ async def main(provider_names: list[str], symbols: list[str]) -> None:
printer = PrintConsumer()
bus.add_consumer(printer)
# Optional: NATS output adapter
nats_consumer = None
if settings.nats_configured:
nats_consumer = NatsOutputConsumer()
await nats_consumer.start()
bus.add_consumer(nats_consumer)
logger.info("nats_output.enabled", subject_prefix=settings.nats_subject_prefix)
else:
logger.info("nats_output.disabled", hint="Set NATS_URL + NATS_ENABLED=true to enable")
# Create providers and register them for heartbeat monitoring
providers: list[MarketDataProvider] = []
for name in provider_names:
p = get_provider(name)
providers.append(p)
bus.register_provider(p.name)
_bus = bus
await bus.start()
# Start HTTP server
http_server = await start_http_server()
# Create providers
providers: list[MarketDataProvider] = []
for name in provider_names:
providers.append(get_provider(name))
# Run all providers concurrently
tasks = []
for p in providers:
@@ -224,21 +257,43 @@ async def main(provider_names: list[str], symbols: list[str]) -> None:
# Wait for shutdown
await shutdown_event.wait()
# Cleanup
# ── Graceful shutdown sequence ──────────────────────────────────────
logger.info("service.shutting_down")
# 1. Cancel provider streaming tasks (with timeout)
for task in tasks:
task.cancel()
done, pending = await asyncio.wait(tasks, timeout=5.0)
for task in pending:
logger.warning("service.task_force_cancel", extra={"task": task.get_name()})
task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
# 2. Close provider WebSocket connections
for p in providers:
await p.close()
try:
await p.close()
except Exception as e:
logger.warning("service.provider_close_error", extra={"provider": p.name, "error": str(e)})
# 3. Stop bus (drains remaining events to consumers)
await bus.stop()
# 4. Stop storage consumer (flush JSONL)
await storage.stop()
# 4b. Stop NATS output (flush + close)
if nats_consumer:
await nats_consumer.stop()
# 5. Close HTTP server
http_server.close()
await http_server.wait_closed()
logger.info("service.stopped")
# 6. Close SQLAlchemy engine (flush connections)
await engine.dispose()
logger.info("service.stopped", extra={"exit": "clean"})
# ── CLI ────────────────────────────────────────────────────────────────
@@ -270,7 +325,7 @@ def cli():
symbols = [s.strip() for s in args.symbols.split(",") if s.strip()]
if args.provider.lower() == "all":
provider_names = ["binance", "alpaca"]
provider_names = ["binance", "alpaca", "bybit"]
else:
provider_names = [p.strip() for p in args.provider.split(",") if p.strip()]