snapshot: NODE1 production state 2026-02-09
Complete snapshot of /opt/microdao-daarion/ from NODE1 (144.76.224.179).
This represents the actual running production code that has diverged
significantly from the previous main branch.
Key changes from old main:
- Gateway (http_api.py): expanded from ~40KB to 164KB with full agent support
- Router: new /v1/agents/{id}/infer endpoint with vision + DeepSeek routing
- Behavior Policy: SOWA v2.2 (3-level: FULL/ACK/SILENT)
- Agent Registry: config/agent_registry.yml as single source of truth
- 13 agents configured (was 3)
- Memory service integration
- CrewAI teams and roles
Excluded from snapshot: venv/, .env, data/, backups, .tgz archives
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
16
services/parser-pipeline/Dockerfile
Normal file
16
services/parser-pipeline/Dockerfile
Normal file
@@ -0,0 +1,16 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN pip install --no-cache-dir \
|
||||
fastapi \
|
||||
uvicorn \
|
||||
httpx \
|
||||
nats-py \
|
||||
cohere
|
||||
|
||||
COPY main.py .
|
||||
|
||||
EXPOSE 8101
|
||||
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8101"]
|
||||
384
services/parser-pipeline/main.py
Normal file
384
services/parser-pipeline/main.py
Normal file
@@ -0,0 +1,384 @@
|
||||
"""
|
||||
Parser Pipeline
|
||||
===============
|
||||
Асинхронний пайплайн для обробки документів.
|
||||
|
||||
Слухає події з NATS JetStream:
|
||||
- attachment.created.document -> парсить документ
|
||||
- attachment.created.image -> описує зображення
|
||||
- attachment.created.audio -> транскрибує аудіо
|
||||
|
||||
Публікує результати:
|
||||
- attachment.parsed.{type}
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import logging
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# Configuration
|
||||
NATS_URL = os.getenv("NATS_URL", "nats://nats:4222")
|
||||
SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890")
|
||||
MEMORY_SERVICE_URL = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000")
|
||||
COHERE_API_KEY = os.getenv("COHERE_API_KEY", "")
|
||||
|
||||
# Stream configuration
|
||||
STREAM_NAME = "ATTACHMENTS"
|
||||
CONSUMER_NAME = "parser-pipeline"
|
||||
|
||||
class ProcessingResult:
|
||||
def __init__(self, success: bool, content: str = "", metadata: Dict = None, error: str = None):
|
||||
self.success = success
|
||||
self.content = content
|
||||
self.metadata = metadata or {}
|
||||
self.error = error
|
||||
|
||||
|
||||
class ParserPipeline:
|
||||
"""
|
||||
Асинхронний обробник документів.
|
||||
|
||||
Підписується на NATS JetStream і обробляє файли:
|
||||
- PDF, DOCX, TXT -> текст
|
||||
- Images -> опис через Vision
|
||||
- Audio -> транскрипція через STT
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.nc = None
|
||||
self.js = None
|
||||
self.http_client = httpx.AsyncClient(timeout=120.0)
|
||||
self.running = False
|
||||
|
||||
async def connect(self):
|
||||
"""Підключення до NATS JetStream"""
|
||||
try:
|
||||
import nats
|
||||
from nats.js.api import ConsumerConfig, DeliverPolicy, AckPolicy
|
||||
|
||||
self.nc = await nats.connect(NATS_URL)
|
||||
self.js = self.nc.jetstream()
|
||||
|
||||
# Ensure stream exists
|
||||
try:
|
||||
await self.js.stream_info(STREAM_NAME)
|
||||
logger.info(f"Connected to stream: {STREAM_NAME}")
|
||||
except Exception:
|
||||
# Create stream if not exists
|
||||
await self.js.add_stream(
|
||||
name=STREAM_NAME,
|
||||
subjects=["attachment.>"]
|
||||
)
|
||||
logger.info(f"Created stream: {STREAM_NAME}")
|
||||
|
||||
# Create durable consumer
|
||||
try:
|
||||
consumer_config = ConsumerConfig(
|
||||
durable_name=CONSUMER_NAME,
|
||||
deliver_policy=DeliverPolicy.ALL,
|
||||
ack_policy=AckPolicy.EXPLICIT,
|
||||
filter_subject="attachment.created.>",
|
||||
max_deliver=3, # Retry up to 3 times
|
||||
)
|
||||
await self.js.add_consumer(STREAM_NAME, consumer_config)
|
||||
logger.info(f"Consumer created: {CONSUMER_NAME}")
|
||||
except Exception as e:
|
||||
logger.info(f"Consumer exists or error: {e}")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"NATS connection failed: {e}")
|
||||
return False
|
||||
|
||||
async def start(self):
|
||||
"""Запуск обробки повідомлень"""
|
||||
if not await self.connect():
|
||||
logger.error("Failed to connect to NATS")
|
||||
return
|
||||
|
||||
self.running = True
|
||||
logger.info("Parser Pipeline started")
|
||||
|
||||
# Subscribe to attachment events
|
||||
sub = await self.js.pull_subscribe(
|
||||
"attachment.created.>",
|
||||
CONSUMER_NAME,
|
||||
stream=STREAM_NAME
|
||||
)
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
# Fetch messages (batch of 10, wait 5 seconds)
|
||||
messages = await sub.fetch(10, timeout=5)
|
||||
|
||||
for msg in messages:
|
||||
try:
|
||||
await self.process_message(msg)
|
||||
await msg.ack()
|
||||
except Exception as e:
|
||||
logger.error(f"Message processing failed: {e}")
|
||||
await msg.nak(delay=10) # Retry after 10 seconds
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
# No messages, continue polling
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.error(f"Fetch error: {e}")
|
||||
await asyncio.sleep(1)
|
||||
|
||||
async def stop(self):
|
||||
"""Зупинка пайплайну"""
|
||||
self.running = False
|
||||
if self.nc:
|
||||
await self.nc.close()
|
||||
await self.http_client.aclose()
|
||||
logger.info("Parser Pipeline stopped")
|
||||
|
||||
async def process_message(self, msg):
|
||||
"""Обробляє одне повідомлення"""
|
||||
try:
|
||||
data = json.loads(msg.data.decode())
|
||||
file_type = data.get("file_type", "unknown")
|
||||
file_id = data.get("file_id")
|
||||
storage_path = data.get("storage_path")
|
||||
agent_id = data.get("agent_id", "helion")
|
||||
user_id = data.get("user_id")
|
||||
|
||||
logger.info(f"Processing: {file_id} ({file_type})")
|
||||
|
||||
# Process based on type
|
||||
if file_type == "document":
|
||||
result = await self.process_document(storage_path, data.get("mime_type"))
|
||||
elif file_type == "image":
|
||||
result = await self.process_image(storage_path)
|
||||
elif file_type == "audio":
|
||||
result = await self.process_audio(storage_path)
|
||||
else:
|
||||
logger.warning(f"Unknown file type: {file_type}")
|
||||
return
|
||||
|
||||
if result.success:
|
||||
# Index to RAG
|
||||
await self.index_to_rag(
|
||||
file_id=file_id,
|
||||
content=result.content,
|
||||
metadata=result.metadata,
|
||||
agent_id=agent_id,
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
# Publish parsed event
|
||||
await self.publish_result(file_id, file_type, result, agent_id)
|
||||
logger.info(f"✅ Processed: {file_id}")
|
||||
else:
|
||||
logger.error(f"❌ Failed: {file_id} - {result.error}")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Invalid JSON: {e}")
|
||||
|
||||
async def process_document(self, path: str, mime_type: str) -> ProcessingResult:
|
||||
"""Парсить документ через Swapper"""
|
||||
try:
|
||||
resp = await self.http_client.post(
|
||||
f"{SWAPPER_URL}/document",
|
||||
json={"file_path": path, "mime_type": mime_type},
|
||||
timeout=60.0
|
||||
)
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
return ProcessingResult(
|
||||
success=True,
|
||||
content=data.get("text", ""),
|
||||
metadata={
|
||||
"pages": data.get("pages", 1),
|
||||
"mime_type": mime_type,
|
||||
"processed_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
)
|
||||
else:
|
||||
return ProcessingResult(success=False, error=f"API error: {resp.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
return ProcessingResult(success=False, error=str(e))
|
||||
|
||||
async def process_image(self, path: str) -> ProcessingResult:
|
||||
"""Описує зображення через Vision"""
|
||||
try:
|
||||
import base64
|
||||
|
||||
with open(path, "rb") as f:
|
||||
image_data = base64.b64encode(f.read()).decode()
|
||||
|
||||
resp = await self.http_client.post(
|
||||
f"{SWAPPER_URL}/vision",
|
||||
json={
|
||||
"image_base64": image_data,
|
||||
"prompt": "Опиши це зображення детально українською. Вкажи всі важливі деталі."
|
||||
},
|
||||
timeout=30.0
|
||||
)
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
description = data.get("description", data.get("text", ""))
|
||||
return ProcessingResult(
|
||||
success=True,
|
||||
content=description,
|
||||
metadata={
|
||||
"type": "image_description",
|
||||
"processed_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
)
|
||||
else:
|
||||
return ProcessingResult(success=False, error=f"Vision API error: {resp.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
return ProcessingResult(success=False, error=str(e))
|
||||
|
||||
async def process_audio(self, path: str) -> ProcessingResult:
|
||||
"""Транскрибує аудіо через STT"""
|
||||
try:
|
||||
import base64
|
||||
|
||||
with open(path, "rb") as f:
|
||||
audio_data = base64.b64encode(f.read()).decode()
|
||||
|
||||
resp = await self.http_client.post(
|
||||
f"{SWAPPER_URL}/stt",
|
||||
json={
|
||||
"audio_base64": audio_data,
|
||||
"language": "uk"
|
||||
},
|
||||
timeout=120.0
|
||||
)
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
return ProcessingResult(
|
||||
success=True,
|
||||
content=data.get("text", ""),
|
||||
metadata={
|
||||
"language": data.get("language", "uk"),
|
||||
"duration": data.get("duration"),
|
||||
"processed_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
)
|
||||
else:
|
||||
return ProcessingResult(success=False, error=f"STT API error: {resp.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
return ProcessingResult(success=False, error=str(e))
|
||||
|
||||
async def index_to_rag(self, file_id: str, content: str, metadata: Dict,
|
||||
agent_id: str, user_id: str):
|
||||
"""Індексує контент у RAG (Qdrant через Memory Service)"""
|
||||
if not content or len(content) < 10:
|
||||
return
|
||||
|
||||
try:
|
||||
# Generate embedding via Cohere
|
||||
if COHERE_API_KEY:
|
||||
import cohere
|
||||
co = cohere.Client(COHERE_API_KEY)
|
||||
embed_resp = co.embed(
|
||||
texts=[content[:8000]], # Limit text length
|
||||
model="embed-multilingual-v3.0",
|
||||
input_type="search_document"
|
||||
)
|
||||
vector = embed_resp.embeddings[0]
|
||||
else:
|
||||
logger.warning("No Cohere API key, skipping embedding")
|
||||
return
|
||||
|
||||
# Store in Memory Service
|
||||
resp = await self.http_client.post(
|
||||
f"{MEMORY_SERVICE_URL}/artifacts/store",
|
||||
json={
|
||||
"artifact_id": file_id,
|
||||
"content": content[:10000], # Limit content
|
||||
"vector": vector,
|
||||
"agent_id": agent_id,
|
||||
"user_id": user_id,
|
||||
"metadata": metadata,
|
||||
"artifact_type": "document"
|
||||
},
|
||||
timeout=30.0
|
||||
)
|
||||
|
||||
if resp.status_code in [200, 201]:
|
||||
logger.info(f"Indexed to RAG: {file_id}")
|
||||
else:
|
||||
logger.warning(f"RAG indexing failed: {resp.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"RAG indexing error: {e}")
|
||||
|
||||
async def publish_result(self, file_id: str, file_type: str,
|
||||
result: ProcessingResult, agent_id: str):
|
||||
"""Публікує результат у NATS"""
|
||||
if not self.js:
|
||||
return
|
||||
|
||||
event = {
|
||||
"event_id": f"parsed-{file_id}",
|
||||
"event_type": f"attachment.parsed.{file_type}",
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"file_id": file_id,
|
||||
"agent_id": agent_id,
|
||||
"content_length": len(result.content),
|
||||
"metadata": result.metadata
|
||||
}
|
||||
|
||||
subject = f"attachment.parsed.{file_type}"
|
||||
await self.js.publish(subject, json.dumps(event).encode())
|
||||
|
||||
|
||||
# ==================== FastAPI App for Health Check ====================
|
||||
|
||||
from fastapi import FastAPI
|
||||
|
||||
app = FastAPI(title="Parser Pipeline", version="1.0.0")
|
||||
pipeline = ParserPipeline()
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup():
|
||||
# Start pipeline in background
|
||||
asyncio.create_task(pipeline.start())
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown():
|
||||
await pipeline.stop()
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {
|
||||
"status": "healthy" if pipeline.running else "starting",
|
||||
"service": "parser-pipeline",
|
||||
"nats_connected": pipeline.nc is not None
|
||||
}
|
||||
|
||||
@app.get("/stats")
|
||||
async def stats():
|
||||
"""Статистика обробки"""
|
||||
return {
|
||||
"running": pipeline.running,
|
||||
"nats_connected": pipeline.nc is not None,
|
||||
"stream": STREAM_NAME,
|
||||
"consumer": CONSUMER_NAME
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8101)
|
||||
Reference in New Issue
Block a user