gateway: add public invoke/jobs facade with redis queue worker and SSE

This commit is contained in:
NODA1 System
2026-02-20 17:55:47 +01:00
parent 7e82a427e3
commit 2e76ef9ccb
7 changed files with 619 additions and 55 deletions

View File

@@ -0,0 +1 @@
"""DAARION public facade package."""

View File

@@ -0,0 +1,118 @@
import asyncio
from datetime import datetime, timezone
import json
import os
import uuid
from typing import Any, Dict, List
from fastapi import APIRouter, HTTPException, Request, status
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field
from .redis_jobs import create_job, enqueue_job, get_job
from .registry_api import _load_registry
router = APIRouter(prefix="/v1", tags=["daarion-facade"])
EVENT_TERMINAL_STATUSES = {"done", "failed"}
EVENT_KNOWN_STATUSES = {"queued", "running", "done", "failed"}
EVENT_POLL_SECONDS = float(os.getenv("DAARION_JOB_EVENTS_POLL_SECONDS", "0.5"))
class InvokeInput(BaseModel):
prompt: str = Field(min_length=1)
images: List[str] = Field(default_factory=list)
class InvokeRequest(BaseModel):
agent_id: str
input: InvokeInput
metadata: Dict[str, Any] = Field(default_factory=dict)
class InvokeResponse(BaseModel):
job_id: str
status: str
status_url: str
def _sse_message(event: str, payload: Dict[str, Any]) -> str:
return f"event: {event}\ndata: {json.dumps(payload, ensure_ascii=False)}\n\n"
@router.post("/invoke", status_code=status.HTTP_202_ACCEPTED, response_model=InvokeResponse)
async def invoke(payload: InvokeRequest) -> InvokeResponse:
registry = _load_registry().get("agents", {})
if payload.agent_id not in registry:
raise HTTPException(status_code=404, detail=f"Unknown agent_id: {payload.agent_id}")
job_id = f"job_{uuid.uuid4().hex}"
now = datetime.now(timezone.utc).isoformat()
job_doc = {
"job_id": job_id,
"status": "queued",
"agent_id": payload.agent_id,
"input": payload.input.model_dump(),
"metadata": payload.metadata,
"result": None,
"error": None,
"created_at": now,
"updated_at": now,
"started_at": None,
"finished_at": None,
}
await create_job(job_id, job_doc)
await enqueue_job(job_id)
return InvokeResponse(job_id=job_id, status="queued", status_url=f"/v1/jobs/{job_id}")
@router.get("/jobs/{job_id}")
async def job_status(job_id: str) -> Dict[str, Any]:
job = await get_job(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
return job
@router.get("/jobs/{job_id}/events")
async def job_events(job_id: str, request: Request) -> StreamingResponse:
existing = await get_job(job_id)
if not existing:
raise HTTPException(status_code=404, detail="Job not found")
async def event_stream():
last_state = None
yield "retry: 1000\n\n"
while True:
if await request.is_disconnected():
break
job = await get_job(job_id)
if not job:
yield _sse_message("failed", {"job_id": job_id, "status": "failed", "error": {"message": "Job not found"}})
break
status_value = str(job.get("status", "unknown"))
updated_at = str(job.get("updated_at", ""))
state = (status_value, updated_at)
if state != last_state:
event_name = status_value if status_value in EVENT_KNOWN_STATUSES else "status"
yield _sse_message(event_name, job)
last_state = state
if status_value in EVENT_TERMINAL_STATUSES:
break
await asyncio.sleep(EVENT_POLL_SECONDS)
return StreamingResponse(
event_stream(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no",
},
)

View File

@@ -0,0 +1,84 @@
import asyncio
import json
import os
from typing import Any, Dict, Optional
from redis.asyncio import Redis
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
JOB_KEY_PREFIX = "daarion:jobs"
QUEUE_KEY = "daarion:jobs:queue"
JOB_TTL_SECONDS = int(os.getenv("DAARION_JOB_TTL_SECONDS", str(72 * 3600)))
_redis: Optional[Redis] = None
def _job_key(job_id: str) -> str:
return f"{JOB_KEY_PREFIX}:{job_id}"
async def redis_client() -> Redis:
global _redis
if _redis is None:
_redis = Redis.from_url(REDIS_URL, decode_responses=True)
return _redis
async def close_redis() -> None:
global _redis
if _redis is not None:
await _redis.close()
_redis = None
async def create_job(job_id: str, payload: Dict[str, Any]) -> None:
r = await redis_client()
key = _job_key(job_id)
await r.set(key, json.dumps(payload, ensure_ascii=False), ex=JOB_TTL_SECONDS)
async def get_job(job_id: str) -> Optional[Dict[str, Any]]:
r = await redis_client()
raw = await r.get(_job_key(job_id))
if not raw:
return None
try:
return json.loads(raw)
except json.JSONDecodeError:
return None
async def update_job(job_id: str, patch: Dict[str, Any]) -> Optional[Dict[str, Any]]:
current = await get_job(job_id)
if not current:
return None
current.update(patch)
await create_job(job_id, current)
return current
async def enqueue_job(job_id: str) -> None:
r = await redis_client()
await r.lpush(QUEUE_KEY, job_id)
async def dequeue_job(block_seconds: int = 5) -> Optional[str]:
r = await redis_client()
result = await r.brpop(QUEUE_KEY, timeout=block_seconds)
if not result:
return None
_, job_id = result
return job_id
async def wait_for_redis(timeout_seconds: int = 30) -> None:
deadline = asyncio.get_running_loop().time() + timeout_seconds
while True:
try:
r = await redis_client()
await r.ping()
return
except Exception:
if asyncio.get_running_loop().time() >= deadline:
raise
await asyncio.sleep(1)

View File

@@ -0,0 +1,107 @@
import asyncio
from datetime import datetime, timezone
import logging
import os
from typing import Any, Dict
import httpx
from .redis_jobs import close_redis, dequeue_job, get_job, update_job, wait_for_redis
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
logger = logging.getLogger("daarion-gateway-worker")
ROUTER_BASE_URL = os.getenv("ROUTER_BASE_URL", os.getenv("ROUTER_URL", "http://router:8000"))
ROUTER_TIMEOUT_SECONDS = float(os.getenv("ROUTER_WORKER_TIMEOUT", "60"))
def _now() -> str:
return datetime.now(timezone.utc).isoformat()
async def _call_router(agent_id: str, input_payload: Dict[str, Any], metadata: Dict[str, Any]) -> Dict[str, Any]:
body: Dict[str, Any] = {
"prompt": input_payload.get("prompt", ""),
"metadata": metadata or {},
}
images = input_payload.get("images") or []
if images:
body["images"] = images
url = f"{ROUTER_BASE_URL}/v1/agents/{agent_id}/infer"
async with httpx.AsyncClient(timeout=ROUTER_TIMEOUT_SECONDS) as client:
resp = await client.post(url, json=body)
resp.raise_for_status()
data = resp.json()
return {
"response": data.get("response", ""),
"model": data.get("model"),
"backend": data.get("backend"),
"tokens_used": data.get("tokens_used"),
}
async def run_once(job_id: str) -> None:
job = await get_job(job_id)
if not job:
logger.warning("job_missing: %s", job_id)
return
await update_job(job_id, {"status": "running", "started_at": _now(), "updated_at": _now()})
agent_id = job.get("agent_id")
input_payload = job.get("input") or {}
metadata = job.get("metadata") or {}
try:
result = await _call_router(agent_id, input_payload, metadata)
await update_job(
job_id,
{
"status": "done",
"result": result,
"error": None,
"finished_at": _now(),
"updated_at": _now(),
},
)
logger.info("job_done: %s agent=%s", job_id, agent_id)
except Exception as e:
await update_job(
job_id,
{
"status": "failed",
"error": {"type": e.__class__.__name__, "message": str(e)},
"finished_at": _now(),
"updated_at": _now(),
},
)
logger.exception("job_failed: %s agent=%s", job_id, agent_id)
async def worker_loop() -> None:
await wait_for_redis(60)
logger.info("worker_started router=%s", ROUTER_BASE_URL)
while True:
try:
job_id = await dequeue_job(block_seconds=10)
if not job_id:
continue
await run_once(job_id)
except asyncio.CancelledError:
raise
except Exception:
logger.exception("worker_loop_error")
await asyncio.sleep(1)
if __name__ == "__main__":
try:
asyncio.run(worker_loop())
finally:
try:
asyncio.run(close_redis())
except Exception:
pass