feat(docs): add standard file processing and router document ingest/query
This commit is contained in:
@@ -17,12 +17,12 @@ from typing import Optional, Dict, Any, List
|
||||
from pydantic import BaseModel
|
||||
from datetime import datetime
|
||||
|
||||
from router_client import send_to_router
|
||||
from memory_client import memory_client
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SHARED_EXCEL_POLICY_AGENTS = {"agromatrix", "helion", "nutra", "greenfood"}
|
||||
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000")
|
||||
|
||||
|
||||
class QAItem(BaseModel):
|
||||
@@ -84,6 +84,28 @@ class DocumentService:
|
||||
"""Initialize document service"""
|
||||
self.memory_client = memory_client
|
||||
|
||||
async def _router_post_json(
|
||||
self,
|
||||
path: str,
|
||||
payload: Dict[str, Any],
|
||||
timeout: float = 45.0,
|
||||
) -> Dict[str, Any]:
|
||||
import httpx
|
||||
|
||||
base = ROUTER_URL.rstrip("/")
|
||||
url = f"{base}{path}"
|
||||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||
resp = await client.post(url, json=payload)
|
||||
body = {}
|
||||
try:
|
||||
body = resp.json()
|
||||
except Exception:
|
||||
body = {"ok": False, "error": f"Invalid JSON from router ({resp.status_code})"}
|
||||
if resp.status_code >= 400:
|
||||
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
|
||||
raise RuntimeError(f"Router error on {path}: {err}")
|
||||
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"}
|
||||
|
||||
def _is_excel_filename(self, file_name: Optional[str]) -> bool:
|
||||
if not file_name:
|
||||
return False
|
||||
@@ -462,7 +484,8 @@ class DocumentService:
|
||||
doc_url: Optional[str] = None,
|
||||
file_name: Optional[str] = None,
|
||||
dao_id: str = None,
|
||||
user_id: str = None
|
||||
user_id: str = None,
|
||||
agent_id: str = "daarwizz",
|
||||
) -> IngestResult:
|
||||
"""
|
||||
Ingest document chunks into RAG/Memory.
|
||||
@@ -488,64 +511,60 @@ class DocumentService:
|
||||
file_name = file_name or doc_context.file_name
|
||||
dao_id = dao_id or doc_context.dao_id
|
||||
|
||||
if not doc_id and not doc_url:
|
||||
if not doc_url:
|
||||
return IngestResult(
|
||||
success=False,
|
||||
error="No document ID or URL provided"
|
||||
error="No document URL available for ingest"
|
||||
)
|
||||
|
||||
# Build request to Router with ingest flag
|
||||
router_request = {
|
||||
"mode": "doc_parse",
|
||||
"agent": "parser",
|
||||
|
||||
parsed = await self.parse_document(
|
||||
session_id=session_id,
|
||||
doc_url=doc_url,
|
||||
file_name=file_name or "document",
|
||||
dao_id=dao_id or "",
|
||||
user_id=user_id or "",
|
||||
output_mode="markdown",
|
||||
metadata={"source": self._extract_source(session_id), "mode": "ingest"},
|
||||
)
|
||||
if not parsed.success:
|
||||
return IngestResult(success=False, error=parsed.error or "Document parse failed")
|
||||
|
||||
effective_doc_id = doc_id or parsed.doc_id
|
||||
if not effective_doc_id:
|
||||
effective_doc_id = hashlib.md5(f"{session_id}:{file_name}:{datetime.utcnow().isoformat()}".encode()).hexdigest()[:12]
|
||||
|
||||
doc_text = (parsed.markdown or "").strip()
|
||||
if not doc_text:
|
||||
return IngestResult(success=False, error="No extractable text for ingestion")
|
||||
|
||||
payload = {
|
||||
"agent_id": (agent_id or "daarwizz").lower(),
|
||||
"doc_id": effective_doc_id,
|
||||
"file_name": file_name or "document",
|
||||
"text": doc_text,
|
||||
"dao_id": dao_id,
|
||||
"user_id": user_id,
|
||||
"metadata": {
|
||||
"source": self._extract_source(session_id),
|
||||
"dao_id": dao_id,
|
||||
"user_id": user_id,
|
||||
"session_id": session_id,
|
||||
},
|
||||
"payload": {
|
||||
"output_mode": "chunks", # Use chunks for RAG ingestion
|
||||
"dao_id": dao_id,
|
||||
"user_id": user_id,
|
||||
"ingest": True, # Flag for ingestion
|
||||
"source": self._extract_source(session_id),
|
||||
},
|
||||
}
|
||||
|
||||
if doc_url:
|
||||
router_request["payload"]["doc_url"] = doc_url
|
||||
router_request["payload"]["file_name"] = file_name or "document.pdf"
|
||||
|
||||
if doc_id:
|
||||
router_request["payload"]["doc_id"] = doc_id
|
||||
|
||||
logger.info(f"Ingesting document: session={session_id}, doc_id={doc_id}")
|
||||
|
||||
# Send to Router
|
||||
response = await send_to_router(router_request)
|
||||
|
||||
if not isinstance(response, dict):
|
||||
return IngestResult(
|
||||
success=False,
|
||||
error="Invalid response from router"
|
||||
)
|
||||
|
||||
data = response.get("data", {})
|
||||
chunks = data.get("chunks", [])
|
||||
|
||||
if chunks:
|
||||
response = await self._router_post_json("/v1/documents/ingest", payload, timeout=90.0)
|
||||
|
||||
if response.get("ok"):
|
||||
return IngestResult(
|
||||
success=True,
|
||||
doc_id=doc_id or data.get("doc_id"),
|
||||
ingested_chunks=len(chunks),
|
||||
status="ingested"
|
||||
)
|
||||
else:
|
||||
return IngestResult(
|
||||
success=False,
|
||||
status="failed",
|
||||
error="No chunks to ingest"
|
||||
doc_id=response.get("doc_id") or effective_doc_id,
|
||||
ingested_chunks=int(response.get("chunks_stored", 0) or 0),
|
||||
status="ingested",
|
||||
)
|
||||
|
||||
return IngestResult(
|
||||
success=False,
|
||||
doc_id=effective_doc_id,
|
||||
status="failed",
|
||||
error=response.get("error", "Router ingest failed"),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Document ingestion failed: {e}", exc_info=True)
|
||||
@@ -625,38 +644,30 @@ class DocumentService:
|
||||
}],
|
||||
)
|
||||
|
||||
# Build RAG query request
|
||||
router_request = {
|
||||
"mode": "rag_query",
|
||||
"agent": agent_id,
|
||||
"metadata": {
|
||||
"source": self._extract_source(session_id),
|
||||
"dao_id": dao_id,
|
||||
"user_id": user_id,
|
||||
"session_id": session_id,
|
||||
},
|
||||
"payload": {
|
||||
"question": question,
|
||||
"dao_id": dao_id,
|
||||
"user_id": user_id,
|
||||
"doc_id": doc_id,
|
||||
},
|
||||
}
|
||||
|
||||
logger.info(
|
||||
f"RAG query: agent={agent_id}, session={session_id}, question={question[:50]}, doc_id={doc_id}"
|
||||
)
|
||||
|
||||
# Send to Router
|
||||
response = await send_to_router(router_request)
|
||||
|
||||
if not isinstance(response, dict):
|
||||
|
||||
response = await self._router_post_json(
|
||||
"/v1/documents/query",
|
||||
{
|
||||
"agent_id": (agent_id or "daarwizz").lower(),
|
||||
"question": question,
|
||||
"doc_id": doc_id,
|
||||
"dao_id": dao_id,
|
||||
"user_id": user_id,
|
||||
"limit": 5,
|
||||
},
|
||||
timeout=60.0,
|
||||
)
|
||||
|
||||
if isinstance(response, dict) and not response.get("ok", False):
|
||||
return QAResult(
|
||||
success=False,
|
||||
error="Invalid response from router"
|
||||
error=response.get("error", "Document query failed"),
|
||||
)
|
||||
|
||||
data = response.get("data", {})
|
||||
|
||||
data = response.get("data", {}) if isinstance(response, dict) else {}
|
||||
answer = data.get("answer") or data.get("text")
|
||||
sources = data.get("citations", []) or data.get("sources", [])
|
||||
|
||||
@@ -717,7 +728,8 @@ async def ingest_document(
|
||||
doc_url: Optional[str] = None,
|
||||
file_name: Optional[str] = None,
|
||||
dao_id: Optional[str] = None,
|
||||
user_id: Optional[str] = None
|
||||
user_id: Optional[str] = None,
|
||||
agent_id: str = "daarwizz",
|
||||
) -> IngestResult:
|
||||
"""Ingest document chunks into RAG/Memory"""
|
||||
return await doc_service.ingest_document(
|
||||
@@ -726,7 +738,8 @@ async def ingest_document(
|
||||
doc_url=doc_url,
|
||||
file_name=file_name,
|
||||
dao_id=dao_id,
|
||||
user_id=user_id
|
||||
user_id=user_id,
|
||||
agent_id=agent_id,
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user