feat(docs): add standard file processing and router document ingest/query

This commit is contained in:
NODA1 System
2026-02-21 14:02:59 +01:00
parent 3e3546ea89
commit 5d52cf81c4
7 changed files with 755 additions and 104 deletions

View File

@@ -1871,23 +1871,53 @@ async def process_document(
Dict з результатом обробки
"""
mime_type = document.get("mime_type", "")
mime_type_l = (mime_type or "").lower()
file_name = document.get("file_name", "")
file_id = document.get("file_id")
file_name_lower = file_name.lower()
allowed_exts = {".pdf", ".docx", ".txt", ".md", ".csv", ".xlsx", ".zip"}
allowed_exts = {
".pdf", ".doc", ".docx", ".rtf", ".odt",
".txt", ".md", ".markdown",
".csv", ".tsv", ".xls", ".xlsx", ".xlsm", ".ods",
".ppt", ".pptx", ".odp",
".json", ".yaml", ".yml", ".xml", ".html", ".htm",
".zip",
".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff",
}
is_allowed = any(file_name_lower.endswith(ext) for ext in allowed_exts)
if mime_type == "application/pdf":
if mime_type_l == "application/pdf":
is_allowed = True
if mime_type in {
if mime_type_l in {
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/rtf",
"text/rtf",
"application/vnd.oasis.opendocument.text",
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.ms-excel.sheet.macroenabled.12",
"application/vnd.oasis.opendocument.spreadsheet",
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.oasis.opendocument.presentation",
"text/plain",
"text/markdown",
"text/csv",
"text/tab-separated-values",
"application/json",
"application/yaml",
"application/x-yaml",
"text/yaml",
"application/xml",
"text/xml",
"text/html",
"application/zip",
"application/x-zip-compressed",
}:
is_allowed = True
if mime_type_l.startswith("image/"):
is_allowed = True
if is_allowed and file_id:
logger.info(f"{agent_config.name}: Document from {username} (tg:{user_id}), file_id: {file_id}, file_name: {file_name}")
@@ -2027,7 +2057,7 @@ async def process_document(
telegram_token = agent_config.get_telegram_token()
await send_telegram_message(
chat_id,
"Наразі підтримуються формати: PDF, DOCX, TXT, MD, CSV, XLSX, ZIP.",
"Підтримуються формати: PDF/DOC/DOCX/RTF/ODT, TXT/MD/CSV/TSV, XLS/XLSX/XLSM/ODS, PPT/PPTX/ODP, JSON/YAML/XML/HTML, ZIP, зображення.",
telegram_token,
)
return {"ok": False, "error": "Unsupported document type"}
@@ -3681,7 +3711,8 @@ async def _old_telegram_webhook(update: TelegramUpdate):
doc_url=file_url,
file_name=file_name,
dao_id=dao_id,
user_id=f"tg:{user_id}"
user_id=f"tg:{user_id}",
agent_id=agent_config.agent_id,
)
if result.success:
@@ -3705,7 +3736,8 @@ async def _old_telegram_webhook(update: TelegramUpdate):
result = await ingest_document(
session_id=session_id,
dao_id=dao_id,
user_id=f"tg:{user_id}"
user_id=f"tg:{user_id}",
agent_id=agent_config.agent_id,
)
if result.success:

View File

@@ -17,12 +17,12 @@ from typing import Optional, Dict, Any, List
from pydantic import BaseModel
from datetime import datetime
from router_client import send_to_router
from memory_client import memory_client
logger = logging.getLogger(__name__)
SHARED_EXCEL_POLICY_AGENTS = {"agromatrix", "helion", "nutra", "greenfood"}
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000")
class QAItem(BaseModel):
@@ -84,6 +84,28 @@ class DocumentService:
"""Initialize document service"""
self.memory_client = memory_client
async def _router_post_json(
self,
path: str,
payload: Dict[str, Any],
timeout: float = 45.0,
) -> Dict[str, Any]:
import httpx
base = ROUTER_URL.rstrip("/")
url = f"{base}{path}"
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(url, json=payload)
body = {}
try:
body = resp.json()
except Exception:
body = {"ok": False, "error": f"Invalid JSON from router ({resp.status_code})"}
if resp.status_code >= 400:
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
raise RuntimeError(f"Router error on {path}: {err}")
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"}
def _is_excel_filename(self, file_name: Optional[str]) -> bool:
if not file_name:
return False
@@ -462,7 +484,8 @@ class DocumentService:
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
dao_id: str = None,
user_id: str = None
user_id: str = None,
agent_id: str = "daarwizz",
) -> IngestResult:
"""
Ingest document chunks into RAG/Memory.
@@ -488,64 +511,60 @@ class DocumentService:
file_name = file_name or doc_context.file_name
dao_id = dao_id or doc_context.dao_id
if not doc_id and not doc_url:
if not doc_url:
return IngestResult(
success=False,
error="No document ID or URL provided"
error="No document URL available for ingest"
)
# Build request to Router with ingest flag
router_request = {
"mode": "doc_parse",
"agent": "parser",
parsed = await self.parse_document(
session_id=session_id,
doc_url=doc_url,
file_name=file_name or "document",
dao_id=dao_id or "",
user_id=user_id or "",
output_mode="markdown",
metadata={"source": self._extract_source(session_id), "mode": "ingest"},
)
if not parsed.success:
return IngestResult(success=False, error=parsed.error or "Document parse failed")
effective_doc_id = doc_id or parsed.doc_id
if not effective_doc_id:
effective_doc_id = hashlib.md5(f"{session_id}:{file_name}:{datetime.utcnow().isoformat()}".encode()).hexdigest()[:12]
doc_text = (parsed.markdown or "").strip()
if not doc_text:
return IngestResult(success=False, error="No extractable text for ingestion")
payload = {
"agent_id": (agent_id or "daarwizz").lower(),
"doc_id": effective_doc_id,
"file_name": file_name or "document",
"text": doc_text,
"dao_id": dao_id,
"user_id": user_id,
"metadata": {
"source": self._extract_source(session_id),
"dao_id": dao_id,
"user_id": user_id,
"session_id": session_id,
},
"payload": {
"output_mode": "chunks", # Use chunks for RAG ingestion
"dao_id": dao_id,
"user_id": user_id,
"ingest": True, # Flag for ingestion
"source": self._extract_source(session_id),
},
}
if doc_url:
router_request["payload"]["doc_url"] = doc_url
router_request["payload"]["file_name"] = file_name or "document.pdf"
if doc_id:
router_request["payload"]["doc_id"] = doc_id
logger.info(f"Ingesting document: session={session_id}, doc_id={doc_id}")
# Send to Router
response = await send_to_router(router_request)
if not isinstance(response, dict):
return IngestResult(
success=False,
error="Invalid response from router"
)
data = response.get("data", {})
chunks = data.get("chunks", [])
if chunks:
response = await self._router_post_json("/v1/documents/ingest", payload, timeout=90.0)
if response.get("ok"):
return IngestResult(
success=True,
doc_id=doc_id or data.get("doc_id"),
ingested_chunks=len(chunks),
status="ingested"
)
else:
return IngestResult(
success=False,
status="failed",
error="No chunks to ingest"
doc_id=response.get("doc_id") or effective_doc_id,
ingested_chunks=int(response.get("chunks_stored", 0) or 0),
status="ingested",
)
return IngestResult(
success=False,
doc_id=effective_doc_id,
status="failed",
error=response.get("error", "Router ingest failed"),
)
except Exception as e:
logger.error(f"Document ingestion failed: {e}", exc_info=True)
@@ -625,38 +644,30 @@ class DocumentService:
}],
)
# Build RAG query request
router_request = {
"mode": "rag_query",
"agent": agent_id,
"metadata": {
"source": self._extract_source(session_id),
"dao_id": dao_id,
"user_id": user_id,
"session_id": session_id,
},
"payload": {
"question": question,
"dao_id": dao_id,
"user_id": user_id,
"doc_id": doc_id,
},
}
logger.info(
f"RAG query: agent={agent_id}, session={session_id}, question={question[:50]}, doc_id={doc_id}"
)
# Send to Router
response = await send_to_router(router_request)
if not isinstance(response, dict):
response = await self._router_post_json(
"/v1/documents/query",
{
"agent_id": (agent_id or "daarwizz").lower(),
"question": question,
"doc_id": doc_id,
"dao_id": dao_id,
"user_id": user_id,
"limit": 5,
},
timeout=60.0,
)
if isinstance(response, dict) and not response.get("ok", False):
return QAResult(
success=False,
error="Invalid response from router"
error=response.get("error", "Document query failed"),
)
data = response.get("data", {})
data = response.get("data", {}) if isinstance(response, dict) else {}
answer = data.get("answer") or data.get("text")
sources = data.get("citations", []) or data.get("sources", [])
@@ -717,7 +728,8 @@ async def ingest_document(
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None
user_id: Optional[str] = None,
agent_id: str = "daarwizz",
) -> IngestResult:
"""Ingest document chunks into RAG/Memory"""
return await doc_service.ingest_document(
@@ -726,7 +738,8 @@ async def ingest_document(
doc_url=doc_url,
file_name=file_name,
dao_id=dao_id,
user_id=user_id
user_id=user_id,
agent_id=agent_id,
)