feat(docs): add standard file processing and router document ingest/query

2026-02-21 14:02:59 +01:00
parent 3e3546ea89
commit 5d52cf81c4
7 changed files with 755 additions and 104 deletions
--- a/gateway-bot/http_api.py
+++ b/gateway-bot/http_api.py
@@ -1871,23 +1871,53 @@ async def process_document(
        Dict з результатом обробки
    """
    mime_type = document.get("mime_type", "")
+    mime_type_l = (mime_type or "").lower()
    file_name = document.get("file_name", "")
    file_id = document.get("file_id")
    
    file_name_lower = file_name.lower()
-    allowed_exts = {".pdf", ".docx", ".txt", ".md", ".csv", ".xlsx", ".zip"}
+    allowed_exts = {
+        ".pdf", ".doc", ".docx", ".rtf", ".odt",
+        ".txt", ".md", ".markdown",
+        ".csv", ".tsv", ".xls", ".xlsx", ".xlsm", ".ods",
+        ".ppt", ".pptx", ".odp",
+        ".json", ".yaml", ".yml", ".xml", ".html", ".htm",
+        ".zip",
+        ".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff",
+    }
    is_allowed = any(file_name_lower.endswith(ext) for ext in allowed_exts)
-    if mime_type == "application/pdf":
+    if mime_type_l == "application/pdf":
        is_allowed = True
-    if mime_type in {
+    if mime_type_l in {
+        "application/msword",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "application/rtf",
+        "text/rtf",
+        "application/vnd.oasis.opendocument.text",
+        "application/vnd.ms-excel",
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        "application/vnd.ms-excel.sheet.macroenabled.12",
+        "application/vnd.oasis.opendocument.spreadsheet",
+        "application/vnd.ms-powerpoint",
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        "application/vnd.oasis.opendocument.presentation",
        "text/plain",
        "text/markdown",
        "text/csv",
+        "text/tab-separated-values",
+        "application/json",
+        "application/yaml",
+        "application/x-yaml",
+        "text/yaml",
+        "application/xml",
+        "text/xml",
+        "text/html",
        "application/zip",
+        "application/x-zip-compressed",
    }:
        is_allowed = True
+    if mime_type_l.startswith("image/"):
+        is_allowed = True
    
    if is_allowed and file_id:
        logger.info(f"{agent_config.name}: Document from {username} (tg:{user_id}), file_id: {file_id}, file_name: {file_name}")
@@ -2027,7 +2057,7 @@ async def process_document(
        telegram_token = agent_config.get_telegram_token()
        await send_telegram_message(
            chat_id,
-            "Наразі підтримуються формати: PDF, DOCX, TXT, MD, CSV, XLSX, ZIP.",
+            "Підтримуються формати: PDF/DOC/DOCX/RTF/ODT, TXT/MD/CSV/TSV, XLS/XLSX/XLSM/ODS, PPT/PPTX/ODP, JSON/YAML/XML/HTML, ZIP, зображення.",
            telegram_token,
        )
        return {"ok": False, "error": "Unsupported document type"}
@@ -3681,7 +3711,8 @@ async def _old_telegram_webhook(update: TelegramUpdate):
                                doc_url=file_url,
                                file_name=file_name,
                                dao_id=dao_id,
-                                user_id=f"tg:{user_id}"
+                                user_id=f"tg:{user_id}",
+                                agent_id=agent_config.agent_id,
                            )
                            
                            if result.success:
@@ -3705,7 +3736,8 @@ async def _old_telegram_webhook(update: TelegramUpdate):
            result = await ingest_document(
                session_id=session_id,
                dao_id=dao_id,
-                user_id=f"tg:{user_id}"
+                user_id=f"tg:{user_id}",
+                agent_id=agent_config.agent_id,
            )
            
            if result.success:
--- a/gateway-bot/services/doc_service.py
+++ b/gateway-bot/services/doc_service.py
@@ -17,12 +17,12 @@ from typing import Optional, Dict, Any, List
 from pydantic import BaseModel
 from datetime import datetime

-from router_client import send_to_router
 from memory_client import memory_client

 logger = logging.getLogger(__name__)

 SHARED_EXCEL_POLICY_AGENTS = {"agromatrix", "helion", "nutra", "greenfood"}
+ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000")


 class QAItem(BaseModel):
@@ -84,6 +84,28 @@ class DocumentService:
        """Initialize document service"""
        self.memory_client = memory_client

+    async def _router_post_json(
+        self,
+        path: str,
+        payload: Dict[str, Any],
+        timeout: float = 45.0,
+    ) -> Dict[str, Any]:
+        import httpx
+
+        base = ROUTER_URL.rstrip("/")
+        url = f"{base}{path}"
+        async with httpx.AsyncClient(timeout=timeout) as client:
+            resp = await client.post(url, json=payload)
+            body = {}
+            try:
+                body = resp.json()
+            except Exception:
+                body = {"ok": False, "error": f"Invalid JSON from router ({resp.status_code})"}
+            if resp.status_code >= 400:
+                err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
+                raise RuntimeError(f"Router error on {path}: {err}")
+            return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"}
+
    def _is_excel_filename(self, file_name: Optional[str]) -> bool:
        if not file_name:
            return False
@@ -462,7 +484,8 @@ class DocumentService:
        doc_url: Optional[str] = None,
        file_name: Optional[str] = None,
        dao_id: str = None,
-        user_id: str = None
+        user_id: str = None,
+        agent_id: str = "daarwizz",
    ) -> IngestResult:
        """
        Ingest document chunks into RAG/Memory.
@@ -488,64 +511,60 @@ class DocumentService:
                    file_name = file_name or doc_context.file_name
                    dao_id = dao_id or doc_context.dao_id
            
-            if not doc_id and not doc_url:
+            if not doc_url:
                return IngestResult(
                    success=False,
-                    error="No document ID or URL provided"
+                    error="No document URL available for ingest"
                )
-            
-            # Build request to Router with ingest flag
-            router_request = {
-                "mode": "doc_parse",
-                "agent": "parser",
+
+            parsed = await self.parse_document(
+                session_id=session_id,
+                doc_url=doc_url,
+                file_name=file_name or "document",
+                dao_id=dao_id or "",
+                user_id=user_id or "",
+                output_mode="markdown",
+                metadata={"source": self._extract_source(session_id), "mode": "ingest"},
+            )
+            if not parsed.success:
+                return IngestResult(success=False, error=parsed.error or "Document parse failed")
+
+            effective_doc_id = doc_id or parsed.doc_id
+            if not effective_doc_id:
+                effective_doc_id = hashlib.md5(f"{session_id}:{file_name}:{datetime.utcnow().isoformat()}".encode()).hexdigest()[:12]
+
+            doc_text = (parsed.markdown or "").strip()
+            if not doc_text:
+                return IngestResult(success=False, error="No extractable text for ingestion")
+
+            payload = {
+                "agent_id": (agent_id or "daarwizz").lower(),
+                "doc_id": effective_doc_id,
+                "file_name": file_name or "document",
+                "text": doc_text,
+                "dao_id": dao_id,
+                "user_id": user_id,
                "metadata": {
-                    "source": self._extract_source(session_id),
-                    "dao_id": dao_id,
-                    "user_id": user_id,
                    "session_id": session_id,
-                },
-                "payload": {
-                    "output_mode": "chunks",  # Use chunks for RAG ingestion
-                    "dao_id": dao_id,
-                    "user_id": user_id,
-                    "ingest": True,  # Flag for ingestion
+                    "source": self._extract_source(session_id),
                },
            }
-            
-            if doc_url:
-                router_request["payload"]["doc_url"] = doc_url
-                router_request["payload"]["file_name"] = file_name or "document.pdf"
-            
-            if doc_id:
-                router_request["payload"]["doc_id"] = doc_id
-            
-            logger.info(f"Ingesting document: session={session_id}, doc_id={doc_id}")
-            
-            # Send to Router
-            response = await send_to_router(router_request)
-            
-            if not isinstance(response, dict):
-                return IngestResult(
-                    success=False,
-                    error="Invalid response from router"
-                )
-            
-            data = response.get("data", {})
-            chunks = data.get("chunks", [])
-            
-            if chunks:
+            response = await self._router_post_json("/v1/documents/ingest", payload, timeout=90.0)
+
+            if response.get("ok"):
                return IngestResult(
                    success=True,
-                    doc_id=doc_id or data.get("doc_id"),
-                    ingested_chunks=len(chunks),
-                    status="ingested"
-                )
-            else:
-                return IngestResult(
-                    success=False,
-                    status="failed",
-                    error="No chunks to ingest"
+                    doc_id=response.get("doc_id") or effective_doc_id,
+                    ingested_chunks=int(response.get("chunks_stored", 0) or 0),
+                    status="ingested",
                )
+
+            return IngestResult(
+                success=False,
+                doc_id=effective_doc_id,
+                status="failed",
+                error=response.get("error", "Router ingest failed"),
+            )
                
        except Exception as e:
            logger.error(f"Document ingestion failed: {e}", exc_info=True)
@@ -625,38 +644,30 @@ class DocumentService:
                        }],
                    )
            
-            # Build RAG query request
-            router_request = {
-                "mode": "rag_query",
-                "agent": agent_id,
-                "metadata": {
-                    "source": self._extract_source(session_id),
-                    "dao_id": dao_id,
-                    "user_id": user_id,
-                    "session_id": session_id,
-                },
-                "payload": {
-                    "question": question,
-                    "dao_id": dao_id,
-                    "user_id": user_id,
-                    "doc_id": doc_id,
-                },
-            }
-            
            logger.info(
                f"RAG query: agent={agent_id}, session={session_id}, question={question[:50]}, doc_id={doc_id}"
            )
-            
-            # Send to Router
-            response = await send_to_router(router_request)
-            
-            if not isinstance(response, dict):
+
+            response = await self._router_post_json(
+                "/v1/documents/query",
+                {
+                    "agent_id": (agent_id or "daarwizz").lower(),
+                    "question": question,
+                    "doc_id": doc_id,
+                    "dao_id": dao_id,
+                    "user_id": user_id,
+                    "limit": 5,
+                },
+                timeout=60.0,
+            )
+
+            if isinstance(response, dict) and not response.get("ok", False):
                return QAResult(
                    success=False,
-                    error="Invalid response from router"
+                    error=response.get("error", "Document query failed"),
                )
-            
-            data = response.get("data", {})
+
+            data = response.get("data", {}) if isinstance(response, dict) else {}
            answer = data.get("answer") or data.get("text")
            sources = data.get("citations", []) or data.get("sources", [])
            
@@ -717,7 +728,8 @@ async def ingest_document(
    doc_url: Optional[str] = None,
    file_name: Optional[str] = None,
    dao_id: Optional[str] = None,
-    user_id: Optional[str] = None
+    user_id: Optional[str] = None,
+    agent_id: str = "daarwizz",
 ) -> IngestResult:
    """Ingest document chunks into RAG/Memory"""
    return await doc_service.ingest_document(
@@ -726,7 +738,8 @@ async def ingest_document(
        doc_url=doc_url,
        file_name=file_name,
        dao_id=dao_id,
-        user_id=user_id
+        user_id=user_id,
+        agent_id=agent_id,
    )