feat(docs): add standard file processing and router document ingest/query

2026-02-21 14:02:59 +01:00
parent 3e3546ea89
commit 5d52cf81c4
7 changed files with 755 additions and 104 deletions
--- a/services/router/memory_retrieval.py
+++ b/services/router/memory_retrieval.py
@@ -1237,6 +1237,234 @@ class MemoryRetrieval:
            logger.warning(f"review_shared_pending_case failed: {e}")
            return {"ok": False, "error": str(e)}

+    def _chunk_document_text(
+        self,
+        text: str,
+        chunk_chars: int = 1200,
+        overlap_chars: int = 180,
+    ) -> List[str]:
+        """
+        Split document text into overlap-aware chunks for RAG indexing.
+        Keeps paragraph structure when possible.
+        """
+        raw = re.sub(r"\r\n?", "\n", text or "").strip()
+        if not raw:
+            return []
+
+        paragraphs = [p.strip() for p in re.split(r"\n{2,}", raw) if p and p.strip()]
+        if not paragraphs:
+            return []
+
+        chunks: List[str] = []
+        current = ""
+        max_hard = max(chunk_chars, 600)
+
+        def _push_current() -> None:
+            nonlocal current
+            if current and len(current.strip()) >= 20:
+                chunks.append(current.strip())
+            current = ""
+
+        for para in paragraphs:
+            if len(para) > max_hard * 2:
+                _push_current()
+                i = 0
+                step = max_hard - max(80, min(overlap_chars, max_hard // 2))
+                while i < len(para):
+                    part = para[i : i + max_hard]
+                    if len(part.strip()) >= 20:
+                        chunks.append(part.strip())
+                    i += max(1, step)
+                continue
+
+            candidate = f"{current}\n\n{para}".strip() if current else para
+            if len(candidate) <= max_hard:
+                current = candidate
+                continue
+
+            _push_current()
+            if overlap_chars > 0 and chunks:
+                tail = chunks[-1][-overlap_chars:]
+                current = f"{tail}\n\n{para}".strip()
+                if len(current) > max_hard:
+                    _push_current()
+                    current = para
+            else:
+                current = para
+
+        _push_current()
+        return chunks
+
+    async def ingest_document_chunks(
+        self,
+        agent_id: str,
+        doc_id: str,
+        file_name: Optional[str],
+        text: str,
+        dao_id: Optional[str] = None,
+        user_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """
+        Ingest normalized document chunks into {agent_id}_docs collection.
+        """
+        if not self.qdrant_client:
+            return {"ok": False, "error": "qdrant_unavailable"}
+        if not COHERE_API_KEY:
+            return {"ok": False, "error": "cohere_unavailable"}
+
+        body = (text or "").strip()
+        if not body:
+            return {"ok": False, "error": "empty_document"}
+
+        chunks = self._chunk_document_text(body)
+        if not chunks:
+            return {"ok": False, "error": "no_chunks"}
+
+        collection = f"{(agent_id or 'daarwizz').lower()}_docs"
+        stored_points = []
+
+        try:
+            from qdrant_client.http import models as qmodels
+            import uuid
+
+            try:
+                self.qdrant_client.get_collection(collection)
+            except Exception:
+                self.qdrant_client.create_collection(
+                    collection_name=collection,
+                    vectors_config=qmodels.VectorParams(
+                        size=1024,
+                        distance=qmodels.Distance.COSINE,
+                    ),
+                )
+                logger.info(f"✅ Created collection: {collection}")
+
+            total = len(chunks)
+            for idx, chunk in enumerate(chunks):
+                emb = await self.get_embedding(chunk[:2000])
+                if not emb:
+                    continue
+                payload: Dict[str, Any] = {
+                    "text": chunk[:6000],
+                    "doc_id": doc_id,
+                    "file_name": file_name,
+                    "agent_id": (agent_id or "").lower(),
+                    "dao_id": dao_id,
+                    "user_id": user_id,
+                    "chunk_index": idx,
+                    "chunks_total": total,
+                    "type": "document_chunk",
+                    "timestamp": datetime.utcnow().isoformat(),
+                }
+                if isinstance(metadata, dict) and metadata:
+                    payload["metadata"] = metadata
+                stored_points.append(
+                    qmodels.PointStruct(
+                        id=str(uuid.uuid4()),
+                        vector=emb,
+                        payload=payload,
+                    )
+                )
+
+            if not stored_points:
+                return {"ok": False, "error": "embedding_failed"}
+
+            self.qdrant_client.upsert(collection_name=collection, points=stored_points)
+            return {
+                "ok": True,
+                "doc_id": doc_id,
+                "chunks_total": len(chunks),
+                "chunks_stored": len(stored_points),
+                "collection": collection,
+            }
+        except Exception as e:
+            logger.warning(f"ingest_document_chunks failed for {collection}: {e}")
+            return {"ok": False, "error": str(e)}
+
+    async def query_document_chunks(
+        self,
+        agent_id: str,
+        question: str,
+        doc_id: Optional[str] = None,
+        dao_id: Optional[str] = None,
+        limit: int = 5,
+    ) -> Dict[str, Any]:
+        """
+        Retrieve top document chunks from {agent_id}_docs for a question.
+        """
+        if not self.qdrant_client:
+            return {"ok": False, "error": "qdrant_unavailable", "chunks": []}
+        if not COHERE_API_KEY:
+            return {"ok": False, "error": "cohere_unavailable", "chunks": []}
+
+        q = (question or "").strip()
+        if not q:
+            return {"ok": False, "error": "empty_question", "chunks": []}
+
+        embedding = await self.get_embedding(q[:2000])
+        if not embedding:
+            return {"ok": False, "error": "embedding_failed", "chunks": []}
+
+        collection = f"{(agent_id or 'daarwizz').lower()}_docs"
+
+        try:
+            from qdrant_client.http import models as qmodels
+            must_conditions = []
+            if doc_id:
+                must_conditions.append(
+                    qmodels.FieldCondition(
+                        key="doc_id",
+                        match=qmodels.MatchValue(value=doc_id),
+                    )
+                )
+            if dao_id:
+                must_conditions.append(
+                    qmodels.FieldCondition(
+                        key="dao_id",
+                        match=qmodels.MatchValue(value=dao_id),
+                    )
+                )
+            query_filter = qmodels.Filter(must=must_conditions) if must_conditions else None
+
+            rows = self.qdrant_client.search(
+                collection_name=collection,
+                query_vector=embedding,
+                query_filter=query_filter,
+                limit=max(1, min(int(limit or 5), 12)),
+                with_payload=True,
+            )
+        except Exception as e:
+            logger.debug(f"query_document_chunks search failed for {collection}: {e}")
+            return {"ok": False, "error": "search_failed", "chunks": [], "collection": collection}
+
+        hits: List[Dict[str, Any]] = []
+        for row in rows or []:
+            score = float(getattr(row, "score", 0.0) or 0.0)
+            if score < 0.30:
+                continue
+            payload = getattr(row, "payload", {}) or {}
+            text = str(payload.get("text") or "").strip()
+            if len(text) < 10:
+                continue
+            hits.append(
+                {
+                    "text": text,
+                    "score": score,
+                    "doc_id": payload.get("doc_id"),
+                    "file_name": payload.get("file_name"),
+                    "chunk_index": payload.get("chunk_index"),
+                    "chunks_total": payload.get("chunks_total"),
+                }
+            )
+
+        return {
+            "ok": bool(hits),
+            "chunks": hits,
+            "collection": collection,
+            "doc_id": doc_id,
+        }
+
    async def store_interaction(
        self,
        channel: str,