feat(docs): add versioned document update and versions APIs

2026-02-21 16:49:24 +01:00
parent 5d52cf81c4
commit f53e71a0f4
4 changed files with 764 additions and 4 deletions
--- a/services/router/main.py
+++ b/services/router/main.py
@@ -1256,6 +1256,18 @@ class DocumentQueryRequest(BaseModel):
    limit: int = 5


+class DocumentUpdateRequest(BaseModel):
+    """Update existing document text and bump version."""
+    agent_id: str
+    doc_id: str
+    file_name: Optional[str] = None
+    text: str
+    dao_id: Optional[str] = None
+    user_id: Optional[str] = None
+    storage_ref: Optional[str] = None
+    metadata: Optional[Dict[str, Any]] = None
+
+
 class SharedMemoryReviewRequest(BaseModel):
    point_id: str
    approve: bool
@@ -2976,6 +2988,7 @@ async def documents_query(request: DocumentQueryRequest):
                "doc_id": c_doc_id,
                "file_name": c_file,
                "chunk_index": c_idx,
+                "version_no": ch.get("version_no"),
                "score": round(c_score, 4),
            }
        )
@@ -3031,6 +3044,76 @@ async def documents_query(request: DocumentQueryRequest):
    }


+@app.post("/v1/documents/update")
+async def documents_update(request: DocumentUpdateRequest):
+    """
+    Replace document chunks for doc_id with new text and create a new version row.
+    """
+    if not MEMORY_RETRIEVAL_AVAILABLE or not memory_retrieval:
+        raise HTTPException(status_code=503, detail="Memory retrieval not available")
+
+    agent_id = (request.agent_id or "").strip().lower()
+    if not agent_id:
+        raise HTTPException(status_code=400, detail="agent_id is required")
+
+    doc_id = (request.doc_id or "").strip()
+    if not doc_id:
+        raise HTTPException(status_code=400, detail="doc_id is required")
+
+    text = (request.text or "").strip()
+    if not text:
+        raise HTTPException(status_code=400, detail="text is required")
+
+    result = await memory_retrieval.update_document_chunks(
+        agent_id=agent_id,
+        doc_id=doc_id,
+        file_name=request.file_name,
+        text=text,
+        dao_id=request.dao_id,
+        user_id=request.user_id,
+        metadata=request.metadata,
+        storage_ref=request.storage_ref,
+    )
+    if not result.get("ok"):
+        return {
+            "ok": False,
+            "error": result.get("error", "update_failed"),
+            "doc_id": doc_id,
+            "collection": result.get("collection"),
+        }
+    return result
+
+
+@app.get("/v1/documents/{doc_id}/versions")
+async def documents_versions(doc_id: str, agent_id: str, limit: int = 20):
+    """
+    List stored versions for a document.
+    """
+    if not MEMORY_RETRIEVAL_AVAILABLE or not memory_retrieval:
+        raise HTTPException(status_code=503, detail="Memory retrieval not available")
+
+    aid = (agent_id or "").strip().lower()
+    if not aid:
+        raise HTTPException(status_code=400, detail="agent_id is required")
+
+    did = (doc_id or "").strip()
+    if not did:
+        raise HTTPException(status_code=400, detail="doc_id is required")
+
+    items = await memory_retrieval.list_document_versions(
+        agent_id=aid,
+        doc_id=did,
+        limit=limit,
+    )
+    return {
+        "ok": True,
+        "agent_id": aid,
+        "doc_id": did,
+        "total": len(items),
+        "items": items,
+    }
+
+
@app.get("/v1/models")
 async def list_available_models():
    """List all available models across backends"""
--- a/services/router/memory_retrieval.py
+++ b/services/router/memory_retrieval.py
@@ -19,10 +19,10 @@ import os
 import json
 import logging
 import re
+import hashlib
 from typing import Optional, Dict, Any, List
 from dataclasses import dataclass, field
 from datetime import datetime
-import hashlib

 import httpx
 import asyncpg
@@ -40,6 +40,7 @@ NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "neo4j")
 PENDING_QUESTIONS_LIMIT = int(os.getenv("AGENT_PENDING_QUESTIONS_LIMIT", "5"))
 SHARED_AGRO_LIBRARY_ENABLED = os.getenv("AGROMATRIX_SHARED_LIBRARY_ENABLED", "true").lower() == "true"
 SHARED_AGRO_LIBRARY_REQUIRE_REVIEW = os.getenv("AGROMATRIX_SHARED_LIBRARY_REQUIRE_REVIEW", "true").lower() == "true"
+DOC_VERSION_PREVIEW_CHARS = int(os.getenv("DOC_VERSION_PREVIEW_CHARS", "240"))


@dataclass
@@ -245,6 +246,26 @@ class MemoryRetrieval:
                      ON agent_pending_questions (agent_id, channel, chat_id, user_id, status, created_at DESC);
                    CREATE UNIQUE INDEX IF NOT EXISTS idx_agent_pending_questions_unique_open
                      ON agent_pending_questions (agent_id, channel, chat_id, user_id, question_fingerprint, status);
+
+                    CREATE TABLE IF NOT EXISTS agent_document_versions (
+                      id BIGSERIAL PRIMARY KEY,
+                      agent_id TEXT NOT NULL,
+                      doc_id TEXT NOT NULL,
+                      version_no INTEGER NOT NULL,
+                      text_hash TEXT NOT NULL,
+                      text_len INTEGER NOT NULL DEFAULT 0,
+                      text_preview TEXT,
+                      file_name TEXT,
+                      dao_id TEXT,
+                      user_id TEXT,
+                      storage_ref TEXT,
+                      source TEXT NOT NULL DEFAULT 'ingest',
+                      metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
+                      created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+                      UNIQUE (agent_id, doc_id, version_no)
+                    );
+                    CREATE INDEX IF NOT EXISTS idx_agent_document_versions_latest
+                      ON agent_document_versions (agent_id, doc_id, version_no DESC);
                    """
                )
        except Exception as e:
@@ -1295,6 +1316,304 @@ class MemoryRetrieval:
        _push_current()
        return chunks

+    async def _next_document_version_no(
+        self,
+        agent_id: str,
+        doc_id: str,
+    ) -> int:
+        if self.pg_pool:
+            try:
+                async with self.pg_pool.acquire() as conn:
+                    value = await conn.fetchval(
+                        """
+                        SELECT COALESCE(MAX(version_no), 0) + 1
+                        FROM agent_document_versions
+                        WHERE agent_id = $1
+                          AND doc_id = $2
+                        """,
+                        (agent_id or "").lower(),
+                        doc_id,
+                    )
+                return max(1, int(value or 1))
+            except Exception as e:
+                logger.warning(f"next_document_version_no(pg) failed: {e}")
+
+        # Fallback: infer from existing chunk payloads in Qdrant.
+        if self.qdrant_client:
+            try:
+                from qdrant_client.http import models as qmodels
+
+                collection = f"{(agent_id or 'daarwizz').lower()}_docs"
+                points, _ = self.qdrant_client.scroll(
+                    collection_name=collection,
+                    scroll_filter=qmodels.Filter(
+                        must=[
+                            qmodels.FieldCondition(
+                                key="doc_id",
+                                match=qmodels.MatchValue(value=doc_id),
+                            )
+                        ]
+                    ),
+                    limit=256,
+                    with_payload=True,
+                )
+                current_max = 0
+                for p in points or []:
+                    payload = getattr(p, "payload", {}) or {}
+                    ver = payload.get("version_no")
+                    if isinstance(ver, int):
+                        current_max = max(current_max, ver)
+                    elif isinstance(ver, str) and ver.isdigit():
+                        current_max = max(current_max, int(ver))
+                return current_max + 1 if current_max > 0 else 1
+            except Exception as e:
+                logger.debug(f"next_document_version_no(qdrant) fallback failed: {e}")
+
+        return 1
+
+    async def _latest_document_version_no(
+        self,
+        agent_id: str,
+        doc_id: str,
+    ) -> int:
+        nxt = await self._next_document_version_no(agent_id=agent_id, doc_id=doc_id)
+        return max(0, int(nxt) - 1)
+
+    async def _record_document_version(
+        self,
+        agent_id: str,
+        doc_id: str,
+        version_no: int,
+        text: str,
+        file_name: Optional[str] = None,
+        dao_id: Optional[str] = None,
+        user_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        source: str = "ingest",
+        storage_ref: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        text_body = (text or "").strip()
+        text_hash = hashlib.sha256(text_body.encode("utf-8")).hexdigest() if text_body else ""
+        text_len = len(text_body)
+        preview = text_body[:DOC_VERSION_PREVIEW_CHARS] if text_body else ""
+        payload = metadata if isinstance(metadata, dict) else {}
+
+        if not self.pg_pool:
+            return {"ok": True, "version_no": int(version_no), "id": None}
+
+        try:
+            async with self.pg_pool.acquire() as conn:
+                row = await conn.fetchrow(
+                    """
+                    INSERT INTO agent_document_versions
+                      (agent_id, doc_id, version_no, text_hash, text_len, text_preview,
+                       file_name, dao_id, user_id, storage_ref, source, metadata)
+                    VALUES
+                      ($1, $2, $3, $4, $5, $6,
+                       $7, $8, $9, $10, $11, $12::jsonb)
+                    ON CONFLICT (agent_id, doc_id, version_no)
+                    DO UPDATE SET
+                      text_hash = EXCLUDED.text_hash,
+                      text_len = EXCLUDED.text_len,
+                      text_preview = EXCLUDED.text_preview,
+                      file_name = EXCLUDED.file_name,
+                      dao_id = EXCLUDED.dao_id,
+                      user_id = EXCLUDED.user_id,
+                      storage_ref = EXCLUDED.storage_ref,
+                      source = EXCLUDED.source,
+                      metadata = EXCLUDED.metadata
+                    RETURNING id, version_no
+                    """,
+                    (agent_id or "").lower(),
+                    doc_id,
+                    int(version_no),
+                    text_hash,
+                    int(text_len),
+                    preview,
+                    file_name,
+                    dao_id,
+                    user_id,
+                    storage_ref,
+                    source,
+                    json.dumps(payload),
+                )
+            return {
+                "ok": True,
+                "id": int(row["id"]) if row and row.get("id") is not None else None,
+                "version_no": int(row["version_no"]) if row and row.get("version_no") is not None else int(version_no),
+            }
+        except Exception as e:
+            logger.warning(f"record_document_version failed: {e}")
+            return {"ok": False, "error": str(e), "version_no": int(version_no)}
+
+    async def list_document_versions(
+        self,
+        agent_id: str,
+        doc_id: str,
+        limit: int = 20,
+    ) -> List[Dict[str, Any]]:
+        rows_out: List[Dict[str, Any]] = []
+        if self.pg_pool:
+            try:
+                async with self.pg_pool.acquire() as conn:
+                    rows = await conn.fetch(
+                        """
+                        SELECT id, agent_id, doc_id, version_no, text_hash, text_len, text_preview,
+                               file_name, dao_id, user_id, storage_ref, source, metadata, created_at
+                        FROM agent_document_versions
+                        WHERE agent_id = $1
+                          AND doc_id = $2
+                        ORDER BY version_no DESC
+                        LIMIT $3
+                        """,
+                        (agent_id or "").lower(),
+                        doc_id,
+                        max(1, min(int(limit or 20), 200)),
+                    )
+                for r in rows:
+                    meta_raw = r["metadata"]
+                    if isinstance(meta_raw, dict):
+                        meta_obj = meta_raw
+                    elif isinstance(meta_raw, str):
+                        try:
+                            parsed = json.loads(meta_raw)
+                            meta_obj = parsed if isinstance(parsed, dict) else {"raw": parsed}
+                        except Exception:
+                            meta_obj = {"raw": meta_raw}
+                    else:
+                        meta_obj = {}
+                    rows_out.append(
+                        {
+                            "id": int(r["id"]),
+                            "agent_id": r["agent_id"],
+                            "doc_id": r["doc_id"],
+                            "version_no": int(r["version_no"]),
+                            "text_hash": r["text_hash"],
+                            "text_len": int(r["text_len"] or 0),
+                            "text_preview": r["text_preview"],
+                            "file_name": r["file_name"],
+                            "dao_id": r["dao_id"],
+                            "user_id": r["user_id"],
+                            "storage_ref": r["storage_ref"],
+                            "source": r["source"],
+                            "metadata": meta_obj,
+                            "created_at": r["created_at"].isoformat() if r["created_at"] else None,
+                        }
+                    )
+                return rows_out
+            except Exception as e:
+                logger.warning(f"list_document_versions failed: {e}")
+
+        # PG unavailable fallback: aggregate distinct versions from Qdrant payloads.
+        if self.qdrant_client:
+            try:
+                from qdrant_client.http import models as qmodels
+
+                collection = f"{(agent_id or 'daarwizz').lower()}_docs"
+                offset = None
+                seen: Dict[int, Dict[str, Any]] = {}
+                max_points = max(64, min(int(limit or 20) * 80, 4096))
+                fetched = 0
+                while fetched < max_points:
+                    points, next_offset = self.qdrant_client.scroll(
+                        collection_name=collection,
+                        scroll_filter=qmodels.Filter(
+                            must=[
+                                qmodels.FieldCondition(
+                                    key="doc_id",
+                                    match=qmodels.MatchValue(value=doc_id),
+                                )
+                            ]
+                        ),
+                        offset=offset,
+                        limit=256,
+                        with_payload=True,
+                    )
+                    if not points:
+                        break
+                    fetched += len(points)
+                    for p in points:
+                        payload = getattr(p, "payload", {}) or {}
+                        ver_raw = payload.get("version_no")
+                        if isinstance(ver_raw, int):
+                            ver = ver_raw
+                        elif isinstance(ver_raw, str) and ver_raw.isdigit():
+                            ver = int(ver_raw)
+                        else:
+                            ver = 1
+
+                        existing = seen.get(ver)
+                        ts = payload.get("timestamp")
+                        if not existing or (ts and str(ts) > str(existing.get("created_at") or "")):
+                            seen[ver] = {
+                                "id": None,
+                                "agent_id": (agent_id or "").lower(),
+                                "doc_id": doc_id,
+                                "version_no": int(ver),
+                                "text_hash": None,
+                                "text_len": None,
+                                "text_preview": None,
+                                "file_name": payload.get("file_name"),
+                                "dao_id": payload.get("dao_id"),
+                                "user_id": payload.get("user_id"),
+                                "storage_ref": payload.get("storage_ref"),
+                                "source": payload.get("source") or "ingest",
+                                "metadata": payload.get("metadata") or {},
+                                "created_at": ts,
+                            }
+                    if not next_offset:
+                        break
+                    offset = next_offset
+                rows_out = sorted(seen.values(), key=lambda x: int(x.get("version_no") or 0), reverse=True)[: max(1, min(int(limit or 20), 200))]
+            except Exception:
+                pass
+
+        return rows_out
+
+    def _build_doc_filter(
+        self,
+        doc_id: str,
+        dao_id: Optional[str] = None,
+    ):
+        from qdrant_client.http import models as qmodels
+
+        must_conditions = [
+            qmodels.FieldCondition(
+                key="doc_id",
+                match=qmodels.MatchValue(value=doc_id),
+            )
+        ]
+        if dao_id:
+            must_conditions.append(
+                qmodels.FieldCondition(
+                    key="dao_id",
+                    match=qmodels.MatchValue(value=dao_id),
+                )
+            )
+        return qmodels.Filter(must=must_conditions)
+
+    def _delete_document_points(
+        self,
+        collection: str,
+        doc_id: str,
+        dao_id: Optional[str] = None,
+    ) -> bool:
+        if not self.qdrant_client:
+            return False
+        try:
+            from qdrant_client.http import models as qmodels
+
+            self.qdrant_client.delete(
+                collection_name=collection,
+                points_selector=qmodels.FilterSelector(
+                    filter=self._build_doc_filter(doc_id=doc_id, dao_id=dao_id)
+                ),
+            )
+            return True
+        except Exception as e:
+            logger.warning(f"delete_document_points failed for {collection}/{doc_id}: {e}")
+            return False
+
    async def ingest_document_chunks(
        self,
        agent_id: str,
@@ -1304,6 +1623,10 @@ class MemoryRetrieval:
        dao_id: Optional[str] = None,
        user_id: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
+        replace_existing: bool = False,
+        version_no: Optional[int] = None,
+        source: str = "ingest",
+        storage_ref: Optional[str] = None,
    ) -> Dict[str, Any]:
        """
        Ingest normalized document chunks into {agent_id}_docs collection.
@@ -1341,6 +1664,7 @@ class MemoryRetrieval:
                logger.info(f"✅ Created collection: {collection}")

            total = len(chunks)
+            resolved_version_no = int(version_no or 0) or await self._next_document_version_no(agent_id=agent_id, doc_id=doc_id)
            for idx, chunk in enumerate(chunks):
                emb = await self.get_embedding(chunk[:2000])
                if not emb:
@@ -1355,6 +1679,9 @@ class MemoryRetrieval:
                    "chunk_index": idx,
                    "chunks_total": total,
                    "type": "document_chunk",
+                    "version_no": int(resolved_version_no),
+                    "source": source,
+                    "storage_ref": storage_ref,
                    "timestamp": datetime.utcnow().isoformat(),
                }
                if isinstance(metadata, dict) and metadata:
@@ -1370,18 +1697,70 @@ class MemoryRetrieval:
            if not stored_points:
                return {"ok": False, "error": "embedding_failed"}

+            # Keep previous versions in the same collection when updating.
+            # Query path will select only the latest version_no for doc_id.
+
            self.qdrant_client.upsert(collection_name=collection, points=stored_points)
+            version_row = await self._record_document_version(
+                agent_id=agent_id,
+                doc_id=doc_id,
+                version_no=resolved_version_no,
+                text=body,
+                file_name=file_name,
+                dao_id=dao_id,
+                user_id=user_id,
+                metadata=metadata,
+                source=source,
+                storage_ref=storage_ref,
+            )
            return {
                "ok": True,
                "doc_id": doc_id,
+                "version_no": int(resolved_version_no),
+                "version_id": version_row.get("id"),
                "chunks_total": len(chunks),
                "chunks_stored": len(stored_points),
+                "replaced_existing": bool(replace_existing),
                "collection": collection,
            }
        except Exception as e:
            logger.warning(f"ingest_document_chunks failed for {collection}: {e}")
            return {"ok": False, "error": str(e)}

+    async def update_document_chunks(
+        self,
+        agent_id: str,
+        doc_id: str,
+        file_name: Optional[str],
+        text: str,
+        dao_id: Optional[str] = None,
+        user_id: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        storage_ref: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """
+        Update existing document content with version bump.
+        Keeps the same logical doc_id and replaces indexed chunks.
+        """
+        next_version = await self._next_document_version_no(agent_id=agent_id, doc_id=doc_id)
+        result = await self.ingest_document_chunks(
+            agent_id=agent_id,
+            doc_id=doc_id,
+            file_name=file_name,
+            text=text,
+            dao_id=dao_id,
+            user_id=user_id,
+            metadata=metadata,
+            replace_existing=False,
+            version_no=next_version,
+            source="update",
+            storage_ref=storage_ref,
+        )
+        if result.get("ok"):
+            result["updated"] = True
+            result["replaced_existing"] = True
+        return result
+
    async def query_document_chunks(
        self,
        agent_id: str,
@@ -1412,12 +1791,20 @@ class MemoryRetrieval:
            from qdrant_client.http import models as qmodels
            must_conditions = []
            if doc_id:
+                latest_ver = await self._latest_document_version_no(agent_id=agent_id, doc_id=doc_id)
                must_conditions.append(
                    qmodels.FieldCondition(
                        key="doc_id",
                        match=qmodels.MatchValue(value=doc_id),
                    )
                )
+                if latest_ver > 0:
+                    must_conditions.append(
+                        qmodels.FieldCondition(
+                            key="version_no",
+                            match=qmodels.MatchValue(value=int(latest_ver)),
+                        )
+                    )
            if dao_id:
                must_conditions.append(
                    qmodels.FieldCondition(
@@ -1455,6 +1842,7 @@ class MemoryRetrieval:
                    "file_name": payload.get("file_name"),
                    "chunk_index": payload.get("chunk_index"),
                    "chunks_total": payload.get("chunks_total"),
+                    "version_no": payload.get("version_no"),
                }
            )