feat(docs): add versioned document update and versions APIs

This commit is contained in:
NODA1 System
2026-02-21 16:49:24 +01:00
parent 5d52cf81c4
commit f53e71a0f4
4 changed files with 764 additions and 4 deletions

View File

@@ -1256,6 +1256,18 @@ class DocumentQueryRequest(BaseModel):
limit: int = 5
class DocumentUpdateRequest(BaseModel):
"""Update existing document text and bump version."""
agent_id: str
doc_id: str
file_name: Optional[str] = None
text: str
dao_id: Optional[str] = None
user_id: Optional[str] = None
storage_ref: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None
class SharedMemoryReviewRequest(BaseModel):
point_id: str
approve: bool
@@ -2976,6 +2988,7 @@ async def documents_query(request: DocumentQueryRequest):
"doc_id": c_doc_id,
"file_name": c_file,
"chunk_index": c_idx,
"version_no": ch.get("version_no"),
"score": round(c_score, 4),
}
)
@@ -3031,6 +3044,76 @@ async def documents_query(request: DocumentQueryRequest):
}
@app.post("/v1/documents/update")
async def documents_update(request: DocumentUpdateRequest):
"""
Replace document chunks for doc_id with new text and create a new version row.
"""
if not MEMORY_RETRIEVAL_AVAILABLE or not memory_retrieval:
raise HTTPException(status_code=503, detail="Memory retrieval not available")
agent_id = (request.agent_id or "").strip().lower()
if not agent_id:
raise HTTPException(status_code=400, detail="agent_id is required")
doc_id = (request.doc_id or "").strip()
if not doc_id:
raise HTTPException(status_code=400, detail="doc_id is required")
text = (request.text or "").strip()
if not text:
raise HTTPException(status_code=400, detail="text is required")
result = await memory_retrieval.update_document_chunks(
agent_id=agent_id,
doc_id=doc_id,
file_name=request.file_name,
text=text,
dao_id=request.dao_id,
user_id=request.user_id,
metadata=request.metadata,
storage_ref=request.storage_ref,
)
if not result.get("ok"):
return {
"ok": False,
"error": result.get("error", "update_failed"),
"doc_id": doc_id,
"collection": result.get("collection"),
}
return result
@app.get("/v1/documents/{doc_id}/versions")
async def documents_versions(doc_id: str, agent_id: str, limit: int = 20):
"""
List stored versions for a document.
"""
if not MEMORY_RETRIEVAL_AVAILABLE or not memory_retrieval:
raise HTTPException(status_code=503, detail="Memory retrieval not available")
aid = (agent_id or "").strip().lower()
if not aid:
raise HTTPException(status_code=400, detail="agent_id is required")
did = (doc_id or "").strip()
if not did:
raise HTTPException(status_code=400, detail="doc_id is required")
items = await memory_retrieval.list_document_versions(
agent_id=aid,
doc_id=did,
limit=limit,
)
return {
"ok": True,
"agent_id": aid,
"doc_id": did,
"total": len(items),
"items": items,
}
@app.get("/v1/models")
async def list_available_models():
"""List all available models across backends"""

View File

@@ -19,10 +19,10 @@ import os
import json
import logging
import re
import hashlib
from typing import Optional, Dict, Any, List
from dataclasses import dataclass, field
from datetime import datetime
import hashlib
import httpx
import asyncpg
@@ -40,6 +40,7 @@ NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "neo4j")
PENDING_QUESTIONS_LIMIT = int(os.getenv("AGENT_PENDING_QUESTIONS_LIMIT", "5"))
SHARED_AGRO_LIBRARY_ENABLED = os.getenv("AGROMATRIX_SHARED_LIBRARY_ENABLED", "true").lower() == "true"
SHARED_AGRO_LIBRARY_REQUIRE_REVIEW = os.getenv("AGROMATRIX_SHARED_LIBRARY_REQUIRE_REVIEW", "true").lower() == "true"
DOC_VERSION_PREVIEW_CHARS = int(os.getenv("DOC_VERSION_PREVIEW_CHARS", "240"))
@dataclass
@@ -245,6 +246,26 @@ class MemoryRetrieval:
ON agent_pending_questions (agent_id, channel, chat_id, user_id, status, created_at DESC);
CREATE UNIQUE INDEX IF NOT EXISTS idx_agent_pending_questions_unique_open
ON agent_pending_questions (agent_id, channel, chat_id, user_id, question_fingerprint, status);
CREATE TABLE IF NOT EXISTS agent_document_versions (
id BIGSERIAL PRIMARY KEY,
agent_id TEXT NOT NULL,
doc_id TEXT NOT NULL,
version_no INTEGER NOT NULL,
text_hash TEXT NOT NULL,
text_len INTEGER NOT NULL DEFAULT 0,
text_preview TEXT,
file_name TEXT,
dao_id TEXT,
user_id TEXT,
storage_ref TEXT,
source TEXT NOT NULL DEFAULT 'ingest',
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (agent_id, doc_id, version_no)
);
CREATE INDEX IF NOT EXISTS idx_agent_document_versions_latest
ON agent_document_versions (agent_id, doc_id, version_no DESC);
"""
)
except Exception as e:
@@ -1295,6 +1316,304 @@ class MemoryRetrieval:
_push_current()
return chunks
async def _next_document_version_no(
self,
agent_id: str,
doc_id: str,
) -> int:
if self.pg_pool:
try:
async with self.pg_pool.acquire() as conn:
value = await conn.fetchval(
"""
SELECT COALESCE(MAX(version_no), 0) + 1
FROM agent_document_versions
WHERE agent_id = $1
AND doc_id = $2
""",
(agent_id or "").lower(),
doc_id,
)
return max(1, int(value or 1))
except Exception as e:
logger.warning(f"next_document_version_no(pg) failed: {e}")
# Fallback: infer from existing chunk payloads in Qdrant.
if self.qdrant_client:
try:
from qdrant_client.http import models as qmodels
collection = f"{(agent_id or 'daarwizz').lower()}_docs"
points, _ = self.qdrant_client.scroll(
collection_name=collection,
scroll_filter=qmodels.Filter(
must=[
qmodels.FieldCondition(
key="doc_id",
match=qmodels.MatchValue(value=doc_id),
)
]
),
limit=256,
with_payload=True,
)
current_max = 0
for p in points or []:
payload = getattr(p, "payload", {}) or {}
ver = payload.get("version_no")
if isinstance(ver, int):
current_max = max(current_max, ver)
elif isinstance(ver, str) and ver.isdigit():
current_max = max(current_max, int(ver))
return current_max + 1 if current_max > 0 else 1
except Exception as e:
logger.debug(f"next_document_version_no(qdrant) fallback failed: {e}")
return 1
async def _latest_document_version_no(
self,
agent_id: str,
doc_id: str,
) -> int:
nxt = await self._next_document_version_no(agent_id=agent_id, doc_id=doc_id)
return max(0, int(nxt) - 1)
async def _record_document_version(
self,
agent_id: str,
doc_id: str,
version_no: int,
text: str,
file_name: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
source: str = "ingest",
storage_ref: Optional[str] = None,
) -> Dict[str, Any]:
text_body = (text or "").strip()
text_hash = hashlib.sha256(text_body.encode("utf-8")).hexdigest() if text_body else ""
text_len = len(text_body)
preview = text_body[:DOC_VERSION_PREVIEW_CHARS] if text_body else ""
payload = metadata if isinstance(metadata, dict) else {}
if not self.pg_pool:
return {"ok": True, "version_no": int(version_no), "id": None}
try:
async with self.pg_pool.acquire() as conn:
row = await conn.fetchrow(
"""
INSERT INTO agent_document_versions
(agent_id, doc_id, version_no, text_hash, text_len, text_preview,
file_name, dao_id, user_id, storage_ref, source, metadata)
VALUES
($1, $2, $3, $4, $5, $6,
$7, $8, $9, $10, $11, $12::jsonb)
ON CONFLICT (agent_id, doc_id, version_no)
DO UPDATE SET
text_hash = EXCLUDED.text_hash,
text_len = EXCLUDED.text_len,
text_preview = EXCLUDED.text_preview,
file_name = EXCLUDED.file_name,
dao_id = EXCLUDED.dao_id,
user_id = EXCLUDED.user_id,
storage_ref = EXCLUDED.storage_ref,
source = EXCLUDED.source,
metadata = EXCLUDED.metadata
RETURNING id, version_no
""",
(agent_id or "").lower(),
doc_id,
int(version_no),
text_hash,
int(text_len),
preview,
file_name,
dao_id,
user_id,
storage_ref,
source,
json.dumps(payload),
)
return {
"ok": True,
"id": int(row["id"]) if row and row.get("id") is not None else None,
"version_no": int(row["version_no"]) if row and row.get("version_no") is not None else int(version_no),
}
except Exception as e:
logger.warning(f"record_document_version failed: {e}")
return {"ok": False, "error": str(e), "version_no": int(version_no)}
async def list_document_versions(
self,
agent_id: str,
doc_id: str,
limit: int = 20,
) -> List[Dict[str, Any]]:
rows_out: List[Dict[str, Any]] = []
if self.pg_pool:
try:
async with self.pg_pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT id, agent_id, doc_id, version_no, text_hash, text_len, text_preview,
file_name, dao_id, user_id, storage_ref, source, metadata, created_at
FROM agent_document_versions
WHERE agent_id = $1
AND doc_id = $2
ORDER BY version_no DESC
LIMIT $3
""",
(agent_id or "").lower(),
doc_id,
max(1, min(int(limit or 20), 200)),
)
for r in rows:
meta_raw = r["metadata"]
if isinstance(meta_raw, dict):
meta_obj = meta_raw
elif isinstance(meta_raw, str):
try:
parsed = json.loads(meta_raw)
meta_obj = parsed if isinstance(parsed, dict) else {"raw": parsed}
except Exception:
meta_obj = {"raw": meta_raw}
else:
meta_obj = {}
rows_out.append(
{
"id": int(r["id"]),
"agent_id": r["agent_id"],
"doc_id": r["doc_id"],
"version_no": int(r["version_no"]),
"text_hash": r["text_hash"],
"text_len": int(r["text_len"] or 0),
"text_preview": r["text_preview"],
"file_name": r["file_name"],
"dao_id": r["dao_id"],
"user_id": r["user_id"],
"storage_ref": r["storage_ref"],
"source": r["source"],
"metadata": meta_obj,
"created_at": r["created_at"].isoformat() if r["created_at"] else None,
}
)
return rows_out
except Exception as e:
logger.warning(f"list_document_versions failed: {e}")
# PG unavailable fallback: aggregate distinct versions from Qdrant payloads.
if self.qdrant_client:
try:
from qdrant_client.http import models as qmodels
collection = f"{(agent_id or 'daarwizz').lower()}_docs"
offset = None
seen: Dict[int, Dict[str, Any]] = {}
max_points = max(64, min(int(limit or 20) * 80, 4096))
fetched = 0
while fetched < max_points:
points, next_offset = self.qdrant_client.scroll(
collection_name=collection,
scroll_filter=qmodels.Filter(
must=[
qmodels.FieldCondition(
key="doc_id",
match=qmodels.MatchValue(value=doc_id),
)
]
),
offset=offset,
limit=256,
with_payload=True,
)
if not points:
break
fetched += len(points)
for p in points:
payload = getattr(p, "payload", {}) or {}
ver_raw = payload.get("version_no")
if isinstance(ver_raw, int):
ver = ver_raw
elif isinstance(ver_raw, str) and ver_raw.isdigit():
ver = int(ver_raw)
else:
ver = 1
existing = seen.get(ver)
ts = payload.get("timestamp")
if not existing or (ts and str(ts) > str(existing.get("created_at") or "")):
seen[ver] = {
"id": None,
"agent_id": (agent_id or "").lower(),
"doc_id": doc_id,
"version_no": int(ver),
"text_hash": None,
"text_len": None,
"text_preview": None,
"file_name": payload.get("file_name"),
"dao_id": payload.get("dao_id"),
"user_id": payload.get("user_id"),
"storage_ref": payload.get("storage_ref"),
"source": payload.get("source") or "ingest",
"metadata": payload.get("metadata") or {},
"created_at": ts,
}
if not next_offset:
break
offset = next_offset
rows_out = sorted(seen.values(), key=lambda x: int(x.get("version_no") or 0), reverse=True)[: max(1, min(int(limit or 20), 200))]
except Exception:
pass
return rows_out
def _build_doc_filter(
self,
doc_id: str,
dao_id: Optional[str] = None,
):
from qdrant_client.http import models as qmodels
must_conditions = [
qmodels.FieldCondition(
key="doc_id",
match=qmodels.MatchValue(value=doc_id),
)
]
if dao_id:
must_conditions.append(
qmodels.FieldCondition(
key="dao_id",
match=qmodels.MatchValue(value=dao_id),
)
)
return qmodels.Filter(must=must_conditions)
def _delete_document_points(
self,
collection: str,
doc_id: str,
dao_id: Optional[str] = None,
) -> bool:
if not self.qdrant_client:
return False
try:
from qdrant_client.http import models as qmodels
self.qdrant_client.delete(
collection_name=collection,
points_selector=qmodels.FilterSelector(
filter=self._build_doc_filter(doc_id=doc_id, dao_id=dao_id)
),
)
return True
except Exception as e:
logger.warning(f"delete_document_points failed for {collection}/{doc_id}: {e}")
return False
async def ingest_document_chunks(
self,
agent_id: str,
@@ -1304,6 +1623,10 @@ class MemoryRetrieval:
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
replace_existing: bool = False,
version_no: Optional[int] = None,
source: str = "ingest",
storage_ref: Optional[str] = None,
) -> Dict[str, Any]:
"""
Ingest normalized document chunks into {agent_id}_docs collection.
@@ -1341,6 +1664,7 @@ class MemoryRetrieval:
logger.info(f"✅ Created collection: {collection}")
total = len(chunks)
resolved_version_no = int(version_no or 0) or await self._next_document_version_no(agent_id=agent_id, doc_id=doc_id)
for idx, chunk in enumerate(chunks):
emb = await self.get_embedding(chunk[:2000])
if not emb:
@@ -1355,6 +1679,9 @@ class MemoryRetrieval:
"chunk_index": idx,
"chunks_total": total,
"type": "document_chunk",
"version_no": int(resolved_version_no),
"source": source,
"storage_ref": storage_ref,
"timestamp": datetime.utcnow().isoformat(),
}
if isinstance(metadata, dict) and metadata:
@@ -1370,18 +1697,70 @@ class MemoryRetrieval:
if not stored_points:
return {"ok": False, "error": "embedding_failed"}
# Keep previous versions in the same collection when updating.
# Query path will select only the latest version_no for doc_id.
self.qdrant_client.upsert(collection_name=collection, points=stored_points)
version_row = await self._record_document_version(
agent_id=agent_id,
doc_id=doc_id,
version_no=resolved_version_no,
text=body,
file_name=file_name,
dao_id=dao_id,
user_id=user_id,
metadata=metadata,
source=source,
storage_ref=storage_ref,
)
return {
"ok": True,
"doc_id": doc_id,
"version_no": int(resolved_version_no),
"version_id": version_row.get("id"),
"chunks_total": len(chunks),
"chunks_stored": len(stored_points),
"replaced_existing": bool(replace_existing),
"collection": collection,
}
except Exception as e:
logger.warning(f"ingest_document_chunks failed for {collection}: {e}")
return {"ok": False, "error": str(e)}
async def update_document_chunks(
self,
agent_id: str,
doc_id: str,
file_name: Optional[str],
text: str,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
storage_ref: Optional[str] = None,
) -> Dict[str, Any]:
"""
Update existing document content with version bump.
Keeps the same logical doc_id and replaces indexed chunks.
"""
next_version = await self._next_document_version_no(agent_id=agent_id, doc_id=doc_id)
result = await self.ingest_document_chunks(
agent_id=agent_id,
doc_id=doc_id,
file_name=file_name,
text=text,
dao_id=dao_id,
user_id=user_id,
metadata=metadata,
replace_existing=False,
version_no=next_version,
source="update",
storage_ref=storage_ref,
)
if result.get("ok"):
result["updated"] = True
result["replaced_existing"] = True
return result
async def query_document_chunks(
self,
agent_id: str,
@@ -1412,12 +1791,20 @@ class MemoryRetrieval:
from qdrant_client.http import models as qmodels
must_conditions = []
if doc_id:
latest_ver = await self._latest_document_version_no(agent_id=agent_id, doc_id=doc_id)
must_conditions.append(
qmodels.FieldCondition(
key="doc_id",
match=qmodels.MatchValue(value=doc_id),
)
)
if latest_ver > 0:
must_conditions.append(
qmodels.FieldCondition(
key="version_no",
match=qmodels.MatchValue(value=int(latest_ver)),
)
)
if dao_id:
must_conditions.append(
qmodels.FieldCondition(
@@ -1455,6 +1842,7 @@ class MemoryRetrieval:
"file_name": payload.get("file_name"),
"chunk_index": payload.get("chunk_index"),
"chunks_total": payload.get("chunks_total"),
"version_no": payload.get("version_no"),
}
)