feat(docs): add standard file processing and router document ingest/query
This commit is contained in:
@@ -1237,6 +1237,234 @@ class MemoryRetrieval:
|
||||
logger.warning(f"review_shared_pending_case failed: {e}")
|
||||
return {"ok": False, "error": str(e)}
|
||||
|
||||
def _chunk_document_text(
|
||||
self,
|
||||
text: str,
|
||||
chunk_chars: int = 1200,
|
||||
overlap_chars: int = 180,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Split document text into overlap-aware chunks for RAG indexing.
|
||||
Keeps paragraph structure when possible.
|
||||
"""
|
||||
raw = re.sub(r"\r\n?", "\n", text or "").strip()
|
||||
if not raw:
|
||||
return []
|
||||
|
||||
paragraphs = [p.strip() for p in re.split(r"\n{2,}", raw) if p and p.strip()]
|
||||
if not paragraphs:
|
||||
return []
|
||||
|
||||
chunks: List[str] = []
|
||||
current = ""
|
||||
max_hard = max(chunk_chars, 600)
|
||||
|
||||
def _push_current() -> None:
|
||||
nonlocal current
|
||||
if current and len(current.strip()) >= 20:
|
||||
chunks.append(current.strip())
|
||||
current = ""
|
||||
|
||||
for para in paragraphs:
|
||||
if len(para) > max_hard * 2:
|
||||
_push_current()
|
||||
i = 0
|
||||
step = max_hard - max(80, min(overlap_chars, max_hard // 2))
|
||||
while i < len(para):
|
||||
part = para[i : i + max_hard]
|
||||
if len(part.strip()) >= 20:
|
||||
chunks.append(part.strip())
|
||||
i += max(1, step)
|
||||
continue
|
||||
|
||||
candidate = f"{current}\n\n{para}".strip() if current else para
|
||||
if len(candidate) <= max_hard:
|
||||
current = candidate
|
||||
continue
|
||||
|
||||
_push_current()
|
||||
if overlap_chars > 0 and chunks:
|
||||
tail = chunks[-1][-overlap_chars:]
|
||||
current = f"{tail}\n\n{para}".strip()
|
||||
if len(current) > max_hard:
|
||||
_push_current()
|
||||
current = para
|
||||
else:
|
||||
current = para
|
||||
|
||||
_push_current()
|
||||
return chunks
|
||||
|
||||
async def ingest_document_chunks(
|
||||
self,
|
||||
agent_id: str,
|
||||
doc_id: str,
|
||||
file_name: Optional[str],
|
||||
text: str,
|
||||
dao_id: Optional[str] = None,
|
||||
user_id: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Ingest normalized document chunks into {agent_id}_docs collection.
|
||||
"""
|
||||
if not self.qdrant_client:
|
||||
return {"ok": False, "error": "qdrant_unavailable"}
|
||||
if not COHERE_API_KEY:
|
||||
return {"ok": False, "error": "cohere_unavailable"}
|
||||
|
||||
body = (text or "").strip()
|
||||
if not body:
|
||||
return {"ok": False, "error": "empty_document"}
|
||||
|
||||
chunks = self._chunk_document_text(body)
|
||||
if not chunks:
|
||||
return {"ok": False, "error": "no_chunks"}
|
||||
|
||||
collection = f"{(agent_id or 'daarwizz').lower()}_docs"
|
||||
stored_points = []
|
||||
|
||||
try:
|
||||
from qdrant_client.http import models as qmodels
|
||||
import uuid
|
||||
|
||||
try:
|
||||
self.qdrant_client.get_collection(collection)
|
||||
except Exception:
|
||||
self.qdrant_client.create_collection(
|
||||
collection_name=collection,
|
||||
vectors_config=qmodels.VectorParams(
|
||||
size=1024,
|
||||
distance=qmodels.Distance.COSINE,
|
||||
),
|
||||
)
|
||||
logger.info(f"✅ Created collection: {collection}")
|
||||
|
||||
total = len(chunks)
|
||||
for idx, chunk in enumerate(chunks):
|
||||
emb = await self.get_embedding(chunk[:2000])
|
||||
if not emb:
|
||||
continue
|
||||
payload: Dict[str, Any] = {
|
||||
"text": chunk[:6000],
|
||||
"doc_id": doc_id,
|
||||
"file_name": file_name,
|
||||
"agent_id": (agent_id or "").lower(),
|
||||
"dao_id": dao_id,
|
||||
"user_id": user_id,
|
||||
"chunk_index": idx,
|
||||
"chunks_total": total,
|
||||
"type": "document_chunk",
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
if isinstance(metadata, dict) and metadata:
|
||||
payload["metadata"] = metadata
|
||||
stored_points.append(
|
||||
qmodels.PointStruct(
|
||||
id=str(uuid.uuid4()),
|
||||
vector=emb,
|
||||
payload=payload,
|
||||
)
|
||||
)
|
||||
|
||||
if not stored_points:
|
||||
return {"ok": False, "error": "embedding_failed"}
|
||||
|
||||
self.qdrant_client.upsert(collection_name=collection, points=stored_points)
|
||||
return {
|
||||
"ok": True,
|
||||
"doc_id": doc_id,
|
||||
"chunks_total": len(chunks),
|
||||
"chunks_stored": len(stored_points),
|
||||
"collection": collection,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning(f"ingest_document_chunks failed for {collection}: {e}")
|
||||
return {"ok": False, "error": str(e)}
|
||||
|
||||
async def query_document_chunks(
|
||||
self,
|
||||
agent_id: str,
|
||||
question: str,
|
||||
doc_id: Optional[str] = None,
|
||||
dao_id: Optional[str] = None,
|
||||
limit: int = 5,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Retrieve top document chunks from {agent_id}_docs for a question.
|
||||
"""
|
||||
if not self.qdrant_client:
|
||||
return {"ok": False, "error": "qdrant_unavailable", "chunks": []}
|
||||
if not COHERE_API_KEY:
|
||||
return {"ok": False, "error": "cohere_unavailable", "chunks": []}
|
||||
|
||||
q = (question or "").strip()
|
||||
if not q:
|
||||
return {"ok": False, "error": "empty_question", "chunks": []}
|
||||
|
||||
embedding = await self.get_embedding(q[:2000])
|
||||
if not embedding:
|
||||
return {"ok": False, "error": "embedding_failed", "chunks": []}
|
||||
|
||||
collection = f"{(agent_id or 'daarwizz').lower()}_docs"
|
||||
|
||||
try:
|
||||
from qdrant_client.http import models as qmodels
|
||||
must_conditions = []
|
||||
if doc_id:
|
||||
must_conditions.append(
|
||||
qmodels.FieldCondition(
|
||||
key="doc_id",
|
||||
match=qmodels.MatchValue(value=doc_id),
|
||||
)
|
||||
)
|
||||
if dao_id:
|
||||
must_conditions.append(
|
||||
qmodels.FieldCondition(
|
||||
key="dao_id",
|
||||
match=qmodels.MatchValue(value=dao_id),
|
||||
)
|
||||
)
|
||||
query_filter = qmodels.Filter(must=must_conditions) if must_conditions else None
|
||||
|
||||
rows = self.qdrant_client.search(
|
||||
collection_name=collection,
|
||||
query_vector=embedding,
|
||||
query_filter=query_filter,
|
||||
limit=max(1, min(int(limit or 5), 12)),
|
||||
with_payload=True,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(f"query_document_chunks search failed for {collection}: {e}")
|
||||
return {"ok": False, "error": "search_failed", "chunks": [], "collection": collection}
|
||||
|
||||
hits: List[Dict[str, Any]] = []
|
||||
for row in rows or []:
|
||||
score = float(getattr(row, "score", 0.0) or 0.0)
|
||||
if score < 0.30:
|
||||
continue
|
||||
payload = getattr(row, "payload", {}) or {}
|
||||
text = str(payload.get("text") or "").strip()
|
||||
if len(text) < 10:
|
||||
continue
|
||||
hits.append(
|
||||
{
|
||||
"text": text,
|
||||
"score": score,
|
||||
"doc_id": payload.get("doc_id"),
|
||||
"file_name": payload.get("file_name"),
|
||||
"chunk_index": payload.get("chunk_index"),
|
||||
"chunks_total": payload.get("chunks_total"),
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"ok": bool(hits),
|
||||
"chunks": hits,
|
||||
"collection": collection,
|
||||
"doc_id": doc_id,
|
||||
}
|
||||
|
||||
async def store_interaction(
|
||||
self,
|
||||
channel: str,
|
||||
|
||||
Reference in New Issue
Block a user