feat(docs): add standard file processing and router document ingest/query

This commit is contained in:
NODA1 System
2026-02-21 14:02:59 +01:00
parent 3e3546ea89
commit 5d52cf81c4
7 changed files with 755 additions and 104 deletions

View File

@@ -1235,6 +1235,27 @@ class InferResponse(BaseModel):
file_mime: Optional[str] = None
class DocumentIngestRequest(BaseModel):
"""Ingest document text into agent-specific docs collection."""
agent_id: str
doc_id: str
file_name: Optional[str] = None
text: str
dao_id: Optional[str] = None
user_id: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None
class DocumentQueryRequest(BaseModel):
"""Query document context from agent-specific docs collection."""
agent_id: str
question: str
doc_id: Optional[str] = None
dao_id: Optional[str] = None
user_id: Optional[str] = None
limit: int = 5
class SharedMemoryReviewRequest(BaseModel):
point_id: str
approve: bool
@@ -2867,6 +2888,149 @@ async def agent_infer(agent_id: str, request: InferRequest):
)
@app.post("/v1/documents/ingest")
async def documents_ingest(request: DocumentIngestRequest):
"""
Ingest raw document text into Qdrant {agent_id}_docs.
"""
if not MEMORY_RETRIEVAL_AVAILABLE or not memory_retrieval:
raise HTTPException(status_code=503, detail="Memory retrieval not available")
agent_id = (request.agent_id or "").strip().lower()
if not agent_id:
raise HTTPException(status_code=400, detail="agent_id is required")
text = (request.text or "").strip()
if not text:
raise HTTPException(status_code=400, detail="text is required")
doc_id = (request.doc_id or "").strip()
if not doc_id:
# Fallback should be deterministic for same text + file
seed = f"{agent_id}:{request.file_name or ''}:{text[:400]}"
doc_id = hashlib.md5(seed.encode("utf-8")).hexdigest()[:16]
result = await memory_retrieval.ingest_document_chunks(
agent_id=agent_id,
doc_id=doc_id,
file_name=request.file_name,
text=text,
dao_id=request.dao_id,
user_id=request.user_id,
metadata=request.metadata,
)
if not result.get("ok"):
return {
"ok": False,
"error": result.get("error", "ingest_failed"),
"doc_id": doc_id,
"collection": result.get("collection"),
}
return result
@app.post("/v1/documents/query")
async def documents_query(request: DocumentQueryRequest):
"""
Query ingested document chunks and synthesize source-locked answer.
"""
if not MEMORY_RETRIEVAL_AVAILABLE or not memory_retrieval:
raise HTTPException(status_code=503, detail="Memory retrieval not available")
agent_id = (request.agent_id or "").strip().lower()
if not agent_id:
raise HTTPException(status_code=400, detail="agent_id is required")
question = (request.question or "").strip()
if not question:
raise HTTPException(status_code=400, detail="question is required")
lookup = await memory_retrieval.query_document_chunks(
agent_id=agent_id,
question=question,
doc_id=request.doc_id,
dao_id=request.dao_id,
limit=request.limit,
)
chunks = lookup.get("chunks") or []
if not chunks:
return {
"ok": False,
"error": lookup.get("error", "no_relevant_chunks"),
"data": {
"answer": None,
"citations": [],
"doc_id": request.doc_id,
},
}
citations: List[Dict[str, Any]] = []
context_blocks: List[str] = []
for i, ch in enumerate(chunks, start=1):
c_doc_id = ch.get("doc_id") or request.doc_id
c_file = ch.get("file_name")
c_idx = ch.get("chunk_index")
c_score = float(ch.get("score", 0.0) or 0.0)
citations.append(
{
"doc_id": c_doc_id,
"file_name": c_file,
"chunk_index": c_idx,
"score": round(c_score, 4),
}
)
src = []
if c_file:
src.append(f"file={c_file}")
if c_idx is not None:
src.append(f"chunk={int(c_idx) + 1}")
src_label = ", ".join(src) if src else "chunk"
context_blocks.append(f"[{i}] ({src_label}) {str(ch.get('text') or '').strip()[:1400]}")
answer_text = ""
try:
llm_req = InternalLLMRequest(
prompt=(
"Питання користувача:\n"
f"{question}\n\n"
"Контекст із документа (дозволено використовувати ТІЛЬКИ його):\n"
+ "\n\n".join(context_blocks)
+ "\n\n"
"Правила відповіді:\n"
"1) Відповідай лише на основі наведеного контексту.\n"
"2) Якщо даних недостатньо, прямо скажи: 'Недостатньо даних у документі'.\n"
"3) В кінці додай коротке посилання на джерело у форматі [source: N].\n"
),
llm_profile="reasoning",
max_tokens=320,
temperature=0.1,
role_context="Document QA source-locked",
metadata={"agent_id": agent_id, "mode": "documents_query"},
)
llm_resp = await internal_llm_complete(llm_req)
answer_text = (llm_resp.text or "").strip()
except Exception as e:
logger.warning(f"documents_query LLM synthesis failed: {e}")
if not answer_text:
top = chunks[0]
answer_text = (
"Знайшов релевантний фрагмент у документі, але не вдалося сформувати підсумок. "
f"Ось ключовий уривок:\n{str(top.get('text') or '').strip()[:1200]}"
)
return {
"ok": True,
"data": {
"answer": answer_text,
"citations": citations,
"doc_id": request.doc_id or chunks[0].get("doc_id"),
"chunks_used": len(chunks),
"collection": lookup.get("collection"),
},
}
@app.get("/v1/models")
async def list_available_models():
"""List all available models across backends"""

View File

@@ -1237,6 +1237,234 @@ class MemoryRetrieval:
logger.warning(f"review_shared_pending_case failed: {e}")
return {"ok": False, "error": str(e)}
def _chunk_document_text(
self,
text: str,
chunk_chars: int = 1200,
overlap_chars: int = 180,
) -> List[str]:
"""
Split document text into overlap-aware chunks for RAG indexing.
Keeps paragraph structure when possible.
"""
raw = re.sub(r"\r\n?", "\n", text or "").strip()
if not raw:
return []
paragraphs = [p.strip() for p in re.split(r"\n{2,}", raw) if p and p.strip()]
if not paragraphs:
return []
chunks: List[str] = []
current = ""
max_hard = max(chunk_chars, 600)
def _push_current() -> None:
nonlocal current
if current and len(current.strip()) >= 20:
chunks.append(current.strip())
current = ""
for para in paragraphs:
if len(para) > max_hard * 2:
_push_current()
i = 0
step = max_hard - max(80, min(overlap_chars, max_hard // 2))
while i < len(para):
part = para[i : i + max_hard]
if len(part.strip()) >= 20:
chunks.append(part.strip())
i += max(1, step)
continue
candidate = f"{current}\n\n{para}".strip() if current else para
if len(candidate) <= max_hard:
current = candidate
continue
_push_current()
if overlap_chars > 0 and chunks:
tail = chunks[-1][-overlap_chars:]
current = f"{tail}\n\n{para}".strip()
if len(current) > max_hard:
_push_current()
current = para
else:
current = para
_push_current()
return chunks
async def ingest_document_chunks(
self,
agent_id: str,
doc_id: str,
file_name: Optional[str],
text: str,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""
Ingest normalized document chunks into {agent_id}_docs collection.
"""
if not self.qdrant_client:
return {"ok": False, "error": "qdrant_unavailable"}
if not COHERE_API_KEY:
return {"ok": False, "error": "cohere_unavailable"}
body = (text or "").strip()
if not body:
return {"ok": False, "error": "empty_document"}
chunks = self._chunk_document_text(body)
if not chunks:
return {"ok": False, "error": "no_chunks"}
collection = f"{(agent_id or 'daarwizz').lower()}_docs"
stored_points = []
try:
from qdrant_client.http import models as qmodels
import uuid
try:
self.qdrant_client.get_collection(collection)
except Exception:
self.qdrant_client.create_collection(
collection_name=collection,
vectors_config=qmodels.VectorParams(
size=1024,
distance=qmodels.Distance.COSINE,
),
)
logger.info(f"✅ Created collection: {collection}")
total = len(chunks)
for idx, chunk in enumerate(chunks):
emb = await self.get_embedding(chunk[:2000])
if not emb:
continue
payload: Dict[str, Any] = {
"text": chunk[:6000],
"doc_id": doc_id,
"file_name": file_name,
"agent_id": (agent_id or "").lower(),
"dao_id": dao_id,
"user_id": user_id,
"chunk_index": idx,
"chunks_total": total,
"type": "document_chunk",
"timestamp": datetime.utcnow().isoformat(),
}
if isinstance(metadata, dict) and metadata:
payload["metadata"] = metadata
stored_points.append(
qmodels.PointStruct(
id=str(uuid.uuid4()),
vector=emb,
payload=payload,
)
)
if not stored_points:
return {"ok": False, "error": "embedding_failed"}
self.qdrant_client.upsert(collection_name=collection, points=stored_points)
return {
"ok": True,
"doc_id": doc_id,
"chunks_total": len(chunks),
"chunks_stored": len(stored_points),
"collection": collection,
}
except Exception as e:
logger.warning(f"ingest_document_chunks failed for {collection}: {e}")
return {"ok": False, "error": str(e)}
async def query_document_chunks(
self,
agent_id: str,
question: str,
doc_id: Optional[str] = None,
dao_id: Optional[str] = None,
limit: int = 5,
) -> Dict[str, Any]:
"""
Retrieve top document chunks from {agent_id}_docs for a question.
"""
if not self.qdrant_client:
return {"ok": False, "error": "qdrant_unavailable", "chunks": []}
if not COHERE_API_KEY:
return {"ok": False, "error": "cohere_unavailable", "chunks": []}
q = (question or "").strip()
if not q:
return {"ok": False, "error": "empty_question", "chunks": []}
embedding = await self.get_embedding(q[:2000])
if not embedding:
return {"ok": False, "error": "embedding_failed", "chunks": []}
collection = f"{(agent_id or 'daarwizz').lower()}_docs"
try:
from qdrant_client.http import models as qmodels
must_conditions = []
if doc_id:
must_conditions.append(
qmodels.FieldCondition(
key="doc_id",
match=qmodels.MatchValue(value=doc_id),
)
)
if dao_id:
must_conditions.append(
qmodels.FieldCondition(
key="dao_id",
match=qmodels.MatchValue(value=dao_id),
)
)
query_filter = qmodels.Filter(must=must_conditions) if must_conditions else None
rows = self.qdrant_client.search(
collection_name=collection,
query_vector=embedding,
query_filter=query_filter,
limit=max(1, min(int(limit or 5), 12)),
with_payload=True,
)
except Exception as e:
logger.debug(f"query_document_chunks search failed for {collection}: {e}")
return {"ok": False, "error": "search_failed", "chunks": [], "collection": collection}
hits: List[Dict[str, Any]] = []
for row in rows or []:
score = float(getattr(row, "score", 0.0) or 0.0)
if score < 0.30:
continue
payload = getattr(row, "payload", {}) or {}
text = str(payload.get("text") or "").strip()
if len(text) < 10:
continue
hits.append(
{
"text": text,
"score": score,
"doc_id": payload.get("doc_id"),
"file_name": payload.get("file_name"),
"chunk_index": payload.get("chunk_index"),
"chunks_total": payload.get("chunks_total"),
}
)
return {
"ok": bool(hits),
"chunks": hits,
"collection": collection,
"doc_id": doc_id,
}
async def store_interaction(
self,
channel: str,

View File

@@ -11,10 +11,13 @@ import os
import asyncio
import logging
import base64
import json
import re
from typing import Optional, Dict, List, Any, Union
from datetime import datetime, timedelta
from enum import Enum
from io import BytesIO
import xml.etree.ElementTree as ET
from fastapi import FastAPI, HTTPException, BackgroundTasks, File, UploadFile, Form
from fastapi.middleware.cors import CORSMiddleware
@@ -56,16 +59,34 @@ def _csv_to_markdown(content: bytes) -> str:
text = _decode_text_bytes(content)
reader = csv.reader(text.splitlines())
rows = list(reader)
return _rows_to_markdown(rows)
def _tsv_to_markdown(content: bytes) -> str:
text = _decode_text_bytes(content)
reader = csv.reader(text.splitlines(), delimiter="\t")
rows = list(reader)
return _rows_to_markdown(rows)
def _rows_to_markdown(rows: List[List[Any]]) -> str:
if not rows:
return ""
header = rows[0]
body = rows[1:]
width = max(len(r) for r in rows)
norm_rows = []
for r in rows:
rr = [str(c) if c is not None else "" for c in r]
if len(rr) < width:
rr.extend([""] * (width - len(rr)))
norm_rows.append(rr)
header = norm_rows[0]
body = norm_rows[1:]
lines = [
"| " + " | ".join(header) + " |",
"| " + " | ".join(["---"] * len(header)) + " |",
]
for row in body:
lines.append("| " + " | ".join(row) + " |")
lines.append("| " + " | ".join([str(c) if c is not None else "" for c in row]) + " |")
return "\n".join(lines)
@@ -91,6 +112,69 @@ def _xlsx_to_markdown(content: bytes) -> str:
return "\n".join(parts)
def _xls_to_markdown(content: bytes) -> str:
try:
import xlrd
except Exception as e:
raise HTTPException(status_code=500, detail=f"xlrd not available: {e}")
wb = xlrd.open_workbook(file_contents=content)
parts = []
for s in wb.sheets():
parts.append(f"## Sheet: {s.name}")
rows = []
for r in range(s.nrows):
rows.append([s.cell_value(r, c) for c in range(s.ncols)])
if not rows:
parts.append("_Empty sheet_")
continue
parts.append(_rows_to_markdown(rows))
return "\n\n".join(parts)
def _ods_to_markdown(content: bytes) -> str:
try:
from odf.opendocument import load
from odf.table import Table, TableRow, TableCell
from odf.text import P
except Exception as e:
raise HTTPException(status_code=500, detail=f"odfpy not available: {e}")
try:
doc = load(BytesIO(content))
except Exception as e:
raise HTTPException(status_code=400, detail=f"Invalid ODS file: {e}")
parts = []
for table in doc.spreadsheet.getElementsByType(Table):
table_name = str(table.getAttribute("name") or "Sheet")
parts.append(f"## Sheet: {table_name}")
rows: List[List[str]] = []
for row in table.getElementsByType(TableRow):
cells_out: List[str] = []
for cell in row.getElementsByType(TableCell):
txt_parts = []
for p in cell.getElementsByType(P):
txt_parts.extend(
[str(getattr(node, "data", "")).strip() for node in p.childNodes if getattr(node, "data", None)]
)
cell_text = " ".join([t for t in txt_parts if t]).strip()
repeat_raw = cell.getAttribute("numbercolumnsrepeated")
try:
repeat = int(repeat_raw) if repeat_raw else 1
except Exception:
repeat = 1
repeat = max(1, min(repeat, 100))
for _ in range(repeat):
cells_out.append(cell_text)
if cells_out:
rows.append(cells_out)
if not rows:
parts.append("_Empty sheet_")
continue
parts.append(_rows_to_markdown(rows))
return "\n\n".join(parts)
def _docx_to_text(content: bytes) -> str:
try:
from docx import Document
@@ -115,18 +199,111 @@ def _pdf_to_text(content: bytes) -> str:
return "\n\n".join(text_content)
def _pptx_to_text(content: bytes) -> str:
try:
from pptx import Presentation
except Exception as e:
raise HTTPException(status_code=500, detail=f"python-pptx not available: {e}")
prs = Presentation(BytesIO(content))
parts = []
for idx, slide in enumerate(prs.slides, start=1):
parts.append(f"## Slide {idx}")
slide_lines = []
for shape in slide.shapes:
text = getattr(shape, "text", None)
if text and str(text).strip():
slide_lines.append(str(text).strip())
parts.extend(slide_lines if slide_lines else ["_No text on this slide_"])
return "\n\n".join(parts)
def _json_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
parsed = json.loads(raw)
return json.dumps(parsed, ensure_ascii=False, indent=2)
except Exception:
return raw
def _yaml_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
parsed = yaml.safe_load(raw)
return yaml.safe_dump(parsed, allow_unicode=True, sort_keys=False)
except Exception:
return raw
def _xml_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
root = ET.fromstring(raw)
text = " ".join([t.strip() for t in root.itertext() if t and t.strip()])
return text or raw
except Exception:
return raw
def _html_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(raw, "html.parser")
text = soup.get_text(separator="\n")
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip() or raw
except Exception:
# Minimal fallback if bs4 is unavailable
text = re.sub(r"<[^>]+>", " ", raw)
text = re.sub(r"\s+", " ", text)
return text.strip()
def _rtf_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
from striprtf.striprtf import rtf_to_text
return rtf_to_text(raw)
except Exception:
# Basic fallback: strip common RTF control tokens
text = re.sub(r"\\'[0-9a-fA-F]{2}", " ", raw)
text = re.sub(r"\\[a-zA-Z]+-?\d* ?", " ", text)
text = text.replace("{", " ").replace("}", " ")
return re.sub(r"\s+", " ", text).strip()
def _extract_text_by_ext(filename: str, content: bytes) -> str:
ext = filename.split(".")[-1].lower() if "." in filename else ""
if ext in ["txt", "md"]:
if ext in ["txt", "md", "markdown"]:
return _decode_text_bytes(content)
if ext == "csv":
return _csv_to_markdown(content)
if ext == "xlsx":
if ext == "tsv":
return _tsv_to_markdown(content)
if ext in {"xlsx", "xlsm"}:
return _xlsx_to_markdown(content)
if ext == "xls":
return _xls_to_markdown(content)
if ext == "ods":
return _ods_to_markdown(content)
if ext == "docx":
return _docx_to_text(content)
if ext == "pdf":
return _pdf_to_text(content)
if ext == "pptx":
return _pptx_to_text(content)
if ext == "json":
return _json_to_text(content)
if ext in {"yaml", "yml"}:
return _yaml_to_text(content)
if ext == "xml":
return _xml_to_text(content)
if ext in {"html", "htm"}:
return _html_to_text(content)
if ext == "rtf":
return _rtf_to_text(content)
raise HTTPException(status_code=400, detail=f"Unsupported file type: .{ext}")
@@ -139,7 +316,12 @@ def _zip_to_markdown(content: bytes, max_files: int = 50, max_total_mb: int = 10
if total_size > max_total_mb * 1024 * 1024:
raise HTTPException(status_code=400, detail=f"ZIP слишком большой: {total_size / 1024 / 1024:.1f} MB")
parts = []
allowed_exts = {"txt", "md", "csv", "xlsx", "docx", "pdf"}
allowed_exts = {
"txt", "md", "markdown", "csv", "tsv",
"xls", "xlsx", "xlsm", "ods",
"docx", "pdf", "pptx",
"json", "yaml", "yml", "xml", "html", "htm", "rtf",
}
processed = []
skipped = []
for member in members:
@@ -1655,7 +1837,8 @@ async def document_endpoint(
- json: Structured JSON with document elements
- text: Plain text extraction
Supported files: PDF, DOCX, PPTX, images (PNG, JPG)
Supported files:
PDF, DOCX, XLS/XLSX/XLSM/ODS, PPTX, TXT/MD/CSV/TSV, JSON/YAML/XML/HTML, RTF, ZIP, images.
"""
try:
import time
@@ -1672,15 +1855,28 @@ async def document_endpoint(
filename = file.filename if file else "document"
file_ext = filename.split(".")[-1].lower() if "." in filename else "pdf"
# Handle text-based formats without Docling
if file_ext in ["txt", "md", "csv", "xlsx", "zip"]:
# Handle deterministic extraction for standard office/text formats
if file_ext in [
"txt", "md", "markdown", "csv", "tsv",
"xlsx", "xls", "xlsm", "ods",
"json", "yaml", "yml", "xml", "html", "htm", "rtf",
"pptx", "zip",
]:
try:
if file_ext == "zip":
content = _zip_to_markdown(doc_data)
output_format = "markdown"
else:
content = _extract_text_by_ext(filename, doc_data)
output_format = "markdown" if file_ext in ["md", "csv", "xlsx"] else "text"
output_format = (
"markdown"
if file_ext in {
"md", "markdown", "csv", "tsv",
"xlsx", "xls", "xlsm", "ods",
"json", "yaml", "yml", "xml", "html", "htm", "pptx",
}
else "text"
)
processing_time_ms = (time.time() - start_time) * 1000
return {
"success": True,
@@ -1764,22 +1960,27 @@ async def document_endpoint(
"device": swapper.device
}
# For DOCX, try python-docx
if file_ext == "docx":
# For common office/text formats, try deterministic extractors.
if file_ext in {
"docx", "txt", "md", "markdown", "csv", "tsv",
"xlsx", "xls", "xlsm", "ods",
"pptx", "json", "yaml", "yml", "xml", "html", "htm", "rtf",
}:
try:
content = _docx_to_text(doc_data)
content = _extract_text_by_ext(filename, doc_data)
out_fmt = "markdown" if file_ext not in {"txt", "rtf"} else "text"
return {
"success": True,
"model": "python-docx (fallback)",
"output_format": "text",
"model": "text-extract (fallback)",
"output_format": out_fmt,
"result": content,
"filename": filename,
"processing_time_ms": (time.time() - start_time) * 1000,
"device": swapper.device
}
except Exception as e:
logger.error(f"DOCX fallback failed: {e}")
raise HTTPException(status_code=500, detail="DOCX extraction failed")
logger.error(f"Text fallback failed for .{file_ext}: {e}")
raise HTTPException(status_code=500, detail=f"Extraction failed for .{file_ext}")
# For PDFs, try pdfplumber
if file_ext == "pdf":
@@ -1807,7 +2008,7 @@ async def document_endpoint(
# For other documents, return error
raise HTTPException(
status_code=503,
detail="Document processing not available. Supported: PDF (with pdfplumber), images (with OCR)"
detail="Document processing unavailable for this type. Supported: office/text/image/zip standard formats."
)
finally:
@@ -2312,4 +2513,3 @@ async def get_multimodal_stack():
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8890)

View File

@@ -4,6 +4,15 @@ httpx==0.25.2
pydantic==2.5.0
pyyaml==6.0.1
python-multipart==0.0.6
chardet>=5.2.0
openpyxl>=3.1.2
python-docx>=1.1.2
pdfplumber>=0.11.0
python-pptx>=0.6.23
xlrd>=2.0.1
odfpy>=1.4.1
beautifulsoup4>=4.12.0
striprtf>=0.0.26
# HuggingFace dependencies for OCR models
torch>=2.0.0
@@ -25,4 +34,4 @@ safetensors>=0.4.0
# Web Scraping & Search
trafilatura>=1.6.0
duckduckgo-search>=4.0.0
duckduckgo-search>=4.0.0

View File

@@ -43,3 +43,8 @@ pdfplumber>=0.10.0
python-docx>=1.1.0
openpyxl>=3.1.2
chardet>=5.2.0
python-pptx>=0.6.23
xlrd>=2.0.1
odfpy>=1.4.1
beautifulsoup4>=4.12.0
striprtf>=0.0.26