Files
microdao-daarion/gateway-bot/services/doc_service.py

1371 lines
50 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Document Workflow Service
Channel-agnostic service for document parsing, ingestion, and RAG queries.
This service can be used by:
- Telegram bots
- Web applications
- Mobile apps
- Any other client
"""
import os
import logging
import hashlib
import base64
import json
import re
from typing import Optional, Dict, Any, List
from pydantic import BaseModel
from datetime import datetime
from io import BytesIO
from memory_client import memory_client
logger = logging.getLogger(__name__)
SHARED_EXCEL_POLICY_AGENTS = {"agromatrix", "helion", "nutra", "greenfood"}
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000")
ARTIFACT_REGISTRY_URL = os.getenv("ARTIFACT_REGISTRY_URL", "http://artifact-registry:9220").rstrip("/")
DOC_WRITEBACK_CREATED_BY = os.getenv("DOC_WRITEBACK_CREATED_BY", "gateway-doc-service")
GATEWAY_PUBLIC_BASE_URL = os.getenv("GATEWAY_PUBLIC_BASE_URL", "").rstrip("/")
class QAItem(BaseModel):
"""Single Q&A pair"""
question: str
answer: str
class ParsedResult(BaseModel):
"""Result of document parsing"""
success: bool
doc_id: Optional[str] = None
qa_pairs: Optional[List[QAItem]] = None
markdown: Optional[str] = None
chunks_meta: Optional[Dict[str, Any]] = None
raw: Optional[Dict[str, Any]] = None
error: Optional[str] = None
class IngestResult(BaseModel):
"""Result of document ingestion to RAG"""
success: bool
doc_id: Optional[str] = None
ingested_chunks: int = 0
status: str = "unknown"
error: Optional[str] = None
class UpdateResult(BaseModel):
"""Result of document update with version bump."""
success: bool
doc_id: Optional[str] = None
version_no: Optional[int] = None
version_id: Optional[int] = None
updated_chunks: int = 0
status: str = "unknown"
publish_error: Optional[str] = None
artifact_id: Optional[str] = None
artifact_version_id: Optional[str] = None
artifact_storage_key: Optional[str] = None
artifact_mime: Optional[str] = None
artifact_download_url: Optional[str] = None
error: Optional[str] = None
class PublishResult(BaseModel):
"""Result of artifact write-back publish."""
success: bool
artifact_id: Optional[str] = None
version_id: Optional[str] = None
storage_key: Optional[str] = None
mime: Optional[str] = None
file_name: Optional[str] = None
download_url: Optional[str] = None
error: Optional[str] = None
class QAResult(BaseModel):
"""Result of RAG query about a document"""
success: bool
answer: Optional[str] = None
doc_id: Optional[str] = None
sources: Optional[List[Dict[str, Any]]] = None
error: Optional[str] = None
class DocContext(BaseModel):
"""Document context stored in Memory Service"""
doc_id: str
dao_id: Optional[str] = None
user_id: Optional[str] = None
doc_url: Optional[str] = None
file_name: Optional[str] = None
saved_at: Optional[str] = None
class DocumentService:
"""
Channel-agnostic service for document operations.
Handles:
- Document parsing (PDF, images)
- Document ingestion to RAG
- RAG queries about documents
"""
def __init__(self):
"""Initialize document service"""
self.memory_client = memory_client
async def _router_post_json(
self,
path: str,
payload: Dict[str, Any],
timeout: float = 45.0,
) -> Dict[str, Any]:
import httpx
base = ROUTER_URL.rstrip("/")
url = f"{base}{path}"
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(url, json=payload)
body = {}
try:
body = resp.json()
except Exception:
body = {"ok": False, "error": f"Invalid JSON from router ({resp.status_code})"}
if resp.status_code >= 400:
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
raise RuntimeError(f"Router error on {path}: {err}")
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"}
async def _router_get_json(
self,
path: str,
timeout: float = 30.0,
) -> Dict[str, Any]:
import httpx
base = ROUTER_URL.rstrip("/")
url = f"{base}{path}"
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.get(url)
body = {}
try:
body = resp.json()
except Exception:
body = {"ok": False, "error": f"Invalid JSON from router ({resp.status_code})"}
if resp.status_code >= 400:
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
raise RuntimeError(f"Router error on {path}: {err}")
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"}
async def _artifact_post_json(
self,
path: str,
payload: Dict[str, Any],
timeout: float = 45.0,
) -> Dict[str, Any]:
import httpx
base = ARTIFACT_REGISTRY_URL.rstrip("/")
url = f"{base}{path}"
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(url, json=payload)
body = {}
try:
body = resp.json()
except Exception:
body = {"ok": False, "error": f"Invalid JSON from artifact-registry ({resp.status_code})"}
if resp.status_code >= 400:
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
raise RuntimeError(f"Artifact registry error on {path}: {err}")
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid artifact response type"}
async def _artifact_get_json(
self,
path: str,
timeout: float = 30.0,
) -> Dict[str, Any]:
import httpx
base = ARTIFACT_REGISTRY_URL.rstrip("/")
url = f"{base}{path}"
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.get(url)
body = {}
try:
body = resp.json()
except Exception:
body = {"ok": False, "error": f"Invalid JSON from artifact-registry ({resp.status_code})"}
if resp.status_code >= 400:
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
raise RuntimeError(f"Artifact registry error on {path}: {err}")
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid artifact response type"}
def _resolve_format(self, file_name: Optional[str], target_format: Optional[str]) -> str:
fmt = (target_format or "").strip().lower().lstrip(".")
if fmt:
return fmt
if file_name and "." in file_name:
return file_name.rsplit(".", 1)[1].strip().lower()
return "txt"
def _compose_output_name(self, file_name: Optional[str], doc_id: str, fmt: str) -> str:
base = "document"
if file_name:
base = file_name.rsplit("/", 1)[-1].rsplit("\\", 1)[-1]
if "." in base:
base = base.rsplit(".", 1)[0]
elif doc_id:
base = doc_id
safe_base = re.sub(r"[^A-Za-z0-9._-]+", "_", base).strip("._") or "document"
return f"{safe_base}.{fmt}"
def _gateway_artifact_download_path(self, artifact_id: str, version_id: str) -> str:
aid = (artifact_id or "").strip()
vid = (version_id or "").strip()
return f"/api/doc/artifacts/{aid}/versions/{vid}/download"
def _gateway_artifact_download_url(self, artifact_id: str, version_id: str) -> str:
path = self._gateway_artifact_download_path(artifact_id, version_id)
if GATEWAY_PUBLIC_BASE_URL:
return f"{GATEWAY_PUBLIC_BASE_URL}{path}"
return path
def _render_document_bytes(
self,
text: str,
file_name: Optional[str],
doc_id: str,
target_format: Optional[str] = None,
) -> Dict[str, Any]:
body = (text or "").strip()
if not body:
raise ValueError("Cannot render empty document text")
fmt = self._resolve_format(file_name=file_name, target_format=target_format)
output_name = self._compose_output_name(file_name=file_name, doc_id=doc_id, fmt=fmt)
if fmt in {"txt"}:
payload = body.encode("utf-8")
return {"bytes": payload, "mime": "text/plain; charset=utf-8", "file_name": output_name}
if fmt in {"md", "markdown"}:
payload = body.encode("utf-8")
return {"bytes": payload, "mime": "text/markdown; charset=utf-8", "file_name": output_name}
if fmt in {"json"}:
parsed: Any
try:
parsed = json.loads(body)
except Exception:
parsed = {"text": body}
payload = json.dumps(parsed, ensure_ascii=False, indent=2).encode("utf-8")
return {"bytes": payload, "mime": "application/json", "file_name": output_name}
if fmt in {"csv"}:
payload = body.encode("utf-8")
return {"bytes": payload, "mime": "text/csv; charset=utf-8", "file_name": output_name}
if fmt in {"xlsx", "xlsm", "xls"}:
try:
from openpyxl import Workbook
except Exception as e:
raise RuntimeError(f"openpyxl is required for {fmt} rendering: {e}")
wb = Workbook()
ws = wb.active
ws.title = "Document"
lines = [ln for ln in body.splitlines()] or [body]
for idx, line in enumerate(lines, start=1):
ws.cell(row=idx, column=1, value=line)
buf = BytesIO()
wb.save(buf)
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
return {"bytes": buf.getvalue(), "mime": mime, "file_name": self._compose_output_name(file_name, doc_id, "xlsx")}
if fmt in {"docx"}:
try:
from docx import Document
except Exception as e:
raise RuntimeError(f"python-docx is required for docx rendering: {e}")
doc = Document()
for line in body.splitlines():
doc.add_paragraph(line if line else " ")
buf = BytesIO()
doc.save(buf)
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
return {"bytes": buf.getvalue(), "mime": mime, "file_name": self._compose_output_name(file_name, doc_id, "docx")}
payload = body.encode("utf-8")
fallback_name = self._compose_output_name(file_name=file_name, doc_id=doc_id, fmt="txt")
return {"bytes": payload, "mime": "text/plain; charset=utf-8", "file_name": fallback_name}
async def _publish_text_artifact(
self,
text: str,
doc_id: str,
file_name: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
artifact_id: Optional[str] = None,
target_format: Optional[str] = None,
label: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> PublishResult:
try:
rendered = self._render_document_bytes(
text=text,
file_name=file_name,
doc_id=doc_id,
target_format=target_format,
)
content_bytes = rendered["bytes"]
content_b64 = base64.b64encode(content_bytes).decode("ascii")
effective_artifact_id = (artifact_id or "").strip()
if not effective_artifact_id:
create_resp = await self._artifact_post_json(
"/artifacts",
{
"type": "doc",
"title": file_name or doc_id,
"project_id": dao_id,
"acl_ref": dao_id,
"created_by": user_id or DOC_WRITEBACK_CREATED_BY,
},
timeout=30.0,
)
effective_artifact_id = str(create_resp.get("artifact_id") or "").strip()
if not effective_artifact_id:
return PublishResult(success=False, error="Artifact create failed: empty artifact_id")
meta = {"doc_id": doc_id, "source": "doc_update_publish"}
if isinstance(metadata, dict):
meta.update(metadata)
version_resp = await self._artifact_post_json(
f"/artifacts/{effective_artifact_id}/versions/from_base64",
{
"content_base64": content_b64,
"mime": rendered["mime"],
"filename": rendered["file_name"],
"label": label or "edited",
"meta_json": meta,
},
timeout=45.0,
)
version_id = str(version_resp.get("version_id") or "").strip()
storage_key = version_resp.get("storage_key")
if not version_id:
return PublishResult(
success=False,
artifact_id=effective_artifact_id,
error="Artifact version create failed: empty version_id",
)
download_url = self._gateway_artifact_download_url(
artifact_id=effective_artifact_id,
version_id=version_id,
)
return PublishResult(
success=True,
artifact_id=effective_artifact_id,
version_id=version_id,
storage_key=storage_key,
mime=rendered["mime"],
file_name=rendered["file_name"],
download_url=download_url,
)
except Exception as e:
logger.error(f"publish_text_artifact failed: {e}", exc_info=True)
return PublishResult(success=False, error=str(e))
def _is_excel_filename(self, file_name: Optional[str]) -> bool:
if not file_name:
return False
lower = file_name.lower()
return lower.endswith(".xlsx") or lower.endswith(".xls")
def _is_numeric_question(self, question: str) -> bool:
t = (question or "").lower()
if not t:
return False
markers = [
"скільки", "сума", "витрат", "добрив", "грн", "uah", "usd", "eur",
"сколько", "amount", "total", "spent", "cost", "value",
]
return any(m in t for m in markers)
def _extract_query_tokens(self, question: str) -> List[str]:
tokens = re.findall(r"[a-zA-Zа-яА-ЯіїєґІЇЄҐ0-9]{3,}", (question or "").lower())
stop = {
"яка", "який", "яке", "which", "what", "скільки", "сума", "була",
"витрачена", "write", "show", "give", "please", "мені", "будь", "ласка",
"тому", "цьому", "цей", "this", "that", "for", "and", "the",
}
return [t for t in tokens if t not in stop]
async def _try_answer_excel_question(
self,
question: str,
doc_url: Optional[str],
file_name: Optional[str],
) -> Optional[str]:
if not doc_url or not self._is_numeric_question(question):
return None
try:
import httpx
from io import BytesIO
import openpyxl
except Exception:
return None
query_tokens = self._extract_query_tokens(question)
if not query_tokens:
query_tokens = ["сума", "витрати", "добрив"]
try:
async with httpx.AsyncClient(timeout=20.0) as client:
resp = await client.get(doc_url)
if resp.status_code != 200:
return None
content = resp.content
wb = openpyxl.load_workbook(BytesIO(content), data_only=True, read_only=True)
best = None
best_score = -1
fallback = None
for ws in wb.worksheets:
for row_idx, row in enumerate(ws.iter_rows(values_only=True), start=1):
label = ""
numeric_value = None
for cell in row:
if isinstance(cell, (int, float)) and numeric_value is None:
numeric_value = float(cell)
elif isinstance(cell, str) and not label:
label = cell.strip()
if numeric_value is None:
continue
label_low = label.lower()
score = sum(1 for t in query_tokens if t in label_low)
if score > best_score:
best_score = score
best = {
"sheet": ws.title,
"row": row_idx,
"label": label or "n/a",
"value": numeric_value,
}
if fallback is None and any(m in label_low for m in ("добрив", "fertiliz", "удобр")):
fallback = {
"sheet": ws.title,
"row": row_idx,
"label": label or "n/a",
"value": numeric_value,
}
picked = best if best and best_score > 0 else fallback
if not picked:
return None
value = picked["value"]
if abs(value - int(value)) < 1e-9:
value_str = f"{int(value):,}".replace(",", " ")
else:
value_str = f"{value:,.2f}".replace(",", " ").replace(".", ",")
unit = "грн" if self._is_numeric_question(question) else ""
unit_part = f" {unit}" if unit else ""
file_part = f' у файлі "{file_name}"' if file_name else ""
return (
f"За{file_part}: {value_str}{unit_part}. "
f"Джерело: лист {picked['sheet']}, рядок {picked['row']} ({picked['label']})."
)
except Exception as e:
logger.warning(f"Excel deterministic answer failed: {e}")
return None
async def save_doc_context(
self,
session_id: str,
doc_id: str,
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
) -> bool:
"""
Save document context for a session.
Uses Memory Service to persist document context across channels.
Args:
session_id: Session identifier (e.g., "telegram:123", "web:user456")
doc_id: Document ID from parser
doc_url: Optional document URL
file_name: Optional file name
dao_id: Optional DAO ID
Returns:
True if saved successfully
"""
try:
# Use stable synthetic user key per session, so context can be
# retrieved later using only session_id (without caller user_id).
fact_user_id = f"session:{session_id}"
# Save as fact in Memory Service
fact_key = f"doc_context:{session_id}"
fact_value_json = {
"doc_id": doc_id,
"doc_url": doc_url,
"file_name": file_name,
"dao_id": dao_id,
"user_id": user_id,
"saved_at": datetime.utcnow().isoformat()
}
result = await self.memory_client.upsert_fact(
user_id=fact_user_id,
fact_key=fact_key,
fact_value_json=fact_value_json,
# Keep doc context globally addressable for follow-up calls
# that may not include dao_id/team_id in retrieval.
team_id=None,
)
logger.info(f"Saved doc context for session {session_id}: doc_id={doc_id}")
return result
except Exception as e:
logger.error(f"Failed to save doc context: {e}", exc_info=True)
return False
async def get_doc_context(self, session_id: str) -> Optional[DocContext]:
"""
Get document context for a session.
Args:
session_id: Session identifier
Returns:
DocContext or None
"""
try:
user_id = f"session:{session_id}"
fact_key = f"doc_context:{session_id}"
# Get fact from Memory Service
fact = await self.memory_client.get_fact(
user_id=user_id,
fact_key=fact_key
)
if fact and fact.get("fact_value_json"):
logger.debug(f"Retrieved doc context for session {session_id}")
ctx_data = fact.get("fact_value_json")
if isinstance(ctx_data, str):
try:
ctx_data = json.loads(ctx_data)
except Exception:
logger.warning("doc_context fact_value_json is not valid JSON string")
return None
return DocContext(**ctx_data)
return None
except Exception as e:
logger.error(f"Failed to get doc context: {e}", exc_info=True)
return None
async def parse_document(
self,
session_id: str,
doc_url: str,
file_name: str,
dao_id: str,
user_id: str,
output_mode: str = "qa_pairs",
metadata: Optional[Dict[str, Any]] = None
) -> ParsedResult:
"""
Parse a document directly through Swapper service.
Args:
session_id: Session identifier (e.g., "telegram:123", "web:user456")
doc_url: URL to the document file
file_name: Name of the file
dao_id: DAO identifier
user_id: User identifier
output_mode: Output format ("qa_pairs", "markdown", "chunks", "text")
metadata: Optional additional metadata
Returns:
ParsedResult with parsed data
"""
import httpx
SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890")
try:
logger.info(f"Parsing document: session={session_id}, file={file_name}, mode={output_mode}")
# Download the document first
async with httpx.AsyncClient(timeout=60.0) as client:
doc_response = await client.get(doc_url)
if doc_response.status_code != 200:
return ParsedResult(
success=False,
error=f"Failed to download document: {doc_response.status_code}"
)
doc_content = doc_response.content
# Send directly to Swapper /document endpoint
async with httpx.AsyncClient(timeout=30.0) as client:
# Map output_mode: qa_pairs -> text (Swapper doesn't support qa_pairs directly)
swapper_mode = "markdown" if output_mode in ["qa_pairs", "markdown"] else "text"
mime_type = "application/octet-stream"
if file_name:
import mimetypes
mime_type = mimetypes.guess_type(file_name)[0] or mime_type
files = {"file": (file_name, doc_content, mime_type)}
data = {"output_format": swapper_mode}
swapper_response = await client.post(
f"{SWAPPER_URL}/document",
files=files,
data=data
)
if swapper_response.status_code == 200:
response = {"ok": True, "data": swapper_response.json()}
else:
logger.error(f"Swapper document error: {swapper_response.status_code} - {swapper_response.text[:200]}")
return ParsedResult(
success=False,
error=f"Document parsing failed: {swapper_response.status_code}"
)
if not isinstance(response, dict):
return ParsedResult(
success=False,
error="Invalid response from Swapper"
)
data = response.get("data", {})
# Swapper returns: {success, model, output_format, result, filename, processing_time_ms}
parsed_text = data.get("result", "")
output_format = data.get("output_format", "text")
model_used = data.get("model", "unknown")
logger.info(f"Document parsed: {len(parsed_text)} chars using {model_used}")
# Generate a simple doc_id based on filename and timestamp
doc_id = hashlib.md5(f"{file_name}:{datetime.utcnow().isoformat()}".encode()).hexdigest()[:12]
# Save document context for follow-up queries
await self.save_doc_context(
session_id=session_id,
doc_id=doc_id,
doc_url=doc_url,
file_name=file_name,
dao_id=dao_id,
user_id=user_id,
)
# Convert text to markdown format
markdown = parsed_text if output_format == "markdown" else f"```\n{parsed_text}\n```"
# No QA pairs from direct parsing - would need LLM for that
qa_pairs = None
chunks = []
chunks_meta = None
if chunks:
chunks_meta = {
"count": len(chunks),
"chunks": chunks[:3] if len(chunks) > 3 else chunks # Sample
}
return ParsedResult(
success=True,
doc_id=doc_id,
qa_pairs=qa_pairs,
markdown=markdown,
chunks_meta=chunks_meta,
raw=data,
error=None
)
except Exception as e:
logger.error(f"Document parsing via Swapper failed: {e}")
# === FALLBACK: Try PyPDF2 for PDF files ===
if file_name and file_name.lower().endswith(".pdf"):
try:
logger.info(f"Fallback: parsing PDF with PyPDF2: {file_name}")
import io
import PyPDF2
reader = PyPDF2.PdfReader(io.BytesIO(doc_content))
parsed_text = ""
for page in reader.pages:
text = page.extract_text() or ""
parsed_text += text + "\n"
parsed_text = parsed_text.strip()
if len(parsed_text) > 30:
logger.info(f"PyPDF2 fallback success: {len(parsed_text)} chars from {len(reader.pages)} pages")
doc_id = hashlib.md5(f"{file_name}:{datetime.utcnow().isoformat()}".encode()).hexdigest()[:12]
await self.save_doc_context(
session_id=session_id,
doc_id=doc_id,
doc_url=doc_url,
file_name=file_name,
dao_id=dao_id,
user_id=user_id,
)
return ParsedResult(
success=True,
doc_id=doc_id,
qa_pairs=None,
markdown=parsed_text,
chunks_meta=None,
raw={"model": "PyPDF2-fallback", "pages": len(reader.pages)},
error=None
)
else:
logger.warning(f"PyPDF2 fallback: too little text ({len(parsed_text)} chars)")
except Exception as pdf_err:
logger.error(f"PyPDF2 fallback also failed: {pdf_err}")
# === END FALLBACK ===
return ParsedResult(
success=False,
error=str(e)
)
async def ingest_document(
self,
session_id: str,
doc_id: Optional[str] = None,
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
dao_id: str = None,
user_id: str = None,
agent_id: str = "daarwizz",
) -> IngestResult:
"""
Ingest document chunks into RAG/Memory.
Args:
session_id: Session identifier
doc_id: Document ID (if already parsed)
doc_url: Document URL (if need to parse first)
file_name: File name
dao_id: DAO identifier
user_id: User identifier
Returns:
IngestResult with ingestion status
"""
try:
# If doc_id not provided, try to get from context
if not doc_id:
doc_context = await self.get_doc_context(session_id)
if doc_context:
doc_id = doc_context.doc_id
doc_url = doc_url or doc_context.doc_url
file_name = file_name or doc_context.file_name
dao_id = dao_id or doc_context.dao_id
if not doc_url:
return IngestResult(
success=False,
error="No document URL available for ingest"
)
parsed = await self.parse_document(
session_id=session_id,
doc_url=doc_url,
file_name=file_name or "document",
dao_id=dao_id or "",
user_id=user_id or "",
output_mode="markdown",
metadata={"source": self._extract_source(session_id), "mode": "ingest"},
)
if not parsed.success:
return IngestResult(success=False, error=parsed.error or "Document parse failed")
effective_doc_id = doc_id or parsed.doc_id
if not effective_doc_id:
effective_doc_id = hashlib.md5(f"{session_id}:{file_name}:{datetime.utcnow().isoformat()}".encode()).hexdigest()[:12]
doc_text = (parsed.markdown or "").strip()
if not doc_text:
return IngestResult(success=False, error="No extractable text for ingestion")
payload = {
"agent_id": (agent_id or "daarwizz").lower(),
"doc_id": effective_doc_id,
"file_name": file_name or "document",
"text": doc_text,
"dao_id": dao_id,
"user_id": user_id,
"metadata": {
"session_id": session_id,
"source": self._extract_source(session_id),
},
}
response = await self._router_post_json("/v1/documents/ingest", payload, timeout=90.0)
if response.get("ok"):
return IngestResult(
success=True,
doc_id=response.get("doc_id") or effective_doc_id,
ingested_chunks=int(response.get("chunks_stored", 0) or 0),
status="ingested",
)
return IngestResult(
success=False,
doc_id=effective_doc_id,
status="failed",
error=response.get("error", "Router ingest failed"),
)
except Exception as e:
logger.error(f"Document ingestion failed: {e}", exc_info=True)
return IngestResult(
success=False,
error=str(e)
)
async def update_document(
self,
session_id: str,
doc_id: Optional[str] = None,
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
text: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
agent_id: str = "daarwizz",
storage_ref: Optional[str] = None,
publish_artifact: bool = False,
artifact_id: Optional[str] = None,
target_format: Optional[str] = None,
artifact_label: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> UpdateResult:
"""
Update existing document content and bump version in router memory.
"""
try:
context = await self.get_doc_context(session_id)
if context:
if not doc_id:
doc_id = context.doc_id
if not doc_url:
doc_url = context.doc_url
if not file_name:
file_name = context.file_name
if not dao_id:
dao_id = context.dao_id
if not doc_id:
return UpdateResult(
success=False,
status="failed",
error="No document context found. Provide doc_id or parse/ingest first.",
)
effective_text = (text or "").strip()
if not effective_text:
if not doc_url:
return UpdateResult(
success=False,
doc_id=doc_id,
status="failed",
error="No text or doc_url provided for update",
)
parsed = await self.parse_document(
session_id=session_id,
doc_url=doc_url,
file_name=file_name or "document",
dao_id=dao_id or "",
user_id=user_id or "",
output_mode="markdown",
metadata={"source": self._extract_source(session_id), "mode": "update"},
)
if not parsed.success:
return UpdateResult(
success=False,
doc_id=doc_id,
status="failed",
error=parsed.error or "Document parse failed",
)
effective_text = (parsed.markdown or "").strip()
if not effective_text:
return UpdateResult(
success=False,
doc_id=doc_id,
status="failed",
error="No extractable text for update",
)
meta = {
"session_id": session_id,
"source": self._extract_source(session_id),
}
if isinstance(metadata, dict):
meta.update(metadata)
response = await self._router_post_json(
"/v1/documents/update",
{
"agent_id": (agent_id or "daarwizz").lower(),
"doc_id": doc_id,
"file_name": file_name,
"text": effective_text,
"dao_id": dao_id,
"user_id": user_id,
"storage_ref": storage_ref,
"metadata": meta,
},
timeout=90.0,
)
if not response.get("ok"):
return UpdateResult(
success=False,
doc_id=doc_id,
status="failed",
error=response.get("error", "Router update failed"),
)
await self.save_doc_context(
session_id=session_id,
doc_id=doc_id,
doc_url=doc_url,
file_name=file_name,
dao_id=dao_id,
user_id=user_id,
)
publish = PublishResult(success=False)
if publish_artifact:
publish = await self._publish_text_artifact(
text=effective_text,
doc_id=doc_id,
file_name=file_name,
dao_id=dao_id,
user_id=user_id,
artifact_id=artifact_id,
target_format=target_format,
label=artifact_label,
metadata=meta,
)
return UpdateResult(
success=True,
doc_id=response.get("doc_id") or doc_id,
version_no=int(response.get("version_no", 0) or 0) or None,
version_id=int(response.get("version_id", 0) or 0) or None,
updated_chunks=int(response.get("chunks_stored", 0) or 0),
status="updated_published" if publish_artifact and publish.success else ("updated_publish_failed" if publish_artifact else "updated"),
publish_error=publish.error if publish_artifact and not publish.success else None,
artifact_id=publish.artifact_id if publish_artifact else None,
artifact_version_id=publish.version_id if publish_artifact else None,
artifact_storage_key=publish.storage_key if publish_artifact else None,
artifact_mime=publish.mime if publish_artifact else None,
artifact_download_url=publish.download_url if publish_artifact else None,
)
except Exception as e:
logger.error(f"Document update failed: {e}", exc_info=True)
return UpdateResult(
success=False,
doc_id=doc_id,
status="failed",
error=str(e),
)
async def list_document_versions(
self,
agent_id: str,
doc_id: str,
limit: int = 20,
) -> Dict[str, Any]:
aid = (agent_id or "daarwizz").lower()
did = (doc_id or "").strip()
if not did:
return {"ok": False, "error": "doc_id is required", "items": []}
try:
response = await self._router_get_json(
f"/v1/documents/{did}/versions?agent_id={aid}&limit={max(1, min(int(limit or 20), 200))}",
timeout=30.0,
)
return response if isinstance(response, dict) else {"ok": False, "error": "invalid_response", "items": []}
except Exception as e:
logger.error(f"list_document_versions failed: {e}")
return {"ok": False, "error": str(e), "items": []}
async def publish_document_artifact(
self,
session_id: str,
doc_id: Optional[str] = None,
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
text: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
artifact_id: Optional[str] = None,
target_format: Optional[str] = None,
artifact_label: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> PublishResult:
"""
Publish text as a physical artifact version (.docx/.xlsx/.txt/...) without changing RAG index.
"""
try:
context = await self.get_doc_context(session_id)
if context:
if not doc_id:
doc_id = context.doc_id
if not doc_url:
doc_url = context.doc_url
if not file_name:
file_name = context.file_name
if not dao_id:
dao_id = context.dao_id
if not user_id:
user_id = context.user_id
if not doc_id:
return PublishResult(success=False, error="doc_id is required")
body = (text or "").strip()
if not body:
if not doc_url:
return PublishResult(success=False, error="text or doc_url is required")
parsed = await self.parse_document(
session_id=session_id,
doc_url=doc_url,
file_name=file_name or "document",
dao_id=dao_id or "",
user_id=user_id or "",
output_mode="markdown",
metadata={"source": self._extract_source(session_id), "mode": "publish"},
)
if not parsed.success:
return PublishResult(success=False, error=parsed.error or "Document parse failed")
body = (parsed.markdown or "").strip()
if not body:
return PublishResult(success=False, error="No text available for publish")
return await self._publish_text_artifact(
text=body,
doc_id=doc_id,
file_name=file_name,
dao_id=dao_id,
user_id=user_id,
artifact_id=artifact_id,
target_format=target_format,
label=artifact_label,
metadata=metadata,
)
except Exception as e:
logger.error(f"publish_document_artifact failed: {e}", exc_info=True)
return PublishResult(success=False, error=str(e))
async def ask_about_document(
self,
session_id: str,
question: str,
doc_id: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
agent_id: str = "daarwizz"
) -> QAResult:
"""
Ask a question about a document using RAG query.
Args:
session_id: Session identifier
question: Question text
doc_id: Document ID (if None, tries to get from context)
dao_id: DAO identifier
user_id: User identifier
Returns:
QAResult with answer and citations
"""
try:
# If doc_id not provided, try to get from context
doc_url = None
file_name = None
if not doc_id:
doc_context = await self.get_doc_context(session_id)
if doc_context:
doc_id = doc_context.doc_id
dao_id = dao_id or doc_context.dao_id
doc_url = doc_context.doc_url
file_name = doc_context.file_name
else:
doc_context = await self.get_doc_context(session_id)
if doc_context:
doc_url = doc_context.doc_url
file_name = doc_context.file_name
if not doc_id:
return QAResult(
success=False,
error="No document context found. Parse a document first."
)
# Extract user_id from session_id if not provided
if not user_id:
parts = session_id.split(":", 1)
user_id = parts[1] if len(parts) > 1 else session_id
# Shared deterministic Excel policy for top-level agrarian agents.
if (
(agent_id or "").lower() in SHARED_EXCEL_POLICY_AGENTS
and self._is_excel_filename(file_name)
):
deterministic = await self._try_answer_excel_question(
question=question,
doc_url=doc_url,
file_name=file_name,
)
if deterministic:
return QAResult(
success=True,
answer=deterministic,
doc_id=doc_id,
sources=[{
"type": "excel_deterministic",
"file_name": file_name,
}],
)
logger.info(
f"RAG query: agent={agent_id}, session={session_id}, question={question[:50]}, doc_id={doc_id}"
)
response = await self._router_post_json(
"/v1/documents/query",
{
"agent_id": (agent_id or "daarwizz").lower(),
"question": question,
"doc_id": doc_id,
"dao_id": dao_id,
"user_id": user_id,
"limit": 5,
},
timeout=60.0,
)
if isinstance(response, dict) and not response.get("ok", False):
return QAResult(
success=False,
error=response.get("error", "Document query failed"),
)
data = response.get("data", {}) if isinstance(response, dict) else {}
answer = data.get("answer") or data.get("text")
sources = data.get("citations", []) or data.get("sources", [])
if answer:
return QAResult(
success=True,
answer=answer,
doc_id=doc_id,
sources=sources if sources else None
)
else:
return QAResult(
success=False,
error="No answer from RAG query"
)
except Exception as e:
logger.error(f"RAG query failed: {e}", exc_info=True)
return QAResult(
success=False,
error=str(e)
)
def _extract_source(self, session_id: str) -> str:
"""Extract source channel from session_id"""
parts = session_id.split(":", 1)
return parts[0] if len(parts) > 1 else "unknown"
# Global instance
doc_service = DocumentService()
# Export functions for convenience
async def parse_document(
session_id: str,
doc_url: str,
file_name: str,
dao_id: str,
user_id: str,
output_mode: str = "qa_pairs",
metadata: Optional[Dict[str, Any]] = None
) -> ParsedResult:
"""Parse a document through DAGI Router"""
return await doc_service.parse_document(
session_id=session_id,
doc_url=doc_url,
file_name=file_name,
dao_id=dao_id,
user_id=user_id,
output_mode=output_mode,
metadata=metadata
)
async def ingest_document(
session_id: str,
doc_id: Optional[str] = None,
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
agent_id: str = "daarwizz",
) -> IngestResult:
"""Ingest document chunks into RAG/Memory"""
return await doc_service.ingest_document(
session_id=session_id,
doc_id=doc_id,
doc_url=doc_url,
file_name=file_name,
dao_id=dao_id,
user_id=user_id,
agent_id=agent_id,
)
async def ask_about_document(
session_id: str,
question: str,
doc_id: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
agent_id: str = "daarwizz"
) -> QAResult:
"""Ask a question about a document using RAG query"""
return await doc_service.ask_about_document(
session_id=session_id,
question=question,
doc_id=doc_id,
dao_id=dao_id,
user_id=user_id,
agent_id=agent_id
)
async def update_document(
session_id: str,
doc_id: Optional[str] = None,
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
text: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
agent_id: str = "daarwizz",
storage_ref: Optional[str] = None,
publish_artifact: bool = False,
artifact_id: Optional[str] = None,
target_format: Optional[str] = None,
artifact_label: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> UpdateResult:
"""Update document chunks and bump version."""
return await doc_service.update_document(
session_id=session_id,
doc_id=doc_id,
doc_url=doc_url,
file_name=file_name,
text=text,
dao_id=dao_id,
user_id=user_id,
agent_id=agent_id,
storage_ref=storage_ref,
publish_artifact=publish_artifact,
artifact_id=artifact_id,
target_format=target_format,
artifact_label=artifact_label,
metadata=metadata,
)
async def list_document_versions(agent_id: str, doc_id: str, limit: int = 20) -> Dict[str, Any]:
"""List document versions from router."""
return await doc_service.list_document_versions(
agent_id=agent_id,
doc_id=doc_id,
limit=limit,
)
async def publish_document_artifact(
session_id: str,
doc_id: Optional[str] = None,
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
text: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
artifact_id: Optional[str] = None,
target_format: Optional[str] = None,
artifact_label: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> PublishResult:
"""Publish physical artifact version for document text."""
return await doc_service.publish_document_artifact(
session_id=session_id,
doc_id=doc_id,
doc_url=doc_url,
file_name=file_name,
text=text,
dao_id=dao_id,
user_id=user_id,
artifact_id=artifact_id,
target_format=target_format,
artifact_label=artifact_label,
metadata=metadata,
)
async def save_doc_context(
session_id: str,
doc_id: str,
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
) -> bool:
"""Save document context for a session"""
return await doc_service.save_doc_context(
session_id=session_id,
doc_id=doc_id,
doc_url=doc_url,
file_name=file_name,
dao_id=dao_id,
user_id=user_id,
)
async def get_doc_context(session_id: str) -> Optional[DocContext]:
"""Get document context for a session"""
return await doc_service.get_doc_context(session_id)