1371 lines
50 KiB
Python
1371 lines
50 KiB
Python
"""
|
||
Document Workflow Service
|
||
Channel-agnostic service for document parsing, ingestion, and RAG queries.
|
||
|
||
This service can be used by:
|
||
- Telegram bots
|
||
- Web applications
|
||
- Mobile apps
|
||
- Any other client
|
||
"""
|
||
import os
|
||
import logging
|
||
import hashlib
|
||
import base64
|
||
import json
|
||
import re
|
||
from typing import Optional, Dict, Any, List
|
||
from pydantic import BaseModel
|
||
from datetime import datetime
|
||
from io import BytesIO
|
||
|
||
from memory_client import memory_client
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
SHARED_EXCEL_POLICY_AGENTS = {"agromatrix", "helion", "nutra", "greenfood"}
|
||
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000")
|
||
ARTIFACT_REGISTRY_URL = os.getenv("ARTIFACT_REGISTRY_URL", "http://artifact-registry:9220").rstrip("/")
|
||
DOC_WRITEBACK_CREATED_BY = os.getenv("DOC_WRITEBACK_CREATED_BY", "gateway-doc-service")
|
||
GATEWAY_PUBLIC_BASE_URL = os.getenv("GATEWAY_PUBLIC_BASE_URL", "").rstrip("/")
|
||
|
||
|
||
class QAItem(BaseModel):
|
||
"""Single Q&A pair"""
|
||
question: str
|
||
answer: str
|
||
|
||
|
||
class ParsedResult(BaseModel):
|
||
"""Result of document parsing"""
|
||
success: bool
|
||
doc_id: Optional[str] = None
|
||
qa_pairs: Optional[List[QAItem]] = None
|
||
markdown: Optional[str] = None
|
||
chunks_meta: Optional[Dict[str, Any]] = None
|
||
raw: Optional[Dict[str, Any]] = None
|
||
error: Optional[str] = None
|
||
|
||
|
||
class IngestResult(BaseModel):
|
||
"""Result of document ingestion to RAG"""
|
||
success: bool
|
||
doc_id: Optional[str] = None
|
||
ingested_chunks: int = 0
|
||
status: str = "unknown"
|
||
error: Optional[str] = None
|
||
|
||
|
||
class UpdateResult(BaseModel):
|
||
"""Result of document update with version bump."""
|
||
success: bool
|
||
doc_id: Optional[str] = None
|
||
version_no: Optional[int] = None
|
||
version_id: Optional[int] = None
|
||
updated_chunks: int = 0
|
||
status: str = "unknown"
|
||
publish_error: Optional[str] = None
|
||
artifact_id: Optional[str] = None
|
||
artifact_version_id: Optional[str] = None
|
||
artifact_storage_key: Optional[str] = None
|
||
artifact_mime: Optional[str] = None
|
||
artifact_download_url: Optional[str] = None
|
||
error: Optional[str] = None
|
||
|
||
|
||
class PublishResult(BaseModel):
|
||
"""Result of artifact write-back publish."""
|
||
success: bool
|
||
artifact_id: Optional[str] = None
|
||
version_id: Optional[str] = None
|
||
storage_key: Optional[str] = None
|
||
mime: Optional[str] = None
|
||
file_name: Optional[str] = None
|
||
download_url: Optional[str] = None
|
||
error: Optional[str] = None
|
||
|
||
|
||
class QAResult(BaseModel):
|
||
"""Result of RAG query about a document"""
|
||
success: bool
|
||
answer: Optional[str] = None
|
||
doc_id: Optional[str] = None
|
||
sources: Optional[List[Dict[str, Any]]] = None
|
||
error: Optional[str] = None
|
||
|
||
|
||
class DocContext(BaseModel):
|
||
"""Document context stored in Memory Service"""
|
||
doc_id: str
|
||
dao_id: Optional[str] = None
|
||
user_id: Optional[str] = None
|
||
doc_url: Optional[str] = None
|
||
file_name: Optional[str] = None
|
||
saved_at: Optional[str] = None
|
||
|
||
|
||
class DocumentService:
|
||
"""
|
||
Channel-agnostic service for document operations.
|
||
|
||
Handles:
|
||
- Document parsing (PDF, images)
|
||
- Document ingestion to RAG
|
||
- RAG queries about documents
|
||
"""
|
||
|
||
def __init__(self):
|
||
"""Initialize document service"""
|
||
self.memory_client = memory_client
|
||
|
||
async def _router_post_json(
|
||
self,
|
||
path: str,
|
||
payload: Dict[str, Any],
|
||
timeout: float = 45.0,
|
||
) -> Dict[str, Any]:
|
||
import httpx
|
||
|
||
base = ROUTER_URL.rstrip("/")
|
||
url = f"{base}{path}"
|
||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||
resp = await client.post(url, json=payload)
|
||
body = {}
|
||
try:
|
||
body = resp.json()
|
||
except Exception:
|
||
body = {"ok": False, "error": f"Invalid JSON from router ({resp.status_code})"}
|
||
if resp.status_code >= 400:
|
||
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
|
||
raise RuntimeError(f"Router error on {path}: {err}")
|
||
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"}
|
||
|
||
async def _router_get_json(
|
||
self,
|
||
path: str,
|
||
timeout: float = 30.0,
|
||
) -> Dict[str, Any]:
|
||
import httpx
|
||
|
||
base = ROUTER_URL.rstrip("/")
|
||
url = f"{base}{path}"
|
||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||
resp = await client.get(url)
|
||
body = {}
|
||
try:
|
||
body = resp.json()
|
||
except Exception:
|
||
body = {"ok": False, "error": f"Invalid JSON from router ({resp.status_code})"}
|
||
if resp.status_code >= 400:
|
||
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
|
||
raise RuntimeError(f"Router error on {path}: {err}")
|
||
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"}
|
||
|
||
async def _artifact_post_json(
|
||
self,
|
||
path: str,
|
||
payload: Dict[str, Any],
|
||
timeout: float = 45.0,
|
||
) -> Dict[str, Any]:
|
||
import httpx
|
||
|
||
base = ARTIFACT_REGISTRY_URL.rstrip("/")
|
||
url = f"{base}{path}"
|
||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||
resp = await client.post(url, json=payload)
|
||
body = {}
|
||
try:
|
||
body = resp.json()
|
||
except Exception:
|
||
body = {"ok": False, "error": f"Invalid JSON from artifact-registry ({resp.status_code})"}
|
||
if resp.status_code >= 400:
|
||
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
|
||
raise RuntimeError(f"Artifact registry error on {path}: {err}")
|
||
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid artifact response type"}
|
||
|
||
async def _artifact_get_json(
|
||
self,
|
||
path: str,
|
||
timeout: float = 30.0,
|
||
) -> Dict[str, Any]:
|
||
import httpx
|
||
|
||
base = ARTIFACT_REGISTRY_URL.rstrip("/")
|
||
url = f"{base}{path}"
|
||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||
resp = await client.get(url)
|
||
body = {}
|
||
try:
|
||
body = resp.json()
|
||
except Exception:
|
||
body = {"ok": False, "error": f"Invalid JSON from artifact-registry ({resp.status_code})"}
|
||
if resp.status_code >= 400:
|
||
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
|
||
raise RuntimeError(f"Artifact registry error on {path}: {err}")
|
||
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid artifact response type"}
|
||
|
||
def _resolve_format(self, file_name: Optional[str], target_format: Optional[str]) -> str:
|
||
fmt = (target_format or "").strip().lower().lstrip(".")
|
||
if fmt:
|
||
return fmt
|
||
if file_name and "." in file_name:
|
||
return file_name.rsplit(".", 1)[1].strip().lower()
|
||
return "txt"
|
||
|
||
def _compose_output_name(self, file_name: Optional[str], doc_id: str, fmt: str) -> str:
|
||
base = "document"
|
||
if file_name:
|
||
base = file_name.rsplit("/", 1)[-1].rsplit("\\", 1)[-1]
|
||
if "." in base:
|
||
base = base.rsplit(".", 1)[0]
|
||
elif doc_id:
|
||
base = doc_id
|
||
safe_base = re.sub(r"[^A-Za-z0-9._-]+", "_", base).strip("._") or "document"
|
||
return f"{safe_base}.{fmt}"
|
||
|
||
def _gateway_artifact_download_path(self, artifact_id: str, version_id: str) -> str:
|
||
aid = (artifact_id or "").strip()
|
||
vid = (version_id or "").strip()
|
||
return f"/api/doc/artifacts/{aid}/versions/{vid}/download"
|
||
|
||
def _gateway_artifact_download_url(self, artifact_id: str, version_id: str) -> str:
|
||
path = self._gateway_artifact_download_path(artifact_id, version_id)
|
||
if GATEWAY_PUBLIC_BASE_URL:
|
||
return f"{GATEWAY_PUBLIC_BASE_URL}{path}"
|
||
return path
|
||
|
||
def _render_document_bytes(
|
||
self,
|
||
text: str,
|
||
file_name: Optional[str],
|
||
doc_id: str,
|
||
target_format: Optional[str] = None,
|
||
) -> Dict[str, Any]:
|
||
body = (text or "").strip()
|
||
if not body:
|
||
raise ValueError("Cannot render empty document text")
|
||
|
||
fmt = self._resolve_format(file_name=file_name, target_format=target_format)
|
||
output_name = self._compose_output_name(file_name=file_name, doc_id=doc_id, fmt=fmt)
|
||
|
||
if fmt in {"txt"}:
|
||
payload = body.encode("utf-8")
|
||
return {"bytes": payload, "mime": "text/plain; charset=utf-8", "file_name": output_name}
|
||
if fmt in {"md", "markdown"}:
|
||
payload = body.encode("utf-8")
|
||
return {"bytes": payload, "mime": "text/markdown; charset=utf-8", "file_name": output_name}
|
||
if fmt in {"json"}:
|
||
parsed: Any
|
||
try:
|
||
parsed = json.loads(body)
|
||
except Exception:
|
||
parsed = {"text": body}
|
||
payload = json.dumps(parsed, ensure_ascii=False, indent=2).encode("utf-8")
|
||
return {"bytes": payload, "mime": "application/json", "file_name": output_name}
|
||
if fmt in {"csv"}:
|
||
payload = body.encode("utf-8")
|
||
return {"bytes": payload, "mime": "text/csv; charset=utf-8", "file_name": output_name}
|
||
if fmt in {"xlsx", "xlsm", "xls"}:
|
||
try:
|
||
from openpyxl import Workbook
|
||
except Exception as e:
|
||
raise RuntimeError(f"openpyxl is required for {fmt} rendering: {e}")
|
||
wb = Workbook()
|
||
ws = wb.active
|
||
ws.title = "Document"
|
||
lines = [ln for ln in body.splitlines()] or [body]
|
||
for idx, line in enumerate(lines, start=1):
|
||
ws.cell(row=idx, column=1, value=line)
|
||
buf = BytesIO()
|
||
wb.save(buf)
|
||
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||
return {"bytes": buf.getvalue(), "mime": mime, "file_name": self._compose_output_name(file_name, doc_id, "xlsx")}
|
||
if fmt in {"docx"}:
|
||
try:
|
||
from docx import Document
|
||
except Exception as e:
|
||
raise RuntimeError(f"python-docx is required for docx rendering: {e}")
|
||
doc = Document()
|
||
for line in body.splitlines():
|
||
doc.add_paragraph(line if line else " ")
|
||
buf = BytesIO()
|
||
doc.save(buf)
|
||
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||
return {"bytes": buf.getvalue(), "mime": mime, "file_name": self._compose_output_name(file_name, doc_id, "docx")}
|
||
|
||
payload = body.encode("utf-8")
|
||
fallback_name = self._compose_output_name(file_name=file_name, doc_id=doc_id, fmt="txt")
|
||
return {"bytes": payload, "mime": "text/plain; charset=utf-8", "file_name": fallback_name}
|
||
|
||
async def _publish_text_artifact(
|
||
self,
|
||
text: str,
|
||
doc_id: str,
|
||
file_name: Optional[str] = None,
|
||
dao_id: Optional[str] = None,
|
||
user_id: Optional[str] = None,
|
||
artifact_id: Optional[str] = None,
|
||
target_format: Optional[str] = None,
|
||
label: Optional[str] = None,
|
||
metadata: Optional[Dict[str, Any]] = None,
|
||
) -> PublishResult:
|
||
try:
|
||
rendered = self._render_document_bytes(
|
||
text=text,
|
||
file_name=file_name,
|
||
doc_id=doc_id,
|
||
target_format=target_format,
|
||
)
|
||
content_bytes = rendered["bytes"]
|
||
content_b64 = base64.b64encode(content_bytes).decode("ascii")
|
||
|
||
effective_artifact_id = (artifact_id or "").strip()
|
||
if not effective_artifact_id:
|
||
create_resp = await self._artifact_post_json(
|
||
"/artifacts",
|
||
{
|
||
"type": "doc",
|
||
"title": file_name or doc_id,
|
||
"project_id": dao_id,
|
||
"acl_ref": dao_id,
|
||
"created_by": user_id or DOC_WRITEBACK_CREATED_BY,
|
||
},
|
||
timeout=30.0,
|
||
)
|
||
effective_artifact_id = str(create_resp.get("artifact_id") or "").strip()
|
||
if not effective_artifact_id:
|
||
return PublishResult(success=False, error="Artifact create failed: empty artifact_id")
|
||
|
||
meta = {"doc_id": doc_id, "source": "doc_update_publish"}
|
||
if isinstance(metadata, dict):
|
||
meta.update(metadata)
|
||
|
||
version_resp = await self._artifact_post_json(
|
||
f"/artifacts/{effective_artifact_id}/versions/from_base64",
|
||
{
|
||
"content_base64": content_b64,
|
||
"mime": rendered["mime"],
|
||
"filename": rendered["file_name"],
|
||
"label": label or "edited",
|
||
"meta_json": meta,
|
||
},
|
||
timeout=45.0,
|
||
)
|
||
version_id = str(version_resp.get("version_id") or "").strip()
|
||
storage_key = version_resp.get("storage_key")
|
||
if not version_id:
|
||
return PublishResult(
|
||
success=False,
|
||
artifact_id=effective_artifact_id,
|
||
error="Artifact version create failed: empty version_id",
|
||
)
|
||
|
||
download_url = self._gateway_artifact_download_url(
|
||
artifact_id=effective_artifact_id,
|
||
version_id=version_id,
|
||
)
|
||
|
||
return PublishResult(
|
||
success=True,
|
||
artifact_id=effective_artifact_id,
|
||
version_id=version_id,
|
||
storage_key=storage_key,
|
||
mime=rendered["mime"],
|
||
file_name=rendered["file_name"],
|
||
download_url=download_url,
|
||
)
|
||
except Exception as e:
|
||
logger.error(f"publish_text_artifact failed: {e}", exc_info=True)
|
||
return PublishResult(success=False, error=str(e))
|
||
|
||
def _is_excel_filename(self, file_name: Optional[str]) -> bool:
|
||
if not file_name:
|
||
return False
|
||
lower = file_name.lower()
|
||
return lower.endswith(".xlsx") or lower.endswith(".xls")
|
||
|
||
def _is_numeric_question(self, question: str) -> bool:
|
||
t = (question or "").lower()
|
||
if not t:
|
||
return False
|
||
markers = [
|
||
"скільки", "сума", "витрат", "добрив", "грн", "uah", "usd", "eur",
|
||
"сколько", "amount", "total", "spent", "cost", "value",
|
||
]
|
||
return any(m in t for m in markers)
|
||
|
||
def _extract_query_tokens(self, question: str) -> List[str]:
|
||
tokens = re.findall(r"[a-zA-Zа-яА-ЯіїєґІЇЄҐ0-9]{3,}", (question or "").lower())
|
||
stop = {
|
||
"яка", "який", "яке", "which", "what", "скільки", "сума", "була",
|
||
"витрачена", "write", "show", "give", "please", "мені", "будь", "ласка",
|
||
"тому", "цьому", "цей", "this", "that", "for", "and", "the",
|
||
}
|
||
return [t for t in tokens if t not in stop]
|
||
|
||
async def _try_answer_excel_question(
|
||
self,
|
||
question: str,
|
||
doc_url: Optional[str],
|
||
file_name: Optional[str],
|
||
) -> Optional[str]:
|
||
if not doc_url or not self._is_numeric_question(question):
|
||
return None
|
||
try:
|
||
import httpx
|
||
from io import BytesIO
|
||
import openpyxl
|
||
except Exception:
|
||
return None
|
||
|
||
query_tokens = self._extract_query_tokens(question)
|
||
if not query_tokens:
|
||
query_tokens = ["сума", "витрати", "добрив"]
|
||
|
||
try:
|
||
async with httpx.AsyncClient(timeout=20.0) as client:
|
||
resp = await client.get(doc_url)
|
||
if resp.status_code != 200:
|
||
return None
|
||
content = resp.content
|
||
|
||
wb = openpyxl.load_workbook(BytesIO(content), data_only=True, read_only=True)
|
||
best = None
|
||
best_score = -1
|
||
fallback = None
|
||
|
||
for ws in wb.worksheets:
|
||
for row_idx, row in enumerate(ws.iter_rows(values_only=True), start=1):
|
||
label = ""
|
||
numeric_value = None
|
||
for cell in row:
|
||
if isinstance(cell, (int, float)) and numeric_value is None:
|
||
numeric_value = float(cell)
|
||
elif isinstance(cell, str) and not label:
|
||
label = cell.strip()
|
||
if numeric_value is None:
|
||
continue
|
||
label_low = label.lower()
|
||
score = sum(1 for t in query_tokens if t in label_low)
|
||
if score > best_score:
|
||
best_score = score
|
||
best = {
|
||
"sheet": ws.title,
|
||
"row": row_idx,
|
||
"label": label or "n/a",
|
||
"value": numeric_value,
|
||
}
|
||
if fallback is None and any(m in label_low for m in ("добрив", "fertiliz", "удобр")):
|
||
fallback = {
|
||
"sheet": ws.title,
|
||
"row": row_idx,
|
||
"label": label or "n/a",
|
||
"value": numeric_value,
|
||
}
|
||
|
||
picked = best if best and best_score > 0 else fallback
|
||
if not picked:
|
||
return None
|
||
|
||
value = picked["value"]
|
||
if abs(value - int(value)) < 1e-9:
|
||
value_str = f"{int(value):,}".replace(",", " ")
|
||
else:
|
||
value_str = f"{value:,.2f}".replace(",", " ").replace(".", ",")
|
||
|
||
unit = "грн" if self._is_numeric_question(question) else ""
|
||
unit_part = f" {unit}" if unit else ""
|
||
file_part = f' у файлі "{file_name}"' if file_name else ""
|
||
return (
|
||
f"За{file_part}: {value_str}{unit_part}. "
|
||
f"Джерело: лист {picked['sheet']}, рядок {picked['row']} ({picked['label']})."
|
||
)
|
||
except Exception as e:
|
||
logger.warning(f"Excel deterministic answer failed: {e}")
|
||
return None
|
||
|
||
async def save_doc_context(
|
||
self,
|
||
session_id: str,
|
||
doc_id: str,
|
||
doc_url: Optional[str] = None,
|
||
file_name: Optional[str] = None,
|
||
dao_id: Optional[str] = None,
|
||
user_id: Optional[str] = None,
|
||
) -> bool:
|
||
"""
|
||
Save document context for a session.
|
||
|
||
Uses Memory Service to persist document context across channels.
|
||
|
||
Args:
|
||
session_id: Session identifier (e.g., "telegram:123", "web:user456")
|
||
doc_id: Document ID from parser
|
||
doc_url: Optional document URL
|
||
file_name: Optional file name
|
||
dao_id: Optional DAO ID
|
||
|
||
Returns:
|
||
True if saved successfully
|
||
"""
|
||
try:
|
||
# Use stable synthetic user key per session, so context can be
|
||
# retrieved later using only session_id (without caller user_id).
|
||
fact_user_id = f"session:{session_id}"
|
||
|
||
# Save as fact in Memory Service
|
||
fact_key = f"doc_context:{session_id}"
|
||
fact_value_json = {
|
||
"doc_id": doc_id,
|
||
"doc_url": doc_url,
|
||
"file_name": file_name,
|
||
"dao_id": dao_id,
|
||
"user_id": user_id,
|
||
"saved_at": datetime.utcnow().isoformat()
|
||
}
|
||
|
||
result = await self.memory_client.upsert_fact(
|
||
user_id=fact_user_id,
|
||
fact_key=fact_key,
|
||
fact_value_json=fact_value_json,
|
||
# Keep doc context globally addressable for follow-up calls
|
||
# that may not include dao_id/team_id in retrieval.
|
||
team_id=None,
|
||
)
|
||
|
||
logger.info(f"Saved doc context for session {session_id}: doc_id={doc_id}")
|
||
return result
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to save doc context: {e}", exc_info=True)
|
||
return False
|
||
|
||
async def get_doc_context(self, session_id: str) -> Optional[DocContext]:
|
||
"""
|
||
Get document context for a session.
|
||
|
||
Args:
|
||
session_id: Session identifier
|
||
|
||
Returns:
|
||
DocContext or None
|
||
"""
|
||
try:
|
||
user_id = f"session:{session_id}"
|
||
|
||
fact_key = f"doc_context:{session_id}"
|
||
|
||
# Get fact from Memory Service
|
||
fact = await self.memory_client.get_fact(
|
||
user_id=user_id,
|
||
fact_key=fact_key
|
||
)
|
||
|
||
if fact and fact.get("fact_value_json"):
|
||
logger.debug(f"Retrieved doc context for session {session_id}")
|
||
ctx_data = fact.get("fact_value_json")
|
||
if isinstance(ctx_data, str):
|
||
try:
|
||
ctx_data = json.loads(ctx_data)
|
||
except Exception:
|
||
logger.warning("doc_context fact_value_json is not valid JSON string")
|
||
return None
|
||
return DocContext(**ctx_data)
|
||
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to get doc context: {e}", exc_info=True)
|
||
return None
|
||
|
||
async def parse_document(
|
||
self,
|
||
session_id: str,
|
||
doc_url: str,
|
||
file_name: str,
|
||
dao_id: str,
|
||
user_id: str,
|
||
output_mode: str = "qa_pairs",
|
||
metadata: Optional[Dict[str, Any]] = None
|
||
) -> ParsedResult:
|
||
"""
|
||
Parse a document directly through Swapper service.
|
||
|
||
Args:
|
||
session_id: Session identifier (e.g., "telegram:123", "web:user456")
|
||
doc_url: URL to the document file
|
||
file_name: Name of the file
|
||
dao_id: DAO identifier
|
||
user_id: User identifier
|
||
output_mode: Output format ("qa_pairs", "markdown", "chunks", "text")
|
||
metadata: Optional additional metadata
|
||
|
||
Returns:
|
||
ParsedResult with parsed data
|
||
"""
|
||
import httpx
|
||
|
||
SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890")
|
||
|
||
try:
|
||
logger.info(f"Parsing document: session={session_id}, file={file_name}, mode={output_mode}")
|
||
|
||
# Download the document first
|
||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||
doc_response = await client.get(doc_url)
|
||
if doc_response.status_code != 200:
|
||
return ParsedResult(
|
||
success=False,
|
||
error=f"Failed to download document: {doc_response.status_code}"
|
||
)
|
||
doc_content = doc_response.content
|
||
|
||
# Send directly to Swapper /document endpoint
|
||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||
# Map output_mode: qa_pairs -> text (Swapper doesn't support qa_pairs directly)
|
||
swapper_mode = "markdown" if output_mode in ["qa_pairs", "markdown"] else "text"
|
||
|
||
mime_type = "application/octet-stream"
|
||
if file_name:
|
||
import mimetypes
|
||
mime_type = mimetypes.guess_type(file_name)[0] or mime_type
|
||
|
||
files = {"file": (file_name, doc_content, mime_type)}
|
||
data = {"output_format": swapper_mode}
|
||
|
||
swapper_response = await client.post(
|
||
f"{SWAPPER_URL}/document",
|
||
files=files,
|
||
data=data
|
||
)
|
||
|
||
if swapper_response.status_code == 200:
|
||
response = {"ok": True, "data": swapper_response.json()}
|
||
else:
|
||
logger.error(f"Swapper document error: {swapper_response.status_code} - {swapper_response.text[:200]}")
|
||
return ParsedResult(
|
||
success=False,
|
||
error=f"Document parsing failed: {swapper_response.status_code}"
|
||
)
|
||
|
||
if not isinstance(response, dict):
|
||
return ParsedResult(
|
||
success=False,
|
||
error="Invalid response from Swapper"
|
||
)
|
||
|
||
data = response.get("data", {})
|
||
|
||
# Swapper returns: {success, model, output_format, result, filename, processing_time_ms}
|
||
parsed_text = data.get("result", "")
|
||
output_format = data.get("output_format", "text")
|
||
model_used = data.get("model", "unknown")
|
||
|
||
logger.info(f"Document parsed: {len(parsed_text)} chars using {model_used}")
|
||
|
||
# Generate a simple doc_id based on filename and timestamp
|
||
doc_id = hashlib.md5(f"{file_name}:{datetime.utcnow().isoformat()}".encode()).hexdigest()[:12]
|
||
|
||
# Save document context for follow-up queries
|
||
await self.save_doc_context(
|
||
session_id=session_id,
|
||
doc_id=doc_id,
|
||
doc_url=doc_url,
|
||
file_name=file_name,
|
||
dao_id=dao_id,
|
||
user_id=user_id,
|
||
)
|
||
|
||
# Convert text to markdown format
|
||
markdown = parsed_text if output_format == "markdown" else f"```\n{parsed_text}\n```"
|
||
|
||
# No QA pairs from direct parsing - would need LLM for that
|
||
qa_pairs = None
|
||
chunks = []
|
||
chunks_meta = None
|
||
if chunks:
|
||
chunks_meta = {
|
||
"count": len(chunks),
|
||
"chunks": chunks[:3] if len(chunks) > 3 else chunks # Sample
|
||
}
|
||
|
||
return ParsedResult(
|
||
success=True,
|
||
doc_id=doc_id,
|
||
qa_pairs=qa_pairs,
|
||
markdown=markdown,
|
||
chunks_meta=chunks_meta,
|
||
raw=data,
|
||
error=None
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"Document parsing via Swapper failed: {e}")
|
||
|
||
# === FALLBACK: Try PyPDF2 for PDF files ===
|
||
if file_name and file_name.lower().endswith(".pdf"):
|
||
try:
|
||
logger.info(f"Fallback: parsing PDF with PyPDF2: {file_name}")
|
||
import io
|
||
import PyPDF2
|
||
|
||
reader = PyPDF2.PdfReader(io.BytesIO(doc_content))
|
||
parsed_text = ""
|
||
for page in reader.pages:
|
||
text = page.extract_text() or ""
|
||
parsed_text += text + "\n"
|
||
parsed_text = parsed_text.strip()
|
||
|
||
if len(parsed_text) > 30:
|
||
logger.info(f"PyPDF2 fallback success: {len(parsed_text)} chars from {len(reader.pages)} pages")
|
||
doc_id = hashlib.md5(f"{file_name}:{datetime.utcnow().isoformat()}".encode()).hexdigest()[:12]
|
||
|
||
await self.save_doc_context(
|
||
session_id=session_id,
|
||
doc_id=doc_id,
|
||
doc_url=doc_url,
|
||
file_name=file_name,
|
||
dao_id=dao_id,
|
||
user_id=user_id,
|
||
)
|
||
|
||
return ParsedResult(
|
||
success=True,
|
||
doc_id=doc_id,
|
||
qa_pairs=None,
|
||
markdown=parsed_text,
|
||
chunks_meta=None,
|
||
raw={"model": "PyPDF2-fallback", "pages": len(reader.pages)},
|
||
error=None
|
||
)
|
||
else:
|
||
logger.warning(f"PyPDF2 fallback: too little text ({len(parsed_text)} chars)")
|
||
except Exception as pdf_err:
|
||
logger.error(f"PyPDF2 fallback also failed: {pdf_err}")
|
||
# === END FALLBACK ===
|
||
|
||
return ParsedResult(
|
||
success=False,
|
||
error=str(e)
|
||
)
|
||
|
||
async def ingest_document(
|
||
self,
|
||
session_id: str,
|
||
doc_id: Optional[str] = None,
|
||
doc_url: Optional[str] = None,
|
||
file_name: Optional[str] = None,
|
||
dao_id: str = None,
|
||
user_id: str = None,
|
||
agent_id: str = "daarwizz",
|
||
) -> IngestResult:
|
||
"""
|
||
Ingest document chunks into RAG/Memory.
|
||
|
||
Args:
|
||
session_id: Session identifier
|
||
doc_id: Document ID (if already parsed)
|
||
doc_url: Document URL (if need to parse first)
|
||
file_name: File name
|
||
dao_id: DAO identifier
|
||
user_id: User identifier
|
||
|
||
Returns:
|
||
IngestResult with ingestion status
|
||
"""
|
||
try:
|
||
# If doc_id not provided, try to get from context
|
||
if not doc_id:
|
||
doc_context = await self.get_doc_context(session_id)
|
||
if doc_context:
|
||
doc_id = doc_context.doc_id
|
||
doc_url = doc_url or doc_context.doc_url
|
||
file_name = file_name or doc_context.file_name
|
||
dao_id = dao_id or doc_context.dao_id
|
||
|
||
if not doc_url:
|
||
return IngestResult(
|
||
success=False,
|
||
error="No document URL available for ingest"
|
||
)
|
||
|
||
parsed = await self.parse_document(
|
||
session_id=session_id,
|
||
doc_url=doc_url,
|
||
file_name=file_name or "document",
|
||
dao_id=dao_id or "",
|
||
user_id=user_id or "",
|
||
output_mode="markdown",
|
||
metadata={"source": self._extract_source(session_id), "mode": "ingest"},
|
||
)
|
||
if not parsed.success:
|
||
return IngestResult(success=False, error=parsed.error or "Document parse failed")
|
||
|
||
effective_doc_id = doc_id or parsed.doc_id
|
||
if not effective_doc_id:
|
||
effective_doc_id = hashlib.md5(f"{session_id}:{file_name}:{datetime.utcnow().isoformat()}".encode()).hexdigest()[:12]
|
||
|
||
doc_text = (parsed.markdown or "").strip()
|
||
if not doc_text:
|
||
return IngestResult(success=False, error="No extractable text for ingestion")
|
||
|
||
payload = {
|
||
"agent_id": (agent_id or "daarwizz").lower(),
|
||
"doc_id": effective_doc_id,
|
||
"file_name": file_name or "document",
|
||
"text": doc_text,
|
||
"dao_id": dao_id,
|
||
"user_id": user_id,
|
||
"metadata": {
|
||
"session_id": session_id,
|
||
"source": self._extract_source(session_id),
|
||
},
|
||
}
|
||
response = await self._router_post_json("/v1/documents/ingest", payload, timeout=90.0)
|
||
|
||
if response.get("ok"):
|
||
return IngestResult(
|
||
success=True,
|
||
doc_id=response.get("doc_id") or effective_doc_id,
|
||
ingested_chunks=int(response.get("chunks_stored", 0) or 0),
|
||
status="ingested",
|
||
)
|
||
|
||
return IngestResult(
|
||
success=False,
|
||
doc_id=effective_doc_id,
|
||
status="failed",
|
||
error=response.get("error", "Router ingest failed"),
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"Document ingestion failed: {e}", exc_info=True)
|
||
return IngestResult(
|
||
success=False,
|
||
error=str(e)
|
||
)
|
||
|
||
async def update_document(
|
||
self,
|
||
session_id: str,
|
||
doc_id: Optional[str] = None,
|
||
doc_url: Optional[str] = None,
|
||
file_name: Optional[str] = None,
|
||
text: Optional[str] = None,
|
||
dao_id: Optional[str] = None,
|
||
user_id: Optional[str] = None,
|
||
agent_id: str = "daarwizz",
|
||
storage_ref: Optional[str] = None,
|
||
publish_artifact: bool = False,
|
||
artifact_id: Optional[str] = None,
|
||
target_format: Optional[str] = None,
|
||
artifact_label: Optional[str] = None,
|
||
metadata: Optional[Dict[str, Any]] = None,
|
||
) -> UpdateResult:
|
||
"""
|
||
Update existing document content and bump version in router memory.
|
||
"""
|
||
try:
|
||
context = await self.get_doc_context(session_id)
|
||
if context:
|
||
if not doc_id:
|
||
doc_id = context.doc_id
|
||
if not doc_url:
|
||
doc_url = context.doc_url
|
||
if not file_name:
|
||
file_name = context.file_name
|
||
if not dao_id:
|
||
dao_id = context.dao_id
|
||
|
||
if not doc_id:
|
||
return UpdateResult(
|
||
success=False,
|
||
status="failed",
|
||
error="No document context found. Provide doc_id or parse/ingest first.",
|
||
)
|
||
|
||
effective_text = (text or "").strip()
|
||
if not effective_text:
|
||
if not doc_url:
|
||
return UpdateResult(
|
||
success=False,
|
||
doc_id=doc_id,
|
||
status="failed",
|
||
error="No text or doc_url provided for update",
|
||
)
|
||
parsed = await self.parse_document(
|
||
session_id=session_id,
|
||
doc_url=doc_url,
|
||
file_name=file_name or "document",
|
||
dao_id=dao_id or "",
|
||
user_id=user_id or "",
|
||
output_mode="markdown",
|
||
metadata={"source": self._extract_source(session_id), "mode": "update"},
|
||
)
|
||
if not parsed.success:
|
||
return UpdateResult(
|
||
success=False,
|
||
doc_id=doc_id,
|
||
status="failed",
|
||
error=parsed.error or "Document parse failed",
|
||
)
|
||
effective_text = (parsed.markdown or "").strip()
|
||
|
||
if not effective_text:
|
||
return UpdateResult(
|
||
success=False,
|
||
doc_id=doc_id,
|
||
status="failed",
|
||
error="No extractable text for update",
|
||
)
|
||
|
||
meta = {
|
||
"session_id": session_id,
|
||
"source": self._extract_source(session_id),
|
||
}
|
||
if isinstance(metadata, dict):
|
||
meta.update(metadata)
|
||
|
||
response = await self._router_post_json(
|
||
"/v1/documents/update",
|
||
{
|
||
"agent_id": (agent_id or "daarwizz").lower(),
|
||
"doc_id": doc_id,
|
||
"file_name": file_name,
|
||
"text": effective_text,
|
||
"dao_id": dao_id,
|
||
"user_id": user_id,
|
||
"storage_ref": storage_ref,
|
||
"metadata": meta,
|
||
},
|
||
timeout=90.0,
|
||
)
|
||
|
||
if not response.get("ok"):
|
||
return UpdateResult(
|
||
success=False,
|
||
doc_id=doc_id,
|
||
status="failed",
|
||
error=response.get("error", "Router update failed"),
|
||
)
|
||
|
||
await self.save_doc_context(
|
||
session_id=session_id,
|
||
doc_id=doc_id,
|
||
doc_url=doc_url,
|
||
file_name=file_name,
|
||
dao_id=dao_id,
|
||
user_id=user_id,
|
||
)
|
||
|
||
publish = PublishResult(success=False)
|
||
if publish_artifact:
|
||
publish = await self._publish_text_artifact(
|
||
text=effective_text,
|
||
doc_id=doc_id,
|
||
file_name=file_name,
|
||
dao_id=dao_id,
|
||
user_id=user_id,
|
||
artifact_id=artifact_id,
|
||
target_format=target_format,
|
||
label=artifact_label,
|
||
metadata=meta,
|
||
)
|
||
|
||
return UpdateResult(
|
||
success=True,
|
||
doc_id=response.get("doc_id") or doc_id,
|
||
version_no=int(response.get("version_no", 0) or 0) or None,
|
||
version_id=int(response.get("version_id", 0) or 0) or None,
|
||
updated_chunks=int(response.get("chunks_stored", 0) or 0),
|
||
status="updated_published" if publish_artifact and publish.success else ("updated_publish_failed" if publish_artifact else "updated"),
|
||
publish_error=publish.error if publish_artifact and not publish.success else None,
|
||
artifact_id=publish.artifact_id if publish_artifact else None,
|
||
artifact_version_id=publish.version_id if publish_artifact else None,
|
||
artifact_storage_key=publish.storage_key if publish_artifact else None,
|
||
artifact_mime=publish.mime if publish_artifact else None,
|
||
artifact_download_url=publish.download_url if publish_artifact else None,
|
||
)
|
||
except Exception as e:
|
||
logger.error(f"Document update failed: {e}", exc_info=True)
|
||
return UpdateResult(
|
||
success=False,
|
||
doc_id=doc_id,
|
||
status="failed",
|
||
error=str(e),
|
||
)
|
||
|
||
async def list_document_versions(
|
||
self,
|
||
agent_id: str,
|
||
doc_id: str,
|
||
limit: int = 20,
|
||
) -> Dict[str, Any]:
|
||
aid = (agent_id or "daarwizz").lower()
|
||
did = (doc_id or "").strip()
|
||
if not did:
|
||
return {"ok": False, "error": "doc_id is required", "items": []}
|
||
try:
|
||
response = await self._router_get_json(
|
||
f"/v1/documents/{did}/versions?agent_id={aid}&limit={max(1, min(int(limit or 20), 200))}",
|
||
timeout=30.0,
|
||
)
|
||
return response if isinstance(response, dict) else {"ok": False, "error": "invalid_response", "items": []}
|
||
except Exception as e:
|
||
logger.error(f"list_document_versions failed: {e}")
|
||
return {"ok": False, "error": str(e), "items": []}
|
||
|
||
async def publish_document_artifact(
|
||
self,
|
||
session_id: str,
|
||
doc_id: Optional[str] = None,
|
||
doc_url: Optional[str] = None,
|
||
file_name: Optional[str] = None,
|
||
text: Optional[str] = None,
|
||
dao_id: Optional[str] = None,
|
||
user_id: Optional[str] = None,
|
||
artifact_id: Optional[str] = None,
|
||
target_format: Optional[str] = None,
|
||
artifact_label: Optional[str] = None,
|
||
metadata: Optional[Dict[str, Any]] = None,
|
||
) -> PublishResult:
|
||
"""
|
||
Publish text as a physical artifact version (.docx/.xlsx/.txt/...) without changing RAG index.
|
||
"""
|
||
try:
|
||
context = await self.get_doc_context(session_id)
|
||
if context:
|
||
if not doc_id:
|
||
doc_id = context.doc_id
|
||
if not doc_url:
|
||
doc_url = context.doc_url
|
||
if not file_name:
|
||
file_name = context.file_name
|
||
if not dao_id:
|
||
dao_id = context.dao_id
|
||
if not user_id:
|
||
user_id = context.user_id
|
||
|
||
if not doc_id:
|
||
return PublishResult(success=False, error="doc_id is required")
|
||
|
||
body = (text or "").strip()
|
||
if not body:
|
||
if not doc_url:
|
||
return PublishResult(success=False, error="text or doc_url is required")
|
||
parsed = await self.parse_document(
|
||
session_id=session_id,
|
||
doc_url=doc_url,
|
||
file_name=file_name or "document",
|
||
dao_id=dao_id or "",
|
||
user_id=user_id or "",
|
||
output_mode="markdown",
|
||
metadata={"source": self._extract_source(session_id), "mode": "publish"},
|
||
)
|
||
if not parsed.success:
|
||
return PublishResult(success=False, error=parsed.error or "Document parse failed")
|
||
body = (parsed.markdown or "").strip()
|
||
|
||
if not body:
|
||
return PublishResult(success=False, error="No text available for publish")
|
||
|
||
return await self._publish_text_artifact(
|
||
text=body,
|
||
doc_id=doc_id,
|
||
file_name=file_name,
|
||
dao_id=dao_id,
|
||
user_id=user_id,
|
||
artifact_id=artifact_id,
|
||
target_format=target_format,
|
||
label=artifact_label,
|
||
metadata=metadata,
|
||
)
|
||
except Exception as e:
|
||
logger.error(f"publish_document_artifact failed: {e}", exc_info=True)
|
||
return PublishResult(success=False, error=str(e))
|
||
|
||
async def ask_about_document(
|
||
self,
|
||
session_id: str,
|
||
question: str,
|
||
doc_id: Optional[str] = None,
|
||
dao_id: Optional[str] = None,
|
||
user_id: Optional[str] = None,
|
||
agent_id: str = "daarwizz"
|
||
) -> QAResult:
|
||
"""
|
||
Ask a question about a document using RAG query.
|
||
|
||
Args:
|
||
session_id: Session identifier
|
||
question: Question text
|
||
doc_id: Document ID (if None, tries to get from context)
|
||
dao_id: DAO identifier
|
||
user_id: User identifier
|
||
|
||
Returns:
|
||
QAResult with answer and citations
|
||
"""
|
||
try:
|
||
# If doc_id not provided, try to get from context
|
||
doc_url = None
|
||
file_name = None
|
||
if not doc_id:
|
||
doc_context = await self.get_doc_context(session_id)
|
||
if doc_context:
|
||
doc_id = doc_context.doc_id
|
||
dao_id = dao_id or doc_context.dao_id
|
||
doc_url = doc_context.doc_url
|
||
file_name = doc_context.file_name
|
||
else:
|
||
doc_context = await self.get_doc_context(session_id)
|
||
if doc_context:
|
||
doc_url = doc_context.doc_url
|
||
file_name = doc_context.file_name
|
||
|
||
if not doc_id:
|
||
return QAResult(
|
||
success=False,
|
||
error="No document context found. Parse a document first."
|
||
)
|
||
|
||
# Extract user_id from session_id if not provided
|
||
if not user_id:
|
||
parts = session_id.split(":", 1)
|
||
user_id = parts[1] if len(parts) > 1 else session_id
|
||
|
||
# Shared deterministic Excel policy for top-level agrarian agents.
|
||
if (
|
||
(agent_id or "").lower() in SHARED_EXCEL_POLICY_AGENTS
|
||
and self._is_excel_filename(file_name)
|
||
):
|
||
deterministic = await self._try_answer_excel_question(
|
||
question=question,
|
||
doc_url=doc_url,
|
||
file_name=file_name,
|
||
)
|
||
if deterministic:
|
||
return QAResult(
|
||
success=True,
|
||
answer=deterministic,
|
||
doc_id=doc_id,
|
||
sources=[{
|
||
"type": "excel_deterministic",
|
||
"file_name": file_name,
|
||
}],
|
||
)
|
||
|
||
logger.info(
|
||
f"RAG query: agent={agent_id}, session={session_id}, question={question[:50]}, doc_id={doc_id}"
|
||
)
|
||
|
||
response = await self._router_post_json(
|
||
"/v1/documents/query",
|
||
{
|
||
"agent_id": (agent_id or "daarwizz").lower(),
|
||
"question": question,
|
||
"doc_id": doc_id,
|
||
"dao_id": dao_id,
|
||
"user_id": user_id,
|
||
"limit": 5,
|
||
},
|
||
timeout=60.0,
|
||
)
|
||
|
||
if isinstance(response, dict) and not response.get("ok", False):
|
||
return QAResult(
|
||
success=False,
|
||
error=response.get("error", "Document query failed"),
|
||
)
|
||
|
||
data = response.get("data", {}) if isinstance(response, dict) else {}
|
||
answer = data.get("answer") or data.get("text")
|
||
sources = data.get("citations", []) or data.get("sources", [])
|
||
|
||
if answer:
|
||
return QAResult(
|
||
success=True,
|
||
answer=answer,
|
||
doc_id=doc_id,
|
||
sources=sources if sources else None
|
||
)
|
||
else:
|
||
return QAResult(
|
||
success=False,
|
||
error="No answer from RAG query"
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"RAG query failed: {e}", exc_info=True)
|
||
return QAResult(
|
||
success=False,
|
||
error=str(e)
|
||
)
|
||
|
||
def _extract_source(self, session_id: str) -> str:
|
||
"""Extract source channel from session_id"""
|
||
parts = session_id.split(":", 1)
|
||
return parts[0] if len(parts) > 1 else "unknown"
|
||
|
||
|
||
# Global instance
|
||
doc_service = DocumentService()
|
||
|
||
# Export functions for convenience
|
||
async def parse_document(
|
||
session_id: str,
|
||
doc_url: str,
|
||
file_name: str,
|
||
dao_id: str,
|
||
user_id: str,
|
||
output_mode: str = "qa_pairs",
|
||
metadata: Optional[Dict[str, Any]] = None
|
||
) -> ParsedResult:
|
||
"""Parse a document through DAGI Router"""
|
||
return await doc_service.parse_document(
|
||
session_id=session_id,
|
||
doc_url=doc_url,
|
||
file_name=file_name,
|
||
dao_id=dao_id,
|
||
user_id=user_id,
|
||
output_mode=output_mode,
|
||
metadata=metadata
|
||
)
|
||
|
||
|
||
async def ingest_document(
|
||
session_id: str,
|
||
doc_id: Optional[str] = None,
|
||
doc_url: Optional[str] = None,
|
||
file_name: Optional[str] = None,
|
||
dao_id: Optional[str] = None,
|
||
user_id: Optional[str] = None,
|
||
agent_id: str = "daarwizz",
|
||
) -> IngestResult:
|
||
"""Ingest document chunks into RAG/Memory"""
|
||
return await doc_service.ingest_document(
|
||
session_id=session_id,
|
||
doc_id=doc_id,
|
||
doc_url=doc_url,
|
||
file_name=file_name,
|
||
dao_id=dao_id,
|
||
user_id=user_id,
|
||
agent_id=agent_id,
|
||
)
|
||
|
||
|
||
async def ask_about_document(
|
||
session_id: str,
|
||
question: str,
|
||
doc_id: Optional[str] = None,
|
||
dao_id: Optional[str] = None,
|
||
user_id: Optional[str] = None,
|
||
agent_id: str = "daarwizz"
|
||
) -> QAResult:
|
||
"""Ask a question about a document using RAG query"""
|
||
return await doc_service.ask_about_document(
|
||
session_id=session_id,
|
||
question=question,
|
||
doc_id=doc_id,
|
||
dao_id=dao_id,
|
||
user_id=user_id,
|
||
agent_id=agent_id
|
||
)
|
||
|
||
|
||
async def update_document(
|
||
session_id: str,
|
||
doc_id: Optional[str] = None,
|
||
doc_url: Optional[str] = None,
|
||
file_name: Optional[str] = None,
|
||
text: Optional[str] = None,
|
||
dao_id: Optional[str] = None,
|
||
user_id: Optional[str] = None,
|
||
agent_id: str = "daarwizz",
|
||
storage_ref: Optional[str] = None,
|
||
publish_artifact: bool = False,
|
||
artifact_id: Optional[str] = None,
|
||
target_format: Optional[str] = None,
|
||
artifact_label: Optional[str] = None,
|
||
metadata: Optional[Dict[str, Any]] = None,
|
||
) -> UpdateResult:
|
||
"""Update document chunks and bump version."""
|
||
return await doc_service.update_document(
|
||
session_id=session_id,
|
||
doc_id=doc_id,
|
||
doc_url=doc_url,
|
||
file_name=file_name,
|
||
text=text,
|
||
dao_id=dao_id,
|
||
user_id=user_id,
|
||
agent_id=agent_id,
|
||
storage_ref=storage_ref,
|
||
publish_artifact=publish_artifact,
|
||
artifact_id=artifact_id,
|
||
target_format=target_format,
|
||
artifact_label=artifact_label,
|
||
metadata=metadata,
|
||
)
|
||
|
||
|
||
async def list_document_versions(agent_id: str, doc_id: str, limit: int = 20) -> Dict[str, Any]:
|
||
"""List document versions from router."""
|
||
return await doc_service.list_document_versions(
|
||
agent_id=agent_id,
|
||
doc_id=doc_id,
|
||
limit=limit,
|
||
)
|
||
|
||
|
||
async def publish_document_artifact(
|
||
session_id: str,
|
||
doc_id: Optional[str] = None,
|
||
doc_url: Optional[str] = None,
|
||
file_name: Optional[str] = None,
|
||
text: Optional[str] = None,
|
||
dao_id: Optional[str] = None,
|
||
user_id: Optional[str] = None,
|
||
artifact_id: Optional[str] = None,
|
||
target_format: Optional[str] = None,
|
||
artifact_label: Optional[str] = None,
|
||
metadata: Optional[Dict[str, Any]] = None,
|
||
) -> PublishResult:
|
||
"""Publish physical artifact version for document text."""
|
||
return await doc_service.publish_document_artifact(
|
||
session_id=session_id,
|
||
doc_id=doc_id,
|
||
doc_url=doc_url,
|
||
file_name=file_name,
|
||
text=text,
|
||
dao_id=dao_id,
|
||
user_id=user_id,
|
||
artifact_id=artifact_id,
|
||
target_format=target_format,
|
||
artifact_label=artifact_label,
|
||
metadata=metadata,
|
||
)
|
||
|
||
|
||
async def save_doc_context(
|
||
session_id: str,
|
||
doc_id: str,
|
||
doc_url: Optional[str] = None,
|
||
file_name: Optional[str] = None,
|
||
dao_id: Optional[str] = None,
|
||
user_id: Optional[str] = None,
|
||
) -> bool:
|
||
"""Save document context for a session"""
|
||
return await doc_service.save_doc_context(
|
||
session_id=session_id,
|
||
doc_id=doc_id,
|
||
doc_url=doc_url,
|
||
file_name=file_name,
|
||
dao_id=dao_id,
|
||
user_id=user_id,
|
||
)
|
||
|
||
|
||
async def get_doc_context(session_id: str) -> Optional[DocContext]:
|
||
"""Get document context for a session"""
|
||
return await doc_service.get_doc_context(session_id)
|