feat(docs): add document write-back publish pipeline

This commit is contained in:
NODA1 System
2026-02-21 17:02:55 +01:00
parent f53e71a0f4
commit cca16254e5
4 changed files with 569 additions and 4 deletions

View File

@@ -22,6 +22,7 @@ RUN pip install --no-cache-dir \
nats-py \
pandas \
openpyxl \
python-docx \
redis==5.0.1
# Copy gateway code and DAARWIZZ prompt

View File

@@ -7,6 +7,7 @@ Endpoints:
- POST /api/doc/ingest - Ingest document to RAG
- POST /api/doc/ask - Ask question about document
- POST /api/doc/update - Update existing document text (versioned)
- POST /api/doc/publish - Publish physical file version via artifact registry
- GET /api/doc/versions/{doc_id} - List document versions
"""
import logging
@@ -21,6 +22,7 @@ from services.doc_service import (
ask_about_document,
update_document,
list_document_versions,
publish_document_artifact,
get_doc_context,
ParsedResult,
IngestResult,
@@ -81,6 +83,25 @@ class UpdateDocumentRequest(BaseModel):
user_id: Optional[str] = None
agent_id: str = "daarwizz"
storage_ref: Optional[str] = None
publish_artifact: bool = False
artifact_id: Optional[str] = None
target_format: Optional[str] = None
artifact_label: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None
class PublishDocumentRequest(BaseModel):
"""Request to publish document as physical artifact version."""
session_id: str
doc_id: Optional[str] = None
doc_url: Optional[str] = None
file_name: Optional[str] = None
text: Optional[str] = None
dao_id: Optional[str] = None
user_id: Optional[str] = None
artifact_id: Optional[str] = None
target_format: Optional[str] = None
artifact_label: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None
@@ -267,18 +288,29 @@ async def update_document_endpoint(request: UpdateDocumentRequest):
user_id=request.user_id,
agent_id=request.agent_id,
storage_ref=request.storage_ref,
publish_artifact=request.publish_artifact,
artifact_id=request.artifact_id,
target_format=request.target_format,
artifact_label=request.artifact_label,
metadata=request.metadata,
)
if not result.success:
raise HTTPException(status_code=400, detail=result.error)
return {
response = {
"ok": True,
"doc_id": result.doc_id,
"version_no": result.version_no,
"version_id": result.version_id,
"updated_chunks": result.updated_chunks,
"status": result.status,
"publish_error": result.publish_error,
"artifact_id": result.artifact_id,
"artifact_version_id": result.artifact_version_id,
"artifact_storage_key": result.artifact_storage_key,
"artifact_mime": result.artifact_mime,
"artifact_download_url": result.artifact_download_url,
}
return response
except HTTPException:
raise
except Exception as e:
@@ -286,6 +318,43 @@ async def update_document_endpoint(request: UpdateDocumentRequest):
raise HTTPException(status_code=500, detail=str(e))
@router.post("/api/doc/publish")
async def publish_document_endpoint(request: PublishDocumentRequest):
"""
Publish current document text as physical file artifact version.
"""
try:
result = await publish_document_artifact(
session_id=request.session_id,
doc_id=request.doc_id,
doc_url=request.doc_url,
file_name=request.file_name,
text=request.text,
dao_id=request.dao_id,
user_id=request.user_id,
artifact_id=request.artifact_id,
target_format=request.target_format,
artifact_label=request.artifact_label,
metadata=request.metadata,
)
if not result.success:
raise HTTPException(status_code=400, detail=result.error)
return {
"ok": True,
"artifact_id": result.artifact_id,
"version_id": result.version_id,
"storage_key": result.storage_key,
"mime": result.mime,
"file_name": result.file_name,
"download_url": result.download_url,
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Publish document error: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.get("/api/doc/versions/{doc_id}")
async def list_document_versions_endpoint(doc_id: str, agent_id: str = "daarwizz", limit: int = 20):
"""

View File

@@ -11,11 +11,13 @@ This service can be used by:
import os
import logging
import hashlib
import base64
import json
import re
from typing import Optional, Dict, Any, List
from pydantic import BaseModel
from datetime import datetime
from io import BytesIO
from memory_client import memory_client
@@ -23,6 +25,8 @@ logger = logging.getLogger(__name__)
SHARED_EXCEL_POLICY_AGENTS = {"agromatrix", "helion", "nutra", "greenfood"}
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000")
ARTIFACT_REGISTRY_URL = os.getenv("ARTIFACT_REGISTRY_URL", "http://artifact-registry:9220").rstrip("/")
DOC_WRITEBACK_CREATED_BY = os.getenv("DOC_WRITEBACK_CREATED_BY", "gateway-doc-service")
class QAItem(BaseModel):
@@ -59,6 +63,24 @@ class UpdateResult(BaseModel):
version_id: Optional[int] = None
updated_chunks: int = 0
status: str = "unknown"
publish_error: Optional[str] = None
artifact_id: Optional[str] = None
artifact_version_id: Optional[str] = None
artifact_storage_key: Optional[str] = None
artifact_mime: Optional[str] = None
artifact_download_url: Optional[str] = None
error: Optional[str] = None
class PublishResult(BaseModel):
"""Result of artifact write-back publish."""
success: bool
artifact_id: Optional[str] = None
version_id: Optional[str] = None
storage_key: Optional[str] = None
mime: Optional[str] = None
file_name: Optional[str] = None
download_url: Optional[str] = None
error: Optional[str] = None
@@ -138,6 +160,217 @@ class DocumentService:
raise RuntimeError(f"Router error on {path}: {err}")
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"}
async def _artifact_post_json(
self,
path: str,
payload: Dict[str, Any],
timeout: float = 45.0,
) -> Dict[str, Any]:
import httpx
base = ARTIFACT_REGISTRY_URL.rstrip("/")
url = f"{base}{path}"
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(url, json=payload)
body = {}
try:
body = resp.json()
except Exception:
body = {"ok": False, "error": f"Invalid JSON from artifact-registry ({resp.status_code})"}
if resp.status_code >= 400:
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
raise RuntimeError(f"Artifact registry error on {path}: {err}")
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid artifact response type"}
async def _artifact_get_json(
self,
path: str,
timeout: float = 30.0,
) -> Dict[str, Any]:
import httpx
base = ARTIFACT_REGISTRY_URL.rstrip("/")
url = f"{base}{path}"
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.get(url)
body = {}
try:
body = resp.json()
except Exception:
body = {"ok": False, "error": f"Invalid JSON from artifact-registry ({resp.status_code})"}
if resp.status_code >= 400:
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
raise RuntimeError(f"Artifact registry error on {path}: {err}")
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid artifact response type"}
def _resolve_format(self, file_name: Optional[str], target_format: Optional[str]) -> str:
fmt = (target_format or "").strip().lower().lstrip(".")
if fmt:
return fmt
if file_name and "." in file_name:
return file_name.rsplit(".", 1)[1].strip().lower()
return "txt"
def _compose_output_name(self, file_name: Optional[str], doc_id: str, fmt: str) -> str:
base = "document"
if file_name:
base = file_name.rsplit("/", 1)[-1].rsplit("\\", 1)[-1]
if "." in base:
base = base.rsplit(".", 1)[0]
elif doc_id:
base = doc_id
safe_base = re.sub(r"[^A-Za-z0-9._-]+", "_", base).strip("._") or "document"
return f"{safe_base}.{fmt}"
def _render_document_bytes(
self,
text: str,
file_name: Optional[str],
doc_id: str,
target_format: Optional[str] = None,
) -> Dict[str, Any]:
body = (text or "").strip()
if not body:
raise ValueError("Cannot render empty document text")
fmt = self._resolve_format(file_name=file_name, target_format=target_format)
output_name = self._compose_output_name(file_name=file_name, doc_id=doc_id, fmt=fmt)
if fmt in {"txt"}:
payload = body.encode("utf-8")
return {"bytes": payload, "mime": "text/plain; charset=utf-8", "file_name": output_name}
if fmt in {"md", "markdown"}:
payload = body.encode("utf-8")
return {"bytes": payload, "mime": "text/markdown; charset=utf-8", "file_name": output_name}
if fmt in {"json"}:
parsed: Any
try:
parsed = json.loads(body)
except Exception:
parsed = {"text": body}
payload = json.dumps(parsed, ensure_ascii=False, indent=2).encode("utf-8")
return {"bytes": payload, "mime": "application/json", "file_name": output_name}
if fmt in {"csv"}:
payload = body.encode("utf-8")
return {"bytes": payload, "mime": "text/csv; charset=utf-8", "file_name": output_name}
if fmt in {"xlsx", "xlsm", "xls"}:
try:
from openpyxl import Workbook
except Exception as e:
raise RuntimeError(f"openpyxl is required for {fmt} rendering: {e}")
wb = Workbook()
ws = wb.active
ws.title = "Document"
lines = [ln for ln in body.splitlines()] or [body]
for idx, line in enumerate(lines, start=1):
ws.cell(row=idx, column=1, value=line)
buf = BytesIO()
wb.save(buf)
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
return {"bytes": buf.getvalue(), "mime": mime, "file_name": self._compose_output_name(file_name, doc_id, "xlsx")}
if fmt in {"docx"}:
try:
from docx import Document
except Exception as e:
raise RuntimeError(f"python-docx is required for docx rendering: {e}")
doc = Document()
for line in body.splitlines():
doc.add_paragraph(line if line else " ")
buf = BytesIO()
doc.save(buf)
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
return {"bytes": buf.getvalue(), "mime": mime, "file_name": self._compose_output_name(file_name, doc_id, "docx")}
payload = body.encode("utf-8")
fallback_name = self._compose_output_name(file_name=file_name, doc_id=doc_id, fmt="txt")
return {"bytes": payload, "mime": "text/plain; charset=utf-8", "file_name": fallback_name}
async def _publish_text_artifact(
self,
text: str,
doc_id: str,
file_name: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
artifact_id: Optional[str] = None,
target_format: Optional[str] = None,
label: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> PublishResult:
try:
rendered = self._render_document_bytes(
text=text,
file_name=file_name,
doc_id=doc_id,
target_format=target_format,
)
content_bytes = rendered["bytes"]
content_b64 = base64.b64encode(content_bytes).decode("ascii")
effective_artifact_id = (artifact_id or "").strip()
if not effective_artifact_id:
create_resp = await self._artifact_post_json(
"/artifacts",
{
"type": "doc",
"title": file_name or doc_id,
"project_id": dao_id,
"acl_ref": dao_id,
"created_by": user_id or DOC_WRITEBACK_CREATED_BY,
},
timeout=30.0,
)
effective_artifact_id = str(create_resp.get("artifact_id") or "").strip()
if not effective_artifact_id:
return PublishResult(success=False, error="Artifact create failed: empty artifact_id")
meta = {"doc_id": doc_id, "source": "doc_update_publish"}
if isinstance(metadata, dict):
meta.update(metadata)
version_resp = await self._artifact_post_json(
f"/artifacts/{effective_artifact_id}/versions/from_base64",
{
"content_base64": content_b64,
"mime": rendered["mime"],
"filename": rendered["file_name"],
"label": label or "edited",
"meta_json": meta,
},
timeout=45.0,
)
version_id = str(version_resp.get("version_id") or "").strip()
storage_key = version_resp.get("storage_key")
if not version_id:
return PublishResult(
success=False,
artifact_id=effective_artifact_id,
error="Artifact version create failed: empty version_id",
)
download_url = None
try:
dl = await self._artifact_get_json(
f"/artifacts/{effective_artifact_id}/versions/{version_id}/download",
timeout=20.0,
)
download_url = dl.get("url")
except Exception as e:
logger.warning(f"version download url generation failed: {e}")
return PublishResult(
success=True,
artifact_id=effective_artifact_id,
version_id=version_id,
storage_key=storage_key,
mime=rendered["mime"],
file_name=rendered["file_name"],
download_url=download_url,
)
except Exception as e:
logger.error(f"publish_text_artifact failed: {e}", exc_info=True)
return PublishResult(success=False, error=str(e))
def _is_excel_filename(self, file_name: Optional[str]) -> bool:
if not file_name:
return False
@@ -616,6 +849,10 @@ class DocumentService:
user_id: Optional[str] = None,
agent_id: str = "daarwizz",
storage_ref: Optional[str] = None,
publish_artifact: bool = False,
artifact_id: Optional[str] = None,
target_format: Optional[str] = None,
artifact_label: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> UpdateResult:
"""
@@ -714,13 +951,33 @@ class DocumentService:
user_id=user_id,
)
publish = PublishResult(success=False)
if publish_artifact:
publish = await self._publish_text_artifact(
text=effective_text,
doc_id=doc_id,
file_name=file_name,
dao_id=dao_id,
user_id=user_id,
artifact_id=artifact_id,
target_format=target_format,
label=artifact_label,
metadata=meta,
)
return UpdateResult(
success=True,
doc_id=response.get("doc_id") or doc_id,
version_no=int(response.get("version_no", 0) or 0) or None,
version_id=int(response.get("version_id", 0) or 0) or None,
updated_chunks=int(response.get("chunks_stored", 0) or 0),
status="updated",
status="updated_published" if publish_artifact and publish.success else ("updated_publish_failed" if publish_artifact else "updated"),
publish_error=publish.error if publish_artifact and not publish.success else None,
artifact_id=publish.artifact_id if publish_artifact else None,
artifact_version_id=publish.version_id if publish_artifact else None,
artifact_storage_key=publish.storage_key if publish_artifact else None,
artifact_mime=publish.mime if publish_artifact else None,
artifact_download_url=publish.download_url if publish_artifact else None,
)
except Exception as e:
logger.error(f"Document update failed: {e}", exc_info=True)
@@ -751,6 +1008,75 @@ class DocumentService:
logger.error(f"list_document_versions failed: {e}")
return {"ok": False, "error": str(e), "items": []}
async def publish_document_artifact(
self,
session_id: str,
doc_id: Optional[str] = None,
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
text: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
artifact_id: Optional[str] = None,
target_format: Optional[str] = None,
artifact_label: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> PublishResult:
"""
Publish text as a physical artifact version (.docx/.xlsx/.txt/...) without changing RAG index.
"""
try:
context = await self.get_doc_context(session_id)
if context:
if not doc_id:
doc_id = context.doc_id
if not doc_url:
doc_url = context.doc_url
if not file_name:
file_name = context.file_name
if not dao_id:
dao_id = context.dao_id
if not user_id:
user_id = context.user_id
if not doc_id:
return PublishResult(success=False, error="doc_id is required")
body = (text or "").strip()
if not body:
if not doc_url:
return PublishResult(success=False, error="text or doc_url is required")
parsed = await self.parse_document(
session_id=session_id,
doc_url=doc_url,
file_name=file_name or "document",
dao_id=dao_id or "",
user_id=user_id or "",
output_mode="markdown",
metadata={"source": self._extract_source(session_id), "mode": "publish"},
)
if not parsed.success:
return PublishResult(success=False, error=parsed.error or "Document parse failed")
body = (parsed.markdown or "").strip()
if not body:
return PublishResult(success=False, error="No text available for publish")
return await self._publish_text_artifact(
text=body,
doc_id=doc_id,
file_name=file_name,
dao_id=dao_id,
user_id=user_id,
artifact_id=artifact_id,
target_format=target_format,
label=artifact_label,
metadata=metadata,
)
except Exception as e:
logger.error(f"publish_document_artifact failed: {e}", exc_info=True)
return PublishResult(success=False, error=str(e))
async def ask_about_document(
self,
session_id: str,
@@ -950,6 +1276,10 @@ async def update_document(
user_id: Optional[str] = None,
agent_id: str = "daarwizz",
storage_ref: Optional[str] = None,
publish_artifact: bool = False,
artifact_id: Optional[str] = None,
target_format: Optional[str] = None,
artifact_label: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> UpdateResult:
"""Update document chunks and bump version."""
@@ -963,6 +1293,10 @@ async def update_document(
user_id=user_id,
agent_id=agent_id,
storage_ref=storage_ref,
publish_artifact=publish_artifact,
artifact_id=artifact_id,
target_format=target_format,
artifact_label=artifact_label,
metadata=metadata,
)
@@ -976,6 +1310,35 @@ async def list_document_versions(agent_id: str, doc_id: str, limit: int = 20) ->
)
async def publish_document_artifact(
session_id: str,
doc_id: Optional[str] = None,
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
text: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
artifact_id: Optional[str] = None,
target_format: Optional[str] = None,
artifact_label: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> PublishResult:
"""Publish physical artifact version for document text."""
return await doc_service.publish_document_artifact(
session_id=session_id,
doc_id=doc_id,
doc_url=doc_url,
file_name=file_name,
text=text,
dao_id=dao_id,
user_id=user_id,
artifact_id=artifact_id,
target_format=target_format,
artifact_label=artifact_label,
metadata=metadata,
)
async def save_doc_context(
session_id: str,
doc_id: str,

View File

@@ -6,13 +6,15 @@ Artifact Registry v0
"""
import asyncio
import base64
import hashlib
import json
import logging
import os
import re
import uuid
from io import BytesIO
from datetime import datetime
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
import asyncpg
@@ -90,6 +92,14 @@ class ArtifactVersionFromUrlRequest(BaseModel):
meta_json: Optional[Dict[str, Any]] = None
class ArtifactVersionFromBase64Request(BaseModel):
content_base64: str
mime: str
filename: Optional[str] = "source.bin"
label: Optional[str] = "source"
meta_json: Optional[Dict[str, Any]] = None
class ArtifactVersionResponse(BaseModel):
version_id: str
storage_key: str
@@ -208,15 +218,38 @@ def _normalize_meta_json(meta: Any) -> Dict[str, Any]:
def _format_to_mime(fmt: str) -> str:
fmt = fmt.lower()
if "/" in fmt:
return fmt
if fmt == "pptx":
return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
if fmt == "pdf":
return "application/pdf"
if fmt == "source":
return "application/json"
if fmt == "docx":
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
if fmt == "xlsx":
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
if fmt == "txt":
return "text/plain; charset=utf-8"
if fmt == "md":
return "text/markdown; charset=utf-8"
if fmt == "json":
return "application/json"
if fmt == "csv":
return "text/csv; charset=utf-8"
return "application/octet-stream"
def _safe_filename(name: Optional[str], fallback: str = "source.bin") -> str:
raw = (name or fallback).strip() or fallback
cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", raw)
cleaned = cleaned.strip("._")
if not cleaned:
return fallback
return cleaned[:120]
async def _download_bytes(url: str) -> bytes:
async with httpx.AsyncClient(timeout=60.0) as client:
resp = await client.get(url)
@@ -462,6 +495,73 @@ async def add_version_from_url(artifact_id: str, payload: ArtifactVersionFromUrl
)
@app.post("/artifacts/{artifact_id}/versions/from_base64", response_model=ArtifactVersionResponse)
async def add_version_from_base64(artifact_id: str, payload: ArtifactVersionFromBase64Request) -> ArtifactVersionResponse:
if not minio_client:
raise HTTPException(status_code=500, detail="MinIO not available")
if not pool:
raise HTTPException(status_code=500, detail="DB not available")
raw = (payload.content_base64 or "").strip()
if not raw:
raise HTTPException(status_code=400, detail="content_base64 is required")
if raw.startswith("data:") and "," in raw:
raw = raw.split(",", 1)[1]
try:
content = base64.b64decode(raw, validate=True)
except Exception:
raise HTTPException(status_code=400, detail="Invalid base64 payload")
if not content:
raise HTTPException(status_code=400, detail="Decoded payload is empty")
version_id = f"ver_{uuid.uuid4().hex}"
filename = _safe_filename(payload.filename, fallback="source.bin")
sha256 = _hash_bytes(content)
storage_key = _storage_key(artifact_id, version_id, filename)
try:
minio_client.put_object(
MINIO_BUCKET,
storage_key,
data=BytesIO(content),
length=len(content),
content_type=payload.mime,
)
except S3Error as e:
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
meta_json = _normalize_meta_json(payload.meta_json)
if "file_name" not in meta_json:
meta_json["file_name"] = filename
async with pool.acquire() as conn:
await conn.execute(
"""
insert into artifact_versions
(id, artifact_id, label, sha256, mime, size_bytes, storage_key, meta_json)
values ($1, $2, $3, $4, $5, $6, $7, $8)
""",
version_id,
artifact_id,
payload.label or "source",
sha256,
payload.mime,
len(content),
storage_key,
json.dumps(meta_json),
)
return ArtifactVersionResponse(
version_id=version_id,
storage_key=storage_key,
sha256=sha256,
size_bytes=len(content),
)
@app.post("/artifacts/{artifact_id}/versions", response_model=ArtifactVersionResponse)
async def add_version(artifact_id: str, payload: ArtifactVersionCreateRequest) -> ArtifactVersionResponse:
if not pool:
@@ -678,7 +778,39 @@ async def download_artifact(artifact_id: str, format: str = Query("pptx")) -> Di
if not row:
raise HTTPException(status_code=404, detail="Version not found")
try:
url = minio_client.presigned_get_object(MINIO_BUCKET, row["storage_key"], expires=1800)
url = minio_client.presigned_get_object(
MINIO_BUCKET,
row["storage_key"],
expires=timedelta(seconds=1800),
)
except S3Error as e:
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
return {"url": url, "storage_key": row["storage_key"], "mime": row["mime"]}
@app.get("/artifacts/{artifact_id}/versions/{version_id}/download")
async def download_artifact_version(artifact_id: str, version_id: str) -> Dict[str, Any]:
if not pool or not minio_client:
raise HTTPException(status_code=500, detail="Service not available")
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
select * from artifact_versions
where artifact_id=$1 and id=$2
limit 1
""",
artifact_id,
version_id,
)
if not row:
raise HTTPException(status_code=404, detail="Version not found")
try:
url = minio_client.presigned_get_object(
MINIO_BUCKET,
row["storage_key"],
expires=timedelta(seconds=1800),
)
except S3Error as e:
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
return {"url": url, "storage_key": row["storage_key"], "mime": row["mime"], "version_id": row["id"]}