feat(docs): add document write-back publish pipeline
This commit is contained in:
@@ -22,6 +22,7 @@ RUN pip install --no-cache-dir \
|
|||||||
nats-py \
|
nats-py \
|
||||||
pandas \
|
pandas \
|
||||||
openpyxl \
|
openpyxl \
|
||||||
|
python-docx \
|
||||||
redis==5.0.1
|
redis==5.0.1
|
||||||
|
|
||||||
# Copy gateway code and DAARWIZZ prompt
|
# Copy gateway code and DAARWIZZ prompt
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ Endpoints:
|
|||||||
- POST /api/doc/ingest - Ingest document to RAG
|
- POST /api/doc/ingest - Ingest document to RAG
|
||||||
- POST /api/doc/ask - Ask question about document
|
- POST /api/doc/ask - Ask question about document
|
||||||
- POST /api/doc/update - Update existing document text (versioned)
|
- POST /api/doc/update - Update existing document text (versioned)
|
||||||
|
- POST /api/doc/publish - Publish physical file version via artifact registry
|
||||||
- GET /api/doc/versions/{doc_id} - List document versions
|
- GET /api/doc/versions/{doc_id} - List document versions
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
@@ -21,6 +22,7 @@ from services.doc_service import (
|
|||||||
ask_about_document,
|
ask_about_document,
|
||||||
update_document,
|
update_document,
|
||||||
list_document_versions,
|
list_document_versions,
|
||||||
|
publish_document_artifact,
|
||||||
get_doc_context,
|
get_doc_context,
|
||||||
ParsedResult,
|
ParsedResult,
|
||||||
IngestResult,
|
IngestResult,
|
||||||
@@ -81,6 +83,25 @@ class UpdateDocumentRequest(BaseModel):
|
|||||||
user_id: Optional[str] = None
|
user_id: Optional[str] = None
|
||||||
agent_id: str = "daarwizz"
|
agent_id: str = "daarwizz"
|
||||||
storage_ref: Optional[str] = None
|
storage_ref: Optional[str] = None
|
||||||
|
publish_artifact: bool = False
|
||||||
|
artifact_id: Optional[str] = None
|
||||||
|
target_format: Optional[str] = None
|
||||||
|
artifact_label: Optional[str] = None
|
||||||
|
metadata: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
|
||||||
|
class PublishDocumentRequest(BaseModel):
|
||||||
|
"""Request to publish document as physical artifact version."""
|
||||||
|
session_id: str
|
||||||
|
doc_id: Optional[str] = None
|
||||||
|
doc_url: Optional[str] = None
|
||||||
|
file_name: Optional[str] = None
|
||||||
|
text: Optional[str] = None
|
||||||
|
dao_id: Optional[str] = None
|
||||||
|
user_id: Optional[str] = None
|
||||||
|
artifact_id: Optional[str] = None
|
||||||
|
target_format: Optional[str] = None
|
||||||
|
artifact_label: Optional[str] = None
|
||||||
metadata: Optional[Dict[str, Any]] = None
|
metadata: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
|
||||||
@@ -267,18 +288,29 @@ async def update_document_endpoint(request: UpdateDocumentRequest):
|
|||||||
user_id=request.user_id,
|
user_id=request.user_id,
|
||||||
agent_id=request.agent_id,
|
agent_id=request.agent_id,
|
||||||
storage_ref=request.storage_ref,
|
storage_ref=request.storage_ref,
|
||||||
|
publish_artifact=request.publish_artifact,
|
||||||
|
artifact_id=request.artifact_id,
|
||||||
|
target_format=request.target_format,
|
||||||
|
artifact_label=request.artifact_label,
|
||||||
metadata=request.metadata,
|
metadata=request.metadata,
|
||||||
)
|
)
|
||||||
if not result.success:
|
if not result.success:
|
||||||
raise HTTPException(status_code=400, detail=result.error)
|
raise HTTPException(status_code=400, detail=result.error)
|
||||||
return {
|
response = {
|
||||||
"ok": True,
|
"ok": True,
|
||||||
"doc_id": result.doc_id,
|
"doc_id": result.doc_id,
|
||||||
"version_no": result.version_no,
|
"version_no": result.version_no,
|
||||||
"version_id": result.version_id,
|
"version_id": result.version_id,
|
||||||
"updated_chunks": result.updated_chunks,
|
"updated_chunks": result.updated_chunks,
|
||||||
"status": result.status,
|
"status": result.status,
|
||||||
|
"publish_error": result.publish_error,
|
||||||
|
"artifact_id": result.artifact_id,
|
||||||
|
"artifact_version_id": result.artifact_version_id,
|
||||||
|
"artifact_storage_key": result.artifact_storage_key,
|
||||||
|
"artifact_mime": result.artifact_mime,
|
||||||
|
"artifact_download_url": result.artifact_download_url,
|
||||||
}
|
}
|
||||||
|
return response
|
||||||
except HTTPException:
|
except HTTPException:
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -286,6 +318,43 @@ async def update_document_endpoint(request: UpdateDocumentRequest):
|
|||||||
raise HTTPException(status_code=500, detail=str(e))
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/api/doc/publish")
|
||||||
|
async def publish_document_endpoint(request: PublishDocumentRequest):
|
||||||
|
"""
|
||||||
|
Publish current document text as physical file artifact version.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
result = await publish_document_artifact(
|
||||||
|
session_id=request.session_id,
|
||||||
|
doc_id=request.doc_id,
|
||||||
|
doc_url=request.doc_url,
|
||||||
|
file_name=request.file_name,
|
||||||
|
text=request.text,
|
||||||
|
dao_id=request.dao_id,
|
||||||
|
user_id=request.user_id,
|
||||||
|
artifact_id=request.artifact_id,
|
||||||
|
target_format=request.target_format,
|
||||||
|
artifact_label=request.artifact_label,
|
||||||
|
metadata=request.metadata,
|
||||||
|
)
|
||||||
|
if not result.success:
|
||||||
|
raise HTTPException(status_code=400, detail=result.error)
|
||||||
|
return {
|
||||||
|
"ok": True,
|
||||||
|
"artifact_id": result.artifact_id,
|
||||||
|
"version_id": result.version_id,
|
||||||
|
"storage_key": result.storage_key,
|
||||||
|
"mime": result.mime,
|
||||||
|
"file_name": result.file_name,
|
||||||
|
"download_url": result.download_url,
|
||||||
|
}
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Publish document error: {e}", exc_info=True)
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
|
||||||
@router.get("/api/doc/versions/{doc_id}")
|
@router.get("/api/doc/versions/{doc_id}")
|
||||||
async def list_document_versions_endpoint(doc_id: str, agent_id: str = "daarwizz", limit: int = 20):
|
async def list_document_versions_endpoint(doc_id: str, agent_id: str = "daarwizz", limit: int = 20):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -11,11 +11,13 @@ This service can be used by:
|
|||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import base64
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from typing import Optional, Dict, Any, List
|
from typing import Optional, Dict, Any, List
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
from memory_client import memory_client
|
from memory_client import memory_client
|
||||||
|
|
||||||
@@ -23,6 +25,8 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
SHARED_EXCEL_POLICY_AGENTS = {"agromatrix", "helion", "nutra", "greenfood"}
|
SHARED_EXCEL_POLICY_AGENTS = {"agromatrix", "helion", "nutra", "greenfood"}
|
||||||
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000")
|
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000")
|
||||||
|
ARTIFACT_REGISTRY_URL = os.getenv("ARTIFACT_REGISTRY_URL", "http://artifact-registry:9220").rstrip("/")
|
||||||
|
DOC_WRITEBACK_CREATED_BY = os.getenv("DOC_WRITEBACK_CREATED_BY", "gateway-doc-service")
|
||||||
|
|
||||||
|
|
||||||
class QAItem(BaseModel):
|
class QAItem(BaseModel):
|
||||||
@@ -59,6 +63,24 @@ class UpdateResult(BaseModel):
|
|||||||
version_id: Optional[int] = None
|
version_id: Optional[int] = None
|
||||||
updated_chunks: int = 0
|
updated_chunks: int = 0
|
||||||
status: str = "unknown"
|
status: str = "unknown"
|
||||||
|
publish_error: Optional[str] = None
|
||||||
|
artifact_id: Optional[str] = None
|
||||||
|
artifact_version_id: Optional[str] = None
|
||||||
|
artifact_storage_key: Optional[str] = None
|
||||||
|
artifact_mime: Optional[str] = None
|
||||||
|
artifact_download_url: Optional[str] = None
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class PublishResult(BaseModel):
|
||||||
|
"""Result of artifact write-back publish."""
|
||||||
|
success: bool
|
||||||
|
artifact_id: Optional[str] = None
|
||||||
|
version_id: Optional[str] = None
|
||||||
|
storage_key: Optional[str] = None
|
||||||
|
mime: Optional[str] = None
|
||||||
|
file_name: Optional[str] = None
|
||||||
|
download_url: Optional[str] = None
|
||||||
error: Optional[str] = None
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
@@ -138,6 +160,217 @@ class DocumentService:
|
|||||||
raise RuntimeError(f"Router error on {path}: {err}")
|
raise RuntimeError(f"Router error on {path}: {err}")
|
||||||
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"}
|
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"}
|
||||||
|
|
||||||
|
async def _artifact_post_json(
|
||||||
|
self,
|
||||||
|
path: str,
|
||||||
|
payload: Dict[str, Any],
|
||||||
|
timeout: float = 45.0,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
base = ARTIFACT_REGISTRY_URL.rstrip("/")
|
||||||
|
url = f"{base}{path}"
|
||||||
|
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||||
|
resp = await client.post(url, json=payload)
|
||||||
|
body = {}
|
||||||
|
try:
|
||||||
|
body = resp.json()
|
||||||
|
except Exception:
|
||||||
|
body = {"ok": False, "error": f"Invalid JSON from artifact-registry ({resp.status_code})"}
|
||||||
|
if resp.status_code >= 400:
|
||||||
|
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
|
||||||
|
raise RuntimeError(f"Artifact registry error on {path}: {err}")
|
||||||
|
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid artifact response type"}
|
||||||
|
|
||||||
|
async def _artifact_get_json(
|
||||||
|
self,
|
||||||
|
path: str,
|
||||||
|
timeout: float = 30.0,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
base = ARTIFACT_REGISTRY_URL.rstrip("/")
|
||||||
|
url = f"{base}{path}"
|
||||||
|
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||||
|
resp = await client.get(url)
|
||||||
|
body = {}
|
||||||
|
try:
|
||||||
|
body = resp.json()
|
||||||
|
except Exception:
|
||||||
|
body = {"ok": False, "error": f"Invalid JSON from artifact-registry ({resp.status_code})"}
|
||||||
|
if resp.status_code >= 400:
|
||||||
|
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
|
||||||
|
raise RuntimeError(f"Artifact registry error on {path}: {err}")
|
||||||
|
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid artifact response type"}
|
||||||
|
|
||||||
|
def _resolve_format(self, file_name: Optional[str], target_format: Optional[str]) -> str:
|
||||||
|
fmt = (target_format or "").strip().lower().lstrip(".")
|
||||||
|
if fmt:
|
||||||
|
return fmt
|
||||||
|
if file_name and "." in file_name:
|
||||||
|
return file_name.rsplit(".", 1)[1].strip().lower()
|
||||||
|
return "txt"
|
||||||
|
|
||||||
|
def _compose_output_name(self, file_name: Optional[str], doc_id: str, fmt: str) -> str:
|
||||||
|
base = "document"
|
||||||
|
if file_name:
|
||||||
|
base = file_name.rsplit("/", 1)[-1].rsplit("\\", 1)[-1]
|
||||||
|
if "." in base:
|
||||||
|
base = base.rsplit(".", 1)[0]
|
||||||
|
elif doc_id:
|
||||||
|
base = doc_id
|
||||||
|
safe_base = re.sub(r"[^A-Za-z0-9._-]+", "_", base).strip("._") or "document"
|
||||||
|
return f"{safe_base}.{fmt}"
|
||||||
|
|
||||||
|
def _render_document_bytes(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
file_name: Optional[str],
|
||||||
|
doc_id: str,
|
||||||
|
target_format: Optional[str] = None,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
body = (text or "").strip()
|
||||||
|
if not body:
|
||||||
|
raise ValueError("Cannot render empty document text")
|
||||||
|
|
||||||
|
fmt = self._resolve_format(file_name=file_name, target_format=target_format)
|
||||||
|
output_name = self._compose_output_name(file_name=file_name, doc_id=doc_id, fmt=fmt)
|
||||||
|
|
||||||
|
if fmt in {"txt"}:
|
||||||
|
payload = body.encode("utf-8")
|
||||||
|
return {"bytes": payload, "mime": "text/plain; charset=utf-8", "file_name": output_name}
|
||||||
|
if fmt in {"md", "markdown"}:
|
||||||
|
payload = body.encode("utf-8")
|
||||||
|
return {"bytes": payload, "mime": "text/markdown; charset=utf-8", "file_name": output_name}
|
||||||
|
if fmt in {"json"}:
|
||||||
|
parsed: Any
|
||||||
|
try:
|
||||||
|
parsed = json.loads(body)
|
||||||
|
except Exception:
|
||||||
|
parsed = {"text": body}
|
||||||
|
payload = json.dumps(parsed, ensure_ascii=False, indent=2).encode("utf-8")
|
||||||
|
return {"bytes": payload, "mime": "application/json", "file_name": output_name}
|
||||||
|
if fmt in {"csv"}:
|
||||||
|
payload = body.encode("utf-8")
|
||||||
|
return {"bytes": payload, "mime": "text/csv; charset=utf-8", "file_name": output_name}
|
||||||
|
if fmt in {"xlsx", "xlsm", "xls"}:
|
||||||
|
try:
|
||||||
|
from openpyxl import Workbook
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"openpyxl is required for {fmt} rendering: {e}")
|
||||||
|
wb = Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
ws.title = "Document"
|
||||||
|
lines = [ln for ln in body.splitlines()] or [body]
|
||||||
|
for idx, line in enumerate(lines, start=1):
|
||||||
|
ws.cell(row=idx, column=1, value=line)
|
||||||
|
buf = BytesIO()
|
||||||
|
wb.save(buf)
|
||||||
|
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
|
return {"bytes": buf.getvalue(), "mime": mime, "file_name": self._compose_output_name(file_name, doc_id, "xlsx")}
|
||||||
|
if fmt in {"docx"}:
|
||||||
|
try:
|
||||||
|
from docx import Document
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"python-docx is required for docx rendering: {e}")
|
||||||
|
doc = Document()
|
||||||
|
for line in body.splitlines():
|
||||||
|
doc.add_paragraph(line if line else " ")
|
||||||
|
buf = BytesIO()
|
||||||
|
doc.save(buf)
|
||||||
|
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
return {"bytes": buf.getvalue(), "mime": mime, "file_name": self._compose_output_name(file_name, doc_id, "docx")}
|
||||||
|
|
||||||
|
payload = body.encode("utf-8")
|
||||||
|
fallback_name = self._compose_output_name(file_name=file_name, doc_id=doc_id, fmt="txt")
|
||||||
|
return {"bytes": payload, "mime": "text/plain; charset=utf-8", "file_name": fallback_name}
|
||||||
|
|
||||||
|
async def _publish_text_artifact(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
doc_id: str,
|
||||||
|
file_name: Optional[str] = None,
|
||||||
|
dao_id: Optional[str] = None,
|
||||||
|
user_id: Optional[str] = None,
|
||||||
|
artifact_id: Optional[str] = None,
|
||||||
|
target_format: Optional[str] = None,
|
||||||
|
label: Optional[str] = None,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> PublishResult:
|
||||||
|
try:
|
||||||
|
rendered = self._render_document_bytes(
|
||||||
|
text=text,
|
||||||
|
file_name=file_name,
|
||||||
|
doc_id=doc_id,
|
||||||
|
target_format=target_format,
|
||||||
|
)
|
||||||
|
content_bytes = rendered["bytes"]
|
||||||
|
content_b64 = base64.b64encode(content_bytes).decode("ascii")
|
||||||
|
|
||||||
|
effective_artifact_id = (artifact_id or "").strip()
|
||||||
|
if not effective_artifact_id:
|
||||||
|
create_resp = await self._artifact_post_json(
|
||||||
|
"/artifacts",
|
||||||
|
{
|
||||||
|
"type": "doc",
|
||||||
|
"title": file_name or doc_id,
|
||||||
|
"project_id": dao_id,
|
||||||
|
"acl_ref": dao_id,
|
||||||
|
"created_by": user_id or DOC_WRITEBACK_CREATED_BY,
|
||||||
|
},
|
||||||
|
timeout=30.0,
|
||||||
|
)
|
||||||
|
effective_artifact_id = str(create_resp.get("artifact_id") or "").strip()
|
||||||
|
if not effective_artifact_id:
|
||||||
|
return PublishResult(success=False, error="Artifact create failed: empty artifact_id")
|
||||||
|
|
||||||
|
meta = {"doc_id": doc_id, "source": "doc_update_publish"}
|
||||||
|
if isinstance(metadata, dict):
|
||||||
|
meta.update(metadata)
|
||||||
|
|
||||||
|
version_resp = await self._artifact_post_json(
|
||||||
|
f"/artifacts/{effective_artifact_id}/versions/from_base64",
|
||||||
|
{
|
||||||
|
"content_base64": content_b64,
|
||||||
|
"mime": rendered["mime"],
|
||||||
|
"filename": rendered["file_name"],
|
||||||
|
"label": label or "edited",
|
||||||
|
"meta_json": meta,
|
||||||
|
},
|
||||||
|
timeout=45.0,
|
||||||
|
)
|
||||||
|
version_id = str(version_resp.get("version_id") or "").strip()
|
||||||
|
storage_key = version_resp.get("storage_key")
|
||||||
|
if not version_id:
|
||||||
|
return PublishResult(
|
||||||
|
success=False,
|
||||||
|
artifact_id=effective_artifact_id,
|
||||||
|
error="Artifact version create failed: empty version_id",
|
||||||
|
)
|
||||||
|
|
||||||
|
download_url = None
|
||||||
|
try:
|
||||||
|
dl = await self._artifact_get_json(
|
||||||
|
f"/artifacts/{effective_artifact_id}/versions/{version_id}/download",
|
||||||
|
timeout=20.0,
|
||||||
|
)
|
||||||
|
download_url = dl.get("url")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"version download url generation failed: {e}")
|
||||||
|
|
||||||
|
return PublishResult(
|
||||||
|
success=True,
|
||||||
|
artifact_id=effective_artifact_id,
|
||||||
|
version_id=version_id,
|
||||||
|
storage_key=storage_key,
|
||||||
|
mime=rendered["mime"],
|
||||||
|
file_name=rendered["file_name"],
|
||||||
|
download_url=download_url,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"publish_text_artifact failed: {e}", exc_info=True)
|
||||||
|
return PublishResult(success=False, error=str(e))
|
||||||
|
|
||||||
def _is_excel_filename(self, file_name: Optional[str]) -> bool:
|
def _is_excel_filename(self, file_name: Optional[str]) -> bool:
|
||||||
if not file_name:
|
if not file_name:
|
||||||
return False
|
return False
|
||||||
@@ -616,6 +849,10 @@ class DocumentService:
|
|||||||
user_id: Optional[str] = None,
|
user_id: Optional[str] = None,
|
||||||
agent_id: str = "daarwizz",
|
agent_id: str = "daarwizz",
|
||||||
storage_ref: Optional[str] = None,
|
storage_ref: Optional[str] = None,
|
||||||
|
publish_artifact: bool = False,
|
||||||
|
artifact_id: Optional[str] = None,
|
||||||
|
target_format: Optional[str] = None,
|
||||||
|
artifact_label: Optional[str] = None,
|
||||||
metadata: Optional[Dict[str, Any]] = None,
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
) -> UpdateResult:
|
) -> UpdateResult:
|
||||||
"""
|
"""
|
||||||
@@ -714,13 +951,33 @@ class DocumentService:
|
|||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
publish = PublishResult(success=False)
|
||||||
|
if publish_artifact:
|
||||||
|
publish = await self._publish_text_artifact(
|
||||||
|
text=effective_text,
|
||||||
|
doc_id=doc_id,
|
||||||
|
file_name=file_name,
|
||||||
|
dao_id=dao_id,
|
||||||
|
user_id=user_id,
|
||||||
|
artifact_id=artifact_id,
|
||||||
|
target_format=target_format,
|
||||||
|
label=artifact_label,
|
||||||
|
metadata=meta,
|
||||||
|
)
|
||||||
|
|
||||||
return UpdateResult(
|
return UpdateResult(
|
||||||
success=True,
|
success=True,
|
||||||
doc_id=response.get("doc_id") or doc_id,
|
doc_id=response.get("doc_id") or doc_id,
|
||||||
version_no=int(response.get("version_no", 0) or 0) or None,
|
version_no=int(response.get("version_no", 0) or 0) or None,
|
||||||
version_id=int(response.get("version_id", 0) or 0) or None,
|
version_id=int(response.get("version_id", 0) or 0) or None,
|
||||||
updated_chunks=int(response.get("chunks_stored", 0) or 0),
|
updated_chunks=int(response.get("chunks_stored", 0) or 0),
|
||||||
status="updated",
|
status="updated_published" if publish_artifact and publish.success else ("updated_publish_failed" if publish_artifact else "updated"),
|
||||||
|
publish_error=publish.error if publish_artifact and not publish.success else None,
|
||||||
|
artifact_id=publish.artifact_id if publish_artifact else None,
|
||||||
|
artifact_version_id=publish.version_id if publish_artifact else None,
|
||||||
|
artifact_storage_key=publish.storage_key if publish_artifact else None,
|
||||||
|
artifact_mime=publish.mime if publish_artifact else None,
|
||||||
|
artifact_download_url=publish.download_url if publish_artifact else None,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Document update failed: {e}", exc_info=True)
|
logger.error(f"Document update failed: {e}", exc_info=True)
|
||||||
@@ -751,6 +1008,75 @@ class DocumentService:
|
|||||||
logger.error(f"list_document_versions failed: {e}")
|
logger.error(f"list_document_versions failed: {e}")
|
||||||
return {"ok": False, "error": str(e), "items": []}
|
return {"ok": False, "error": str(e), "items": []}
|
||||||
|
|
||||||
|
async def publish_document_artifact(
|
||||||
|
self,
|
||||||
|
session_id: str,
|
||||||
|
doc_id: Optional[str] = None,
|
||||||
|
doc_url: Optional[str] = None,
|
||||||
|
file_name: Optional[str] = None,
|
||||||
|
text: Optional[str] = None,
|
||||||
|
dao_id: Optional[str] = None,
|
||||||
|
user_id: Optional[str] = None,
|
||||||
|
artifact_id: Optional[str] = None,
|
||||||
|
target_format: Optional[str] = None,
|
||||||
|
artifact_label: Optional[str] = None,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> PublishResult:
|
||||||
|
"""
|
||||||
|
Publish text as a physical artifact version (.docx/.xlsx/.txt/...) without changing RAG index.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
context = await self.get_doc_context(session_id)
|
||||||
|
if context:
|
||||||
|
if not doc_id:
|
||||||
|
doc_id = context.doc_id
|
||||||
|
if not doc_url:
|
||||||
|
doc_url = context.doc_url
|
||||||
|
if not file_name:
|
||||||
|
file_name = context.file_name
|
||||||
|
if not dao_id:
|
||||||
|
dao_id = context.dao_id
|
||||||
|
if not user_id:
|
||||||
|
user_id = context.user_id
|
||||||
|
|
||||||
|
if not doc_id:
|
||||||
|
return PublishResult(success=False, error="doc_id is required")
|
||||||
|
|
||||||
|
body = (text or "").strip()
|
||||||
|
if not body:
|
||||||
|
if not doc_url:
|
||||||
|
return PublishResult(success=False, error="text or doc_url is required")
|
||||||
|
parsed = await self.parse_document(
|
||||||
|
session_id=session_id,
|
||||||
|
doc_url=doc_url,
|
||||||
|
file_name=file_name or "document",
|
||||||
|
dao_id=dao_id or "",
|
||||||
|
user_id=user_id or "",
|
||||||
|
output_mode="markdown",
|
||||||
|
metadata={"source": self._extract_source(session_id), "mode": "publish"},
|
||||||
|
)
|
||||||
|
if not parsed.success:
|
||||||
|
return PublishResult(success=False, error=parsed.error or "Document parse failed")
|
||||||
|
body = (parsed.markdown or "").strip()
|
||||||
|
|
||||||
|
if not body:
|
||||||
|
return PublishResult(success=False, error="No text available for publish")
|
||||||
|
|
||||||
|
return await self._publish_text_artifact(
|
||||||
|
text=body,
|
||||||
|
doc_id=doc_id,
|
||||||
|
file_name=file_name,
|
||||||
|
dao_id=dao_id,
|
||||||
|
user_id=user_id,
|
||||||
|
artifact_id=artifact_id,
|
||||||
|
target_format=target_format,
|
||||||
|
label=artifact_label,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"publish_document_artifact failed: {e}", exc_info=True)
|
||||||
|
return PublishResult(success=False, error=str(e))
|
||||||
|
|
||||||
async def ask_about_document(
|
async def ask_about_document(
|
||||||
self,
|
self,
|
||||||
session_id: str,
|
session_id: str,
|
||||||
@@ -950,6 +1276,10 @@ async def update_document(
|
|||||||
user_id: Optional[str] = None,
|
user_id: Optional[str] = None,
|
||||||
agent_id: str = "daarwizz",
|
agent_id: str = "daarwizz",
|
||||||
storage_ref: Optional[str] = None,
|
storage_ref: Optional[str] = None,
|
||||||
|
publish_artifact: bool = False,
|
||||||
|
artifact_id: Optional[str] = None,
|
||||||
|
target_format: Optional[str] = None,
|
||||||
|
artifact_label: Optional[str] = None,
|
||||||
metadata: Optional[Dict[str, Any]] = None,
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
) -> UpdateResult:
|
) -> UpdateResult:
|
||||||
"""Update document chunks and bump version."""
|
"""Update document chunks and bump version."""
|
||||||
@@ -963,6 +1293,10 @@ async def update_document(
|
|||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
agent_id=agent_id,
|
agent_id=agent_id,
|
||||||
storage_ref=storage_ref,
|
storage_ref=storage_ref,
|
||||||
|
publish_artifact=publish_artifact,
|
||||||
|
artifact_id=artifact_id,
|
||||||
|
target_format=target_format,
|
||||||
|
artifact_label=artifact_label,
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -976,6 +1310,35 @@ async def list_document_versions(agent_id: str, doc_id: str, limit: int = 20) ->
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def publish_document_artifact(
|
||||||
|
session_id: str,
|
||||||
|
doc_id: Optional[str] = None,
|
||||||
|
doc_url: Optional[str] = None,
|
||||||
|
file_name: Optional[str] = None,
|
||||||
|
text: Optional[str] = None,
|
||||||
|
dao_id: Optional[str] = None,
|
||||||
|
user_id: Optional[str] = None,
|
||||||
|
artifact_id: Optional[str] = None,
|
||||||
|
target_format: Optional[str] = None,
|
||||||
|
artifact_label: Optional[str] = None,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> PublishResult:
|
||||||
|
"""Publish physical artifact version for document text."""
|
||||||
|
return await doc_service.publish_document_artifact(
|
||||||
|
session_id=session_id,
|
||||||
|
doc_id=doc_id,
|
||||||
|
doc_url=doc_url,
|
||||||
|
file_name=file_name,
|
||||||
|
text=text,
|
||||||
|
dao_id=dao_id,
|
||||||
|
user_id=user_id,
|
||||||
|
artifact_id=artifact_id,
|
||||||
|
target_format=target_format,
|
||||||
|
artifact_label=artifact_label,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def save_doc_context(
|
async def save_doc_context(
|
||||||
session_id: str,
|
session_id: str,
|
||||||
doc_id: str,
|
doc_id: str,
|
||||||
|
|||||||
@@ -6,13 +6,15 @@ Artifact Registry v0
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import base64
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import uuid
|
import uuid
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from datetime import datetime
|
from datetime import datetime, timedelta
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
import asyncpg
|
import asyncpg
|
||||||
@@ -90,6 +92,14 @@ class ArtifactVersionFromUrlRequest(BaseModel):
|
|||||||
meta_json: Optional[Dict[str, Any]] = None
|
meta_json: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ArtifactVersionFromBase64Request(BaseModel):
|
||||||
|
content_base64: str
|
||||||
|
mime: str
|
||||||
|
filename: Optional[str] = "source.bin"
|
||||||
|
label: Optional[str] = "source"
|
||||||
|
meta_json: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
|
||||||
class ArtifactVersionResponse(BaseModel):
|
class ArtifactVersionResponse(BaseModel):
|
||||||
version_id: str
|
version_id: str
|
||||||
storage_key: str
|
storage_key: str
|
||||||
@@ -208,15 +218,38 @@ def _normalize_meta_json(meta: Any) -> Dict[str, Any]:
|
|||||||
|
|
||||||
def _format_to_mime(fmt: str) -> str:
|
def _format_to_mime(fmt: str) -> str:
|
||||||
fmt = fmt.lower()
|
fmt = fmt.lower()
|
||||||
|
if "/" in fmt:
|
||||||
|
return fmt
|
||||||
if fmt == "pptx":
|
if fmt == "pptx":
|
||||||
return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||||
if fmt == "pdf":
|
if fmt == "pdf":
|
||||||
return "application/pdf"
|
return "application/pdf"
|
||||||
if fmt == "source":
|
if fmt == "source":
|
||||||
return "application/json"
|
return "application/json"
|
||||||
|
if fmt == "docx":
|
||||||
|
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
if fmt == "xlsx":
|
||||||
|
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
|
if fmt == "txt":
|
||||||
|
return "text/plain; charset=utf-8"
|
||||||
|
if fmt == "md":
|
||||||
|
return "text/markdown; charset=utf-8"
|
||||||
|
if fmt == "json":
|
||||||
|
return "application/json"
|
||||||
|
if fmt == "csv":
|
||||||
|
return "text/csv; charset=utf-8"
|
||||||
return "application/octet-stream"
|
return "application/octet-stream"
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_filename(name: Optional[str], fallback: str = "source.bin") -> str:
|
||||||
|
raw = (name or fallback).strip() or fallback
|
||||||
|
cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", raw)
|
||||||
|
cleaned = cleaned.strip("._")
|
||||||
|
if not cleaned:
|
||||||
|
return fallback
|
||||||
|
return cleaned[:120]
|
||||||
|
|
||||||
|
|
||||||
async def _download_bytes(url: str) -> bytes:
|
async def _download_bytes(url: str) -> bytes:
|
||||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||||
resp = await client.get(url)
|
resp = await client.get(url)
|
||||||
@@ -462,6 +495,73 @@ async def add_version_from_url(artifact_id: str, payload: ArtifactVersionFromUrl
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/artifacts/{artifact_id}/versions/from_base64", response_model=ArtifactVersionResponse)
|
||||||
|
async def add_version_from_base64(artifact_id: str, payload: ArtifactVersionFromBase64Request) -> ArtifactVersionResponse:
|
||||||
|
if not minio_client:
|
||||||
|
raise HTTPException(status_code=500, detail="MinIO not available")
|
||||||
|
if not pool:
|
||||||
|
raise HTTPException(status_code=500, detail="DB not available")
|
||||||
|
|
||||||
|
raw = (payload.content_base64 or "").strip()
|
||||||
|
if not raw:
|
||||||
|
raise HTTPException(status_code=400, detail="content_base64 is required")
|
||||||
|
|
||||||
|
if raw.startswith("data:") and "," in raw:
|
||||||
|
raw = raw.split(",", 1)[1]
|
||||||
|
|
||||||
|
try:
|
||||||
|
content = base64.b64decode(raw, validate=True)
|
||||||
|
except Exception:
|
||||||
|
raise HTTPException(status_code=400, detail="Invalid base64 payload")
|
||||||
|
|
||||||
|
if not content:
|
||||||
|
raise HTTPException(status_code=400, detail="Decoded payload is empty")
|
||||||
|
|
||||||
|
version_id = f"ver_{uuid.uuid4().hex}"
|
||||||
|
filename = _safe_filename(payload.filename, fallback="source.bin")
|
||||||
|
sha256 = _hash_bytes(content)
|
||||||
|
storage_key = _storage_key(artifact_id, version_id, filename)
|
||||||
|
|
||||||
|
try:
|
||||||
|
minio_client.put_object(
|
||||||
|
MINIO_BUCKET,
|
||||||
|
storage_key,
|
||||||
|
data=BytesIO(content),
|
||||||
|
length=len(content),
|
||||||
|
content_type=payload.mime,
|
||||||
|
)
|
||||||
|
except S3Error as e:
|
||||||
|
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
|
||||||
|
|
||||||
|
meta_json = _normalize_meta_json(payload.meta_json)
|
||||||
|
if "file_name" not in meta_json:
|
||||||
|
meta_json["file_name"] = filename
|
||||||
|
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
await conn.execute(
|
||||||
|
"""
|
||||||
|
insert into artifact_versions
|
||||||
|
(id, artifact_id, label, sha256, mime, size_bytes, storage_key, meta_json)
|
||||||
|
values ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||||
|
""",
|
||||||
|
version_id,
|
||||||
|
artifact_id,
|
||||||
|
payload.label or "source",
|
||||||
|
sha256,
|
||||||
|
payload.mime,
|
||||||
|
len(content),
|
||||||
|
storage_key,
|
||||||
|
json.dumps(meta_json),
|
||||||
|
)
|
||||||
|
|
||||||
|
return ArtifactVersionResponse(
|
||||||
|
version_id=version_id,
|
||||||
|
storage_key=storage_key,
|
||||||
|
sha256=sha256,
|
||||||
|
size_bytes=len(content),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.post("/artifacts/{artifact_id}/versions", response_model=ArtifactVersionResponse)
|
@app.post("/artifacts/{artifact_id}/versions", response_model=ArtifactVersionResponse)
|
||||||
async def add_version(artifact_id: str, payload: ArtifactVersionCreateRequest) -> ArtifactVersionResponse:
|
async def add_version(artifact_id: str, payload: ArtifactVersionCreateRequest) -> ArtifactVersionResponse:
|
||||||
if not pool:
|
if not pool:
|
||||||
@@ -678,7 +778,39 @@ async def download_artifact(artifact_id: str, format: str = Query("pptx")) -> Di
|
|||||||
if not row:
|
if not row:
|
||||||
raise HTTPException(status_code=404, detail="Version not found")
|
raise HTTPException(status_code=404, detail="Version not found")
|
||||||
try:
|
try:
|
||||||
url = minio_client.presigned_get_object(MINIO_BUCKET, row["storage_key"], expires=1800)
|
url = minio_client.presigned_get_object(
|
||||||
|
MINIO_BUCKET,
|
||||||
|
row["storage_key"],
|
||||||
|
expires=timedelta(seconds=1800),
|
||||||
|
)
|
||||||
except S3Error as e:
|
except S3Error as e:
|
||||||
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
|
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
|
||||||
return {"url": url, "storage_key": row["storage_key"], "mime": row["mime"]}
|
return {"url": url, "storage_key": row["storage_key"], "mime": row["mime"]}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/artifacts/{artifact_id}/versions/{version_id}/download")
|
||||||
|
async def download_artifact_version(artifact_id: str, version_id: str) -> Dict[str, Any]:
|
||||||
|
if not pool or not minio_client:
|
||||||
|
raise HTTPException(status_code=500, detail="Service not available")
|
||||||
|
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
row = await conn.fetchrow(
|
||||||
|
"""
|
||||||
|
select * from artifact_versions
|
||||||
|
where artifact_id=$1 and id=$2
|
||||||
|
limit 1
|
||||||
|
""",
|
||||||
|
artifact_id,
|
||||||
|
version_id,
|
||||||
|
)
|
||||||
|
if not row:
|
||||||
|
raise HTTPException(status_code=404, detail="Version not found")
|
||||||
|
try:
|
||||||
|
url = minio_client.presigned_get_object(
|
||||||
|
MINIO_BUCKET,
|
||||||
|
row["storage_key"],
|
||||||
|
expires=timedelta(seconds=1800),
|
||||||
|
)
|
||||||
|
except S3Error as e:
|
||||||
|
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
|
||||||
|
return {"url": url, "storage_key": row["storage_key"], "mime": row["mime"], "version_id": row["id"]}
|
||||||
|
|||||||
Reference in New Issue
Block a user