microdao-daarion/services/sofiia-console/app/docs_router.py

"""
sofiia-console — Projects, Documents, Sessions, Dialog Map endpoints.

All endpoints are mounted on the main FastAPI app in main.py via:
    app.include_router(docs_router)

Features:
- File upload with sha256, mime detection, size limits
- Projects CRUD
- Documents per project with keyword search
- Sessions with persistence (aiosqlite)
- Messages with branching (parent_msg_id)
- Dialog map (nodes + edges JSON)
- Session fork
"""
import hashlib
import io
import json
import logging
import mimetypes
import os
import re
import uuid
from pathlib import Path
from typing import List, Optional

import httpx
from fastapi import APIRouter, HTTPException, Query, Request, UploadFile, File
from fastapi.responses import FileResponse, JSONResponse
from pydantic import BaseModel

from . import db as _db

logger = logging.getLogger(__name__)

docs_router = APIRouter(prefix="/api", tags=["projects-docs-sessions"])

# ── Config ────────────────────────────────────────────────────────────────────

_DATA_DIR     = Path(os.getenv("SOFIIA_DATA_DIR", "/app/data"))
_UPLOADS_DIR  = _DATA_DIR / "uploads"
_ROUTER_URL   = os.getenv("ROUTER_URL", "http://router:8000")

_MAX_IMAGE_MB = int(os.getenv("UPLOAD_MAX_IMAGE_MB",  "10"))
_MAX_VIDEO_MB = int(os.getenv("UPLOAD_MAX_VIDEO_MB", "200"))
_MAX_DOC_MB   = int(os.getenv("UPLOAD_MAX_DOC_MB",    "50"))

_USE_FABRIC_OCR  = os.getenv("USE_FABRIC_OCR",  "false").lower() == "true"
_USE_EMBEDDINGS  = os.getenv("USE_EMBEDDINGS",  "false").lower() == "true"

_ALLOWED_MIMES = {
    # images
    "image/jpeg", "image/png", "image/gif", "image/webp", "image/bmp",
    # video
    "video/mp4", "video/mpeg", "video/webm", "video/quicktime",
    # documents
    "application/pdf",
    "application/msword",
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    "application/vnd.ms-excel",
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    "application/vnd.ms-powerpoint",
    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
    "text/plain", "text/markdown", "text/csv",
    "application/json",
    "application/zip",
}

def _safe_filename(name: str) -> str:
    """Remove path traversal attempts and dangerous chars."""
    name = os.path.basename(name)
    name = re.sub(r"[^\w\-_.()]", "_", name)
    return name[:128] or "upload"


def _size_limit_mb(mime: str) -> int:
    if mime.startswith("image/"):  return _MAX_IMAGE_MB
    if mime.startswith("video/"):  return _MAX_VIDEO_MB
    return _MAX_DOC_MB


def _detect_mime(filename: str, data: bytes) -> str:
    """Detect MIME by magic bytes first, fall back to extension."""
    try:
        import magic
        return magic.from_buffer(data[:2048], mime=True)
    except Exception:
        pass
    guessed, _ = mimetypes.guess_type(filename)
    return guessed or "application/octet-stream"


def _extract_text_simple(filename: str, data: bytes, mime: str) -> str:
    """Best-effort text extraction without external services."""
    try:
        if mime == "text/plain" or filename.endswith((".txt", ".md", ".markdown")):
            return data.decode("utf-8", errors="replace")[:4096]
        if mime == "application/json":
            return data.decode("utf-8", errors="replace")[:4096]
        if mime == "application/pdf":
            try:
                import pypdf
                reader = pypdf.PdfReader(io.BytesIO(data))
                text = "\n".join(p.extract_text() or "" for p in reader.pages[:10])
                return text[:4096]
            except Exception:
                pass
        if mime in (
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        ):
            try:
                import docx
                doc = docx.Document(io.BytesIO(data))
                return "\n".join(p.text for p in doc.paragraphs)[:4096]
            except Exception:
                pass
    except Exception as e:
        logger.debug("extract_text_simple failed: %s", e)
    return ""


# ── Projects ──────────────────────────────────────────────────────────────────

class ProjectCreate(BaseModel):
    name: str
    description: str = ""


class ProjectUpdate(BaseModel):
    name: Optional[str] = None
    description: Optional[str] = None


@docs_router.get("/projects")
async def list_projects():
    return await _db.list_projects()


@docs_router.post("/projects", status_code=201)
async def create_project(body: ProjectCreate):
    if not body.name.strip():
        raise HTTPException(status_code=400, detail="name is required")
    result = await _db.create_project(body.name.strip(), body.description)

    # Fire-and-forget: compute initial snapshot + signals so Portfolio is populated
    import asyncio as _asyncio
    async def _bootstrap_project(pid: str) -> None:
        try:
            await _db.compute_graph_snapshot(project_id=pid, window="7d")
        except Exception:
            pass
        try:
            await _db.recompute_graph_signals(project_id=pid, window="7d", dry_run=False)
        except Exception:
            pass

    _asyncio.ensure_future(_bootstrap_project(result.get("project_id", "")))
    return result


@docs_router.get("/projects/{project_id}")
async def get_project(project_id: str):
    p = await _db.get_project(project_id)
    if not p:
        raise HTTPException(status_code=404, detail="Project not found")
    return p


@docs_router.patch("/projects/{project_id}")
async def update_project(project_id: str, body: ProjectUpdate):
    ok = await _db.update_project(project_id, name=body.name, description=body.description)
    if not ok:
        raise HTTPException(status_code=404, detail="Project not found or no changes")
    return {"ok": True}


# ── File Upload ───────────────────────────────────────────────────────────────

@docs_router.post("/files/upload")
async def upload_file(
    request: Request,
    project_id: str = Query("default"),
    title: str = Query(""),
    tags: str = Query(""),  # comma-separated
    file: UploadFile = File(...),
):
    """Upload a file, extract text, store metadata.

    Returns: {file_id, doc_id, sha256, mime, size_bytes, filename, preview_text}
    """
    raw_name = _safe_filename(file.filename or "upload")
    data = await file.read()

    # Detect real mime from bytes
    mime = _detect_mime(raw_name, data)

    # Validate mime
    if mime not in _ALLOWED_MIMES:
        raise HTTPException(status_code=415, detail=f"Unsupported file type: {mime}")

    # Size limits
    size_mb = len(data) / (1024 * 1024)
    limit_mb = _size_limit_mb(mime)
    if size_mb > limit_mb:
        raise HTTPException(
            status_code=413,
            detail=f"File too large: {size_mb:.1f}MB > {limit_mb}MB limit for {mime}",
        )

    # SHA-256 (content-addressed storage)
    sha = hashlib.sha256(data).hexdigest()

    # Store file (content-addressed)
    _UPLOADS_DIR.mkdir(parents=True, exist_ok=True)
    shard = sha[:2]
    dest = _UPLOADS_DIR / shard / f"{sha}_{raw_name}"
    dest.parent.mkdir(parents=True, exist_ok=True)
    if not dest.exists():
        dest.write_bytes(data)

    file_id = sha[:16]  # short reference

    # Extract text
    extracted = _extract_text_simple(raw_name, data, mime)

    # Fabric OCR for images (feature flag)
    if _USE_FABRIC_OCR and mime.startswith("image/") and not extracted:
        try:
            import base64 as _b64
            router_url = os.getenv("ROUTER_URL", "http://router:8000")
            async with httpx.AsyncClient(timeout=30.0) as client:
                r = await client.post(
                    f"{router_url}/v1/capability/ocr",
                    json={"image_b64": _b64.b64encode(data).decode(), "filename": raw_name},
                )
                if r.status_code == 200:
                    extracted = r.json().get("text", "")[:4096]
        except Exception as e:
            logger.debug("Fabric OCR failed (skipping): %s", e)

    # Parse tags
    tag_list = [t.strip() for t in tags.split(",") if t.strip()]

    # Ensure project exists
    if not await _db.get_project(project_id):
        project_id = "default"

    # Save to DB
    doc = await _db.create_document(
        project_id=project_id,
        file_id=file_id,
        sha256=sha,
        mime=mime,
        size_bytes=len(data),
        filename=raw_name,
        title=title or raw_name,
        tags=tag_list,
        extracted_text=extracted,
    )

    # Async ingest to Qdrant via Router (best-effort, non-blocking)
    if _USE_EMBEDDINGS and extracted:
        try:
            router_url = os.getenv("ROUTER_URL", "http://router:8000")
            async with httpx.AsyncClient(timeout=10.0) as client:
                await client.post(f"{router_url}/v1/documents/ingest", json={
                    "agent_id": "sofiia",
                    "text": extracted,
                    "doc_id": doc["doc_id"],
                    "project_id": project_id,
                    "filename": raw_name,
                    "mime": mime,
                    "tags": tag_list,
                })
        except Exception as e:
            logger.debug("Doc ingest (best-effort) failed: %s", e)

    return {
        **doc,
        "preview_text": extracted[:300],
        "storage_path": str(dest.relative_to(_DATA_DIR)),
    }


# ── Documents ─────────────────────────────────────────────────────────────────

@docs_router.get("/projects/{project_id}/documents")
async def list_documents(project_id: str, limit: int = Query(50, ge=1, le=200)):
    return await _db.list_documents(project_id, limit=limit)


@docs_router.get("/projects/{project_id}/documents/{doc_id}")
async def get_document(project_id: str, doc_id: str):
    doc = await _db.get_document(doc_id)
    if not doc or doc["project_id"] != project_id:
        raise HTTPException(status_code=404, detail="Document not found")
    return doc


@docs_router.post("/projects/{project_id}/search")
async def search_project(project_id: str, request: Request):
    body = await request.json()
    query = body.get("query", "").strip()
    if not query:
        raise HTTPException(status_code=400, detail="query is required")
    docs = await _db.search_documents(project_id, query, limit=body.get("limit", 20))
    sessions = []  # Phase 2: semantic session search
    return {"query": query, "documents": docs, "sessions": sessions}


@docs_router.get("/files/{file_id}/download")
async def download_file(file_id: str):
    """Download a file by its file_id (first 16 chars of sha256)."""
    matches = list(_UPLOADS_DIR.rglob(f"{file_id}_*"))
    if not matches:
        raise HTTPException(status_code=404, detail="File not found")
    path = matches[0]
    return FileResponse(str(path), filename=path.name)


# ── Sessions ──────────────────────────────────────────────────────────────────

@docs_router.get("/sessions")
async def list_sessions(
    project_id: str = Query("default"),
    limit: int = Query(30, ge=1, le=100),
):
    return await _db.list_sessions(project_id, limit=limit)


@docs_router.get("/sessions/{session_id}")
async def get_session(session_id: str):
    s = await _db.get_session(session_id)
    if not s:
        raise HTTPException(status_code=404, detail="Session not found")
    return s


@docs_router.patch("/sessions/{session_id}/title")
async def update_session_title(session_id: str, request: Request):
    body = await request.json()
    title = body.get("title", "").strip()
    await _db.update_session_title(session_id, title)
    return {"ok": True}


# ── Chat History ──────────────────────────────────────────────────────────────

@docs_router.get("/chat/history")
async def get_chat_history(
    session_id: str = Query(...),
    limit: int = Query(50, ge=1, le=200),
    branch_label: Optional[str] = Query(None),
):
    """Load persisted message history for a session (for UI restore on page reload)."""
    msgs = await _db.list_messages(session_id, limit=limit, branch_label=branch_label)
    return {"session_id": session_id, "messages": msgs, "count": len(msgs)}


# ── Dialog Map ────────────────────────────────────────────────────────────────

@docs_router.get("/sessions/{session_id}/map")
async def get_dialog_map(session_id: str):
    """Return nodes and edges for dialog map visualization."""
    return await _db.get_dialog_map(session_id)


class ForkRequest(BaseModel):
    from_msg_id: str
    new_title: str = ""
    project_id: str = "default"


@docs_router.post("/sessions/{session_id}/fork")
async def fork_session(session_id: str, body: ForkRequest):
    """Fork a session from a specific message (creates new session with ancestor messages)."""
    result = await _db.fork_session(
        source_session_id=session_id,
        from_msg_id=body.from_msg_id,
        new_title=body.new_title,
        project_id=body.project_id,
    )
    return result


# ── Delete endpoints ───────────────────────────────────────────────────────────

@docs_router.delete("/projects/{project_id}")
async def delete_project(project_id: str):
    if project_id == "default":
        raise HTTPException(status_code=400, detail="Cannot delete default project")
    db = await _db.get_db()
    await db.execute("DELETE FROM projects WHERE project_id=?", (project_id,))
    await db.commit()
    return {"ok": True}


@docs_router.delete("/projects/{project_id}/documents/{doc_id}")
async def delete_document(project_id: str, doc_id: str):
    doc = await _db.get_document(doc_id)
    if not doc or doc["project_id"] != project_id:
        raise HTTPException(status_code=404, detail="Document not found")
    db = await _db.get_db()
    await db.execute("DELETE FROM documents WHERE doc_id=?", (doc_id,))
    await db.commit()
    return {"ok": True}


# ── Tasks (Kanban) ─────────────────────────────────────────────────────────────

class TaskCreate(BaseModel):
    title: str
    description: str = ""
    status: str = "backlog"
    priority: str = "normal"
    labels: List[str] = []
    assignees: List[str] = []
    due_at: Optional[str] = None
    created_by: str = ""


class TaskUpdate(BaseModel):
    title: Optional[str] = None
    description: Optional[str] = None
    status: Optional[str] = None
    priority: Optional[str] = None
    labels: Optional[List[str]] = None
    assignees: Optional[List[str]] = None
    due_at: Optional[str] = None
    sort_key: Optional[float] = None


@docs_router.get("/projects/{project_id}/tasks")
async def list_tasks(
    project_id: str,
    status: Optional[str] = Query(None),
    limit: int = Query(100, ge=1, le=500),
):
    """List tasks for a project, optionally filtered by status."""
    return await _db.list_tasks(project_id, status=status, limit=limit)


@docs_router.post("/projects/{project_id}/tasks", status_code=201)
async def create_task(project_id: str, body: TaskCreate):
    if not body.title.strip():
        raise HTTPException(status_code=400, detail="title is required")
    if not await _db.get_project(project_id):
        raise HTTPException(status_code=404, detail="Project not found")
    task = await _db.create_task(
        project_id=project_id,
        title=body.title.strip(),
        description=body.description,
        status=body.status,
        priority=body.priority,
        labels=body.labels,
        assignees=body.assignees,
        due_at=body.due_at,
        created_by=body.created_by,
    )
    # Auto-upsert dialog node
    await _db.upsert_dialog_node(
        project_id=project_id,
        node_type="task",
        ref_id=task["task_id"],
        title=task["title"],
        summary=task["description"][:200],
        props={"status": task["status"], "priority": task["priority"]},
    )
    return task


@docs_router.get("/projects/{project_id}/tasks/{task_id}")
async def get_task(project_id: str, task_id: str):
    task = await _db.get_task(task_id)
    if not task or task["project_id"] != project_id:
        raise HTTPException(status_code=404, detail="Task not found")
    return task


@docs_router.patch("/projects/{project_id}/tasks/{task_id}")
async def update_task(project_id: str, task_id: str, body: TaskUpdate):
    task = await _db.get_task(task_id)
    if not task or task["project_id"] != project_id:
        raise HTTPException(status_code=404, detail="Task not found")
    updates = body.model_dump(exclude_none=True)
    ok = await _db.update_task(task_id, **updates)
    if ok and "status" in updates:
        await _db.upsert_dialog_node(
            project_id=project_id,
            node_type="task",
            ref_id=task_id,
            title=task["title"],
            props={"status": updates["status"]},
        )
    return {"ok": ok}


@docs_router.delete("/projects/{project_id}/tasks/{task_id}")
async def delete_task(project_id: str, task_id: str):
    task = await _db.get_task(task_id)
    if not task or task["project_id"] != project_id:
        raise HTTPException(status_code=404, detail="Task not found")
    ok = await _db.delete_task(task_id)
    return {"ok": ok}


# ── Meetings ───────────────────────────────────────────────────────────────────

class MeetingCreate(BaseModel):
    title: str
    starts_at: str
    agenda: str = ""
    duration_min: int = 30
    location: str = ""
    attendees: List[str] = []
    created_by: str = ""


class MeetingUpdate(BaseModel):
    title: Optional[str] = None
    agenda: Optional[str] = None
    starts_at: Optional[str] = None
    duration_min: Optional[int] = None
    location: Optional[str] = None
    attendees: Optional[List[str]] = None


@docs_router.get("/projects/{project_id}/meetings")
async def list_meetings(project_id: str, limit: int = Query(50, ge=1, le=200)):
    return await _db.list_meetings(project_id, limit=limit)


@docs_router.post("/projects/{project_id}/meetings", status_code=201)
async def create_meeting(project_id: str, body: MeetingCreate):
    if not body.title.strip():
        raise HTTPException(status_code=400, detail="title is required")
    if not body.starts_at:
        raise HTTPException(status_code=400, detail="starts_at is required")
    if not await _db.get_project(project_id):
        raise HTTPException(status_code=404, detail="Project not found")
    meeting = await _db.create_meeting(
        project_id=project_id,
        title=body.title.strip(),
        starts_at=body.starts_at,
        agenda=body.agenda,
        duration_min=body.duration_min,
        location=body.location,
        attendees=body.attendees,
        created_by=body.created_by,
    )
    # Auto-upsert dialog node
    await _db.upsert_dialog_node(
        project_id=project_id,
        node_type="meeting",
        ref_id=meeting["meeting_id"],
        title=meeting["title"],
        summary=meeting["agenda"][:200],
        props={"starts_at": meeting["starts_at"], "duration_min": meeting["duration_min"]},
    )
    return meeting


@docs_router.get("/projects/{project_id}/meetings/{meeting_id}")
async def get_meeting(project_id: str, meeting_id: str):
    m = await _db.get_meeting(meeting_id)
    if not m or m["project_id"] != project_id:
        raise HTTPException(status_code=404, detail="Meeting not found")
    return m


@docs_router.patch("/projects/{project_id}/meetings/{meeting_id}")
async def update_meeting(project_id: str, meeting_id: str, body: MeetingUpdate):
    m = await _db.get_meeting(meeting_id)
    if not m or m["project_id"] != project_id:
        raise HTTPException(status_code=404, detail="Meeting not found")
    updates = body.model_dump(exclude_none=True)
    ok = await _db.update_meeting(meeting_id, **updates)
    return {"ok": ok}


@docs_router.delete("/projects/{project_id}/meetings/{meeting_id}")
async def delete_meeting(project_id: str, meeting_id: str):
    m = await _db.get_meeting(meeting_id)
    if not m or m["project_id"] != project_id:
        raise HTTPException(status_code=404, detail="Meeting not found")
    ok = await _db.delete_meeting(meeting_id)
    return {"ok": ok}


# ── Dialog Map (Project-level graph) ─────────────────────────────────────────

@docs_router.get("/projects/{project_id}/dialog-map")
async def get_project_dialog_map(project_id: str):
    """Return canonical dialog graph for the project (all entity nodes + edges)."""
    return await _db.get_project_dialog_map(project_id)


class LinkCreate(BaseModel):
    from_type: str
    from_id: str
    to_type: str
    to_id: str
    edge_type: str = "references"
    props: dict = {}
    created_by: str = ""


@docs_router.post("/projects/{project_id}/dialog/link", status_code=201)
async def create_dialog_link(project_id: str, body: LinkCreate):
    """Create a dialog edge between two entities (auto-resolves/creates nodes)."""
    if not await _db.get_project(project_id):
        raise HTTPException(status_code=404, detail="Project not found")

    # Resolve or create from_node
    from_node = await _db.upsert_dialog_node(
        project_id=project_id,
        node_type=body.from_type,
        ref_id=body.from_id,
        title=f"{body.from_type}:{body.from_id[:8]}",
        created_by=body.created_by,
    )
    # Resolve or create to_node
    to_node = await _db.upsert_dialog_node(
        project_id=project_id,
        node_type=body.to_type,
        ref_id=body.to_id,
        title=f"{body.to_type}:{body.to_id[:8]}",
        created_by=body.created_by,
    )
    edge = await _db.create_dialog_edge(
        project_id=project_id,
        from_node_id=from_node["node_id"],
        to_node_id=to_node["node_id"],
        edge_type=body.edge_type,
        props=body.props,
        created_by=body.created_by,
    )
    # Also persist as entity_link
    await _db.create_entity_link(
        project_id=project_id,
        from_type=body.from_type, from_id=body.from_id,
        to_type=body.to_type, to_id=body.to_id,
        link_type=body.edge_type,
        created_by=body.created_by,
    )
    return {
        "ok": True,
        "from_node": from_node,
        "to_node": to_node,
        "edge": edge,
    }


@docs_router.get("/projects/{project_id}/dialog/views")
async def list_dialog_views(project_id: str):
    return await _db.list_dialog_views(project_id)


class DialogViewSave(BaseModel):
    name: str
    filters: dict = {}
    layout: dict = {}


@docs_router.put("/projects/{project_id}/dialog/views/{name}")
async def save_dialog_view(project_id: str, name: str, body: DialogViewSave):
    view = await _db.upsert_dialog_view(
        project_id=project_id,
        name=name,
        filters=body.filters,
        layout=body.layout,
    )
    return view


# ── Doc Versions ──────────────────────────────────────────────────────────────

class DocUpdateRequest(BaseModel):
    content_md: str
    author_id: str = "system"
    reason: str = ""
    dry_run: bool = False


@docs_router.post("/projects/{project_id}/documents/{doc_id}/update")
async def update_document_version(project_id: str, doc_id: str, body: DocUpdateRequest):
    """Update document text and create a new version (idempotent by content hash).

    dry_run=True: returns computed version_hash + diff_preview without writing.
    """
    import hashlib, difflib
    doc = await _db.get_document(doc_id)
    if not doc or doc["project_id"] != project_id:
        raise HTTPException(status_code=404, detail="Document not found")

    content = body.content_md.strip()
    version_hash = hashlib.sha256(content.encode()).hexdigest()[:16]

    # Get latest version for diff
    existing = await _db.list_doc_versions(doc_id, limit=1)
    prev_content = ""
    if existing:
        prev_content = (await _db.get_doc_version_content(existing[0]["version_id"])) or ""

    diff_lines = list(difflib.unified_diff(
        prev_content.splitlines(), content.splitlines(),
        fromfile="previous", tofile="updated", lineterm="", n=3,
    ))
    diff_text = "\n".join(diff_lines[:80])  # cap for response
    will_change = content != prev_content

    if body.dry_run or not will_change:
        return {
            "ok": True,
            "dry_run": body.dry_run,
            "will_change": will_change,
            "version_hash": version_hash,
            "diff_text": diff_text,
        }

    new_ver = await _db.save_doc_version(doc_id, content, author_id=body.author_id)
    return {
        "ok": True,
        "dry_run": False,
        "will_change": True,
        "version_hash": version_hash,
        "version_id": new_ver["version_id"],
        "created_at": new_ver["created_at"],
        "diff_text": diff_text,
        "reason": body.reason,
    }


@docs_router.get("/projects/{project_id}/documents/{doc_id}/versions")
async def list_doc_versions(project_id: str, doc_id: str, limit: int = Query(20)):
    doc = await _db.get_document(doc_id)
    if not doc or doc["project_id"] != project_id:
        raise HTTPException(status_code=404, detail="Document not found")
    return await _db.list_doc_versions(doc_id, limit=limit)


class DocVersionRestore(BaseModel):
    version_id: str
    author_id: str = "system"


@docs_router.post("/projects/{project_id}/documents/{doc_id}/restore")
async def restore_doc_version(project_id: str, doc_id: str, body: DocVersionRestore):
    doc = await _db.get_document(doc_id)
    if not doc or doc["project_id"] != project_id:
        raise HTTPException(status_code=404, detail="Document not found")
    content = await _db.get_doc_version_content(body.version_id)
    if content is None:
        raise HTTPException(status_code=404, detail="Version not found")
    # Save restored content as new version
    new_ver = await _db.save_doc_version(doc_id, content, author_id=body.author_id)
    return {"ok": True, "new_version": new_ver, "restored_from": body.version_id}