feat(docs): add standard file processing and router document ingest/query

2026-02-21 14:02:59 +01:00
parent 3e3546ea89
commit 5d52cf81c4
7 changed files with 755 additions and 104 deletions
--- a/services/swapper-service/app/main.py
+++ b/services/swapper-service/app/main.py
@@ -11,10 +11,13 @@ import os
 import asyncio
 import logging
 import base64
+import json
+import re
 from typing import Optional, Dict, List, Any, Union
 from datetime import datetime, timedelta
 from enum import Enum
 from io import BytesIO
+import xml.etree.ElementTree as ET

 from fastapi import FastAPI, HTTPException, BackgroundTasks, File, UploadFile, Form
 from fastapi.middleware.cors import CORSMiddleware
@@ -56,16 +59,34 @@ def _csv_to_markdown(content: bytes) -> str:
    text = _decode_text_bytes(content)
    reader = csv.reader(text.splitlines())
    rows = list(reader)
+    return _rows_to_markdown(rows)
+
+
+def _tsv_to_markdown(content: bytes) -> str:
+    text = _decode_text_bytes(content)
+    reader = csv.reader(text.splitlines(), delimiter="\t")
+    rows = list(reader)
+    return _rows_to_markdown(rows)
+
+
+def _rows_to_markdown(rows: List[List[Any]]) -> str:
    if not rows:
        return ""
-    header = rows[0]
-    body = rows[1:]
+    width = max(len(r) for r in rows)
+    norm_rows = []
+    for r in rows:
+        rr = [str(c) if c is not None else "" for c in r]
+        if len(rr) < width:
+            rr.extend([""] * (width - len(rr)))
+        norm_rows.append(rr)
+    header = norm_rows[0]
+    body = norm_rows[1:]
    lines = [
        "| " + " | ".join(header) + " |",
        "| " + " | ".join(["---"] * len(header)) + " |",
    ]
    for row in body:
-        lines.append("| " + " | ".join(row) + " |")
+        lines.append("| " + " | ".join([str(c) if c is not None else "" for c in row]) + " |")
    return "\n".join(lines)


@@ -91,6 +112,69 @@ def _xlsx_to_markdown(content: bytes) -> str:
    return "\n".join(parts)


+def _xls_to_markdown(content: bytes) -> str:
+    try:
+        import xlrd
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"xlrd not available: {e}")
+    wb = xlrd.open_workbook(file_contents=content)
+    parts = []
+    for s in wb.sheets():
+        parts.append(f"## Sheet: {s.name}")
+        rows = []
+        for r in range(s.nrows):
+            rows.append([s.cell_value(r, c) for c in range(s.ncols)])
+        if not rows:
+            parts.append("_Empty sheet_")
+            continue
+        parts.append(_rows_to_markdown(rows))
+    return "\n\n".join(parts)
+
+
+def _ods_to_markdown(content: bytes) -> str:
+    try:
+        from odf.opendocument import load
+        from odf.table import Table, TableRow, TableCell
+        from odf.text import P
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"odfpy not available: {e}")
+
+    try:
+        doc = load(BytesIO(content))
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid ODS file: {e}")
+
+    parts = []
+    for table in doc.spreadsheet.getElementsByType(Table):
+        table_name = str(table.getAttribute("name") or "Sheet")
+        parts.append(f"## Sheet: {table_name}")
+        rows: List[List[str]] = []
+        for row in table.getElementsByType(TableRow):
+            cells_out: List[str] = []
+            for cell in row.getElementsByType(TableCell):
+                txt_parts = []
+                for p in cell.getElementsByType(P):
+                    txt_parts.extend(
+                        [str(getattr(node, "data", "")).strip() for node in p.childNodes if getattr(node, "data", None)]
+                    )
+                cell_text = " ".join([t for t in txt_parts if t]).strip()
+                repeat_raw = cell.getAttribute("numbercolumnsrepeated")
+                try:
+                    repeat = int(repeat_raw) if repeat_raw else 1
+                except Exception:
+                    repeat = 1
+                repeat = max(1, min(repeat, 100))
+                for _ in range(repeat):
+                    cells_out.append(cell_text)
+            if cells_out:
+                rows.append(cells_out)
+        if not rows:
+            parts.append("_Empty sheet_")
+            continue
+        parts.append(_rows_to_markdown(rows))
+    return "\n\n".join(parts)
+
+
 def _docx_to_text(content: bytes) -> str:
    try:
        from docx import Document
@@ -115,18 +199,111 @@ def _pdf_to_text(content: bytes) -> str:
    return "\n\n".join(text_content)


+def _pptx_to_text(content: bytes) -> str:
+    try:
+        from pptx import Presentation
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"python-pptx not available: {e}")
+    prs = Presentation(BytesIO(content))
+    parts = []
+    for idx, slide in enumerate(prs.slides, start=1):
+        parts.append(f"## Slide {idx}")
+        slide_lines = []
+        for shape in slide.shapes:
+            text = getattr(shape, "text", None)
+            if text and str(text).strip():
+                slide_lines.append(str(text).strip())
+        parts.extend(slide_lines if slide_lines else ["_No text on this slide_"])
+    return "\n\n".join(parts)
+
+
+def _json_to_text(content: bytes) -> str:
+    raw = _decode_text_bytes(content)
+    try:
+        parsed = json.loads(raw)
+        return json.dumps(parsed, ensure_ascii=False, indent=2)
+    except Exception:
+        return raw
+
+
+def _yaml_to_text(content: bytes) -> str:
+    raw = _decode_text_bytes(content)
+    try:
+        parsed = yaml.safe_load(raw)
+        return yaml.safe_dump(parsed, allow_unicode=True, sort_keys=False)
+    except Exception:
+        return raw
+
+
+def _xml_to_text(content: bytes) -> str:
+    raw = _decode_text_bytes(content)
+    try:
+        root = ET.fromstring(raw)
+        text = " ".join([t.strip() for t in root.itertext() if t and t.strip()])
+        return text or raw
+    except Exception:
+        return raw
+
+
+def _html_to_text(content: bytes) -> str:
+    raw = _decode_text_bytes(content)
+    try:
+        from bs4 import BeautifulSoup
+
+        soup = BeautifulSoup(raw, "html.parser")
+        text = soup.get_text(separator="\n")
+        text = re.sub(r"\n{3,}", "\n\n", text)
+        return text.strip() or raw
+    except Exception:
+        # Minimal fallback if bs4 is unavailable
+        text = re.sub(r"<[^>]+>", " ", raw)
+        text = re.sub(r"\s+", " ", text)
+        return text.strip()
+
+
+def _rtf_to_text(content: bytes) -> str:
+    raw = _decode_text_bytes(content)
+    try:
+        from striprtf.striprtf import rtf_to_text
+        return rtf_to_text(raw)
+    except Exception:
+        # Basic fallback: strip common RTF control tokens
+        text = re.sub(r"\\'[0-9a-fA-F]{2}", " ", raw)
+        text = re.sub(r"\\[a-zA-Z]+-?\d* ?", " ", text)
+        text = text.replace("{", " ").replace("}", " ")
+        return re.sub(r"\s+", " ", text).strip()
+
+
 def _extract_text_by_ext(filename: str, content: bytes) -> str:
    ext = filename.split(".")[-1].lower() if "." in filename else ""
-    if ext in ["txt", "md"]:
+    if ext in ["txt", "md", "markdown"]:
        return _decode_text_bytes(content)
    if ext == "csv":
        return _csv_to_markdown(content)
-    if ext == "xlsx":
+    if ext == "tsv":
+        return _tsv_to_markdown(content)
+    if ext in {"xlsx", "xlsm"}:
        return _xlsx_to_markdown(content)
+    if ext == "xls":
+        return _xls_to_markdown(content)
+    if ext == "ods":
+        return _ods_to_markdown(content)
    if ext == "docx":
        return _docx_to_text(content)
    if ext == "pdf":
        return _pdf_to_text(content)
+    if ext == "pptx":
+        return _pptx_to_text(content)
+    if ext == "json":
+        return _json_to_text(content)
+    if ext in {"yaml", "yml"}:
+        return _yaml_to_text(content)
+    if ext == "xml":
+        return _xml_to_text(content)
+    if ext in {"html", "htm"}:
+        return _html_to_text(content)
+    if ext == "rtf":
+        return _rtf_to_text(content)
    raise HTTPException(status_code=400, detail=f"Unsupported file type: .{ext}")


@@ -139,7 +316,12 @@ def _zip_to_markdown(content: bytes, max_files: int = 50, max_total_mb: int = 10
    if total_size > max_total_mb * 1024 * 1024:
        raise HTTPException(status_code=400, detail=f"ZIP слишком большой: {total_size / 1024 / 1024:.1f} MB")
    parts = []
-    allowed_exts = {"txt", "md", "csv", "xlsx", "docx", "pdf"}
+    allowed_exts = {
+        "txt", "md", "markdown", "csv", "tsv",
+        "xls", "xlsx", "xlsm", "ods",
+        "docx", "pdf", "pptx",
+        "json", "yaml", "yml", "xml", "html", "htm", "rtf",
+    }
    processed = []
    skipped = []
    for member in members:
@@ -1655,7 +1837,8 @@ async def document_endpoint(
    - json: Structured JSON with document elements
    - text: Plain text extraction
    
-    Supported files: PDF, DOCX, PPTX, images (PNG, JPG)
+    Supported files:
+    PDF, DOCX, XLS/XLSX/XLSM/ODS, PPTX, TXT/MD/CSV/TSV, JSON/YAML/XML/HTML, RTF, ZIP, images.
    """
    try:
        import time
@@ -1672,15 +1855,28 @@ async def document_endpoint(
        filename = file.filename if file else "document"
        file_ext = filename.split(".")[-1].lower() if "." in filename else "pdf"

-        # Handle text-based formats without Docling
-        if file_ext in ["txt", "md", "csv", "xlsx", "zip"]:
+        # Handle deterministic extraction for standard office/text formats
+        if file_ext in [
+            "txt", "md", "markdown", "csv", "tsv",
+            "xlsx", "xls", "xlsm", "ods",
+            "json", "yaml", "yml", "xml", "html", "htm", "rtf",
+            "pptx", "zip",
+        ]:
            try:
                if file_ext == "zip":
                    content = _zip_to_markdown(doc_data)
                    output_format = "markdown"
                else:
                    content = _extract_text_by_ext(filename, doc_data)
-                    output_format = "markdown" if file_ext in ["md", "csv", "xlsx"] else "text"
+                    output_format = (
+                        "markdown"
+                        if file_ext in {
+                            "md", "markdown", "csv", "tsv",
+                            "xlsx", "xls", "xlsm", "ods",
+                            "json", "yaml", "yml", "xml", "html", "htm", "pptx",
+                        }
+                        else "text"
+                    )
                processing_time_ms = (time.time() - start_time) * 1000
                return {
                    "success": True,
@@ -1764,22 +1960,27 @@ async def document_endpoint(
                        "device": swapper.device
                    }
                
-                # For DOCX, try python-docx
-                if file_ext == "docx":
+                # For common office/text formats, try deterministic extractors.
+                if file_ext in {
+                    "docx", "txt", "md", "markdown", "csv", "tsv",
+                    "xlsx", "xls", "xlsm", "ods",
+                    "pptx", "json", "yaml", "yml", "xml", "html", "htm", "rtf",
+                }:
                    try:
-                        content = _docx_to_text(doc_data)
+                        content = _extract_text_by_ext(filename, doc_data)
+                        out_fmt = "markdown" if file_ext not in {"txt", "rtf"} else "text"
                        return {
                            "success": True,
-                            "model": "python-docx (fallback)",
-                            "output_format": "text",
+                            "model": "text-extract (fallback)",
+                            "output_format": out_fmt,
                            "result": content,
                            "filename": filename,
                            "processing_time_ms": (time.time() - start_time) * 1000,
                            "device": swapper.device
                        }
                    except Exception as e:
-                        logger.error(f"DOCX fallback failed: {e}")
-                        raise HTTPException(status_code=500, detail="DOCX extraction failed")
+                        logger.error(f"Text fallback failed for .{file_ext}: {e}")
+                        raise HTTPException(status_code=500, detail=f"Extraction failed for .{file_ext}")
                
                # For PDFs, try pdfplumber
                if file_ext == "pdf":
@@ -1807,7 +2008,7 @@ async def document_endpoint(
                # For other documents, return error
                raise HTTPException(
                    status_code=503, 
-                    detail="Document processing not available. Supported: PDF (with pdfplumber), images (with OCR)"
+                    detail="Document processing unavailable for this type. Supported: office/text/image/zip standard formats."
                )
                
        finally:
@@ -2312,4 +2513,3 @@ async def get_multimodal_stack():
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8890)
-
--- a/services/swapper-service/app/requirements.txt
+++ b/services/swapper-service/app/requirements.txt
@@ -4,6 +4,15 @@ httpx==0.25.2
 pydantic==2.5.0
 pyyaml==6.0.1
 python-multipart==0.0.6
+chardet>=5.2.0
+openpyxl>=3.1.2
+python-docx>=1.1.2
+pdfplumber>=0.11.0
+python-pptx>=0.6.23
+xlrd>=2.0.1
+odfpy>=1.4.1
+beautifulsoup4>=4.12.0
+striprtf>=0.0.26

 # HuggingFace dependencies for OCR models
 torch>=2.0.0
@@ -25,4 +34,4 @@ safetensors>=0.4.0

 # Web Scraping & Search
 trafilatura>=1.6.0
-duckduckgo-search>=4.0.0
+duckduckgo-search>=4.0.0
--- a/services/swapper-service/requirements.txt
+++ b/services/swapper-service/requirements.txt
@@ -43,3 +43,8 @@ pdfplumber>=0.10.0
 python-docx>=1.1.0
 openpyxl>=3.1.2
 chardet>=5.2.0
+python-pptx>=0.6.23
+xlrd>=2.0.1
+odfpy>=1.4.1
+beautifulsoup4>=4.12.0
+striprtf>=0.0.26