feat(docs): add standard file processing and router document ingest/query
This commit is contained in:
@@ -11,10 +11,13 @@ import os
|
||||
import asyncio
|
||||
import logging
|
||||
import base64
|
||||
import json
|
||||
import re
|
||||
from typing import Optional, Dict, List, Any, Union
|
||||
from datetime import datetime, timedelta
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from fastapi import FastAPI, HTTPException, BackgroundTasks, File, UploadFile, Form
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
@@ -56,16 +59,34 @@ def _csv_to_markdown(content: bytes) -> str:
|
||||
text = _decode_text_bytes(content)
|
||||
reader = csv.reader(text.splitlines())
|
||||
rows = list(reader)
|
||||
return _rows_to_markdown(rows)
|
||||
|
||||
|
||||
def _tsv_to_markdown(content: bytes) -> str:
|
||||
text = _decode_text_bytes(content)
|
||||
reader = csv.reader(text.splitlines(), delimiter="\t")
|
||||
rows = list(reader)
|
||||
return _rows_to_markdown(rows)
|
||||
|
||||
|
||||
def _rows_to_markdown(rows: List[List[Any]]) -> str:
|
||||
if not rows:
|
||||
return ""
|
||||
header = rows[0]
|
||||
body = rows[1:]
|
||||
width = max(len(r) for r in rows)
|
||||
norm_rows = []
|
||||
for r in rows:
|
||||
rr = [str(c) if c is not None else "" for c in r]
|
||||
if len(rr) < width:
|
||||
rr.extend([""] * (width - len(rr)))
|
||||
norm_rows.append(rr)
|
||||
header = norm_rows[0]
|
||||
body = norm_rows[1:]
|
||||
lines = [
|
||||
"| " + " | ".join(header) + " |",
|
||||
"| " + " | ".join(["---"] * len(header)) + " |",
|
||||
]
|
||||
for row in body:
|
||||
lines.append("| " + " | ".join(row) + " |")
|
||||
lines.append("| " + " | ".join([str(c) if c is not None else "" for c in row]) + " |")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
@@ -91,6 +112,69 @@ def _xlsx_to_markdown(content: bytes) -> str:
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def _xls_to_markdown(content: bytes) -> str:
|
||||
try:
|
||||
import xlrd
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"xlrd not available: {e}")
|
||||
wb = xlrd.open_workbook(file_contents=content)
|
||||
parts = []
|
||||
for s in wb.sheets():
|
||||
parts.append(f"## Sheet: {s.name}")
|
||||
rows = []
|
||||
for r in range(s.nrows):
|
||||
rows.append([s.cell_value(r, c) for c in range(s.ncols)])
|
||||
if not rows:
|
||||
parts.append("_Empty sheet_")
|
||||
continue
|
||||
parts.append(_rows_to_markdown(rows))
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _ods_to_markdown(content: bytes) -> str:
|
||||
try:
|
||||
from odf.opendocument import load
|
||||
from odf.table import Table, TableRow, TableCell
|
||||
from odf.text import P
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"odfpy not available: {e}")
|
||||
|
||||
try:
|
||||
doc = load(BytesIO(content))
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid ODS file: {e}")
|
||||
|
||||
parts = []
|
||||
for table in doc.spreadsheet.getElementsByType(Table):
|
||||
table_name = str(table.getAttribute("name") or "Sheet")
|
||||
parts.append(f"## Sheet: {table_name}")
|
||||
rows: List[List[str]] = []
|
||||
for row in table.getElementsByType(TableRow):
|
||||
cells_out: List[str] = []
|
||||
for cell in row.getElementsByType(TableCell):
|
||||
txt_parts = []
|
||||
for p in cell.getElementsByType(P):
|
||||
txt_parts.extend(
|
||||
[str(getattr(node, "data", "")).strip() for node in p.childNodes if getattr(node, "data", None)]
|
||||
)
|
||||
cell_text = " ".join([t for t in txt_parts if t]).strip()
|
||||
repeat_raw = cell.getAttribute("numbercolumnsrepeated")
|
||||
try:
|
||||
repeat = int(repeat_raw) if repeat_raw else 1
|
||||
except Exception:
|
||||
repeat = 1
|
||||
repeat = max(1, min(repeat, 100))
|
||||
for _ in range(repeat):
|
||||
cells_out.append(cell_text)
|
||||
if cells_out:
|
||||
rows.append(cells_out)
|
||||
if not rows:
|
||||
parts.append("_Empty sheet_")
|
||||
continue
|
||||
parts.append(_rows_to_markdown(rows))
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _docx_to_text(content: bytes) -> str:
|
||||
try:
|
||||
from docx import Document
|
||||
@@ -115,18 +199,111 @@ def _pdf_to_text(content: bytes) -> str:
|
||||
return "\n\n".join(text_content)
|
||||
|
||||
|
||||
def _pptx_to_text(content: bytes) -> str:
|
||||
try:
|
||||
from pptx import Presentation
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"python-pptx not available: {e}")
|
||||
prs = Presentation(BytesIO(content))
|
||||
parts = []
|
||||
for idx, slide in enumerate(prs.slides, start=1):
|
||||
parts.append(f"## Slide {idx}")
|
||||
slide_lines = []
|
||||
for shape in slide.shapes:
|
||||
text = getattr(shape, "text", None)
|
||||
if text and str(text).strip():
|
||||
slide_lines.append(str(text).strip())
|
||||
parts.extend(slide_lines if slide_lines else ["_No text on this slide_"])
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _json_to_text(content: bytes) -> str:
|
||||
raw = _decode_text_bytes(content)
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
return json.dumps(parsed, ensure_ascii=False, indent=2)
|
||||
except Exception:
|
||||
return raw
|
||||
|
||||
|
||||
def _yaml_to_text(content: bytes) -> str:
|
||||
raw = _decode_text_bytes(content)
|
||||
try:
|
||||
parsed = yaml.safe_load(raw)
|
||||
return yaml.safe_dump(parsed, allow_unicode=True, sort_keys=False)
|
||||
except Exception:
|
||||
return raw
|
||||
|
||||
|
||||
def _xml_to_text(content: bytes) -> str:
|
||||
raw = _decode_text_bytes(content)
|
||||
try:
|
||||
root = ET.fromstring(raw)
|
||||
text = " ".join([t.strip() for t in root.itertext() if t and t.strip()])
|
||||
return text or raw
|
||||
except Exception:
|
||||
return raw
|
||||
|
||||
|
||||
def _html_to_text(content: bytes) -> str:
|
||||
raw = _decode_text_bytes(content)
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(raw, "html.parser")
|
||||
text = soup.get_text(separator="\n")
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text.strip() or raw
|
||||
except Exception:
|
||||
# Minimal fallback if bs4 is unavailable
|
||||
text = re.sub(r"<[^>]+>", " ", raw)
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _rtf_to_text(content: bytes) -> str:
|
||||
raw = _decode_text_bytes(content)
|
||||
try:
|
||||
from striprtf.striprtf import rtf_to_text
|
||||
return rtf_to_text(raw)
|
||||
except Exception:
|
||||
# Basic fallback: strip common RTF control tokens
|
||||
text = re.sub(r"\\'[0-9a-fA-F]{2}", " ", raw)
|
||||
text = re.sub(r"\\[a-zA-Z]+-?\d* ?", " ", text)
|
||||
text = text.replace("{", " ").replace("}", " ")
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def _extract_text_by_ext(filename: str, content: bytes) -> str:
|
||||
ext = filename.split(".")[-1].lower() if "." in filename else ""
|
||||
if ext in ["txt", "md"]:
|
||||
if ext in ["txt", "md", "markdown"]:
|
||||
return _decode_text_bytes(content)
|
||||
if ext == "csv":
|
||||
return _csv_to_markdown(content)
|
||||
if ext == "xlsx":
|
||||
if ext == "tsv":
|
||||
return _tsv_to_markdown(content)
|
||||
if ext in {"xlsx", "xlsm"}:
|
||||
return _xlsx_to_markdown(content)
|
||||
if ext == "xls":
|
||||
return _xls_to_markdown(content)
|
||||
if ext == "ods":
|
||||
return _ods_to_markdown(content)
|
||||
if ext == "docx":
|
||||
return _docx_to_text(content)
|
||||
if ext == "pdf":
|
||||
return _pdf_to_text(content)
|
||||
if ext == "pptx":
|
||||
return _pptx_to_text(content)
|
||||
if ext == "json":
|
||||
return _json_to_text(content)
|
||||
if ext in {"yaml", "yml"}:
|
||||
return _yaml_to_text(content)
|
||||
if ext == "xml":
|
||||
return _xml_to_text(content)
|
||||
if ext in {"html", "htm"}:
|
||||
return _html_to_text(content)
|
||||
if ext == "rtf":
|
||||
return _rtf_to_text(content)
|
||||
raise HTTPException(status_code=400, detail=f"Unsupported file type: .{ext}")
|
||||
|
||||
|
||||
@@ -139,7 +316,12 @@ def _zip_to_markdown(content: bytes, max_files: int = 50, max_total_mb: int = 10
|
||||
if total_size > max_total_mb * 1024 * 1024:
|
||||
raise HTTPException(status_code=400, detail=f"ZIP слишком большой: {total_size / 1024 / 1024:.1f} MB")
|
||||
parts = []
|
||||
allowed_exts = {"txt", "md", "csv", "xlsx", "docx", "pdf"}
|
||||
allowed_exts = {
|
||||
"txt", "md", "markdown", "csv", "tsv",
|
||||
"xls", "xlsx", "xlsm", "ods",
|
||||
"docx", "pdf", "pptx",
|
||||
"json", "yaml", "yml", "xml", "html", "htm", "rtf",
|
||||
}
|
||||
processed = []
|
||||
skipped = []
|
||||
for member in members:
|
||||
@@ -1655,7 +1837,8 @@ async def document_endpoint(
|
||||
- json: Structured JSON with document elements
|
||||
- text: Plain text extraction
|
||||
|
||||
Supported files: PDF, DOCX, PPTX, images (PNG, JPG)
|
||||
Supported files:
|
||||
PDF, DOCX, XLS/XLSX/XLSM/ODS, PPTX, TXT/MD/CSV/TSV, JSON/YAML/XML/HTML, RTF, ZIP, images.
|
||||
"""
|
||||
try:
|
||||
import time
|
||||
@@ -1672,15 +1855,28 @@ async def document_endpoint(
|
||||
filename = file.filename if file else "document"
|
||||
file_ext = filename.split(".")[-1].lower() if "." in filename else "pdf"
|
||||
|
||||
# Handle text-based formats without Docling
|
||||
if file_ext in ["txt", "md", "csv", "xlsx", "zip"]:
|
||||
# Handle deterministic extraction for standard office/text formats
|
||||
if file_ext in [
|
||||
"txt", "md", "markdown", "csv", "tsv",
|
||||
"xlsx", "xls", "xlsm", "ods",
|
||||
"json", "yaml", "yml", "xml", "html", "htm", "rtf",
|
||||
"pptx", "zip",
|
||||
]:
|
||||
try:
|
||||
if file_ext == "zip":
|
||||
content = _zip_to_markdown(doc_data)
|
||||
output_format = "markdown"
|
||||
else:
|
||||
content = _extract_text_by_ext(filename, doc_data)
|
||||
output_format = "markdown" if file_ext in ["md", "csv", "xlsx"] else "text"
|
||||
output_format = (
|
||||
"markdown"
|
||||
if file_ext in {
|
||||
"md", "markdown", "csv", "tsv",
|
||||
"xlsx", "xls", "xlsm", "ods",
|
||||
"json", "yaml", "yml", "xml", "html", "htm", "pptx",
|
||||
}
|
||||
else "text"
|
||||
)
|
||||
processing_time_ms = (time.time() - start_time) * 1000
|
||||
return {
|
||||
"success": True,
|
||||
@@ -1764,22 +1960,27 @@ async def document_endpoint(
|
||||
"device": swapper.device
|
||||
}
|
||||
|
||||
# For DOCX, try python-docx
|
||||
if file_ext == "docx":
|
||||
# For common office/text formats, try deterministic extractors.
|
||||
if file_ext in {
|
||||
"docx", "txt", "md", "markdown", "csv", "tsv",
|
||||
"xlsx", "xls", "xlsm", "ods",
|
||||
"pptx", "json", "yaml", "yml", "xml", "html", "htm", "rtf",
|
||||
}:
|
||||
try:
|
||||
content = _docx_to_text(doc_data)
|
||||
content = _extract_text_by_ext(filename, doc_data)
|
||||
out_fmt = "markdown" if file_ext not in {"txt", "rtf"} else "text"
|
||||
return {
|
||||
"success": True,
|
||||
"model": "python-docx (fallback)",
|
||||
"output_format": "text",
|
||||
"model": "text-extract (fallback)",
|
||||
"output_format": out_fmt,
|
||||
"result": content,
|
||||
"filename": filename,
|
||||
"processing_time_ms": (time.time() - start_time) * 1000,
|
||||
"device": swapper.device
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"DOCX fallback failed: {e}")
|
||||
raise HTTPException(status_code=500, detail="DOCX extraction failed")
|
||||
logger.error(f"Text fallback failed for .{file_ext}: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Extraction failed for .{file_ext}")
|
||||
|
||||
# For PDFs, try pdfplumber
|
||||
if file_ext == "pdf":
|
||||
@@ -1807,7 +2008,7 @@ async def document_endpoint(
|
||||
# For other documents, return error
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="Document processing not available. Supported: PDF (with pdfplumber), images (with OCR)"
|
||||
detail="Document processing unavailable for this type. Supported: office/text/image/zip standard formats."
|
||||
)
|
||||
|
||||
finally:
|
||||
@@ -2312,4 +2513,3 @@ async def get_multimodal_stack():
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8890)
|
||||
|
||||
|
||||
@@ -4,6 +4,15 @@ httpx==0.25.2
|
||||
pydantic==2.5.0
|
||||
pyyaml==6.0.1
|
||||
python-multipart==0.0.6
|
||||
chardet>=5.2.0
|
||||
openpyxl>=3.1.2
|
||||
python-docx>=1.1.2
|
||||
pdfplumber>=0.11.0
|
||||
python-pptx>=0.6.23
|
||||
xlrd>=2.0.1
|
||||
odfpy>=1.4.1
|
||||
beautifulsoup4>=4.12.0
|
||||
striprtf>=0.0.26
|
||||
|
||||
# HuggingFace dependencies for OCR models
|
||||
torch>=2.0.0
|
||||
@@ -25,4 +34,4 @@ safetensors>=0.4.0
|
||||
|
||||
# Web Scraping & Search
|
||||
trafilatura>=1.6.0
|
||||
duckduckgo-search>=4.0.0
|
||||
duckduckgo-search>=4.0.0
|
||||
|
||||
@@ -43,3 +43,8 @@ pdfplumber>=0.10.0
|
||||
python-docx>=1.1.0
|
||||
openpyxl>=3.1.2
|
||||
chardet>=5.2.0
|
||||
python-pptx>=0.6.23
|
||||
xlrd>=2.0.1
|
||||
odfpy>=1.4.1
|
||||
beautifulsoup4>=4.12.0
|
||||
striprtf>=0.0.26
|
||||
|
||||
Reference in New Issue
Block a user