feat(docs): add standard file processing and router document ingest/query

This commit is contained in:
NODA1 System
2026-02-21 14:02:59 +01:00
parent 3e3546ea89
commit 5d52cf81c4
7 changed files with 755 additions and 104 deletions

View File

@@ -11,10 +11,13 @@ import os
import asyncio
import logging
import base64
import json
import re
from typing import Optional, Dict, List, Any, Union
from datetime import datetime, timedelta
from enum import Enum
from io import BytesIO
import xml.etree.ElementTree as ET
from fastapi import FastAPI, HTTPException, BackgroundTasks, File, UploadFile, Form
from fastapi.middleware.cors import CORSMiddleware
@@ -56,16 +59,34 @@ def _csv_to_markdown(content: bytes) -> str:
text = _decode_text_bytes(content)
reader = csv.reader(text.splitlines())
rows = list(reader)
return _rows_to_markdown(rows)
def _tsv_to_markdown(content: bytes) -> str:
text = _decode_text_bytes(content)
reader = csv.reader(text.splitlines(), delimiter="\t")
rows = list(reader)
return _rows_to_markdown(rows)
def _rows_to_markdown(rows: List[List[Any]]) -> str:
if not rows:
return ""
header = rows[0]
body = rows[1:]
width = max(len(r) for r in rows)
norm_rows = []
for r in rows:
rr = [str(c) if c is not None else "" for c in r]
if len(rr) < width:
rr.extend([""] * (width - len(rr)))
norm_rows.append(rr)
header = norm_rows[0]
body = norm_rows[1:]
lines = [
"| " + " | ".join(header) + " |",
"| " + " | ".join(["---"] * len(header)) + " |",
]
for row in body:
lines.append("| " + " | ".join(row) + " |")
lines.append("| " + " | ".join([str(c) if c is not None else "" for c in row]) + " |")
return "\n".join(lines)
@@ -91,6 +112,69 @@ def _xlsx_to_markdown(content: bytes) -> str:
return "\n".join(parts)
def _xls_to_markdown(content: bytes) -> str:
try:
import xlrd
except Exception as e:
raise HTTPException(status_code=500, detail=f"xlrd not available: {e}")
wb = xlrd.open_workbook(file_contents=content)
parts = []
for s in wb.sheets():
parts.append(f"## Sheet: {s.name}")
rows = []
for r in range(s.nrows):
rows.append([s.cell_value(r, c) for c in range(s.ncols)])
if not rows:
parts.append("_Empty sheet_")
continue
parts.append(_rows_to_markdown(rows))
return "\n\n".join(parts)
def _ods_to_markdown(content: bytes) -> str:
try:
from odf.opendocument import load
from odf.table import Table, TableRow, TableCell
from odf.text import P
except Exception as e:
raise HTTPException(status_code=500, detail=f"odfpy not available: {e}")
try:
doc = load(BytesIO(content))
except Exception as e:
raise HTTPException(status_code=400, detail=f"Invalid ODS file: {e}")
parts = []
for table in doc.spreadsheet.getElementsByType(Table):
table_name = str(table.getAttribute("name") or "Sheet")
parts.append(f"## Sheet: {table_name}")
rows: List[List[str]] = []
for row in table.getElementsByType(TableRow):
cells_out: List[str] = []
for cell in row.getElementsByType(TableCell):
txt_parts = []
for p in cell.getElementsByType(P):
txt_parts.extend(
[str(getattr(node, "data", "")).strip() for node in p.childNodes if getattr(node, "data", None)]
)
cell_text = " ".join([t for t in txt_parts if t]).strip()
repeat_raw = cell.getAttribute("numbercolumnsrepeated")
try:
repeat = int(repeat_raw) if repeat_raw else 1
except Exception:
repeat = 1
repeat = max(1, min(repeat, 100))
for _ in range(repeat):
cells_out.append(cell_text)
if cells_out:
rows.append(cells_out)
if not rows:
parts.append("_Empty sheet_")
continue
parts.append(_rows_to_markdown(rows))
return "\n\n".join(parts)
def _docx_to_text(content: bytes) -> str:
try:
from docx import Document
@@ -115,18 +199,111 @@ def _pdf_to_text(content: bytes) -> str:
return "\n\n".join(text_content)
def _pptx_to_text(content: bytes) -> str:
try:
from pptx import Presentation
except Exception as e:
raise HTTPException(status_code=500, detail=f"python-pptx not available: {e}")
prs = Presentation(BytesIO(content))
parts = []
for idx, slide in enumerate(prs.slides, start=1):
parts.append(f"## Slide {idx}")
slide_lines = []
for shape in slide.shapes:
text = getattr(shape, "text", None)
if text and str(text).strip():
slide_lines.append(str(text).strip())
parts.extend(slide_lines if slide_lines else ["_No text on this slide_"])
return "\n\n".join(parts)
def _json_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
parsed = json.loads(raw)
return json.dumps(parsed, ensure_ascii=False, indent=2)
except Exception:
return raw
def _yaml_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
parsed = yaml.safe_load(raw)
return yaml.safe_dump(parsed, allow_unicode=True, sort_keys=False)
except Exception:
return raw
def _xml_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
root = ET.fromstring(raw)
text = " ".join([t.strip() for t in root.itertext() if t and t.strip()])
return text or raw
except Exception:
return raw
def _html_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(raw, "html.parser")
text = soup.get_text(separator="\n")
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip() or raw
except Exception:
# Minimal fallback if bs4 is unavailable
text = re.sub(r"<[^>]+>", " ", raw)
text = re.sub(r"\s+", " ", text)
return text.strip()
def _rtf_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
from striprtf.striprtf import rtf_to_text
return rtf_to_text(raw)
except Exception:
# Basic fallback: strip common RTF control tokens
text = re.sub(r"\\'[0-9a-fA-F]{2}", " ", raw)
text = re.sub(r"\\[a-zA-Z]+-?\d* ?", " ", text)
text = text.replace("{", " ").replace("}", " ")
return re.sub(r"\s+", " ", text).strip()
def _extract_text_by_ext(filename: str, content: bytes) -> str:
ext = filename.split(".")[-1].lower() if "." in filename else ""
if ext in ["txt", "md"]:
if ext in ["txt", "md", "markdown"]:
return _decode_text_bytes(content)
if ext == "csv":
return _csv_to_markdown(content)
if ext == "xlsx":
if ext == "tsv":
return _tsv_to_markdown(content)
if ext in {"xlsx", "xlsm"}:
return _xlsx_to_markdown(content)
if ext == "xls":
return _xls_to_markdown(content)
if ext == "ods":
return _ods_to_markdown(content)
if ext == "docx":
return _docx_to_text(content)
if ext == "pdf":
return _pdf_to_text(content)
if ext == "pptx":
return _pptx_to_text(content)
if ext == "json":
return _json_to_text(content)
if ext in {"yaml", "yml"}:
return _yaml_to_text(content)
if ext == "xml":
return _xml_to_text(content)
if ext in {"html", "htm"}:
return _html_to_text(content)
if ext == "rtf":
return _rtf_to_text(content)
raise HTTPException(status_code=400, detail=f"Unsupported file type: .{ext}")
@@ -139,7 +316,12 @@ def _zip_to_markdown(content: bytes, max_files: int = 50, max_total_mb: int = 10
if total_size > max_total_mb * 1024 * 1024:
raise HTTPException(status_code=400, detail=f"ZIP слишком большой: {total_size / 1024 / 1024:.1f} MB")
parts = []
allowed_exts = {"txt", "md", "csv", "xlsx", "docx", "pdf"}
allowed_exts = {
"txt", "md", "markdown", "csv", "tsv",
"xls", "xlsx", "xlsm", "ods",
"docx", "pdf", "pptx",
"json", "yaml", "yml", "xml", "html", "htm", "rtf",
}
processed = []
skipped = []
for member in members:
@@ -1655,7 +1837,8 @@ async def document_endpoint(
- json: Structured JSON with document elements
- text: Plain text extraction
Supported files: PDF, DOCX, PPTX, images (PNG, JPG)
Supported files:
PDF, DOCX, XLS/XLSX/XLSM/ODS, PPTX, TXT/MD/CSV/TSV, JSON/YAML/XML/HTML, RTF, ZIP, images.
"""
try:
import time
@@ -1672,15 +1855,28 @@ async def document_endpoint(
filename = file.filename if file else "document"
file_ext = filename.split(".")[-1].lower() if "." in filename else "pdf"
# Handle text-based formats without Docling
if file_ext in ["txt", "md", "csv", "xlsx", "zip"]:
# Handle deterministic extraction for standard office/text formats
if file_ext in [
"txt", "md", "markdown", "csv", "tsv",
"xlsx", "xls", "xlsm", "ods",
"json", "yaml", "yml", "xml", "html", "htm", "rtf",
"pptx", "zip",
]:
try:
if file_ext == "zip":
content = _zip_to_markdown(doc_data)
output_format = "markdown"
else:
content = _extract_text_by_ext(filename, doc_data)
output_format = "markdown" if file_ext in ["md", "csv", "xlsx"] else "text"
output_format = (
"markdown"
if file_ext in {
"md", "markdown", "csv", "tsv",
"xlsx", "xls", "xlsm", "ods",
"json", "yaml", "yml", "xml", "html", "htm", "pptx",
}
else "text"
)
processing_time_ms = (time.time() - start_time) * 1000
return {
"success": True,
@@ -1764,22 +1960,27 @@ async def document_endpoint(
"device": swapper.device
}
# For DOCX, try python-docx
if file_ext == "docx":
# For common office/text formats, try deterministic extractors.
if file_ext in {
"docx", "txt", "md", "markdown", "csv", "tsv",
"xlsx", "xls", "xlsm", "ods",
"pptx", "json", "yaml", "yml", "xml", "html", "htm", "rtf",
}:
try:
content = _docx_to_text(doc_data)
content = _extract_text_by_ext(filename, doc_data)
out_fmt = "markdown" if file_ext not in {"txt", "rtf"} else "text"
return {
"success": True,
"model": "python-docx (fallback)",
"output_format": "text",
"model": "text-extract (fallback)",
"output_format": out_fmt,
"result": content,
"filename": filename,
"processing_time_ms": (time.time() - start_time) * 1000,
"device": swapper.device
}
except Exception as e:
logger.error(f"DOCX fallback failed: {e}")
raise HTTPException(status_code=500, detail="DOCX extraction failed")
logger.error(f"Text fallback failed for .{file_ext}: {e}")
raise HTTPException(status_code=500, detail=f"Extraction failed for .{file_ext}")
# For PDFs, try pdfplumber
if file_ext == "pdf":
@@ -1807,7 +2008,7 @@ async def document_endpoint(
# For other documents, return error
raise HTTPException(
status_code=503,
detail="Document processing not available. Supported: PDF (with pdfplumber), images (with OCR)"
detail="Document processing unavailable for this type. Supported: office/text/image/zip standard formats."
)
finally:
@@ -2312,4 +2513,3 @@ async def get_multimodal_stack():
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8890)

View File

@@ -4,6 +4,15 @@ httpx==0.25.2
pydantic==2.5.0
pyyaml==6.0.1
python-multipart==0.0.6
chardet>=5.2.0
openpyxl>=3.1.2
python-docx>=1.1.2
pdfplumber>=0.11.0
python-pptx>=0.6.23
xlrd>=2.0.1
odfpy>=1.4.1
beautifulsoup4>=4.12.0
striprtf>=0.0.26
# HuggingFace dependencies for OCR models
torch>=2.0.0
@@ -25,4 +34,4 @@ safetensors>=0.4.0
# Web Scraping & Search
trafilatura>=1.6.0
duckduckgo-search>=4.0.0
duckduckgo-search>=4.0.0

View File

@@ -43,3 +43,8 @@ pdfplumber>=0.10.0
python-docx>=1.1.0
openpyxl>=3.1.2
chardet>=5.2.0
python-pptx>=0.6.23
xlrd>=2.0.1
odfpy>=1.4.1
beautifulsoup4>=4.12.0
striprtf>=0.0.26