feat: implement PDF/image preprocessing, post-processing, and dots.ocr integration prep

G.2.3 - PDF/Image Support:
- Add preprocessing.py with PDF→images conversion (pdf2image)
- Add image loading and normalization
- Add file type detection and validation
- Support for PDF, PNG, JPEG, WebP, TIFF

G.2.4 - Pre/Post-processing:
- Add postprocessing.py with structured output builders
- build_chunks() - semantic chunks for RAG
- build_qa_pairs() - Q&A extraction
- build_markdown() - Markdown conversion
- Text normalization and chunking logic

G.1.3 - dots.ocr Integration Prep:
- Update model_loader.py with proper error handling
- Add USE_DUMMY_PARSER and ALLOW_DUMMY_FALLBACK flags
- Update inference.py to work with images list
- Add parse_document_from_images() function
- Ready for actual model integration

Configuration:
- Add PDF_DPI, IMAGE_MAX_SIZE, PAGE_RANGE settings
- Add parser mode flags (USE_DUMMY_PARSER, ALLOW_DUMMY_FALLBACK)

API Updates:
- Update endpoints to use new preprocessing pipeline
- Integrate post-processing for all output modes
- Remove temp file handling (work directly with bytes)
This commit is contained in:
Apple
2025-11-15 13:19:07 -08:00
parent 0f6cfe046f
commit 4befecc425
6 changed files with 762 additions and 122 deletions

View File

@@ -14,7 +14,13 @@ from app.schemas import (
ParseRequest, ParseResponse, ParsedDocument, ParsedChunk, QAPair, ChunksResponse
)
from app.core.config import settings
from app.runtime.inference import parse_document, dummy_parse_document
from app.runtime.inference import parse_document_from_images
from app.runtime.preprocessing import (
convert_pdf_to_images, load_image, detect_file_type, validate_file_size
)
from app.runtime.postprocessing import (
build_chunks, build_qa_pairs, build_markdown
)
logger = logging.getLogger(__name__)
@@ -50,31 +56,29 @@ async def parse_document_endpoint(
detail="Either 'file' or 'doc_url' must be provided"
)
# Determine document type
# Process file
if file:
doc_type = "image" # Will be determined from file extension
file_ext = Path(file.filename or "").suffix.lower()
if file_ext == ".pdf":
doc_type = "pdf"
# Read file content
content = await file.read()
# Check file size
max_size = settings.MAX_FILE_SIZE_MB * 1024 * 1024
if len(content) > max_size:
raise HTTPException(
status_code=413,
detail=f"File size exceeds maximum {settings.MAX_FILE_SIZE_MB}MB"
)
# Validate file size
try:
validate_file_size(content)
except ValueError as e:
raise HTTPException(status_code=413, detail=str(e))
# Save to temp file
temp_dir = Path(settings.TEMP_DIR)
temp_dir.mkdir(exist_ok=True, parents=True)
temp_file = temp_dir / f"{uuid.uuid4()}{file_ext}"
temp_file.write_bytes(content)
# Detect file type
try:
doc_type = detect_file_type(content, file.filename)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
input_path = str(temp_file)
# Convert to images
if doc_type == "pdf":
images = convert_pdf_to_images(content)
else:
image = load_image(content)
images = [image]
else:
# TODO: Download from doc_url
@@ -83,51 +87,31 @@ async def parse_document_endpoint(
detail="doc_url download not yet implemented"
)
# Parse document
logger.info(f"Parsing document: {input_path}, mode: {output_mode}")
# Parse document from images
logger.info(f"Parsing document: {len(images)} page(s), mode: {output_mode}")
# TODO: Replace with real parse_document when model is integrated
parsed_doc = dummy_parse_document(
input_path=input_path,
parsed_doc = parse_document_from_images(
images=images,
output_mode=output_mode,
doc_id=doc_id or str(uuid.uuid4()),
doc_type=doc_type
)
# Build response based on output_mode
response_data = {"metadata": {}}
response_data = {"metadata": {
"doc_id": parsed_doc.doc_id,
"doc_type": parsed_doc.doc_type,
"page_count": len(parsed_doc.pages)
}}
if output_mode == "raw_json":
response_data["document"] = parsed_doc
elif output_mode == "markdown":
# TODO: Convert to markdown
response_data["markdown"] = "# Document\n\n" + "\n\n".join(
block.text for page in parsed_doc.pages for block in page.blocks
)
response_data["markdown"] = build_markdown(parsed_doc)
elif output_mode == "qa_pairs":
# TODO: Extract QA pairs
response_data["qa_pairs"] = []
response_data["qa_pairs"] = build_qa_pairs(parsed_doc)
elif output_mode == "chunks":
# Convert blocks to chunks
chunks = []
for page in parsed_doc.pages:
for block in page.blocks:
chunks.append(ParsedChunk(
text=block.text,
page=page.page_num,
bbox=block.bbox,
section=block.type,
metadata={
"dao_id": dao_id,
"doc_id": parsed_doc.doc_id,
"block_type": block.type
}
))
response_data["chunks"] = chunks
# Cleanup temp file
if file and temp_file.exists():
temp_file.unlink()
response_data["chunks"] = build_chunks(parsed_doc, dao_id=dao_id)
return ParseResponse(**response_data)