feat: create PARSER service skeleton with FastAPI

- Create parser-service/ with full structure
- Add FastAPI app with endpoints (/parse, /parse_qa, /parse_markdown, /parse_chunks)
- Add Pydantic schemas (ParsedDocument, ParsedBlock, ParsedChunk, etc.)
- Add runtime module with model_loader and inference (with dummy parser)
- Add configuration, Dockerfile, requirements.txt
- Update TODO-PARSER-RAG.md with completed tasks
- Ready for dots.ocr model integration
This commit is contained in:
Apple
2025-11-15 13:15:08 -08:00
parent 2fc1894b26
commit 5e7cfc019e
11 changed files with 824 additions and 30 deletions

View File

@@ -0,0 +1,15 @@
"""
PARSER Runtime module
Handles model loading and inference for dots.ocr
"""
from app.runtime.inference import parse_document, dummy_parse_document
from app.runtime.model_loader import load_model, get_model
__all__ = [
"parse_document",
"dummy_parse_document",
"load_model",
"get_model"
]

View File

@@ -0,0 +1,112 @@
"""
Inference functions for document parsing
"""
import logging
from typing import Literal, Optional
from pathlib import Path
from app.schemas import ParsedDocument, ParsedPage, ParsedBlock, BBox
from app.runtime.model_loader import get_model
from app.core.config import settings
logger = logging.getLogger(__name__)
def parse_document(
input_path: str,
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
doc_id: Optional[str] = None,
doc_type: Literal["pdf", "image"] = "image"
) -> ParsedDocument:
"""
Parse document using dots.ocr model
Args:
input_path: Path to document file (PDF or image)
output_mode: Output format mode
doc_id: Document ID
doc_type: Document type (pdf or image)
Returns:
ParsedDocument with structured content
"""
model = get_model()
if model is None:
logger.warning("Model not loaded, using dummy parser")
return dummy_parse_document(input_path, output_mode, doc_id, doc_type)
# TODO: Implement actual inference with dots.ocr
# Example:
# from PIL import Image
# import pdf2image # for PDF
# if doc_type == "pdf":
# images = pdf2image.convert_from_path(input_path)
# else:
# images = [Image.open(input_path)]
#
# pages = []
# for idx, image in enumerate(images):
# # Process with model
# inputs = model["processor"](images=image, return_tensors="pt")
# outputs = model["model"].generate(**inputs)
# text = model["processor"].decode(outputs[0], skip_special_tokens=True)
#
# # Parse output into blocks
# blocks = parse_model_output(text, idx + 1)
# pages.append(ParsedPage(...))
#
# return ParsedDocument(...)
# For now, use dummy
return dummy_parse_document(input_path, output_mode, doc_id, doc_type)
def dummy_parse_document(
input_path: str,
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
doc_id: Optional[str] = None,
doc_type: Literal["pdf", "image"] = "image"
) -> ParsedDocument:
"""
Dummy parser for testing (returns mock data)
This will be replaced with actual dots.ocr inference
"""
logger.info(f"Dummy parsing: {input_path}")
# Mock data
mock_page = ParsedPage(
page_num=1,
blocks=[
ParsedBlock(
type="heading",
text="Document Title",
bbox=BBox(x=0, y=0, width=800, height=50),
reading_order=1,
page_num=1
),
ParsedBlock(
type="paragraph",
text="This is a dummy parsed document. Replace this with actual dots.ocr inference.",
bbox=BBox(x=0, y=60, width=800, height=100),
reading_order=2,
page_num=1
)
],
width=800,
height=1200
)
return ParsedDocument(
doc_id=doc_id or "dummy-doc-1",
doc_type=doc_type,
pages=[mock_page],
metadata={
"parser": "dummy",
"input_path": input_path
}
)

View File

@@ -0,0 +1,74 @@
"""
Model loader for dots.ocr
Handles lazy loading and GPU/CPU fallback
"""
import logging
from typing import Optional, Literal
from pathlib import Path
from app.core.config import settings
logger = logging.getLogger(__name__)
# Global model instance
_model: Optional[object] = None
def load_model() -> object:
"""
Load dots.ocr model
Returns:
Loaded model instance
"""
global _model
if _model is not None:
return _model
logger.info(f"Loading model: {settings.PARSER_MODEL_NAME}")
logger.info(f"Device: {settings.PARSER_DEVICE}")
try:
# TODO: Implement actual model loading
# Example:
# from transformers import AutoModelForVision2Seq, AutoProcessor
#
# processor = AutoProcessor.from_pretrained(settings.PARSER_MODEL_NAME)
# model = AutoModelForVision2Seq.from_pretrained(
# settings.PARSER_MODEL_NAME,
# device_map=settings.PARSER_DEVICE
# )
#
# _model = {
# "model": model,
# "processor": processor
# }
# For now, return None (will use dummy parser)
logger.warning("Model loading not yet implemented, using dummy parser")
_model = None
except Exception as e:
logger.error(f"Failed to load model: {e}", exc_info=True)
raise
return _model
def get_model() -> Optional[object]:
"""Get loaded model instance"""
if _model is None:
return load_model()
return _model
def unload_model():
"""Unload model from memory"""
global _model
if _model is not None:
# TODO: Proper cleanup
_model = None
logger.info("Model unloaded")