feat: create PARSER service skeleton with FastAPI
- Create parser-service/ with full structure - Add FastAPI app with endpoints (/parse, /parse_qa, /parse_markdown, /parse_chunks) - Add Pydantic schemas (ParsedDocument, ParsedBlock, ParsedChunk, etc.) - Add runtime module with model_loader and inference (with dummy parser) - Add configuration, Dockerfile, requirements.txt - Update TODO-PARSER-RAG.md with completed tasks - Ready for dots.ocr model integration
This commit is contained in:
15
services/parser-service/app/runtime/__init__.py
Normal file
15
services/parser-service/app/runtime/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
PARSER Runtime module
|
||||
Handles model loading and inference for dots.ocr
|
||||
"""
|
||||
|
||||
from app.runtime.inference import parse_document, dummy_parse_document
|
||||
from app.runtime.model_loader import load_model, get_model
|
||||
|
||||
__all__ = [
|
||||
"parse_document",
|
||||
"dummy_parse_document",
|
||||
"load_model",
|
||||
"get_model"
|
||||
]
|
||||
|
||||
112
services/parser-service/app/runtime/inference.py
Normal file
112
services/parser-service/app/runtime/inference.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""
|
||||
Inference functions for document parsing
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Literal, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from app.schemas import ParsedDocument, ParsedPage, ParsedBlock, BBox
|
||||
from app.runtime.model_loader import get_model
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def parse_document(
|
||||
input_path: str,
|
||||
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
|
||||
doc_id: Optional[str] = None,
|
||||
doc_type: Literal["pdf", "image"] = "image"
|
||||
) -> ParsedDocument:
|
||||
"""
|
||||
Parse document using dots.ocr model
|
||||
|
||||
Args:
|
||||
input_path: Path to document file (PDF or image)
|
||||
output_mode: Output format mode
|
||||
doc_id: Document ID
|
||||
doc_type: Document type (pdf or image)
|
||||
|
||||
Returns:
|
||||
ParsedDocument with structured content
|
||||
"""
|
||||
model = get_model()
|
||||
|
||||
if model is None:
|
||||
logger.warning("Model not loaded, using dummy parser")
|
||||
return dummy_parse_document(input_path, output_mode, doc_id, doc_type)
|
||||
|
||||
# TODO: Implement actual inference with dots.ocr
|
||||
# Example:
|
||||
# from PIL import Image
|
||||
# import pdf2image # for PDF
|
||||
|
||||
# if doc_type == "pdf":
|
||||
# images = pdf2image.convert_from_path(input_path)
|
||||
# else:
|
||||
# images = [Image.open(input_path)]
|
||||
#
|
||||
# pages = []
|
||||
# for idx, image in enumerate(images):
|
||||
# # Process with model
|
||||
# inputs = model["processor"](images=image, return_tensors="pt")
|
||||
# outputs = model["model"].generate(**inputs)
|
||||
# text = model["processor"].decode(outputs[0], skip_special_tokens=True)
|
||||
#
|
||||
# # Parse output into blocks
|
||||
# blocks = parse_model_output(text, idx + 1)
|
||||
# pages.append(ParsedPage(...))
|
||||
#
|
||||
# return ParsedDocument(...)
|
||||
|
||||
# For now, use dummy
|
||||
return dummy_parse_document(input_path, output_mode, doc_id, doc_type)
|
||||
|
||||
|
||||
def dummy_parse_document(
|
||||
input_path: str,
|
||||
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
|
||||
doc_id: Optional[str] = None,
|
||||
doc_type: Literal["pdf", "image"] = "image"
|
||||
) -> ParsedDocument:
|
||||
"""
|
||||
Dummy parser for testing (returns mock data)
|
||||
|
||||
This will be replaced with actual dots.ocr inference
|
||||
"""
|
||||
logger.info(f"Dummy parsing: {input_path}")
|
||||
|
||||
# Mock data
|
||||
mock_page = ParsedPage(
|
||||
page_num=1,
|
||||
blocks=[
|
||||
ParsedBlock(
|
||||
type="heading",
|
||||
text="Document Title",
|
||||
bbox=BBox(x=0, y=0, width=800, height=50),
|
||||
reading_order=1,
|
||||
page_num=1
|
||||
),
|
||||
ParsedBlock(
|
||||
type="paragraph",
|
||||
text="This is a dummy parsed document. Replace this with actual dots.ocr inference.",
|
||||
bbox=BBox(x=0, y=60, width=800, height=100),
|
||||
reading_order=2,
|
||||
page_num=1
|
||||
)
|
||||
],
|
||||
width=800,
|
||||
height=1200
|
||||
)
|
||||
|
||||
return ParsedDocument(
|
||||
doc_id=doc_id or "dummy-doc-1",
|
||||
doc_type=doc_type,
|
||||
pages=[mock_page],
|
||||
metadata={
|
||||
"parser": "dummy",
|
||||
"input_path": input_path
|
||||
}
|
||||
)
|
||||
|
||||
74
services/parser-service/app/runtime/model_loader.py
Normal file
74
services/parser-service/app/runtime/model_loader.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""
|
||||
Model loader for dots.ocr
|
||||
Handles lazy loading and GPU/CPU fallback
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Optional, Literal
|
||||
from pathlib import Path
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Global model instance
|
||||
_model: Optional[object] = None
|
||||
|
||||
|
||||
def load_model() -> object:
|
||||
"""
|
||||
Load dots.ocr model
|
||||
|
||||
Returns:
|
||||
Loaded model instance
|
||||
"""
|
||||
global _model
|
||||
|
||||
if _model is not None:
|
||||
return _model
|
||||
|
||||
logger.info(f"Loading model: {settings.PARSER_MODEL_NAME}")
|
||||
logger.info(f"Device: {settings.PARSER_DEVICE}")
|
||||
|
||||
try:
|
||||
# TODO: Implement actual model loading
|
||||
# Example:
|
||||
# from transformers import AutoModelForVision2Seq, AutoProcessor
|
||||
#
|
||||
# processor = AutoProcessor.from_pretrained(settings.PARSER_MODEL_NAME)
|
||||
# model = AutoModelForVision2Seq.from_pretrained(
|
||||
# settings.PARSER_MODEL_NAME,
|
||||
# device_map=settings.PARSER_DEVICE
|
||||
# )
|
||||
#
|
||||
# _model = {
|
||||
# "model": model,
|
||||
# "processor": processor
|
||||
# }
|
||||
|
||||
# For now, return None (will use dummy parser)
|
||||
logger.warning("Model loading not yet implemented, using dummy parser")
|
||||
_model = None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load model: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
return _model
|
||||
|
||||
|
||||
def get_model() -> Optional[object]:
|
||||
"""Get loaded model instance"""
|
||||
if _model is None:
|
||||
return load_model()
|
||||
return _model
|
||||
|
||||
|
||||
def unload_model():
|
||||
"""Unload model from memory"""
|
||||
global _model
|
||||
if _model is not None:
|
||||
# TODO: Proper cleanup
|
||||
_model = None
|
||||
logger.info("Model unloaded")
|
||||
|
||||
Reference in New Issue
Block a user