feat: create PARSER service skeleton with FastAPI

- Create parser-service/ with full structure - Add FastAPI app with endpoints (/parse, /parse_qa, /parse_markdown, /parse_chunks) - Add Pydantic schemas (ParsedDocument, ParsedBlock, ParsedChunk, etc.) - Add runtime module with model_loader and inference (with dummy parser) - Add configuration, Dockerfile, requirements.txt - Update TODO-PARSER-RAG.md with completed tasks - Ready for dots.ocr model integration
2025-11-15 13:15:08 -08:00
parent 2fc1894b26
commit 5e7cfc019e
11 changed files with 824 additions and 30 deletions
--- a/services/parser-service/app/runtime/init.py
+++ b/services/parser-service/app/runtime/init.py
@@ -0,0 +1,15 @@
+"""
+PARSER Runtime module
+Handles model loading and inference for dots.ocr
+"""
+
+from app.runtime.inference import parse_document, dummy_parse_document
+from app.runtime.model_loader import load_model, get_model
+
+__all__ = [
+    "parse_document",
+    "dummy_parse_document",
+    "load_model",
+    "get_model"
+]
+
--- a/services/parser-service/app/runtime/inference.py
+++ b/services/parser-service/app/runtime/inference.py
@@ -0,0 +1,112 @@
+"""
+Inference functions for document parsing
+"""
+
+import logging
+from typing import Literal, Optional
+from pathlib import Path
+
+from app.schemas import ParsedDocument, ParsedPage, ParsedBlock, BBox
+from app.runtime.model_loader import get_model
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+def parse_document(
+    input_path: str,
+    output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
+    doc_id: Optional[str] = None,
+    doc_type: Literal["pdf", "image"] = "image"
+) -> ParsedDocument:
+    """
+    Parse document using dots.ocr model
+    
+    Args:
+        input_path: Path to document file (PDF or image)
+        output_mode: Output format mode
+        doc_id: Document ID
+        doc_type: Document type (pdf or image)
+    
+    Returns:
+        ParsedDocument with structured content
+    """
+    model = get_model()
+    
+    if model is None:
+        logger.warning("Model not loaded, using dummy parser")
+        return dummy_parse_document(input_path, output_mode, doc_id, doc_type)
+    
+    # TODO: Implement actual inference with dots.ocr
+    # Example:
+    # from PIL import Image
+    # import pdf2image  # for PDF
+    
+    # if doc_type == "pdf":
+    #     images = pdf2image.convert_from_path(input_path)
+    # else:
+    #     images = [Image.open(input_path)]
+    #
+    # pages = []
+    # for idx, image in enumerate(images):
+    #     # Process with model
+    #     inputs = model["processor"](images=image, return_tensors="pt")
+    #     outputs = model["model"].generate(**inputs)
+    #     text = model["processor"].decode(outputs[0], skip_special_tokens=True)
+    #
+    #     # Parse output into blocks
+    #     blocks = parse_model_output(text, idx + 1)
+    #     pages.append(ParsedPage(...))
+    #
+    # return ParsedDocument(...)
+    
+    # For now, use dummy
+    return dummy_parse_document(input_path, output_mode, doc_id, doc_type)
+
+
+def dummy_parse_document(
+    input_path: str,
+    output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
+    doc_id: Optional[str] = None,
+    doc_type: Literal["pdf", "image"] = "image"
+) -> ParsedDocument:
+    """
+    Dummy parser for testing (returns mock data)
+    
+    This will be replaced with actual dots.ocr inference
+    """
+    logger.info(f"Dummy parsing: {input_path}")
+    
+    # Mock data
+    mock_page = ParsedPage(
+        page_num=1,
+        blocks=[
+            ParsedBlock(
+                type="heading",
+                text="Document Title",
+                bbox=BBox(x=0, y=0, width=800, height=50),
+                reading_order=1,
+                page_num=1
+            ),
+            ParsedBlock(
+                type="paragraph",
+                text="This is a dummy parsed document. Replace this with actual dots.ocr inference.",
+                bbox=BBox(x=0, y=60, width=800, height=100),
+                reading_order=2,
+                page_num=1
+            )
+        ],
+        width=800,
+        height=1200
+    )
+    
+    return ParsedDocument(
+        doc_id=doc_id or "dummy-doc-1",
+        doc_type=doc_type,
+        pages=[mock_page],
+        metadata={
+            "parser": "dummy",
+            "input_path": input_path
+        }
+    )
+
--- a/services/parser-service/app/runtime/model_loader.py
+++ b/services/parser-service/app/runtime/model_loader.py
@@ -0,0 +1,74 @@
+"""
+Model loader for dots.ocr
+Handles lazy loading and GPU/CPU fallback
+"""
+
+import logging
+from typing import Optional, Literal
+from pathlib import Path
+
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+# Global model instance
+_model: Optional[object] = None
+
+
+def load_model() -> object:
+    """
+    Load dots.ocr model
+    
+    Returns:
+        Loaded model instance
+    """
+    global _model
+    
+    if _model is not None:
+        return _model
+    
+    logger.info(f"Loading model: {settings.PARSER_MODEL_NAME}")
+    logger.info(f"Device: {settings.PARSER_DEVICE}")
+    
+    try:
+        # TODO: Implement actual model loading
+        # Example:
+        # from transformers import AutoModelForVision2Seq, AutoProcessor
+        # 
+        # processor = AutoProcessor.from_pretrained(settings.PARSER_MODEL_NAME)
+        # model = AutoModelForVision2Seq.from_pretrained(
+        #     settings.PARSER_MODEL_NAME,
+        #     device_map=settings.PARSER_DEVICE
+        # )
+        # 
+        # _model = {
+        #     "model": model,
+        #     "processor": processor
+        # }
+        
+        # For now, return None (will use dummy parser)
+        logger.warning("Model loading not yet implemented, using dummy parser")
+        _model = None
+        
+    except Exception as e:
+        logger.error(f"Failed to load model: {e}", exc_info=True)
+        raise
+    
+    return _model
+
+
+def get_model() -> Optional[object]:
+    """Get loaded model instance"""
+    if _model is None:
+        return load_model()
+    return _model
+
+
+def unload_model():
+    """Unload model from memory"""
+    global _model
+    if _model is not None:
+        # TODO: Proper cleanup
+        _model = None
+        logger.info("Model unloaded")
+