feat: implement PDF/image preprocessing, post-processing, and dots.ocr integration prep

G.2.3 - PDF/Image Support: - Add preprocessing.py with PDF→images conversion (pdf2image) - Add image loading and normalization - Add file type detection and validation - Support for PDF, PNG, JPEG, WebP, TIFF G.2.4 - Pre/Post-processing: - Add postprocessing.py with structured output builders - build_chunks() - semantic chunks for RAG - build_qa_pairs() - Q&A extraction - build_markdown() - Markdown conversion - Text normalization and chunking logic G.1.3 - dots.ocr Integration Prep: - Update model_loader.py with proper error handling - Add USE_DUMMY_PARSER and ALLOW_DUMMY_FALLBACK flags - Update inference.py to work with images list - Add parse_document_from_images() function - Ready for actual model integration Configuration: - Add PDF_DPI, IMAGE_MAX_SIZE, PAGE_RANGE settings - Add parser mode flags (USE_DUMMY_PARSER, ALLOW_DUMMY_FALLBACK) API Updates: - Update endpoints to use new preprocessing pipeline - Integrate post-processing for all output modes - Remove temp file handling (work directly with bytes)
2025-11-15 13:19:07 -08:00
parent 0f6cfe046f
commit 4befecc425
6 changed files with 762 additions and 122 deletions
--- a/services/parser-service/app/runtime/postprocessing.py
+++ b/services/parser-service/app/runtime/postprocessing.py
@@ -0,0 +1,321 @@
+"""
+Post-processing functions to convert model output to structured formats
+"""
+
+import logging
+import re
+from typing import List, Dict, Any, Optional
+
+from app.schemas import (
+    ParsedDocument, ParsedPage, ParsedBlock, ParsedChunk, QAPair, BBox
+)
+
+logger = logging.getLogger(__name__)
+
+
+def normalize_text(text: str) -> str:
+    """
+    Normalize text: remove extra whitespace, line breaks, invisible chars
+    
+    Args:
+        text: Raw text
+    
+    Returns:
+        Normalized text
+    """
+    if not text:
+        return ""
+    
+    # Remove invisible characters
+    text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]', '', text)
+    
+    # Normalize whitespace
+    text = re.sub(r'\s+', ' ', text)
+    
+    # Remove leading/trailing whitespace
+    text = text.strip()
+    
+    return text
+
+
+def build_parsed_document(
+    pages_data: List[Dict[str, Any]],
+    doc_id: str,
+    doc_type: str,
+    metadata: Dict[str, Any] = None
+) -> ParsedDocument:
+    """
+    Build ParsedDocument from model output
+    
+    Args:
+        pages_data: List of page data from model
+            Each page should have: blocks, width, height
+        doc_id: Document ID
+        doc_type: Document type ("pdf" or "image")
+        metadata: Additional metadata
+    
+    Returns:
+        ParsedDocument
+    """
+    pages = []
+    
+    for page_idx, page_data in enumerate(pages_data, start=1):
+        blocks = []
+        
+        for block_data in page_data.get('blocks', []):
+            # Normalize text
+            text = normalize_text(block_data.get('text', ''))
+            
+            if not text:
+                continue
+            
+            # Extract bbox
+            bbox_data = block_data.get('bbox', {})
+            bbox = BBox(
+                x=bbox_data.get('x', 0),
+                y=bbox_data.get('y', 0),
+                width=bbox_data.get('width', 0),
+                height=bbox_data.get('height', 0)
+            )
+            
+            # Create block
+            block = ParsedBlock(
+                type=block_data.get('type', 'paragraph'),
+                text=text,
+                bbox=bbox,
+                reading_order=block_data.get('reading_order', len(blocks) + 1),
+                page_num=page_idx,
+                metadata=block_data.get('metadata', {})
+            )
+            
+            blocks.append(block)
+        
+        page = ParsedPage(
+            page_num=page_idx,
+            blocks=blocks,
+            width=page_data.get('width', 0),
+            height=page_data.get('height', 0)
+        )
+        pages.append(page)
+    
+    return ParsedDocument(
+        doc_id=doc_id,
+        doc_type=doc_type,
+        pages=pages,
+        metadata=metadata or {}
+    )
+
+
+def build_chunks(
+    parsed_doc: ParsedDocument,
+    chunk_size: int = 500,
+    chunk_overlap: int = 50,
+    dao_id: Optional[str] = None
+) -> List[ParsedChunk]:
+    """
+    Build semantic chunks from ParsedDocument
+    
+    Args:
+        parsed_doc: Parsed document
+        chunk_size: Target chunk size in characters
+        chunk_overlap: Overlap between chunks
+        dao_id: Optional DAO ID for metadata
+    
+    Returns:
+        List of ParsedChunk
+    """
+    chunks = []
+    
+    for page in parsed_doc.pages:
+        # Group blocks by section (heading-based)
+        current_section = None
+        current_text_parts = []
+        
+        for block in page.blocks:
+            # Update section if we encounter a heading
+            if block.type == 'heading':
+                # Save previous section if exists
+                if current_text_parts:
+                    text = ' '.join(current_text_parts)
+                    if text:
+                        chunks.append(ParsedChunk(
+                            text=text,
+                            page=page.page_num,
+                            bbox=block.bbox,  # Use first block's bbox
+                            section=current_section or "main",
+                            metadata={
+                                "dao_id": dao_id,
+                                "doc_id": parsed_doc.doc_id,
+                                "chunk_type": "section"
+                            }
+                        ))
+                
+                current_section = normalize_text(block.text)
+                current_text_parts = []
+            
+            # Add block text
+            if block.text:
+                current_text_parts.append(block.text)
+        
+        # Save last section
+        if current_text_parts:
+            text = ' '.join(current_text_parts)
+            if text:
+                chunks.append(ParsedChunk(
+                    text=text,
+                    page=page.page_num,
+                    section=current_section or "main",
+                    metadata={
+                        "dao_id": dao_id,
+                        "doc_id": parsed_doc.doc_id,
+                        "chunk_type": "section"
+                    }
+                ))
+    
+    # Split large chunks
+    final_chunks = []
+    for chunk in chunks:
+        if len(chunk.text) <= chunk_size:
+            final_chunks.append(chunk)
+        else:
+            # Split into smaller chunks
+            words = chunk.text.split()
+            current_chunk_words = []
+            current_length = 0
+            
+            for word in words:
+                word_length = len(word) + 1  # +1 for space
+                if current_length + word_length > chunk_size and current_chunk_words:
+                    # Save current chunk
+                    chunk_text = ' '.join(current_chunk_words)
+                    final_chunks.append(ParsedChunk(
+                        text=chunk_text,
+                        page=chunk.page,
+                        bbox=chunk.bbox,
+                        section=chunk.section,
+                        metadata=chunk.metadata
+                    ))
+                    
+                    # Start new chunk with overlap
+                    overlap_words = current_chunk_words[-chunk_overlap:] if chunk_overlap > 0 else []
+                    current_chunk_words = overlap_words + [word]
+                    current_length = sum(len(w) + 1 for w in current_chunk_words)
+                else:
+                    current_chunk_words.append(word)
+                    current_length += word_length
+            
+            # Save last chunk
+            if current_chunk_words:
+                chunk_text = ' '.join(current_chunk_words)
+                final_chunks.append(ParsedChunk(
+                    text=chunk_text,
+                    page=chunk.page,
+                    bbox=chunk.bbox,
+                    section=chunk.section,
+                    metadata=chunk.metadata
+                ))
+    
+    logger.info(f"Created {len(final_chunks)} chunks from document")
+    return final_chunks
+
+
+def build_qa_pairs(
+    parsed_doc: ParsedDocument,
+    max_pairs: int = 10
+) -> List[QAPair]:
+    """
+    Build Q&A pairs from ParsedDocument
+    
+    This is a simple implementation. For production, consider using LLM
+    to generate better Q&A pairs.
+    
+    Args:
+        parsed_doc: Parsed document
+        max_pairs: Maximum number of Q&A pairs to generate
+    
+    Returns:
+        List of QAPair
+    """
+    qa_pairs = []
+    
+    # Simple heuristic: use headings as questions, following paragraphs as answers
+    for page in parsed_doc.pages:
+        for i, block in enumerate(page.blocks):
+            if block.type == 'heading' and i + 1 < len(page.blocks):
+                question = f"What is {normalize_text(block.text)}?"
+                answer_block = page.blocks[i + 1]
+                answer = normalize_text(answer_block.text)
+                
+                if answer:
+                    qa_pairs.append(QAPair(
+                        question=question,
+                        answer=answer,
+                        source_page=page.page_num,
+                        source_bbox=block.bbox,
+                        confidence=0.7  # Placeholder
+                    ))
+                    
+                    if len(qa_pairs) >= max_pairs:
+                        break
+        
+        if len(qa_pairs) >= max_pairs:
+            break
+    
+    logger.info(f"Generated {len(qa_pairs)} Q&A pairs")
+    return qa_pairs
+
+
+def build_markdown(parsed_doc: ParsedDocument) -> str:
+    """
+    Build Markdown representation from ParsedDocument
+    
+    Args:
+        parsed_doc: Parsed document
+    
+    Returns:
+        Markdown string
+    """
+    markdown_parts = []
+    
+    for page in parsed_doc.pages:
+        if len(parsed_doc.pages) > 1:
+            markdown_parts.append(f"\n## Page {page.page_num}\n")
+        
+        for block in page.blocks:
+            text = normalize_text(block.text)
+            if not text:
+                continue
+            
+            if block.type == 'heading':
+                # Determine heading level (simple heuristic)
+                if len(text) < 50:
+                    markdown_parts.append(f"### {text}\n")
+                else:
+                    markdown_parts.append(f"#### {text}\n")
+            elif block.type == 'paragraph':
+                markdown_parts.append(f"{text}\n\n")
+            elif block.type == 'list':
+                # Simple list formatting
+                lines = text.split('\n')
+                for line in lines:
+                    if line.strip():
+                        markdown_parts.append(f"- {line.strip()}\n")
+                markdown_parts.append("\n")
+            elif block.type == 'table' and block.table_data:
+                # Format table as Markdown
+                table = block.table_data
+                if table.columns:
+                    # Header
+                    markdown_parts.append("| " + " | ".join(table.columns) + " |\n")
+                    markdown_parts.append("| " + " | ".join(["---"] * len(table.columns)) + " |\n")
+                    
+                    # Rows
+                    for row in table.rows:
+                        markdown_parts.append("| " + " | ".join(str(cell) for cell in row) + " |\n")
+                    markdown_parts.append("\n")
+            else:
+                # Default: plain text
+                markdown_parts.append(f"{text}\n\n")
+    
+    return ''.join(markdown_parts)
+