Files
microdao-daarion/services/parser-service/app/schemas.py
Apple 7251e519d6 feat: enhance model output parser and add integration guide
Model Output Parser:
- Support multiple dots.ocr output formats (JSON, structured text, plain text)
- Normalize all formats to standard ParsedBlock structure
- Handle JSON with blocks/pages arrays
- Parse markdown-like structured text
- Fallback to plain text parsing
- Better error handling and logging

Schemas:
- Document must-have fields for RAG (doc_id, pages, metadata.dao_id)
- ParsedChunk must-have fields (text, metadata.dao_id, metadata.doc_id)
- Add detailed field descriptions for RAG integration

Integration Guide:
- Create INTEGRATION.md with complete integration guide
- Document dots.ocr output formats
- Show ParsedDocument → Haystack Documents conversion
- Provide DAGI Router integration examples
- RAG pipeline integration with filters
- Complete workflow examples
- RBAC integration recommendations
2025-11-16 03:02:42 -08:00

141 lines
5.5 KiB
Python

"""
Pydantic schemas for PARSER Service
"""
from typing import Optional, List, Dict, Any, Literal
from pydantic import BaseModel, Field
from datetime import datetime
class BBox(BaseModel):
"""Bounding box coordinates"""
x: float = Field(..., description="X coordinate")
y: float = Field(..., description="Y coordinate")
width: float = Field(..., description="Width")
height: float = Field(..., description="Height")
class TableCell(BaseModel):
"""Table cell data"""
row: int
col: int
text: str
rowspan: Optional[int] = 1
colspan: Optional[int] = 1
class TableData(BaseModel):
"""Structured table data"""
rows: List[List[str]] = Field(..., description="Table rows")
columns: List[str] = Field(..., description="Column headers")
merged_cells: Optional[List[Dict[str, Any]]] = Field(None, description="Merged cells info")
class ParsedBlock(BaseModel):
"""Parsed document block"""
type: Literal["paragraph", "heading", "table", "formula", "figure_caption", "list"] = Field(
..., description="Block type"
)
text: str = Field(..., description="Block text content")
bbox: BBox = Field(..., description="Bounding box")
reading_order: int = Field(..., description="Reading order index")
page_num: int = Field(..., description="Page number")
table_data: Optional[TableData] = Field(None, description="Table data (if type=table)")
metadata: Optional[Dict[str, Any]] = Field(None, description="Additional metadata")
class ParsedPage(BaseModel):
"""Parsed document page"""
page_num: int = Field(..., description="Page number (1-indexed)")
blocks: List[ParsedBlock] = Field(..., description="Page blocks")
width: float = Field(..., description="Page width in pixels")
height: float = Field(..., description="Page height in pixels")
class ParsedChunk(BaseModel):
"""
Semantic chunk for RAG
Must-have fields for RAG indexing:
- text: Chunk text content (required)
- metadata.dao_id: DAO identifier (required for filtering)
- metadata.doc_id: Document identifier (required for citation)
Recommended fields:
- page: Page number (for citation)
- section: Section name (for context)
- metadata.block_type: Type of block (heading, paragraph, etc.)
- metadata.reading_order: Reading order (for sorting)
"""
text: str = Field(..., description="Chunk text (required for RAG)")
page: int = Field(..., description="Page number (for citation)")
bbox: Optional[BBox] = Field(None, description="Bounding box (for highlighting)")
section: Optional[str] = Field(None, description="Section name (for context)")
metadata: Dict[str, Any] = Field(
default_factory=dict,
description="Metadata (must include dao_id, doc_id for RAG)"
)
class QAPair(BaseModel):
"""Question-Answer pair"""
question: str = Field(..., description="Question")
answer: str = Field(..., description="Answer")
source_page: int = Field(..., description="Source page number")
source_bbox: Optional[BBox] = Field(None, description="Source bounding box")
confidence: Optional[float] = Field(None, description="Confidence score")
class ParsedDocument(BaseModel):
"""
Complete parsed document
Must-have fields for RAG integration:
- doc_id: Unique document identifier (required for RAG indexing)
- pages: List of parsed pages with blocks (required for content)
- doc_type: Document type (required for processing)
Recommended fields for RAG:
- metadata.dao_id: DAO identifier (for filtering)
- metadata.user_id: User who uploaded (for access control)
- metadata.title: Document title (for display)
- metadata.created_at: Upload timestamp (for sorting)
"""
doc_id: str = Field(..., description="Document ID (required for RAG)")
doc_url: Optional[str] = Field(None, description="Document URL")
doc_type: Literal["pdf", "image"] = Field(..., description="Document type")
pages: List[ParsedPage] = Field(..., description="Parsed pages (required for RAG)")
metadata: Dict[str, Any] = Field(
default_factory=dict,
description="Document metadata (should include dao_id, user_id for RAG)"
)
created_at: datetime = Field(default_factory=datetime.utcnow, description="Creation timestamp")
class ParseRequest(BaseModel):
"""Parse request"""
doc_url: Optional[str] = Field(None, description="Document URL")
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = Field(
"raw_json", description="Output mode"
)
dao_id: Optional[str] = Field(None, description="DAO ID")
doc_id: Optional[str] = Field(None, description="Document ID")
class ParseResponse(BaseModel):
"""Parse response"""
document: Optional[ParsedDocument] = Field(None, description="Parsed document (raw_json mode)")
markdown: Optional[str] = Field(None, description="Markdown content (markdown mode)")
qa_pairs: Optional[List[QAPair]] = Field(None, description="QA pairs (qa_pairs mode)")
chunks: Optional[List[ParsedChunk]] = Field(None, description="Chunks (chunks mode)")
metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
class ChunksResponse(BaseModel):
"""Chunks response for RAG"""
chunks: List[ParsedChunk] = Field(..., description="Document chunks")
total_chunks: int = Field(..., description="Total number of chunks")
doc_id: str = Field(..., description="Document ID")
dao_id: str = Field(..., description="DAO ID")