feat: add Ollama runtime support and RAG implementation plan

Ollama Runtime:
- Add ollama_client.py for Ollama API integration
- Support for dots-ocr model via Ollama
- Add OLLAMA_BASE_URL configuration
- Update inference.py to support Ollama runtime (RUNTIME_TYPE=ollama)
- Update endpoints to handle async Ollama calls
- Alternative to local transformers model

RAG Implementation Plan:
- Create TODO-RAG.md with detailed Haystack integration plan
- Document Store setup (pgvector)
- Embedding model selection
- Ingest pipeline (PARSER → RAG)
- Query pipeline (RAG → LLM)
- Integration with DAGI Router
- Bot commands (/upload_doc, /ask_doc)
- Testing strategy

Now supports three runtime modes:
1. Local transformers (RUNTIME_TYPE=local)
2. Ollama (RUNTIME_TYPE=ollama)
3. Dummy (USE_DUMMY_PARSER=true)
This commit is contained in:
Apple
2025-11-16 02:56:36 -08:00
parent d56ff3493d
commit 00f9102e50
6 changed files with 607 additions and 9 deletions

View File

@@ -16,11 +16,94 @@ from app.runtime.preprocessing import (
)
from app.runtime.postprocessing import build_parsed_document
from app.runtime.model_output_parser import parse_model_output_to_blocks
from app.runtime.ollama_client import (
call_ollama_vision, parse_ollama_response, OutputMode as OllamaOutputMode
)
from app.core.config import settings
logger = logging.getLogger(__name__)
async def parse_document_with_ollama(
images: List[Image.Image],
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
doc_id: Optional[str] = None,
doc_type: Literal["pdf", "image"] = "image"
) -> ParsedDocument:
"""
Parse document using Ollama API
Args:
images: List of PIL Images
output_mode: Output format mode
doc_id: Document ID
doc_type: Document type
Returns:
ParsedDocument
"""
import io
# Convert output_mode to Ollama format
ollama_mode_map = {
"raw_json": OllamaOutputMode.raw_json,
"markdown": OllamaOutputMode.markdown,
"qa_pairs": OllamaOutputMode.qa_pairs,
"chunks": OllamaOutputMode.raw_json # Use raw_json for chunks, will be processed later
}
ollama_mode = ollama_mode_map.get(output_mode, OllamaOutputMode.raw_json)
pages_data = []
for idx, image in enumerate(images, start=1):
try:
# Convert image to PNG bytes
buf = io.BytesIO()
image.convert("RGB").save(buf, format="PNG")
png_bytes = buf.getvalue()
# Call Ollama
ollama_data = await call_ollama_vision(png_bytes, ollama_mode)
raw_text, parsed_json = parse_ollama_response(ollama_data, ollama_mode)
logger.debug(f"Ollama output for page {idx}: {raw_text[:100]}...")
# Parse into blocks
if parsed_json and isinstance(parsed_json, dict):
# Use structured JSON if available
blocks = parsed_json.get("blocks", [])
if not blocks:
# Fallback: create block from raw text
blocks = [{
"type": "paragraph",
"text": raw_text,
"bbox": {"x": 0, "y": 0, "width": image.width, "height": image.height},
"reading_order": 1
}]
else:
# Parse plain text output
blocks = parse_model_output_to_blocks(raw_text, image.size, page_num=idx)
pages_data.append({
"blocks": blocks,
"width": image.width,
"height": image.height
})
logger.info(f"Processed page {idx}/{len(images)} via Ollama")
except Exception as e:
logger.error(f"Error processing page {idx} with Ollama: {e}", exc_info=True)
continue
return build_parsed_document(
pages_data=pages_data,
doc_id=doc_id or "parsed-doc",
doc_type=doc_type,
metadata={"model": settings.PARSER_MODEL_NAME, "runtime": "ollama"}
)
def parse_document_from_images(
images: List[Image.Image],
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
@@ -44,7 +127,12 @@ def parse_document_from_images(
logger.info("Using dummy parser (USE_DUMMY_PARSER=true)")
return dummy_parse_document_from_images(images, doc_id, doc_type)
# Try to get model
# Check if using Ollama runtime
if settings.RUNTIME_TYPE == "ollama":
logger.info("Using Ollama runtime")
return await parse_document_with_ollama(images, output_mode, doc_id, doc_type)
# Try to get local model
model = get_model()
if model is None: