feat: implement TTS, Document processing, and Memory Service /facts API

- TTS: xtts-v2 integration with voice cloning support
- Document: docling integration for PDF/DOCX/PPTX processing
- Memory Service: added /facts/upsert, /facts/{key}, /facts endpoints
- Added required dependencies (TTS, docling)
This commit is contained in:
Apple
2026-01-17 08:16:37 -08:00
parent a9fcadc6e2
commit 5290287058
121 changed files with 17071 additions and 436 deletions

View File

@@ -1,64 +1,220 @@
# Swapper Configuration for Node #1 (Production Server)
# Single-active LLM scheduler
# Optimized Multimodal Stack: LLM + Vision + OCR + Document + Audio
# Hetzner GEX44 - NVIDIA RTX 4000 SFF Ada (20GB VRAM)
#
# ВАЖЛИВО: Ембедінги через зовнішні API:
# - Text: Cohere API (embed-multilingual-v3.0, 1024 dim)
# - Image: Vision Encoder (OpenCLIP ViT-L/14, 768 dim)
# НЕ використовуємо локальні embedding моделі!
swapper:
mode: single-active
max_concurrent_models: 1
mode: multi-active
max_concurrent_models: 4 # LLM + OCR + STT + TTS (до 15GB)
model_swap_timeout: 300
gpu_enabled: true
metal_acceleration: false # NVIDIA GPU, not Apple Silicon
# Модель для автоматичного завантаження при старті
# qwen3-8b - основна модель (4.87 GB), швидка відповідь на перший запит
metal_acceleration: false
default_model: qwen3-8b
lazy_load_ocr: true
lazy_load_audio: true
# Автоматичне вивантаження при нестачі VRAM
auto_unload_on_oom: true
vram_threshold_gb: 18 # Починати вивантажувати при 18GB
models:
# Primary LLM - Qwen3 8B (High Priority) - Main model from INFRASTRUCTURE.md
# ============================================
# LLM MODELS (Ollama) - тільки qwen3
# ============================================
# Primary LLM - Qwen3 8B (includes math, coding, reasoning)
qwen3-8b:
path: ollama:qwen3:8b
type: llm
size_gb: 4.87
size_gb: 5.2
priority: high
description: "Primary LLM for general tasks and conversations"
# Vision Model - Qwen3-VL 8B (High Priority) - For image processing
description: "Qwen3 8B - primary LLM with math, coding, reasoning capabilities"
capabilities:
- chat
- math
- coding
- reasoning
- multilingual
# ============================================
# VISION MODELS (Ollama)
# ============================================
# Vision Model - Qwen3-VL 8B
qwen3-vl-8b:
path: ollama:qwen3-vl:8b
type: vision
size_gb: 5.72
size_gb: 6.1
priority: high
description: "Vision model for image understanding and processing"
# Qwen2.5 7B Instruct (High Priority)
qwen2.5-7b-instruct:
path: ollama:qwen2.5:7b-instruct-q4_K_M
type: llm
size_gb: 4.36
description: "Qwen3-VL 8B for image understanding and visual reasoning"
capabilities:
- image_understanding
- visual_qa
- diagram_analysis
- ocr_basic
# ============================================
# OCR/DOCUMENT MODELS (HuggingFace)
# ============================================
# GOT-OCR2.0 - Best for documents, tables, formulas
got-ocr2:
path: huggingface:stepfun-ai/GOT-OCR2_0
type: ocr
size_gb: 7.0
priority: high
description: "Qwen2.5 7B Instruct model"
description: "Best OCR for documents, tables, formulas, handwriting"
capabilities:
- documents
- tables
- formulas
- handwriting
- multilingual
# Lightweight LLM - Qwen2.5 3B Instruct (Medium Priority)
qwen2.5-3b-instruct:
path: ollama:qwen2.5:3b-instruct-q4_K_M
type: llm
size_gb: 1.80
# Donut - Document Understanding (no external OCR, 91% CORD)
donut-base:
path: huggingface:naver-clova-ix/donut-base
type: ocr
size_gb: 3.0
priority: high
description: "Document parsing without OCR engine (91% CORD accuracy)"
capabilities:
- document_parsing
- receipts
- forms
- invoices
# Donut fine-tuned for receipts/invoices (CORD dataset)
donut-cord:
path: huggingface:naver-clova-ix/donut-base-finetuned-cord-v2
type: ocr
size_gb: 3.0
priority: medium
description: "Lightweight LLM for faster responses"
# Math Specialist - Qwen2 Math 7B (High Priority)
qwen2-math-7b:
path: ollama:qwen2-math:7b
type: math
size_gb: 4.13
description: "Donut fine-tuned for receipts extraction"
capabilities:
- receipts
- invoices
- structured_extraction
# IBM Granite Docling - Document conversion with structure preservation
granite-docling:
path: huggingface:ds4sd/docling-ibm-granite-vision-1b
type: document
size_gb: 2.5
priority: high
description: "Specialized model for mathematical tasks"
description: "IBM Granite Docling for PDF/document structure extraction"
capabilities:
- pdf_conversion
- table_extraction
- formula_extraction
- layout_preservation
- doctags_format
# ============================================
# AUDIO MODELS - STT (Speech-to-Text)
# ============================================
# Faster Whisper Large-v3 - Best STT quality
faster-whisper-large:
path: huggingface:Systran/faster-whisper-large-v3
type: stt
size_gb: 3.0
priority: high
description: "Faster Whisper Large-v3 - best quality, 99 languages"
capabilities:
- speech_recognition
- transcription
- multilingual
- timestamps
- ukrainian
# Whisper Small - Fast/lightweight for quick transcription
whisper-small:
path: huggingface:openai/whisper-small
type: stt
size_gb: 0.5
priority: medium
description: "Whisper Small for fast transcription"
capabilities:
- speech_recognition
- transcription
# ============================================
# AUDIO MODELS - TTS (Text-to-Speech)
# ============================================
# Coqui XTTS-v2 - Best multilingual TTS with Ukrainian support
xtts-v2:
path: huggingface:coqui/XTTS-v2
type: tts
size_gb: 2.0
priority: high
description: "XTTS-v2 multilingual TTS with voice cloning, Ukrainian support"
capabilities:
- text_to_speech
- voice_cloning
- multilingual
- ukrainian
- 17_languages
# ============================================
# IMAGE GENERATION MODELS (HuggingFace/Diffusers)
# ============================================
# FLUX.2 Klein 4B - High quality image generation with lazy loading
flux-klein-4b:
path: huggingface:black-forest-labs/FLUX.2-klein-base-4B
type: image_generation
size_gb: 15.4
priority: medium
description: "FLUX.2 Klein 4B - high quality image generation, lazy loaded on demand"
capabilities:
- text_to_image
- high_quality
- 1024x1024
- artistic
default_params:
num_inference_steps: 50
guidance_scale: 4.0
width: 1024
height: 1024
storage:
models_dir: /app/models
cache_dir: /app/cache
swap_dir: /app/swap
huggingface_cache: /root/.cache/huggingface
ollama:
url: http://ollama:11434 # From Docker container to Ollama service
url: http://172.18.0.1:11434
timeout: 300
huggingface:
device: cuda
torch_dtype: float16
trust_remote_code: true
low_cpu_mem_usage: true
# ============================================
# EMBEDDING SERVICES (External APIs)
# НЕ через Swapper - окремі сервіси!
# ============================================
#
# Text Embeddings:
# Service: Memory Service → Cohere API
# Model: embed-multilingual-v3.0
# Dimension: 1024
# Endpoint: Memory Service handles internally
#
# Image/Multimodal Embeddings:
# Service: Vision Encoder (port 8001)
# Model: OpenCLIP ViT-L/14
# Dimension: 768
# Endpoint: http://vision-encoder:8001/embed
#
# Vector Storage:
# Qdrant (port 6333) - separate collections for text vs image embeddings
# ВАЖЛИВО: НЕ змішувати embedding spaces в одній колекції!

View File

@@ -0,0 +1,63 @@
# Swapper Configuration for Node #3 (AI/ML Workstation)
# Single-active LLM scheduler
# Threadripper PRO + RTX 3090 24GB - GPU-intensive workloads
swapper:
mode: single-active
max_concurrent_models: 1
model_swap_timeout: 300
gpu_enabled: true
metal_acceleration: false # NVIDIA GPU, not Apple Silicon
# Модель для автоматичного завантаження при старті
# qwen3-8b - основна модель (4.87 GB), швидка відповідь на перший запит
default_model: qwen3-8b
models:
# Primary LLM - Qwen3 8B (High Priority) - Main model from INFRASTRUCTURE.md
qwen3-8b:
path: ollama:qwen3:8b
type: llm
size_gb: 4.87
priority: high
description: "Primary LLM for general tasks and conversations"
# Vision Model - Qwen3-VL 8B (High Priority) - For image processing
qwen3-vl-8b:
path: ollama:qwen3-vl:8b
type: vision
size_gb: 5.72
priority: high
description: "Vision model for image understanding and processing"
# Qwen2.5 7B Instruct (High Priority)
qwen2.5-7b-instruct:
path: ollama:qwen2.5:7b-instruct-q4_K_M
type: llm
size_gb: 4.36
priority: high
description: "Qwen2.5 7B Instruct model"
# Lightweight LLM - Qwen2.5 3B Instruct (Medium Priority)
qwen2.5-3b-instruct:
path: ollama:qwen2.5:3b-instruct-q4_K_M
type: llm
size_gb: 1.80
priority: medium
description: "Lightweight LLM for faster responses"
# Math Specialist - Qwen2 Math 7B (High Priority)
qwen2-math-7b:
path: ollama:qwen2-math:7b
type: math
size_gb: 4.13
priority: high
description: "Specialized model for mathematical tasks"
storage:
models_dir: /app/models
cache_dir: /app/cache
swap_dir: /app/swap
ollama:
url: http://ollama:11434 # From Docker container to Ollama service
timeout: 300