feat: implement RAG Service MVP with PARSER + Memory integration

RAG Service Implementation: - Create rag-service/ with full structure (config, document_store, embedding, pipelines) - Document Store: PostgreSQL + pgvector via Haystack - Embedding: BAAI/bge-m3 (multilingual, 1024 dim) - Ingest Pipeline: Convert ParsedDocument to Haystack Documents, embed, index - Query Pipeline: Retrieve documents, generate answers via DAGI Router - FastAPI endpoints: /ingest, /query, /health Tests: - Unit tests for ingest and query pipelines - E2E test with example parsed JSON - Test fixtures with real PARSER output example Router Integration: - Add mode='rag_query' routing rule in router-config.yml - Priority 7, uses local_qwen3_8b for RAG queries Docker: - Add rag-service to docker-compose.yml - Configure dependencies (router, city-db) - Add model cache volume Documentation: - Complete README with API examples - Integration guides for PARSER and Router
2025-11-16 04:41:53 -08:00
parent d3c701f3ff
commit 9b86f9a694
19 changed files with 1275 additions and 97 deletions
--- a/services/rag-service/tests/test_ingest.py
+++ b/services/rag-service/tests/test_ingest.py
@@ -0,0 +1,82 @@
+"""
+Tests for ingest pipeline
+"""
+
+import pytest
+from app.ingest_pipeline import ingest_parsed_document, _parsed_json_to_documents
+
+
+class TestIngestPipeline:
+    """Tests for document ingestion"""
+    
+    def test_parsed_json_to_documents(self):
+        """Test conversion of parsed JSON to Haystack Documents"""
+        parsed_json = {
+            "doc_id": "test-doc",
+            "doc_type": "pdf",
+            "pages": [
+                {
+                    "page_num": 1,
+                    "blocks": [
+                        {
+                            "type": "heading",
+                            "text": "Test Document",
+                            "bbox": {"x": 0, "y": 0, "width": 800, "height": 50},
+                            "reading_order": 1
+                        },
+                        {
+                            "type": "paragraph",
+                            "text": "This is test content.",
+                            "bbox": {"x": 0, "y": 60, "width": 800, "height": 100},
+                            "reading_order": 2
+                        }
+                    ],
+                    "width": 800,
+                    "height": 600
+                }
+            ],
+            "metadata": {
+                "dao_id": "test-dao",
+                "title": "Test Document"
+            }
+        }
+        
+        documents = _parsed_json_to_documents(
+            parsed_json=parsed_json,
+            dao_id="test-dao",
+            doc_id="test-doc"
+        )
+        
+        assert len(documents) == 2
+        assert documents[0].content == "Test Document"
+        assert documents[0].meta["dao_id"] == "test-dao"
+        assert documents[0].meta["doc_id"] == "test-doc"
+        assert documents[0].meta["page"] == 1
+        assert documents[0].meta["block_type"] == "heading"
+    
+    def test_parsed_json_to_documents_empty_blocks(self):
+        """Test that empty blocks are skipped"""
+        parsed_json = {
+            "doc_id": "test-doc",
+            "pages": [
+                {
+                    "page_num": 1,
+                    "blocks": [
+                        {"type": "paragraph", "text": ""},
+                        {"type": "paragraph", "text": "   "},
+                        {"type": "paragraph", "text": "Valid content"}
+                    ]
+                }
+            ],
+            "metadata": {}
+        }
+        
+        documents = _parsed_json_to_documents(
+            parsed_json=parsed_json,
+            dao_id="test-dao",
+            doc_id="test-doc"
+        )
+        
+        assert len(documents) == 1
+        assert documents[0].content == "Valid content"
+