microdao-daarion/services/memory-service/app/ingestion.py

"""
Memory Ingestion Pipeline
Автоматичне витягування фактів/пам'яті з діалогів

Етапи:
1. PII Scrubber - виявлення та редакція персональних даних
2. Memory Candidate Extractor - класифікація та витягування
3. Dedup & Merge - дедуплікація схожих пам'ятей
4. Write - збереження в SQL + Vector + Graph
5. Audit Log - запис в аудит
"""

import re
import hashlib
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime, timedelta
from uuid import UUID, uuid4
from enum import Enum
import structlog
from pydantic import BaseModel

logger = structlog.get_logger()


class MemoryType(str, Enum):
    EPISODIC = "episodic"      # Події/факти про взаємодію
    SEMANTIC = "semantic"      # Стійкі вподобання/профіль
    PROCEDURAL = "procedural"  # Як робити щось


class MemoryCategory(str, Enum):
    PREFERENCE = "preference"           # Вподобання користувача
    FACT = "fact"                       # Факт про користувача
    TOPIC_INTEREST = "topic_interest"   # Інтерес до теми
    ROLE = "role"                       # Роль (інвестор, інженер)
    INTERACTION = "interaction"         # Тип взаємодії
    FEEDBACK = "feedback"               # Відгук/оцінка
    OPT_OUT = "opt_out"                 # Заборона збереження


class PIIType(str, Enum):
    PHONE = "phone"
    EMAIL = "email"
    ADDRESS = "address"
    PASSPORT = "passport"
    CARD_NUMBER = "card_number"
    NAME = "name"
    LOCATION = "location"


class MemoryCandidate(BaseModel):
    """Кандидат на збереження в пам'ять"""
    content: str
    summary: str
    memory_type: MemoryType
    category: MemoryCategory
    importance: float  # 0.0 - 1.0
    confidence: float  # 0.0 - 1.0
    ttl_days: Optional[int] = None
    source_message_ids: List[str] = []
    metadata: Dict[str, Any] = {}


class PIIDetection(BaseModel):
    """Результат виявлення PII"""
    pii_type: PIIType
    start: int
    end: int
    original: str
    redacted: str


# =============================================================================
# 1. PII SCRUBBER
# =============================================================================

class PIIScrubber:
    """Виявлення та редакція персональних даних"""

    # Регулярні вирази для PII
    PATTERNS = {
        PIIType.PHONE: [
            r'\+?38?\s?0?\d{2}[\s\-]?\d{3}[\s\-]?\d{2}[\s\-]?\d{2}',  # UA phones
            r'\+?\d{1,3}[\s\-]?\(?\d{2,3}\)?[\s\-]?\d{3}[\s\-]?\d{2}[\s\-]?\d{2}',
        ],
        PIIType.EMAIL: [
            r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
        ],
        PIIType.CARD_NUMBER: [
            r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b',
        ],
        PIIType.PASSPORT: [
            r'\b[A-Z]{2}\d{6}\b',  # UA passport
        ],
    }

    def detect(self, text: str) -> List[PIIDetection]:
        """Виявити всі PII в тексті"""
        detections = []

        for pii_type, patterns in self.PATTERNS.items():
            for pattern in patterns:
                for match in re.finditer(pattern, text, re.IGNORECASE):
                    detections.append(PIIDetection(
                        pii_type=pii_type,
                        start=match.start(),
                        end=match.end(),
                        original=match.group(),
                        redacted=self._redact(pii_type, match.group())
                    ))

        return detections

    def _redact(self, pii_type: PIIType, value: str) -> str:
        """Редагувати PII значення"""
        if pii_type == PIIType.EMAIL:
            parts = value.split('@')
            return f"{parts[0][:2]}***@{parts[1]}" if len(parts) == 2 else "[EMAIL]"
        elif pii_type == PIIType.PHONE:
            return f"***{value[-4:]}" if len(value) > 4 else "[PHONE]"
        elif pii_type == PIIType.CARD_NUMBER:
            return f"****{value[-4:]}"
        else:
            return f"[{pii_type.value.upper()}]"

    def scrub(self, text: str) -> Tuple[str, List[PIIDetection], bool]:
        """
        Очистити текст від PII
        Returns: (cleaned_text, detections, has_pii)
        """
        detections = self.detect(text)

        if not detections:
            return text, [], False

        # Сортувати за позицією (з кінця) для правильної заміни
        detections.sort(key=lambda x: x.start, reverse=True)

        cleaned = text
        for detection in detections:
            cleaned = cleaned[:detection.start] + detection.redacted + cleaned[detection.end:]

        return cleaned, detections, True


# =============================================================================
# 2. MEMORY CANDIDATE EXTRACTOR
# =============================================================================

class MemoryExtractor:
    """Витягування кандидатів на пам'ять з повідомлень"""

    # Ключові фрази для категорій
    CATEGORY_PATTERNS = {
        MemoryCategory.PREFERENCE: [
            r'я (хочу|бажаю|віддаю перевагу|люблю|не люблю)',
            r'мені (подобається|не подобається|зручніше)',
            r'(краще|гірше) для мене',
        ],
        MemoryCategory.ROLE: [
            r'я (інвестор|інженер|розробник|науковець|журналіст|модератор)',
            r'працюю (як|в галузі)',
            r'моя (роль|посада|професія)',
        ],
        MemoryCategory.TOPIC_INTEREST: [
            r'цікавить (мене )?(BioMiner|EcoMiner|токеноміка|governance|стейкінг)',
            r'хочу (дізнатися|розібратися) (в|з)',
            r'питання (про|щодо|стосовно)',
        ],
        MemoryCategory.OPT_OUT: [
            r'(не |НЕ )?(запам[\'ʼ]ятов|запамʼятовуй|запамятовуй)',
            r'забудь (мене|це|все)',
            r'вимкни (пам[\'ʼ]ять|память)',
        ],
    }

    # Важливість за категорією
    IMPORTANCE_WEIGHTS = {
        MemoryCategory.PREFERENCE: 0.7,
        MemoryCategory.ROLE: 0.8,
        MemoryCategory.TOPIC_INTEREST: 0.6,
        MemoryCategory.FACT: 0.5,
        MemoryCategory.OPT_OUT: 1.0,  # Найвища важливість
    }

    def extract(
        self,
        messages: List[Dict[str, Any]],
        context: Optional[Dict[str, Any]] = None
    ) -> List[MemoryCandidate]:
        """
        Витягнути кандидатів на пам'ять з повідомлень

        Args:
            messages: Список повідомлень [{role, content, message_id, ...}]
            context: Додатковий контекст (group_id, user_id, etc.)

        Returns:
            Список MemoryCandidate
        """
        candidates = []

        for msg in messages:
            if msg.get('role') != 'user':
                continue

            content = msg.get('content', '')
            message_id = msg.get('message_id', str(uuid4()))

            # Перевірити opt-out фрази
            opt_out = self._check_opt_out(content)
            if opt_out:
                candidates.append(opt_out)
                candidates[-1].source_message_ids = [message_id]
                continue

            # Шукати інші категорії
            for category, patterns in self.CATEGORY_PATTERNS.items():
                if category == MemoryCategory.OPT_OUT:
                    continue

                for pattern in patterns:
                    if re.search(pattern, content, re.IGNORECASE):
                        candidate = self._create_candidate(
                            content=content,
                            category=category,
                            message_id=message_id,
                            context=context
                        )
                        if candidate:
                            candidates.append(candidate)
                        break

        return candidates

    def _check_opt_out(self, content: str) -> Optional[MemoryCandidate]:
        """Перевірити на opt-out фразу"""
        for pattern in self.CATEGORY_PATTERNS[MemoryCategory.OPT_OUT]:
            match = re.search(pattern, content, re.IGNORECASE)
            if match:
                # Визначити тип opt-out
                if 'забудь' in content.lower():
                    action = 'forget'
                    summary = "Користувач просить видалити пам'ять"
                else:
                    action = 'disable'
                    summary = "Користувач просить не запам'ятовувати"

                return MemoryCandidate(
                    content=content,
                    summary=summary,
                    memory_type=MemoryType.SEMANTIC,
                    category=MemoryCategory.OPT_OUT,
                    importance=1.0,
                    confidence=0.95,
                    metadata={'action': action}
                )
        return None

    def _create_candidate(
        self,
        content: str,
        category: MemoryCategory,
        message_id: str,
        context: Optional[Dict[str, Any]] = None
    ) -> Optional[MemoryCandidate]:
        """Створити кандидата на пам'ять"""

        # Визначити тип пам'яті
        if category in [MemoryCategory.PREFERENCE, MemoryCategory.ROLE]:
            memory_type = MemoryType.SEMANTIC
            ttl_days = None  # Безстроково
        else:
            memory_type = MemoryType.EPISODIC
            ttl_days = 90  # 3 місяці

        # Створити короткий summary
        summary = self._generate_summary(content, category)

        return MemoryCandidate(
            content=content,
            summary=summary,
            memory_type=memory_type,
            category=category,
            importance=self.IMPORTANCE_WEIGHTS.get(category, 0.5),
            confidence=0.7,  # Базова впевненість, можна підвищити через LLM
            ttl_days=ttl_days,
            source_message_ids=[message_id],
            metadata=context or {}
        )

    def _generate_summary(self, content: str, category: MemoryCategory) -> str:
        """Згенерувати короткий summary"""
        # Простий варіант - перші 100 символів
        # В production використовувати LLM
        summary = content[:100]
        if len(content) > 100:
            summary += "..."
        return f"[{category.value}] {summary}"


# =============================================================================
# 3. DEDUP & MERGE
# =============================================================================

class MemoryDeduplicator:
    """Дедуплікація та об'єднання схожих пам'ятей"""

    def __init__(self, similarity_threshold: float = 0.85):
        self.similarity_threshold = similarity_threshold

    def deduplicate(
        self,
        new_candidates: List[MemoryCandidate],
        existing_memories: List[Dict[str, Any]]
    ) -> Tuple[List[MemoryCandidate], List[Dict[str, Any]]]:
        """
        Дедуплікувати нових кандидатів проти існуючих пам'ятей

        Returns:
            (candidates_to_create, memories_to_update)
        """
        to_create = []
        to_update = []

        for candidate in new_candidates:
            # Шукати схожу пам'ять
            similar = self._find_similar(candidate, existing_memories)

            if similar:
                # Оновити існуючу пам'ять
                to_update.append({
                    'memory_id': similar['memory_id'],
                    'content': candidate.content,
                    'summary': candidate.summary,
                    'importance': max(candidate.importance, similar.get('importance', 0)),
                    'source_message_ids': list(set(
                        similar.get('source_message_ids', []) +
                        candidate.source_message_ids
                    ))
                })
            else:
                to_create.append(candidate)

        return to_create, to_update

    def _find_similar(
        self,
        candidate: MemoryCandidate,
        existing: List[Dict[str, Any]]
    ) -> Optional[Dict[str, Any]]:
        """Знайти схожу пам'ять"""
        candidate_hash = self._content_hash(candidate.content)

        for memory in existing:
            # Швидка перевірка за хешем
            if self._content_hash(memory.get('content', '')) == candidate_hash:
                return memory

            # Перевірка за категорією + summary
            if (memory.get('category') == candidate.category.value and
                self._text_similarity(candidate.summary, memory.get('summary', '')) > self.similarity_threshold):
                return memory

        return None

    def _content_hash(self, content: str) -> str:
        """Обчислити хеш контенту"""
        normalized = content.lower().strip()
        return hashlib.md5(normalized.encode()).hexdigest()

    def _text_similarity(self, text1: str, text2: str) -> float:
        """Проста подібність тексту (Jaccard)"""
        if not text1 or not text2:
            return 0.0

        words1 = set(text1.lower().split())
        words2 = set(text2.lower().split())

        intersection = len(words1 & words2)
        union = len(words1 | words2)

        return intersection / union if union > 0 else 0.0


# =============================================================================
# 4. MEMORY INGESTION PIPELINE
# =============================================================================

class MemoryIngestionPipeline:
    """
    Повний пайплайн витягування та збереження пам'яті
    """

    def __init__(self, db=None, vector_store=None, graph_store=None):
        self.db = db
        self.vector_store = vector_store
        self.graph_store = graph_store

        self.pii_scrubber = PIIScrubber()
        self.extractor = MemoryExtractor()
        self.deduplicator = MemoryDeduplicator()

    async def process_conversation(
        self,
        messages: List[Dict[str, Any]],
        user_id: Optional[str] = None,
        platform_user_id: Optional[str] = None,
        group_id: Optional[str] = None,
        conversation_id: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Обробити розмову та витягнути пам'ять

        Returns:
            {
                "memories_created": int,
                "memories_updated": int,
                "pii_detected": bool,
                "opt_out_requested": bool,
                "details": [...]
            }
        """
        result = {
            "memories_created": 0,
            "memories_updated": 0,
            "pii_detected": False,
            "opt_out_requested": False,
            "details": []
        }

        # 1. PII Scrubbing
        cleaned_messages = []
        for msg in messages:
            if msg.get('role') == 'user':
                cleaned, detections, has_pii = self.pii_scrubber.scrub(msg.get('content', ''))
                if has_pii:
                    result["pii_detected"] = True
                    logger.info("pii_detected",
                               count=len(detections),
                               types=[d.pii_type.value for d in detections])
                cleaned_messages.append({**msg, 'content': cleaned, '_pii_detected': has_pii})
            else:
                cleaned_messages.append(msg)

        # 2. Extract candidates
        context = {
            'user_id': user_id,
            'platform_user_id': platform_user_id,
            'group_id': group_id,
            'conversation_id': conversation_id
        }
        candidates = self.extractor.extract(cleaned_messages, context)

        # Перевірити opt-out
        for candidate in candidates:
            if candidate.category == MemoryCategory.OPT_OUT:
                result["opt_out_requested"] = True
                await self._handle_opt_out(candidate, context)
                result["details"].append({
                    "type": "opt_out",
                    "action": candidate.metadata.get('action'),
                    "summary": candidate.summary
                })

        # Якщо opt-out — не зберігати інші пам'яті
        if result["opt_out_requested"]:
            return result

        # 3. Dedup against existing
        existing_memories = []
        if self.db:
            existing_memories = await self._get_existing_memories(
                user_id=user_id,
                platform_user_id=platform_user_id,
                group_id=group_id
            )

        to_create, to_update = self.deduplicator.deduplicate(candidates, existing_memories)

        # 4. Write to storage
        for candidate in to_create:
            memory_id = await self._create_memory(candidate, context)
            if memory_id:
                result["memories_created"] += 1
                result["details"].append({
                    "type": "created",
                    "memory_id": str(memory_id),
                    "category": candidate.category.value,
                    "summary": candidate.summary
                })

        for update in to_update:
            success = await self._update_memory(update)
            if success:
                result["memories_updated"] += 1
                result["details"].append({
                    "type": "updated",
                    "memory_id": update['memory_id'],
                    "summary": update.get('summary')
                })

        # 5. Audit log
        await self._log_ingestion(result, context)

        logger.info("ingestion_complete",
                   created=result["memories_created"],
                   updated=result["memories_updated"],
                   pii=result["pii_detected"],
                   opt_out=result["opt_out_requested"])

        return result

    async def _handle_opt_out(
        self,
        candidate: MemoryCandidate,
        context: Dict[str, Any]
    ):
        """Обробити opt-out запит"""
        action = candidate.metadata.get('action', 'disable')
        group_id = context.get('group_id')
        platform_user_id = context.get('platform_user_id')

        if not platform_user_id:
            return

        if self.db:
            if action == 'forget' and group_id:
                # Повне видалення в групі
                await self.db.execute(
                    "SELECT memory_forget_in_group($1::uuid, $2)",
                    group_id, platform_user_id
                )
            else:
                # Просто відключити збереження
                if group_id:
                    await self.db.execute("""
                        UPDATE group_members
                        SET no_memory_in_group = TRUE
                        WHERE group_id = $1::uuid AND platform_user_id = $2
                    """, group_id, platform_user_id)
                else:
                    await self.db.execute("""
                        UPDATE memory_consent
                        SET memory_enabled = FALSE, updated_at = NOW()
                        WHERE platform_user_id = $1
                    """, platform_user_id)

    async def _get_existing_memories(
        self,
        user_id: Optional[str],
        platform_user_id: Optional[str],
        group_id: Optional[str]
    ) -> List[Dict[str, Any]]:
        """Отримати існуючі пам'яті"""
        if not self.db:
            return []

        query = """
            SELECT memory_id, content, summary, category, importance, source_message_ids
            FROM memories
            WHERE is_active = TRUE
        """
        params = []

        if group_id:
            query += " AND group_id = $1::uuid"
            params.append(group_id)
            if platform_user_id:
                query += " AND platform_user_id = $2"
                params.append(platform_user_id)
        elif user_id:
            query += " AND user_id = $1::uuid AND group_id IS NULL"
            params.append(user_id)
        elif platform_user_id:
            query += " AND platform_user_id = $1 AND group_id IS NULL"
            params.append(platform_user_id)
        else:
            return []

        rows = await self.db.fetch(query, *params)
        return [dict(row) for row in rows]

    async def _create_memory(
        self,
        candidate: MemoryCandidate,
        context: Dict[str, Any]
    ) -> Optional[UUID]:
        """Створити нову пам'ять"""
        if not self.db:
            return uuid4()  # Mock для тестування

        memory_id = uuid4()

        # Calculate expires_at
        expires_at = None
        if candidate.ttl_days:
            expires_at = datetime.now() + timedelta(days=candidate.ttl_days)

        await self.db.execute("""
            INSERT INTO memories (
                memory_id, user_id, platform_user_id, group_id,
                memory_type, category, content, summary,
                importance, confidence, ttl_days, expires_at,
                source_message_ids, extraction_method, metadata
            ) VALUES (
                $1, $2::uuid, $3, $4::uuid,
                $5, $6, $7, $8,
                $9, $10, $11, $12,
                $13, $14, $15
            )
        """,
            memory_id,
            context.get('user_id'),
            context.get('platform_user_id'),
            context.get('group_id'),
            candidate.memory_type.value,
            candidate.category.value,
            candidate.content,
            candidate.summary,
            candidate.importance,
            candidate.confidence,
            candidate.ttl_days,
            expires_at,
            candidate.source_message_ids,
            'pipeline',
            candidate.metadata
        )

        # Зберегти embedding якщо є vector store
        if self.vector_store:
            await self._store_embedding(memory_id, candidate, context)

        # Зберегти в граф якщо є graph store
        if self.graph_store:
            await self._store_graph_relation(memory_id, candidate, context)

        return memory_id

    async def _update_memory(self, update: Dict[str, Any]) -> bool:
        """Оновити існуючу пам'ять"""
        if not self.db:
            return True

        await self.db.execute("""
            UPDATE memories
            SET content = $2, summary = $3, importance = $4,
                source_message_ids = $5, updated_at = NOW()
            WHERE memory_id = $1::uuid
        """,
            update['memory_id'],
            update['content'],
            update['summary'],
            update['importance'],
            update['source_message_ids']
        )
        return True

    async def _store_embedding(
        self,
        memory_id: UUID,
        candidate: MemoryCandidate,
        context: Dict[str, Any]
    ):
        """Зберегти embedding в vector store"""
        # Реалізація залежить від vector store (Qdrant, pgvector)
        pass

    async def _store_graph_relation(
        self,
        memory_id: UUID,
        candidate: MemoryCandidate,
        context: Dict[str, Any]
    ):
        """Зберегти зв'язок в graph store"""
        # Реалізація для Neo4j
        pass

    async def _log_ingestion(
        self,
        result: Dict[str, Any],
        context: Dict[str, Any]
    ):
        """Записати в аудит"""
        if not self.db:
            return

        await self.db.execute("""
            INSERT INTO memory_events (
                user_id, group_id, action, actor, new_value
            ) VALUES (
                $1::uuid, $2::uuid, 'ingestion', 'pipeline', $3
            )
        """,
            context.get('user_id'),
            context.get('group_id'),
            result
        )