feat: implement TTS, Document processing, and Memory Service /facts API
- TTS: xtts-v2 integration with voice cloning support
- Document: docling integration for PDF/DOCX/PPTX processing
- Memory Service: added /facts/upsert, /facts/{key}, /facts endpoints
- Added required dependencies (TTS, docling)
This commit is contained in:
@@ -6,7 +6,7 @@ import os
|
||||
import jwt
|
||||
import time
|
||||
from typing import Optional, Union
|
||||
from fastapi import HTTPException, Security
|
||||
from fastapi import HTTPException, Security, Depends
|
||||
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
||||
from app.config import get_settings
|
||||
|
||||
@@ -18,6 +18,7 @@ JWT_ALGORITHM = settings.jwt_algorithm
|
||||
JWT_EXPIRATION = settings.jwt_expiration
|
||||
|
||||
security = HTTPBearer()
|
||||
security_optional = HTTPBearer(auto_error=False)
|
||||
|
||||
|
||||
def generate_jwt_token(service_name: str, permissions: list = None) -> str:
|
||||
@@ -43,7 +44,7 @@ def verify_jwt_token(token: str) -> dict:
|
||||
|
||||
|
||||
async def get_current_service_optional(
|
||||
credentials: Optional[HTTPAuthorizationCredentials] = Security(security, auto_error=False)
|
||||
credentials: Optional[HTTPAuthorizationCredentials] = Depends(security_optional)
|
||||
) -> Optional[dict]:
|
||||
"""Dependency для отримання поточного сервісу з JWT (опціонально)"""
|
||||
if not credentials:
|
||||
|
||||
@@ -406,6 +406,117 @@ class Database:
|
||||
""", thread_id)
|
||||
return dict(row) if row else None
|
||||
|
||||
# ========================================================================
|
||||
# FACTS (Simple Key-Value storage)
|
||||
# ========================================================================
|
||||
|
||||
async def ensure_facts_table(self):
|
||||
"""Create facts table if not exists"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS user_facts (
|
||||
fact_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
user_id TEXT NOT NULL,
|
||||
team_id TEXT,
|
||||
fact_key TEXT NOT NULL,
|
||||
fact_value TEXT,
|
||||
fact_value_json JSONB,
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
||||
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
||||
UNIQUE(user_id, team_id, fact_key)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_user_facts_user_id ON user_facts(user_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_user_facts_team_id ON user_facts(team_id);
|
||||
""")
|
||||
|
||||
async def upsert_fact(
|
||||
self,
|
||||
user_id: str,
|
||||
fact_key: str,
|
||||
fact_value: Optional[str] = None,
|
||||
fact_value_json: Optional[dict] = None,
|
||||
team_id: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Create or update a user fact"""
|
||||
async with self.pool.acquire() as conn:
|
||||
row = await conn.fetchrow("""
|
||||
INSERT INTO user_facts (user_id, team_id, fact_key, fact_value, fact_value_json)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
ON CONFLICT (user_id, team_id, fact_key)
|
||||
DO UPDATE SET
|
||||
fact_value = EXCLUDED.fact_value,
|
||||
fact_value_json = EXCLUDED.fact_value_json,
|
||||
updated_at = NOW()
|
||||
RETURNING *
|
||||
""", user_id, team_id, fact_key, fact_value, fact_value_json)
|
||||
|
||||
return dict(row) if row else {}
|
||||
|
||||
async def get_fact(
|
||||
self,
|
||||
user_id: str,
|
||||
fact_key: str,
|
||||
team_id: Optional[str] = None
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Get a specific fact"""
|
||||
async with self.pool.acquire() as conn:
|
||||
if team_id:
|
||||
row = await conn.fetchrow("""
|
||||
SELECT * FROM user_facts
|
||||
WHERE user_id = $1 AND fact_key = $2 AND team_id = $3
|
||||
""", user_id, fact_key, team_id)
|
||||
else:
|
||||
row = await conn.fetchrow("""
|
||||
SELECT * FROM user_facts
|
||||
WHERE user_id = $1 AND fact_key = $2 AND team_id IS NULL
|
||||
""", user_id, fact_key)
|
||||
|
||||
return dict(row) if row else None
|
||||
|
||||
async def list_facts(
|
||||
self,
|
||||
user_id: str,
|
||||
team_id: Optional[str] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""List all facts for a user"""
|
||||
async with self.pool.acquire() as conn:
|
||||
if team_id:
|
||||
rows = await conn.fetch("""
|
||||
SELECT * FROM user_facts
|
||||
WHERE user_id = $1 AND team_id = $2
|
||||
ORDER BY fact_key
|
||||
""", user_id, team_id)
|
||||
else:
|
||||
rows = await conn.fetch("""
|
||||
SELECT * FROM user_facts
|
||||
WHERE user_id = $1
|
||||
ORDER BY fact_key
|
||||
""", user_id)
|
||||
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def delete_fact(
|
||||
self,
|
||||
user_id: str,
|
||||
fact_key: str,
|
||||
team_id: Optional[str] = None
|
||||
) -> bool:
|
||||
"""Delete a fact"""
|
||||
async with self.pool.acquire() as conn:
|
||||
if team_id:
|
||||
result = await conn.execute("""
|
||||
DELETE FROM user_facts
|
||||
WHERE user_id = $1 AND fact_key = $2 AND team_id = $3
|
||||
""", user_id, fact_key, team_id)
|
||||
else:
|
||||
result = await conn.execute("""
|
||||
DELETE FROM user_facts
|
||||
WHERE user_id = $1 AND fact_key = $2 AND team_id IS NULL
|
||||
""", user_id, fact_key)
|
||||
|
||||
return "DELETE 1" in result
|
||||
|
||||
# ========================================================================
|
||||
# STATS
|
||||
# ========================================================================
|
||||
@@ -418,11 +529,18 @@ class Database:
|
||||
memories = await conn.fetchval("SELECT COUNT(*) FROM long_term_memory_items WHERE valid_to IS NULL")
|
||||
summaries = await conn.fetchval("SELECT COUNT(*) FROM thread_summaries")
|
||||
|
||||
# Add facts count safely
|
||||
try:
|
||||
facts = await conn.fetchval("SELECT COUNT(*) FROM user_facts")
|
||||
except:
|
||||
facts = 0
|
||||
|
||||
return {
|
||||
"threads": threads,
|
||||
"events": events,
|
||||
"active_memories": memories,
|
||||
"summaries": summaries
|
||||
"summaries": summaries,
|
||||
"facts": facts
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -11,8 +11,20 @@ from .config import get_settings
|
||||
logger = structlog.get_logger()
|
||||
settings = get_settings()
|
||||
|
||||
# Initialize Cohere client
|
||||
co = cohere.Client(settings.cohere_api_key)
|
||||
# Cohere client will be initialized lazily
|
||||
_cohere_client = None
|
||||
|
||||
def get_cohere_client():
|
||||
"""Lazy initialization of Cohere client"""
|
||||
global _cohere_client
|
||||
if _cohere_client is None and settings.cohere_api_key:
|
||||
try:
|
||||
_cohere_client = cohere.Client(settings.cohere_api_key)
|
||||
logger.info("cohere_client_initialized")
|
||||
except Exception as e:
|
||||
logger.warning("cohere_client_init_failed", error=str(e))
|
||||
_cohere_client = False # Mark as failed to avoid retries
|
||||
return _cohere_client if _cohere_client else None
|
||||
|
||||
|
||||
@retry(
|
||||
@@ -36,9 +48,14 @@ async def get_embeddings(
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
co_client = get_cohere_client()
|
||||
if not co_client:
|
||||
logger.warning("cohere_not_configured", message="Cohere API key not set, returning empty embeddings")
|
||||
return [[] for _ in texts]
|
||||
|
||||
logger.info("generating_embeddings", count=len(texts), input_type=input_type)
|
||||
|
||||
response = co.embed(
|
||||
response = co_client.embed(
|
||||
texts=texts,
|
||||
model=settings.cohere_model,
|
||||
input_type=input_type,
|
||||
|
||||
698
services/memory-service/app/ingestion.py
Normal file
698
services/memory-service/app/ingestion.py
Normal file
@@ -0,0 +1,698 @@
|
||||
"""
|
||||
Memory Ingestion Pipeline
|
||||
Автоматичне витягування фактів/пам'яті з діалогів
|
||||
|
||||
Етапи:
|
||||
1. PII Scrubber - виявлення та редакція персональних даних
|
||||
2. Memory Candidate Extractor - класифікація та витягування
|
||||
3. Dedup & Merge - дедуплікація схожих пам'ятей
|
||||
4. Write - збереження в SQL + Vector + Graph
|
||||
5. Audit Log - запис в аудит
|
||||
"""
|
||||
|
||||
import re
|
||||
import hashlib
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
from datetime import datetime, timedelta
|
||||
from uuid import UUID, uuid4
|
||||
from enum import Enum
|
||||
import structlog
|
||||
from pydantic import BaseModel
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class MemoryType(str, Enum):
|
||||
EPISODIC = "episodic" # Події/факти про взаємодію
|
||||
SEMANTIC = "semantic" # Стійкі вподобання/профіль
|
||||
PROCEDURAL = "procedural" # Як робити щось
|
||||
|
||||
|
||||
class MemoryCategory(str, Enum):
|
||||
PREFERENCE = "preference" # Вподобання користувача
|
||||
FACT = "fact" # Факт про користувача
|
||||
TOPIC_INTEREST = "topic_interest" # Інтерес до теми
|
||||
ROLE = "role" # Роль (інвестор, інженер)
|
||||
INTERACTION = "interaction" # Тип взаємодії
|
||||
FEEDBACK = "feedback" # Відгук/оцінка
|
||||
OPT_OUT = "opt_out" # Заборона збереження
|
||||
|
||||
|
||||
class PIIType(str, Enum):
|
||||
PHONE = "phone"
|
||||
EMAIL = "email"
|
||||
ADDRESS = "address"
|
||||
PASSPORT = "passport"
|
||||
CARD_NUMBER = "card_number"
|
||||
NAME = "name"
|
||||
LOCATION = "location"
|
||||
|
||||
|
||||
class MemoryCandidate(BaseModel):
|
||||
"""Кандидат на збереження в пам'ять"""
|
||||
content: str
|
||||
summary: str
|
||||
memory_type: MemoryType
|
||||
category: MemoryCategory
|
||||
importance: float # 0.0 - 1.0
|
||||
confidence: float # 0.0 - 1.0
|
||||
ttl_days: Optional[int] = None
|
||||
source_message_ids: List[str] = []
|
||||
metadata: Dict[str, Any] = {}
|
||||
|
||||
|
||||
class PIIDetection(BaseModel):
|
||||
"""Результат виявлення PII"""
|
||||
pii_type: PIIType
|
||||
start: int
|
||||
end: int
|
||||
original: str
|
||||
redacted: str
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 1. PII SCRUBBER
|
||||
# =============================================================================
|
||||
|
||||
class PIIScrubber:
|
||||
"""Виявлення та редакція персональних даних"""
|
||||
|
||||
# Регулярні вирази для PII
|
||||
PATTERNS = {
|
||||
PIIType.PHONE: [
|
||||
r'\+?38?\s?0?\d{2}[\s\-]?\d{3}[\s\-]?\d{2}[\s\-]?\d{2}', # UA phones
|
||||
r'\+?\d{1,3}[\s\-]?\(?\d{2,3}\)?[\s\-]?\d{3}[\s\-]?\d{2}[\s\-]?\d{2}',
|
||||
],
|
||||
PIIType.EMAIL: [
|
||||
r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
|
||||
],
|
||||
PIIType.CARD_NUMBER: [
|
||||
r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b',
|
||||
],
|
||||
PIIType.PASSPORT: [
|
||||
r'\b[A-Z]{2}\d{6}\b', # UA passport
|
||||
],
|
||||
}
|
||||
|
||||
def detect(self, text: str) -> List[PIIDetection]:
|
||||
"""Виявити всі PII в тексті"""
|
||||
detections = []
|
||||
|
||||
for pii_type, patterns in self.PATTERNS.items():
|
||||
for pattern in patterns:
|
||||
for match in re.finditer(pattern, text, re.IGNORECASE):
|
||||
detections.append(PIIDetection(
|
||||
pii_type=pii_type,
|
||||
start=match.start(),
|
||||
end=match.end(),
|
||||
original=match.group(),
|
||||
redacted=self._redact(pii_type, match.group())
|
||||
))
|
||||
|
||||
return detections
|
||||
|
||||
def _redact(self, pii_type: PIIType, value: str) -> str:
|
||||
"""Редагувати PII значення"""
|
||||
if pii_type == PIIType.EMAIL:
|
||||
parts = value.split('@')
|
||||
return f"{parts[0][:2]}***@{parts[1]}" if len(parts) == 2 else "[EMAIL]"
|
||||
elif pii_type == PIIType.PHONE:
|
||||
return f"***{value[-4:]}" if len(value) > 4 else "[PHONE]"
|
||||
elif pii_type == PIIType.CARD_NUMBER:
|
||||
return f"****{value[-4:]}"
|
||||
else:
|
||||
return f"[{pii_type.value.upper()}]"
|
||||
|
||||
def scrub(self, text: str) -> Tuple[str, List[PIIDetection], bool]:
|
||||
"""
|
||||
Очистити текст від PII
|
||||
Returns: (cleaned_text, detections, has_pii)
|
||||
"""
|
||||
detections = self.detect(text)
|
||||
|
||||
if not detections:
|
||||
return text, [], False
|
||||
|
||||
# Сортувати за позицією (з кінця) для правильної заміни
|
||||
detections.sort(key=lambda x: x.start, reverse=True)
|
||||
|
||||
cleaned = text
|
||||
for detection in detections:
|
||||
cleaned = cleaned[:detection.start] + detection.redacted + cleaned[detection.end:]
|
||||
|
||||
return cleaned, detections, True
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 2. MEMORY CANDIDATE EXTRACTOR
|
||||
# =============================================================================
|
||||
|
||||
class MemoryExtractor:
|
||||
"""Витягування кандидатів на пам'ять з повідомлень"""
|
||||
|
||||
# Ключові фрази для категорій
|
||||
CATEGORY_PATTERNS = {
|
||||
MemoryCategory.PREFERENCE: [
|
||||
r'я (хочу|бажаю|віддаю перевагу|люблю|не люблю)',
|
||||
r'мені (подобається|не подобається|зручніше)',
|
||||
r'(краще|гірше) для мене',
|
||||
],
|
||||
MemoryCategory.ROLE: [
|
||||
r'я (інвестор|інженер|розробник|науковець|журналіст|модератор)',
|
||||
r'працюю (як|в галузі)',
|
||||
r'моя (роль|посада|професія)',
|
||||
],
|
||||
MemoryCategory.TOPIC_INTEREST: [
|
||||
r'цікавить (мене )?(BioMiner|EcoMiner|токеноміка|governance|стейкінг)',
|
||||
r'хочу (дізнатися|розібратися) (в|з)',
|
||||
r'питання (про|щодо|стосовно)',
|
||||
],
|
||||
MemoryCategory.OPT_OUT: [
|
||||
r'(не |НЕ )?(запам[\'ʼ]ятов|запамʼятовуй|запамятовуй)',
|
||||
r'забудь (мене|це|все)',
|
||||
r'вимкни (пам[\'ʼ]ять|память)',
|
||||
],
|
||||
}
|
||||
|
||||
# Важливість за категорією
|
||||
IMPORTANCE_WEIGHTS = {
|
||||
MemoryCategory.PREFERENCE: 0.7,
|
||||
MemoryCategory.ROLE: 0.8,
|
||||
MemoryCategory.TOPIC_INTEREST: 0.6,
|
||||
MemoryCategory.FACT: 0.5,
|
||||
MemoryCategory.OPT_OUT: 1.0, # Найвища важливість
|
||||
}
|
||||
|
||||
def extract(
|
||||
self,
|
||||
messages: List[Dict[str, Any]],
|
||||
context: Optional[Dict[str, Any]] = None
|
||||
) -> List[MemoryCandidate]:
|
||||
"""
|
||||
Витягнути кандидатів на пам'ять з повідомлень
|
||||
|
||||
Args:
|
||||
messages: Список повідомлень [{role, content, message_id, ...}]
|
||||
context: Додатковий контекст (group_id, user_id, etc.)
|
||||
|
||||
Returns:
|
||||
Список MemoryCandidate
|
||||
"""
|
||||
candidates = []
|
||||
|
||||
for msg in messages:
|
||||
if msg.get('role') != 'user':
|
||||
continue
|
||||
|
||||
content = msg.get('content', '')
|
||||
message_id = msg.get('message_id', str(uuid4()))
|
||||
|
||||
# Перевірити opt-out фрази
|
||||
opt_out = self._check_opt_out(content)
|
||||
if opt_out:
|
||||
candidates.append(opt_out)
|
||||
candidates[-1].source_message_ids = [message_id]
|
||||
continue
|
||||
|
||||
# Шукати інші категорії
|
||||
for category, patterns in self.CATEGORY_PATTERNS.items():
|
||||
if category == MemoryCategory.OPT_OUT:
|
||||
continue
|
||||
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, content, re.IGNORECASE):
|
||||
candidate = self._create_candidate(
|
||||
content=content,
|
||||
category=category,
|
||||
message_id=message_id,
|
||||
context=context
|
||||
)
|
||||
if candidate:
|
||||
candidates.append(candidate)
|
||||
break
|
||||
|
||||
return candidates
|
||||
|
||||
def _check_opt_out(self, content: str) -> Optional[MemoryCandidate]:
|
||||
"""Перевірити на opt-out фразу"""
|
||||
for pattern in self.CATEGORY_PATTERNS[MemoryCategory.OPT_OUT]:
|
||||
match = re.search(pattern, content, re.IGNORECASE)
|
||||
if match:
|
||||
# Визначити тип opt-out
|
||||
if 'забудь' in content.lower():
|
||||
action = 'forget'
|
||||
summary = "Користувач просить видалити пам'ять"
|
||||
else:
|
||||
action = 'disable'
|
||||
summary = "Користувач просить не запам'ятовувати"
|
||||
|
||||
return MemoryCandidate(
|
||||
content=content,
|
||||
summary=summary,
|
||||
memory_type=MemoryType.SEMANTIC,
|
||||
category=MemoryCategory.OPT_OUT,
|
||||
importance=1.0,
|
||||
confidence=0.95,
|
||||
metadata={'action': action}
|
||||
)
|
||||
return None
|
||||
|
||||
def _create_candidate(
|
||||
self,
|
||||
content: str,
|
||||
category: MemoryCategory,
|
||||
message_id: str,
|
||||
context: Optional[Dict[str, Any]] = None
|
||||
) -> Optional[MemoryCandidate]:
|
||||
"""Створити кандидата на пам'ять"""
|
||||
|
||||
# Визначити тип пам'яті
|
||||
if category in [MemoryCategory.PREFERENCE, MemoryCategory.ROLE]:
|
||||
memory_type = MemoryType.SEMANTIC
|
||||
ttl_days = None # Безстроково
|
||||
else:
|
||||
memory_type = MemoryType.EPISODIC
|
||||
ttl_days = 90 # 3 місяці
|
||||
|
||||
# Створити короткий summary
|
||||
summary = self._generate_summary(content, category)
|
||||
|
||||
return MemoryCandidate(
|
||||
content=content,
|
||||
summary=summary,
|
||||
memory_type=memory_type,
|
||||
category=category,
|
||||
importance=self.IMPORTANCE_WEIGHTS.get(category, 0.5),
|
||||
confidence=0.7, # Базова впевненість, можна підвищити через LLM
|
||||
ttl_days=ttl_days,
|
||||
source_message_ids=[message_id],
|
||||
metadata=context or {}
|
||||
)
|
||||
|
||||
def _generate_summary(self, content: str, category: MemoryCategory) -> str:
|
||||
"""Згенерувати короткий summary"""
|
||||
# Простий варіант - перші 100 символів
|
||||
# В production використовувати LLM
|
||||
summary = content[:100]
|
||||
if len(content) > 100:
|
||||
summary += "..."
|
||||
return f"[{category.value}] {summary}"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 3. DEDUP & MERGE
|
||||
# =============================================================================
|
||||
|
||||
class MemoryDeduplicator:
|
||||
"""Дедуплікація та об'єднання схожих пам'ятей"""
|
||||
|
||||
def __init__(self, similarity_threshold: float = 0.85):
|
||||
self.similarity_threshold = similarity_threshold
|
||||
|
||||
def deduplicate(
|
||||
self,
|
||||
new_candidates: List[MemoryCandidate],
|
||||
existing_memories: List[Dict[str, Any]]
|
||||
) -> Tuple[List[MemoryCandidate], List[Dict[str, Any]]]:
|
||||
"""
|
||||
Дедуплікувати нових кандидатів проти існуючих пам'ятей
|
||||
|
||||
Returns:
|
||||
(candidates_to_create, memories_to_update)
|
||||
"""
|
||||
to_create = []
|
||||
to_update = []
|
||||
|
||||
for candidate in new_candidates:
|
||||
# Шукати схожу пам'ять
|
||||
similar = self._find_similar(candidate, existing_memories)
|
||||
|
||||
if similar:
|
||||
# Оновити існуючу пам'ять
|
||||
to_update.append({
|
||||
'memory_id': similar['memory_id'],
|
||||
'content': candidate.content,
|
||||
'summary': candidate.summary,
|
||||
'importance': max(candidate.importance, similar.get('importance', 0)),
|
||||
'source_message_ids': list(set(
|
||||
similar.get('source_message_ids', []) +
|
||||
candidate.source_message_ids
|
||||
))
|
||||
})
|
||||
else:
|
||||
to_create.append(candidate)
|
||||
|
||||
return to_create, to_update
|
||||
|
||||
def _find_similar(
|
||||
self,
|
||||
candidate: MemoryCandidate,
|
||||
existing: List[Dict[str, Any]]
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Знайти схожу пам'ять"""
|
||||
candidate_hash = self._content_hash(candidate.content)
|
||||
|
||||
for memory in existing:
|
||||
# Швидка перевірка за хешем
|
||||
if self._content_hash(memory.get('content', '')) == candidate_hash:
|
||||
return memory
|
||||
|
||||
# Перевірка за категорією + summary
|
||||
if (memory.get('category') == candidate.category.value and
|
||||
self._text_similarity(candidate.summary, memory.get('summary', '')) > self.similarity_threshold):
|
||||
return memory
|
||||
|
||||
return None
|
||||
|
||||
def _content_hash(self, content: str) -> str:
|
||||
"""Обчислити хеш контенту"""
|
||||
normalized = content.lower().strip()
|
||||
return hashlib.md5(normalized.encode()).hexdigest()
|
||||
|
||||
def _text_similarity(self, text1: str, text2: str) -> float:
|
||||
"""Проста подібність тексту (Jaccard)"""
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
words1 = set(text1.lower().split())
|
||||
words2 = set(text2.lower().split())
|
||||
|
||||
intersection = len(words1 & words2)
|
||||
union = len(words1 | words2)
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 4. MEMORY INGESTION PIPELINE
|
||||
# =============================================================================
|
||||
|
||||
class MemoryIngestionPipeline:
|
||||
"""
|
||||
Повний пайплайн витягування та збереження пам'яті
|
||||
"""
|
||||
|
||||
def __init__(self, db=None, vector_store=None, graph_store=None):
|
||||
self.db = db
|
||||
self.vector_store = vector_store
|
||||
self.graph_store = graph_store
|
||||
|
||||
self.pii_scrubber = PIIScrubber()
|
||||
self.extractor = MemoryExtractor()
|
||||
self.deduplicator = MemoryDeduplicator()
|
||||
|
||||
async def process_conversation(
|
||||
self,
|
||||
messages: List[Dict[str, Any]],
|
||||
user_id: Optional[str] = None,
|
||||
platform_user_id: Optional[str] = None,
|
||||
group_id: Optional[str] = None,
|
||||
conversation_id: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Обробити розмову та витягнути пам'ять
|
||||
|
||||
Returns:
|
||||
{
|
||||
"memories_created": int,
|
||||
"memories_updated": int,
|
||||
"pii_detected": bool,
|
||||
"opt_out_requested": bool,
|
||||
"details": [...]
|
||||
}
|
||||
"""
|
||||
result = {
|
||||
"memories_created": 0,
|
||||
"memories_updated": 0,
|
||||
"pii_detected": False,
|
||||
"opt_out_requested": False,
|
||||
"details": []
|
||||
}
|
||||
|
||||
# 1. PII Scrubbing
|
||||
cleaned_messages = []
|
||||
for msg in messages:
|
||||
if msg.get('role') == 'user':
|
||||
cleaned, detections, has_pii = self.pii_scrubber.scrub(msg.get('content', ''))
|
||||
if has_pii:
|
||||
result["pii_detected"] = True
|
||||
logger.info("pii_detected",
|
||||
count=len(detections),
|
||||
types=[d.pii_type.value for d in detections])
|
||||
cleaned_messages.append({**msg, 'content': cleaned, '_pii_detected': has_pii})
|
||||
else:
|
||||
cleaned_messages.append(msg)
|
||||
|
||||
# 2. Extract candidates
|
||||
context = {
|
||||
'user_id': user_id,
|
||||
'platform_user_id': platform_user_id,
|
||||
'group_id': group_id,
|
||||
'conversation_id': conversation_id
|
||||
}
|
||||
candidates = self.extractor.extract(cleaned_messages, context)
|
||||
|
||||
# Перевірити opt-out
|
||||
for candidate in candidates:
|
||||
if candidate.category == MemoryCategory.OPT_OUT:
|
||||
result["opt_out_requested"] = True
|
||||
await self._handle_opt_out(candidate, context)
|
||||
result["details"].append({
|
||||
"type": "opt_out",
|
||||
"action": candidate.metadata.get('action'),
|
||||
"summary": candidate.summary
|
||||
})
|
||||
|
||||
# Якщо opt-out — не зберігати інші пам'яті
|
||||
if result["opt_out_requested"]:
|
||||
return result
|
||||
|
||||
# 3. Dedup against existing
|
||||
existing_memories = []
|
||||
if self.db:
|
||||
existing_memories = await self._get_existing_memories(
|
||||
user_id=user_id,
|
||||
platform_user_id=platform_user_id,
|
||||
group_id=group_id
|
||||
)
|
||||
|
||||
to_create, to_update = self.deduplicator.deduplicate(candidates, existing_memories)
|
||||
|
||||
# 4. Write to storage
|
||||
for candidate in to_create:
|
||||
memory_id = await self._create_memory(candidate, context)
|
||||
if memory_id:
|
||||
result["memories_created"] += 1
|
||||
result["details"].append({
|
||||
"type": "created",
|
||||
"memory_id": str(memory_id),
|
||||
"category": candidate.category.value,
|
||||
"summary": candidate.summary
|
||||
})
|
||||
|
||||
for update in to_update:
|
||||
success = await self._update_memory(update)
|
||||
if success:
|
||||
result["memories_updated"] += 1
|
||||
result["details"].append({
|
||||
"type": "updated",
|
||||
"memory_id": update['memory_id'],
|
||||
"summary": update.get('summary')
|
||||
})
|
||||
|
||||
# 5. Audit log
|
||||
await self._log_ingestion(result, context)
|
||||
|
||||
logger.info("ingestion_complete",
|
||||
created=result["memories_created"],
|
||||
updated=result["memories_updated"],
|
||||
pii=result["pii_detected"],
|
||||
opt_out=result["opt_out_requested"])
|
||||
|
||||
return result
|
||||
|
||||
async def _handle_opt_out(
|
||||
self,
|
||||
candidate: MemoryCandidate,
|
||||
context: Dict[str, Any]
|
||||
):
|
||||
"""Обробити opt-out запит"""
|
||||
action = candidate.metadata.get('action', 'disable')
|
||||
group_id = context.get('group_id')
|
||||
platform_user_id = context.get('platform_user_id')
|
||||
|
||||
if not platform_user_id:
|
||||
return
|
||||
|
||||
if self.db:
|
||||
if action == 'forget' and group_id:
|
||||
# Повне видалення в групі
|
||||
await self.db.execute(
|
||||
"SELECT memory_forget_in_group($1::uuid, $2)",
|
||||
group_id, platform_user_id
|
||||
)
|
||||
else:
|
||||
# Просто відключити збереження
|
||||
if group_id:
|
||||
await self.db.execute("""
|
||||
UPDATE group_members
|
||||
SET no_memory_in_group = TRUE
|
||||
WHERE group_id = $1::uuid AND platform_user_id = $2
|
||||
""", group_id, platform_user_id)
|
||||
else:
|
||||
await self.db.execute("""
|
||||
UPDATE memory_consent
|
||||
SET memory_enabled = FALSE, updated_at = NOW()
|
||||
WHERE platform_user_id = $1
|
||||
""", platform_user_id)
|
||||
|
||||
async def _get_existing_memories(
|
||||
self,
|
||||
user_id: Optional[str],
|
||||
platform_user_id: Optional[str],
|
||||
group_id: Optional[str]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Отримати існуючі пам'яті"""
|
||||
if not self.db:
|
||||
return []
|
||||
|
||||
query = """
|
||||
SELECT memory_id, content, summary, category, importance, source_message_ids
|
||||
FROM memories
|
||||
WHERE is_active = TRUE
|
||||
"""
|
||||
params = []
|
||||
|
||||
if group_id:
|
||||
query += " AND group_id = $1::uuid"
|
||||
params.append(group_id)
|
||||
if platform_user_id:
|
||||
query += " AND platform_user_id = $2"
|
||||
params.append(platform_user_id)
|
||||
elif user_id:
|
||||
query += " AND user_id = $1::uuid AND group_id IS NULL"
|
||||
params.append(user_id)
|
||||
elif platform_user_id:
|
||||
query += " AND platform_user_id = $1 AND group_id IS NULL"
|
||||
params.append(platform_user_id)
|
||||
else:
|
||||
return []
|
||||
|
||||
rows = await self.db.fetch(query, *params)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def _create_memory(
|
||||
self,
|
||||
candidate: MemoryCandidate,
|
||||
context: Dict[str, Any]
|
||||
) -> Optional[UUID]:
|
||||
"""Створити нову пам'ять"""
|
||||
if not self.db:
|
||||
return uuid4() # Mock для тестування
|
||||
|
||||
memory_id = uuid4()
|
||||
|
||||
# Calculate expires_at
|
||||
expires_at = None
|
||||
if candidate.ttl_days:
|
||||
expires_at = datetime.now() + timedelta(days=candidate.ttl_days)
|
||||
|
||||
await self.db.execute("""
|
||||
INSERT INTO memories (
|
||||
memory_id, user_id, platform_user_id, group_id,
|
||||
memory_type, category, content, summary,
|
||||
importance, confidence, ttl_days, expires_at,
|
||||
source_message_ids, extraction_method, metadata
|
||||
) VALUES (
|
||||
$1, $2::uuid, $3, $4::uuid,
|
||||
$5, $6, $7, $8,
|
||||
$9, $10, $11, $12,
|
||||
$13, $14, $15
|
||||
)
|
||||
""",
|
||||
memory_id,
|
||||
context.get('user_id'),
|
||||
context.get('platform_user_id'),
|
||||
context.get('group_id'),
|
||||
candidate.memory_type.value,
|
||||
candidate.category.value,
|
||||
candidate.content,
|
||||
candidate.summary,
|
||||
candidate.importance,
|
||||
candidate.confidence,
|
||||
candidate.ttl_days,
|
||||
expires_at,
|
||||
candidate.source_message_ids,
|
||||
'pipeline',
|
||||
candidate.metadata
|
||||
)
|
||||
|
||||
# Зберегти embedding якщо є vector store
|
||||
if self.vector_store:
|
||||
await self._store_embedding(memory_id, candidate, context)
|
||||
|
||||
# Зберегти в граф якщо є graph store
|
||||
if self.graph_store:
|
||||
await self._store_graph_relation(memory_id, candidate, context)
|
||||
|
||||
return memory_id
|
||||
|
||||
async def _update_memory(self, update: Dict[str, Any]) -> bool:
|
||||
"""Оновити існуючу пам'ять"""
|
||||
if not self.db:
|
||||
return True
|
||||
|
||||
await self.db.execute("""
|
||||
UPDATE memories
|
||||
SET content = $2, summary = $3, importance = $4,
|
||||
source_message_ids = $5, updated_at = NOW()
|
||||
WHERE memory_id = $1::uuid
|
||||
""",
|
||||
update['memory_id'],
|
||||
update['content'],
|
||||
update['summary'],
|
||||
update['importance'],
|
||||
update['source_message_ids']
|
||||
)
|
||||
return True
|
||||
|
||||
async def _store_embedding(
|
||||
self,
|
||||
memory_id: UUID,
|
||||
candidate: MemoryCandidate,
|
||||
context: Dict[str, Any]
|
||||
):
|
||||
"""Зберегти embedding в vector store"""
|
||||
# Реалізація залежить від vector store (Qdrant, pgvector)
|
||||
pass
|
||||
|
||||
async def _store_graph_relation(
|
||||
self,
|
||||
memory_id: UUID,
|
||||
candidate: MemoryCandidate,
|
||||
context: Dict[str, Any]
|
||||
):
|
||||
"""Зберегти зв'язок в graph store"""
|
||||
# Реалізація для Neo4j
|
||||
pass
|
||||
|
||||
async def _log_ingestion(
|
||||
self,
|
||||
result: Dict[str, Any],
|
||||
context: Dict[str, Any]
|
||||
):
|
||||
"""Записати в аудит"""
|
||||
if not self.db:
|
||||
return
|
||||
|
||||
await self.db.execute("""
|
||||
INSERT INTO memory_events (
|
||||
user_id, group_id, action, actor, new_value
|
||||
) VALUES (
|
||||
$1::uuid, $2::uuid, 'ingestion', 'pipeline', $3
|
||||
)
|
||||
""",
|
||||
context.get('user_id'),
|
||||
context.get('group_id'),
|
||||
result
|
||||
)
|
||||
@@ -477,6 +477,102 @@ async def get_context(
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# FACTS (Simple Key-Value storage for Gateway compatibility)
|
||||
# ============================================================================
|
||||
|
||||
from pydantic import BaseModel
|
||||
from typing import Any
|
||||
|
||||
class FactUpsertRequest(BaseModel):
|
||||
"""Request to upsert a user fact"""
|
||||
user_id: str
|
||||
fact_key: str
|
||||
fact_value: Optional[str] = None
|
||||
fact_value_json: Optional[dict] = None
|
||||
team_id: Optional[str] = None
|
||||
|
||||
@app.post("/facts/upsert")
|
||||
async def upsert_fact(request: FactUpsertRequest):
|
||||
"""
|
||||
Create or update a user fact.
|
||||
|
||||
This is a simple key-value store for Gateway compatibility.
|
||||
Facts are stored in PostgreSQL without vector indexing.
|
||||
"""
|
||||
try:
|
||||
# Ensure facts table exists (will be created on first call)
|
||||
await db.ensure_facts_table()
|
||||
|
||||
# Upsert the fact
|
||||
result = await db.upsert_fact(
|
||||
user_id=request.user_id,
|
||||
fact_key=request.fact_key,
|
||||
fact_value=request.fact_value,
|
||||
fact_value_json=request.fact_value_json,
|
||||
team_id=request.team_id
|
||||
)
|
||||
|
||||
logger.info(f"fact_upserted", user_id=request.user_id, fact_key=request.fact_key)
|
||||
return {"status": "ok", "fact_id": result.get("fact_id") if result else None}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"fact_upsert_failed", error=str(e), user_id=request.user_id)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.get("/facts/{fact_key}")
|
||||
async def get_fact(
|
||||
fact_key: str,
|
||||
user_id: str = Query(...),
|
||||
team_id: Optional[str] = None
|
||||
):
|
||||
"""Get a specific fact for a user"""
|
||||
try:
|
||||
fact = await db.get_fact(user_id=user_id, fact_key=fact_key, team_id=team_id)
|
||||
if not fact:
|
||||
raise HTTPException(status_code=404, detail="Fact not found")
|
||||
return fact
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"fact_get_failed", error=str(e))
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.get("/facts")
|
||||
async def list_facts(
|
||||
user_id: str = Query(...),
|
||||
team_id: Optional[str] = None
|
||||
):
|
||||
"""List all facts for a user"""
|
||||
try:
|
||||
facts = await db.list_facts(user_id=user_id, team_id=team_id)
|
||||
return {"facts": facts}
|
||||
except Exception as e:
|
||||
logger.error(f"facts_list_failed", error=str(e))
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.delete("/facts/{fact_key}")
|
||||
async def delete_fact(
|
||||
fact_key: str,
|
||||
user_id: str = Query(...),
|
||||
team_id: Optional[str] = None
|
||||
):
|
||||
"""Delete a fact"""
|
||||
try:
|
||||
deleted = await db.delete_fact(user_id=user_id, fact_key=fact_key, team_id=team_id)
|
||||
if not deleted:
|
||||
raise HTTPException(status_code=404, detail="Fact not found")
|
||||
return {"status": "ok", "deleted": True}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"fact_delete_failed", error=str(e))
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# ADMIN
|
||||
# ============================================================================
|
||||
|
||||
Reference in New Issue
Block a user