doc-service: add shared deterministic excel answer contract

This commit is contained in:
Apple
2026-02-20 14:16:16 -08:00
parent e6c083a000
commit 7b5357228f

View File

@@ -12,6 +12,7 @@ import os
import logging import logging
import hashlib import hashlib
import json import json
import re
from typing import Optional, Dict, Any, List from typing import Optional, Dict, Any, List
from pydantic import BaseModel from pydantic import BaseModel
from datetime import datetime from datetime import datetime
@@ -21,6 +22,8 @@ from memory_client import memory_client
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
SHARED_EXCEL_POLICY_AGENTS = {"agromatrix", "helion", "nutra", "greenfood"}
class QAItem(BaseModel): class QAItem(BaseModel):
"""Single Q&A pair""" """Single Q&A pair"""
@@ -81,6 +84,112 @@ class DocumentService:
"""Initialize document service""" """Initialize document service"""
self.memory_client = memory_client self.memory_client = memory_client
def _is_excel_filename(self, file_name: Optional[str]) -> bool:
if not file_name:
return False
lower = file_name.lower()
return lower.endswith(".xlsx") or lower.endswith(".xls")
def _is_numeric_question(self, question: str) -> bool:
t = (question or "").lower()
if not t:
return False
markers = [
"скільки", "сума", "витрат", "добрив", "грн", "uah", "usd", "eur",
"сколько", "amount", "total", "spent", "cost", "value",
]
return any(m in t for m in markers)
def _extract_query_tokens(self, question: str) -> List[str]:
tokens = re.findall(r"[a-zA-Zа-яА-ЯіїєґІЇЄҐ0-9]{3,}", (question or "").lower())
stop = {
"яка", "який", "яке", "which", "what", "скільки", "сума", "була",
"витрачена", "write", "show", "give", "please", "мені", "будь", "ласка",
"тому", "цьому", "цей", "this", "that", "for", "and", "the",
}
return [t for t in tokens if t not in stop]
async def _try_answer_excel_question(
self,
question: str,
doc_url: Optional[str],
file_name: Optional[str],
) -> Optional[str]:
if not doc_url or not self._is_numeric_question(question):
return None
try:
import httpx
from io import BytesIO
import openpyxl
except Exception:
return None
query_tokens = self._extract_query_tokens(question)
if not query_tokens:
query_tokens = ["сума", "витрати", "добрив"]
try:
async with httpx.AsyncClient(timeout=20.0) as client:
resp = await client.get(doc_url)
if resp.status_code != 200:
return None
content = resp.content
wb = openpyxl.load_workbook(BytesIO(content), data_only=True, read_only=True)
best = None
best_score = -1
fallback = None
for ws in wb.worksheets:
for row_idx, row in enumerate(ws.iter_rows(values_only=True), start=1):
label = ""
numeric_value = None
for cell in row:
if isinstance(cell, (int, float)) and numeric_value is None:
numeric_value = float(cell)
elif isinstance(cell, str) and not label:
label = cell.strip()
if numeric_value is None:
continue
label_low = label.lower()
score = sum(1 for t in query_tokens if t in label_low)
if score > best_score:
best_score = score
best = {
"sheet": ws.title,
"row": row_idx,
"label": label or "n/a",
"value": numeric_value,
}
if fallback is None and any(m in label_low for m in ("добрив", "fertiliz", "удобр")):
fallback = {
"sheet": ws.title,
"row": row_idx,
"label": label or "n/a",
"value": numeric_value,
}
picked = best if best and best_score > 0 else fallback
if not picked:
return None
value = picked["value"]
if abs(value - int(value)) < 1e-9:
value_str = f"{int(value):,}".replace(",", " ")
else:
value_str = f"{value:,.2f}".replace(",", " ").replace(".", ",")
unit = "грн" if self._is_numeric_question(question) else ""
unit_part = f" {unit}" if unit else ""
file_part = f' у файлі "{file_name}"' if file_name else ""
return (
f"За{file_part}: {value_str}{unit_part}. "
f"Джерело: лист {picked['sheet']}, рядок {picked['row']} ({picked['label']})."
)
except Exception as e:
logger.warning(f"Excel deterministic answer failed: {e}")
return None
async def save_doc_context( async def save_doc_context(
self, self,
session_id: str, session_id: str,
@@ -451,7 +560,8 @@ class DocumentService:
question: str, question: str,
doc_id: Optional[str] = None, doc_id: Optional[str] = None,
dao_id: Optional[str] = None, dao_id: Optional[str] = None,
user_id: Optional[str] = None user_id: Optional[str] = None,
agent_id: str = "daarwizz"
) -> QAResult: ) -> QAResult:
""" """
Ask a question about a document using RAG query. Ask a question about a document using RAG query.
@@ -468,11 +578,20 @@ class DocumentService:
""" """
try: try:
# If doc_id not provided, try to get from context # If doc_id not provided, try to get from context
doc_url = None
file_name = None
if not doc_id: if not doc_id:
doc_context = await self.get_doc_context(session_id) doc_context = await self.get_doc_context(session_id)
if doc_context: if doc_context:
doc_id = doc_context.doc_id doc_id = doc_context.doc_id
dao_id = dao_id or doc_context.dao_id dao_id = dao_id or doc_context.dao_id
doc_url = doc_context.doc_url
file_name = doc_context.file_name
else:
doc_context = await self.get_doc_context(session_id)
if doc_context:
doc_url = doc_context.doc_url
file_name = doc_context.file_name
if not doc_id: if not doc_id:
return QAResult( return QAResult(
@@ -485,10 +604,31 @@ class DocumentService:
parts = session_id.split(":", 1) parts = session_id.split(":", 1)
user_id = parts[1] if len(parts) > 1 else session_id user_id = parts[1] if len(parts) > 1 else session_id
# Shared deterministic Excel policy for top-level agrarian agents.
if (
(agent_id or "").lower() in SHARED_EXCEL_POLICY_AGENTS
and self._is_excel_filename(file_name)
):
deterministic = await self._try_answer_excel_question(
question=question,
doc_url=doc_url,
file_name=file_name,
)
if deterministic:
return QAResult(
success=True,
answer=deterministic,
doc_id=doc_id,
sources=[{
"type": "excel_deterministic",
"file_name": file_name,
}],
)
# Build RAG query request # Build RAG query request
router_request = { router_request = {
"mode": "rag_query", "mode": "rag_query",
"agent": "daarwizz", "agent": agent_id,
"metadata": { "metadata": {
"source": self._extract_source(session_id), "source": self._extract_source(session_id),
"dao_id": dao_id, "dao_id": dao_id,
@@ -503,7 +643,9 @@ class DocumentService:
}, },
} }
logger.info(f"RAG query: session={session_id}, question={question[:50]}, doc_id={doc_id}") logger.info(
f"RAG query: agent={agent_id}, session={session_id}, question={question[:50]}, doc_id={doc_id}"
)
# Send to Router # Send to Router
response = await send_to_router(router_request) response = await send_to_router(router_request)
@@ -593,7 +735,8 @@ async def ask_about_document(
question: str, question: str,
doc_id: Optional[str] = None, doc_id: Optional[str] = None,
dao_id: Optional[str] = None, dao_id: Optional[str] = None,
user_id: Optional[str] = None user_id: Optional[str] = None,
agent_id: str = "daarwizz"
) -> QAResult: ) -> QAResult:
"""Ask a question about a document using RAG query""" """Ask a question about a document using RAG query"""
return await doc_service.ask_about_document( return await doc_service.ask_about_document(
@@ -601,7 +744,8 @@ async def ask_about_document(
question=question, question=question,
doc_id=doc_id, doc_id=doc_id,
dao_id=dao_id, dao_id=dao_id,
user_id=user_id user_id=user_id,
agent_id=agent_id
) )