doc-service: add shared deterministic excel answer contract
This commit is contained in:
@@ -12,6 +12,7 @@ import os
|
|||||||
import logging
|
import logging
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
from typing import Optional, Dict, Any, List
|
from typing import Optional, Dict, Any, List
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@@ -21,6 +22,8 @@ from memory_client import memory_client
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
SHARED_EXCEL_POLICY_AGENTS = {"agromatrix", "helion", "nutra", "greenfood"}
|
||||||
|
|
||||||
|
|
||||||
class QAItem(BaseModel):
|
class QAItem(BaseModel):
|
||||||
"""Single Q&A pair"""
|
"""Single Q&A pair"""
|
||||||
@@ -81,6 +84,112 @@ class DocumentService:
|
|||||||
"""Initialize document service"""
|
"""Initialize document service"""
|
||||||
self.memory_client = memory_client
|
self.memory_client = memory_client
|
||||||
|
|
||||||
|
def _is_excel_filename(self, file_name: Optional[str]) -> bool:
|
||||||
|
if not file_name:
|
||||||
|
return False
|
||||||
|
lower = file_name.lower()
|
||||||
|
return lower.endswith(".xlsx") or lower.endswith(".xls")
|
||||||
|
|
||||||
|
def _is_numeric_question(self, question: str) -> bool:
|
||||||
|
t = (question or "").lower()
|
||||||
|
if not t:
|
||||||
|
return False
|
||||||
|
markers = [
|
||||||
|
"скільки", "сума", "витрат", "добрив", "грн", "uah", "usd", "eur",
|
||||||
|
"сколько", "amount", "total", "spent", "cost", "value",
|
||||||
|
]
|
||||||
|
return any(m in t for m in markers)
|
||||||
|
|
||||||
|
def _extract_query_tokens(self, question: str) -> List[str]:
|
||||||
|
tokens = re.findall(r"[a-zA-Zа-яА-ЯіїєґІЇЄҐ0-9]{3,}", (question or "").lower())
|
||||||
|
stop = {
|
||||||
|
"яка", "який", "яке", "which", "what", "скільки", "сума", "була",
|
||||||
|
"витрачена", "write", "show", "give", "please", "мені", "будь", "ласка",
|
||||||
|
"тому", "цьому", "цей", "this", "that", "for", "and", "the",
|
||||||
|
}
|
||||||
|
return [t for t in tokens if t not in stop]
|
||||||
|
|
||||||
|
async def _try_answer_excel_question(
|
||||||
|
self,
|
||||||
|
question: str,
|
||||||
|
doc_url: Optional[str],
|
||||||
|
file_name: Optional[str],
|
||||||
|
) -> Optional[str]:
|
||||||
|
if not doc_url or not self._is_numeric_question(question):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
import httpx
|
||||||
|
from io import BytesIO
|
||||||
|
import openpyxl
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
query_tokens = self._extract_query_tokens(question)
|
||||||
|
if not query_tokens:
|
||||||
|
query_tokens = ["сума", "витрати", "добрив"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=20.0) as client:
|
||||||
|
resp = await client.get(doc_url)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return None
|
||||||
|
content = resp.content
|
||||||
|
|
||||||
|
wb = openpyxl.load_workbook(BytesIO(content), data_only=True, read_only=True)
|
||||||
|
best = None
|
||||||
|
best_score = -1
|
||||||
|
fallback = None
|
||||||
|
|
||||||
|
for ws in wb.worksheets:
|
||||||
|
for row_idx, row in enumerate(ws.iter_rows(values_only=True), start=1):
|
||||||
|
label = ""
|
||||||
|
numeric_value = None
|
||||||
|
for cell in row:
|
||||||
|
if isinstance(cell, (int, float)) and numeric_value is None:
|
||||||
|
numeric_value = float(cell)
|
||||||
|
elif isinstance(cell, str) and not label:
|
||||||
|
label = cell.strip()
|
||||||
|
if numeric_value is None:
|
||||||
|
continue
|
||||||
|
label_low = label.lower()
|
||||||
|
score = sum(1 for t in query_tokens if t in label_low)
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best = {
|
||||||
|
"sheet": ws.title,
|
||||||
|
"row": row_idx,
|
||||||
|
"label": label or "n/a",
|
||||||
|
"value": numeric_value,
|
||||||
|
}
|
||||||
|
if fallback is None and any(m in label_low for m in ("добрив", "fertiliz", "удобр")):
|
||||||
|
fallback = {
|
||||||
|
"sheet": ws.title,
|
||||||
|
"row": row_idx,
|
||||||
|
"label": label or "n/a",
|
||||||
|
"value": numeric_value,
|
||||||
|
}
|
||||||
|
|
||||||
|
picked = best if best and best_score > 0 else fallback
|
||||||
|
if not picked:
|
||||||
|
return None
|
||||||
|
|
||||||
|
value = picked["value"]
|
||||||
|
if abs(value - int(value)) < 1e-9:
|
||||||
|
value_str = f"{int(value):,}".replace(",", " ")
|
||||||
|
else:
|
||||||
|
value_str = f"{value:,.2f}".replace(",", " ").replace(".", ",")
|
||||||
|
|
||||||
|
unit = "грн" if self._is_numeric_question(question) else ""
|
||||||
|
unit_part = f" {unit}" if unit else ""
|
||||||
|
file_part = f' у файлі "{file_name}"' if file_name else ""
|
||||||
|
return (
|
||||||
|
f"За{file_part}: {value_str}{unit_part}. "
|
||||||
|
f"Джерело: лист {picked['sheet']}, рядок {picked['row']} ({picked['label']})."
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Excel deterministic answer failed: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
async def save_doc_context(
|
async def save_doc_context(
|
||||||
self,
|
self,
|
||||||
session_id: str,
|
session_id: str,
|
||||||
@@ -451,7 +560,8 @@ class DocumentService:
|
|||||||
question: str,
|
question: str,
|
||||||
doc_id: Optional[str] = None,
|
doc_id: Optional[str] = None,
|
||||||
dao_id: Optional[str] = None,
|
dao_id: Optional[str] = None,
|
||||||
user_id: Optional[str] = None
|
user_id: Optional[str] = None,
|
||||||
|
agent_id: str = "daarwizz"
|
||||||
) -> QAResult:
|
) -> QAResult:
|
||||||
"""
|
"""
|
||||||
Ask a question about a document using RAG query.
|
Ask a question about a document using RAG query.
|
||||||
@@ -468,11 +578,20 @@ class DocumentService:
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# If doc_id not provided, try to get from context
|
# If doc_id not provided, try to get from context
|
||||||
|
doc_url = None
|
||||||
|
file_name = None
|
||||||
if not doc_id:
|
if not doc_id:
|
||||||
doc_context = await self.get_doc_context(session_id)
|
doc_context = await self.get_doc_context(session_id)
|
||||||
if doc_context:
|
if doc_context:
|
||||||
doc_id = doc_context.doc_id
|
doc_id = doc_context.doc_id
|
||||||
dao_id = dao_id or doc_context.dao_id
|
dao_id = dao_id or doc_context.dao_id
|
||||||
|
doc_url = doc_context.doc_url
|
||||||
|
file_name = doc_context.file_name
|
||||||
|
else:
|
||||||
|
doc_context = await self.get_doc_context(session_id)
|
||||||
|
if doc_context:
|
||||||
|
doc_url = doc_context.doc_url
|
||||||
|
file_name = doc_context.file_name
|
||||||
|
|
||||||
if not doc_id:
|
if not doc_id:
|
||||||
return QAResult(
|
return QAResult(
|
||||||
@@ -485,10 +604,31 @@ class DocumentService:
|
|||||||
parts = session_id.split(":", 1)
|
parts = session_id.split(":", 1)
|
||||||
user_id = parts[1] if len(parts) > 1 else session_id
|
user_id = parts[1] if len(parts) > 1 else session_id
|
||||||
|
|
||||||
|
# Shared deterministic Excel policy for top-level agrarian agents.
|
||||||
|
if (
|
||||||
|
(agent_id or "").lower() in SHARED_EXCEL_POLICY_AGENTS
|
||||||
|
and self._is_excel_filename(file_name)
|
||||||
|
):
|
||||||
|
deterministic = await self._try_answer_excel_question(
|
||||||
|
question=question,
|
||||||
|
doc_url=doc_url,
|
||||||
|
file_name=file_name,
|
||||||
|
)
|
||||||
|
if deterministic:
|
||||||
|
return QAResult(
|
||||||
|
success=True,
|
||||||
|
answer=deterministic,
|
||||||
|
doc_id=doc_id,
|
||||||
|
sources=[{
|
||||||
|
"type": "excel_deterministic",
|
||||||
|
"file_name": file_name,
|
||||||
|
}],
|
||||||
|
)
|
||||||
|
|
||||||
# Build RAG query request
|
# Build RAG query request
|
||||||
router_request = {
|
router_request = {
|
||||||
"mode": "rag_query",
|
"mode": "rag_query",
|
||||||
"agent": "daarwizz",
|
"agent": agent_id,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"source": self._extract_source(session_id),
|
"source": self._extract_source(session_id),
|
||||||
"dao_id": dao_id,
|
"dao_id": dao_id,
|
||||||
@@ -503,7 +643,9 @@ class DocumentService:
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(f"RAG query: session={session_id}, question={question[:50]}, doc_id={doc_id}")
|
logger.info(
|
||||||
|
f"RAG query: agent={agent_id}, session={session_id}, question={question[:50]}, doc_id={doc_id}"
|
||||||
|
)
|
||||||
|
|
||||||
# Send to Router
|
# Send to Router
|
||||||
response = await send_to_router(router_request)
|
response = await send_to_router(router_request)
|
||||||
@@ -593,7 +735,8 @@ async def ask_about_document(
|
|||||||
question: str,
|
question: str,
|
||||||
doc_id: Optional[str] = None,
|
doc_id: Optional[str] = None,
|
||||||
dao_id: Optional[str] = None,
|
dao_id: Optional[str] = None,
|
||||||
user_id: Optional[str] = None
|
user_id: Optional[str] = None,
|
||||||
|
agent_id: str = "daarwizz"
|
||||||
) -> QAResult:
|
) -> QAResult:
|
||||||
"""Ask a question about a document using RAG query"""
|
"""Ask a question about a document using RAG query"""
|
||||||
return await doc_service.ask_about_document(
|
return await doc_service.ask_about_document(
|
||||||
@@ -601,7 +744,8 @@ async def ask_about_document(
|
|||||||
question=question,
|
question=question,
|
||||||
doc_id=doc_id,
|
doc_id=doc_id,
|
||||||
dao_id=dao_id,
|
dao_id=dao_id,
|
||||||
user_id=user_id
|
user_id=user_id,
|
||||||
|
agent_id=agent_id
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user