From 7b5357228fc6aa72955c2b76eceedb531f88392f Mon Sep 17 00:00:00 2001 From: Apple Date: Fri, 20 Feb 2026 14:16:16 -0800 Subject: [PATCH] doc-service: add shared deterministic excel answer contract --- gateway-bot/services/doc_service.py | 154 +++++++++++++++++++++++++++- 1 file changed, 149 insertions(+), 5 deletions(-) diff --git a/gateway-bot/services/doc_service.py b/gateway-bot/services/doc_service.py index c55f4e28..da5a6843 100644 --- a/gateway-bot/services/doc_service.py +++ b/gateway-bot/services/doc_service.py @@ -12,6 +12,7 @@ import os import logging import hashlib import json +import re from typing import Optional, Dict, Any, List from pydantic import BaseModel from datetime import datetime @@ -21,6 +22,8 @@ from memory_client import memory_client logger = logging.getLogger(__name__) +SHARED_EXCEL_POLICY_AGENTS = {"agromatrix", "helion", "nutra", "greenfood"} + class QAItem(BaseModel): """Single Q&A pair""" @@ -80,6 +83,112 @@ class DocumentService: def __init__(self): """Initialize document service""" self.memory_client = memory_client + + def _is_excel_filename(self, file_name: Optional[str]) -> bool: + if not file_name: + return False + lower = file_name.lower() + return lower.endswith(".xlsx") or lower.endswith(".xls") + + def _is_numeric_question(self, question: str) -> bool: + t = (question or "").lower() + if not t: + return False + markers = [ + "скільки", "сума", "витрат", "добрив", "грн", "uah", "usd", "eur", + "сколько", "amount", "total", "spent", "cost", "value", + ] + return any(m in t for m in markers) + + def _extract_query_tokens(self, question: str) -> List[str]: + tokens = re.findall(r"[a-zA-Zа-яА-ЯіїєґІЇЄҐ0-9]{3,}", (question or "").lower()) + stop = { + "яка", "який", "яке", "which", "what", "скільки", "сума", "була", + "витрачена", "write", "show", "give", "please", "мені", "будь", "ласка", + "тому", "цьому", "цей", "this", "that", "for", "and", "the", + } + return [t for t in tokens if t not in stop] + + async def _try_answer_excel_question( + self, + question: str, + doc_url: Optional[str], + file_name: Optional[str], + ) -> Optional[str]: + if not doc_url or not self._is_numeric_question(question): + return None + try: + import httpx + from io import BytesIO + import openpyxl + except Exception: + return None + + query_tokens = self._extract_query_tokens(question) + if not query_tokens: + query_tokens = ["сума", "витрати", "добрив"] + + try: + async with httpx.AsyncClient(timeout=20.0) as client: + resp = await client.get(doc_url) + if resp.status_code != 200: + return None + content = resp.content + + wb = openpyxl.load_workbook(BytesIO(content), data_only=True, read_only=True) + best = None + best_score = -1 + fallback = None + + for ws in wb.worksheets: + for row_idx, row in enumerate(ws.iter_rows(values_only=True), start=1): + label = "" + numeric_value = None + for cell in row: + if isinstance(cell, (int, float)) and numeric_value is None: + numeric_value = float(cell) + elif isinstance(cell, str) and not label: + label = cell.strip() + if numeric_value is None: + continue + label_low = label.lower() + score = sum(1 for t in query_tokens if t in label_low) + if score > best_score: + best_score = score + best = { + "sheet": ws.title, + "row": row_idx, + "label": label or "n/a", + "value": numeric_value, + } + if fallback is None and any(m in label_low for m in ("добрив", "fertiliz", "удобр")): + fallback = { + "sheet": ws.title, + "row": row_idx, + "label": label or "n/a", + "value": numeric_value, + } + + picked = best if best and best_score > 0 else fallback + if not picked: + return None + + value = picked["value"] + if abs(value - int(value)) < 1e-9: + value_str = f"{int(value):,}".replace(",", " ") + else: + value_str = f"{value:,.2f}".replace(",", " ").replace(".", ",") + + unit = "грн" if self._is_numeric_question(question) else "" + unit_part = f" {unit}" if unit else "" + file_part = f' у файлі "{file_name}"' if file_name else "" + return ( + f"За{file_part}: {value_str}{unit_part}. " + f"Джерело: лист {picked['sheet']}, рядок {picked['row']} ({picked['label']})." + ) + except Exception as e: + logger.warning(f"Excel deterministic answer failed: {e}") + return None async def save_doc_context( self, @@ -451,7 +560,8 @@ class DocumentService: question: str, doc_id: Optional[str] = None, dao_id: Optional[str] = None, - user_id: Optional[str] = None + user_id: Optional[str] = None, + agent_id: str = "daarwizz" ) -> QAResult: """ Ask a question about a document using RAG query. @@ -468,11 +578,20 @@ class DocumentService: """ try: # If doc_id not provided, try to get from context + doc_url = None + file_name = None if not doc_id: doc_context = await self.get_doc_context(session_id) if doc_context: doc_id = doc_context.doc_id dao_id = dao_id or doc_context.dao_id + doc_url = doc_context.doc_url + file_name = doc_context.file_name + else: + doc_context = await self.get_doc_context(session_id) + if doc_context: + doc_url = doc_context.doc_url + file_name = doc_context.file_name if not doc_id: return QAResult( @@ -484,11 +603,32 @@ class DocumentService: if not user_id: parts = session_id.split(":", 1) user_id = parts[1] if len(parts) > 1 else session_id + + # Shared deterministic Excel policy for top-level agrarian agents. + if ( + (agent_id or "").lower() in SHARED_EXCEL_POLICY_AGENTS + and self._is_excel_filename(file_name) + ): + deterministic = await self._try_answer_excel_question( + question=question, + doc_url=doc_url, + file_name=file_name, + ) + if deterministic: + return QAResult( + success=True, + answer=deterministic, + doc_id=doc_id, + sources=[{ + "type": "excel_deterministic", + "file_name": file_name, + }], + ) # Build RAG query request router_request = { "mode": "rag_query", - "agent": "daarwizz", + "agent": agent_id, "metadata": { "source": self._extract_source(session_id), "dao_id": dao_id, @@ -503,7 +643,9 @@ class DocumentService: }, } - logger.info(f"RAG query: session={session_id}, question={question[:50]}, doc_id={doc_id}") + logger.info( + f"RAG query: agent={agent_id}, session={session_id}, question={question[:50]}, doc_id={doc_id}" + ) # Send to Router response = await send_to_router(router_request) @@ -593,7 +735,8 @@ async def ask_about_document( question: str, doc_id: Optional[str] = None, dao_id: Optional[str] = None, - user_id: Optional[str] = None + user_id: Optional[str] = None, + agent_id: str = "daarwizz" ) -> QAResult: """Ask a question about a document using RAG query""" return await doc_service.ask_about_document( @@ -601,7 +744,8 @@ async def ask_about_document( question=question, doc_id=doc_id, dao_id=dao_id, - user_id=user_id + user_id=user_id, + agent_id=agent_id )