diff --git a/config/agent_registry.json b/config/agent_registry.json index 8f9c2870..719d518a 100644 --- a/config/agent_registry.json +++ b/config/agent_registry.json @@ -19,7 +19,8 @@ "onboarding", "ecosystem" ], - "mentor": null + "mentor": null, + "district_id": "city-core" }, "helion": { "display_name": "Helion", @@ -35,7 +36,8 @@ "market_analysis", "biominer" ], - "mentor": null + "mentor": null, + "district_id": "helion" }, "alateya": { "display_name": "Aletheia", @@ -58,7 +60,8 @@ "email": "alverjob@gmail.com", "site": "https://alverjob.xyz", "youtube": "https://www.youtube.com/@alverjob72" - } + }, + "district_id": "alateya" }, "druid": { "display_name": "DRUID", @@ -76,7 +79,8 @@ "inci", "safety_basics" ], - "mentor": null + "mentor": null, + "district_id": "druid" }, "nutra": { "display_name": "NUTRA", @@ -93,7 +97,8 @@ "vitamins", "microbiome" ], - "mentor": null + "mentor": null, + "district_id": "nutra" }, "agromatrix": { "display_name": "Степан Матрікс", @@ -110,7 +115,8 @@ "logistics", "farm_economics" ], - "mentor": null + "mentor": null, + "district_id": "agromatrix" }, "greenfood": { "display_name": "GREENFOOD", @@ -127,7 +133,8 @@ "food_production", "sales" ], - "mentor": null + "mentor": null, + "district_id": "greenfood" }, "clan": { "display_name": "CLAN", @@ -143,7 +150,8 @@ "culture", "facilitation" ], - "mentor": null + "mentor": null, + "district_id": "clan" }, "eonarch": { "display_name": "EONARCH", @@ -159,7 +167,8 @@ "transformation", "spirituality" ], - "mentor": null + "mentor": null, + "district_id": "eonarch" }, "yaromir": { "display_name": "YAROMIR", @@ -175,7 +184,8 @@ "code_review", "strategy" ], - "mentor": null + "mentor": null, + "district_id": "city-core" }, "soul": { "display_name": "SOUL", @@ -191,7 +201,24 @@ "values", "wellbeing" ], - "mentor": null + "mentor": null, + "district_id": "soul" + }, + "dario": { + "display_name": "DARIO", + "canonical_role": "Future DAARION Agent (planned, not launched)", + "prompt_file": "dario_prompt.txt", + "telegram_mode": "disabled", + "visibility": "private", + "status": "planned", + "district_id": "city-core", + "domains": [ + "city_ops", + "coordination", + "support" + ], + "mentor": null, + "launch_state": "planned" } } -} \ No newline at end of file +} diff --git a/config/roles/agromatrix/agronomist.md b/config/roles/agromatrix/agronomist.md new file mode 100644 index 00000000..e863d9cf --- /dev/null +++ b/config/roles/agromatrix/agronomist.md @@ -0,0 +1,8 @@ +# Agronomist + +Фокус: агрономія, діагностика стану рослин, фази розвитку, ризики хвороб/стресів. + +Правила відповіді: +- Коротко і прикладно. +- Ніяких вигаданих фактів; при невизначеності чітко позначити припущення. +- Для фото-питань: аналізувати в межах доступного контексту; якщо файл відсутній зараз — просити фото повторно. diff --git a/config/roles/agromatrix/communicator.md b/config/roles/agromatrix/communicator.md new file mode 100644 index 00000000..0f55a1ca --- /dev/null +++ b/config/roles/agromatrix/communicator.md @@ -0,0 +1,8 @@ +# Communicator + +Фокус: людяна та зрозуміла комунікація фінальної відповіді. + +Правила: +- Природна мова, без механістичного тону. +- Не дублюй технічні обмеження, якщо вони не потрібні для дії користувача. +- Завершуй конкретним корисним кроком. diff --git a/config/roles/agromatrix/data_analyst.md b/config/roles/agromatrix/data_analyst.md new file mode 100644 index 00000000..f2c617b2 --- /dev/null +++ b/config/roles/agromatrix/data_analyst.md @@ -0,0 +1,7 @@ +# Field Data Analyst + +Фокус: аналіз польових даних, тренди, аномалії, порівняння сценаріїв. + +Правила: +- Пояснювати висновки простою мовою. +- Якщо даних недостатньо — вказати, які саме дані потрібні для точного висновку. diff --git a/config/roles/agromatrix/farm_ops.md b/config/roles/agromatrix/farm_ops.md new file mode 100644 index 00000000..c50d06e8 --- /dev/null +++ b/config/roles/agromatrix/farm_ops.md @@ -0,0 +1,8 @@ +# Farm Ops Planner + +Фокус: планування польових робіт, ресурси, пріоритезація задач, таймінги. + +Правила: +- Видавати практичний порядок дій. +- За простого запиту: коротка відповідь. +- Для операційних запитів: стислий план з відповідальними і дедлайном. diff --git a/config/roles/agromatrix/orchestrator_synthesis.md b/config/roles/agromatrix/orchestrator_synthesis.md new file mode 100644 index 00000000..7e70653f --- /dev/null +++ b/config/roles/agromatrix/orchestrator_synthesis.md @@ -0,0 +1,10 @@ +# AgroMatrix Orchestrator Synthesis + +Ти синтезуєш відповіді ролей у фінальну відповідь Степана. + +Правила: +- За замовчуванням: 1-3 природні речення без шаблонної канцелярії. +- Детальний формат (пункти/чекліст) тільки коли користувач просить "детально", "план", "чекліст", "розрахунок". +- Якщо для аналізу бракує фото в поточному контексті, скажи це просто і попроси надіслати фото повторно. +- Уникай службових формулювань про "технічні обмеження", "text-only" чи "відсутній vision-модуль". +- Пояснюй по суті агропитання і давай 1 наступний практичний крок. diff --git a/config/roles/agromatrix/risk_assessor.md b/config/roles/agromatrix/risk_assessor.md new file mode 100644 index 00000000..c843d005 --- /dev/null +++ b/config/roles/agromatrix/risk_assessor.md @@ -0,0 +1,7 @@ +# Risk Assessor + +Фокус: агро-ризики, операційні ризики, наслідки рішень. + +Правила: +- Давай коротку оцінку ризику (низький/середній/високий) і як зменшити ризик. +- Без зайвої бюрократії у відповіді користувачу. diff --git a/config/roles/agx/agx-orchestrator-stepan/orchestrator_stepan.md b/config/roles/agx/agx-orchestrator-stepan/orchestrator_stepan.md index b5695d27..58773cfd 100644 --- a/config/roles/agx/agx-orchestrator-stepan/orchestrator_stepan.md +++ b/config/roles/agx/agx-orchestrator-stepan/orchestrator_stepan.md @@ -11,6 +11,10 @@ - Деструктивні дії (delete/migrate/prod) ТІЛЬКИ через план + dry-run + backup - Ніколи не логувати секрети/токени - Інші ролі НЕ спілкуються з користувачем напряму +- Мультимодальність активна: фото/голос/документи підтримуються через стек платформи. +- Якщо в поточному контексті не вистачає зображення для аналізу, пояснюйте це простою людською мовою і попросіть надіслати фото ще раз без технічних формулювань. ## Формат відповіді: -Структурована відповідь з чіткими рекомендаціями та наступними кроками. +- За замовчуванням: природна коротка відповідь 1-3 речення. +- Якщо користувач просить детально/план/чекліст: структурована відповідь з чіткими наступними кроками. +- Тон: живий і професійний, без канцеляризмів, шаблонів і фраз про "обмеження моделі". diff --git a/config/roles/agx/agx-orchestrator-stepan/orchestrator_synthesis.md b/config/roles/agx/agx-orchestrator-stepan/orchestrator_synthesis.md index cac85d38..e192dbbb 100644 --- a/config/roles/agx/agx-orchestrator-stepan/orchestrator_synthesis.md +++ b/config/roles/agx/agx-orchestrator-stepan/orchestrator_synthesis.md @@ -7,3 +7,7 @@ - Структурувати інформацію логічно - Включати конкретні наступні кроки - Позначати ризики якщо є +- За замовчуванням відповідати природно і коротко (1-3 речення), без шаблонної канцелярії. +- Для детальних запитів переходити у структурований режим. +- Якщо для аналізу бракує зображення у поточному контексті, скажіть це природно і попросіть надіслати фото повторно. +- Не вживати службові формулювання на кшталт "обмеження моделі", "text-only", "vision unavailable". diff --git a/config/roles/agx/agx-plant-intel/agrovoc_normalizer.md b/config/roles/agx/agx-plant-intel/agrovoc_normalizer.md new file mode 100644 index 00000000..f279f615 --- /dev/null +++ b/config/roles/agx/agx-plant-intel/agrovoc_normalizer.md @@ -0,0 +1,11 @@ +You are AGROVOC Normalizer. + +Responsibilities: +- Normalize crop/disease terms using agrovoc_lookup. +- Provide canonical term mapping for user-facing output. +- Keep labels practical for agronomy context. + +Return format: +- canonical_terms +- term_mapping +- notes_for_user diff --git a/config/roles/agx/agx-plant-intel/orchestrator_synthesis.md b/config/roles/agx/agx-plant-intel/orchestrator_synthesis.md new file mode 100644 index 00000000..f6151c0d --- /dev/null +++ b/config/roles/agx/agx-plant-intel/orchestrator_synthesis.md @@ -0,0 +1,17 @@ +You are the synthesis role for AgroMatrix plant intelligence. + +Goal: +- Aggregate candidate plant IDs from vision + PlantNet + GBIF + AGROVOC. +- Return concise output with uncertainty, sources, and next-photo requirements. + +Output contract (strict): +1) probable_taxon: one short line +2) confidence: low/medium/high + one short reason +3) alternatives: up to 3 entries +4) sources: PlantNet/GBIF/AGROVOC/Web (only those actually used) +5) next_photos_required: 1-3 concrete photo instructions + +Rules: +- Never claim 100% certainty from a single weak source. +- If evidence conflicts, say so and reduce confidence. +- Keep default response concise. diff --git a/config/roles/agx/agx-plant-intel/plant_identifier.md b/config/roles/agx/agx-plant-intel/plant_identifier.md new file mode 100644 index 00000000..23a6d0b7 --- /dev/null +++ b/config/roles/agx/agx-plant-intel/plant_identifier.md @@ -0,0 +1,11 @@ +You are Plant Identifier. + +Responsibilities: +- Parse visual cues from user description/photo context. +- Build candidate crop/plant hypotheses. +- Use plantnet_lookup first when image URL is available. +- If PlantNet is unavailable, provide top hypotheses with explicit uncertainty. + +Return format: +- candidates: numbered list max 5, each with rationale. +- required_data: what extra image/data is needed. diff --git a/config/roles/agx/agx-plant-intel/taxonomy_validator.md b/config/roles/agx/agx-plant-intel/taxonomy_validator.md new file mode 100644 index 00000000..54df47e4 --- /dev/null +++ b/config/roles/agx/agx-plant-intel/taxonomy_validator.md @@ -0,0 +1,11 @@ +You are Taxonomy Validator. + +Responsibilities: +- Validate candidate names via gbif_species_lookup. +- Remove invalid/synonym-conflicted names. +- Keep accepted taxa and explain conflicts briefly. + +Return format: +- accepted_candidates +- rejected_candidates_with_reason +- confidence_adjustment diff --git a/docs/agromatrix-plant-intel-contract.md b/docs/agromatrix-plant-intel-contract.md new file mode 100644 index 00000000..a6a6c9ed --- /dev/null +++ b/docs/agromatrix-plant-intel-contract.md @@ -0,0 +1,43 @@ +# AgroMatrix Plant Intel Contract (Skeleton) + +## Purpose +`agromatrix_plant_intel` is an internal CrewAI profile for Stepan (AgroMatrix orchestrator). +It is used for plant/crop identification and normalization when confidence matters. + +## Call Path +1. User asks Stepan. +2. Stepan remains final speaker. +3. When query matches plant-intel intent, CrewAI profile `plant_intel` is selected. +4. Subteam runs: + - `plant_identifier` + - `taxonomy_validator` + - `agrovoc_normalizer` +5. Synthesis returns compact evidence package to Stepan. + +## Tool Adapters +- `nature_id_identify` + - input: `image_url`, `top_k?` + - output: local/open-source candidates + - note: requires self-hosted endpoint `NATURE_ID_URL` +- `plantnet_lookup` + - input: `query?`, `image_url?`, `organ?`, `top_k?` + - output: candidate taxa + score + - note: if `PLANTNET_API_KEY` missing, fallback chain is `nature_id_identify` -> `gbif_species_lookup` +- `gbif_species_lookup` + - input: `query`, `limit?` + - output: accepted taxa/rank/status +- `agrovoc_lookup` + - input: `query`, `lang?`, `limit?` + - output: canonical AGROVOC concepts + +## Response Contract (to Stepan) +- `probable_taxon` +- `confidence` (`low|medium|high` + reason) +- `alternatives` (up to 3) +- `sources` (actual tools used) +- `next_photos_required` (1-3 concrete instructions) + +## Safety +- No categorical claim with weak evidence. +- If sources conflict, confidence is downgraded. +- Final user answer remains concise by default. diff --git a/gateway-bot/Dockerfile b/gateway-bot/Dockerfile index f5e75293..fe07a230 100644 --- a/gateway-bot/Dockerfile +++ b/gateway-bot/Dockerfile @@ -3,7 +3,7 @@ FROM python:3.11-slim LABEL maintainer="DAARION.city Team" LABEL description="Bot Gateway - Telegram/Discord webhook handler with DAARWIZZ" -LABEL version="0.2.0" +LABEL version="0.2.1" WORKDIR /app/gateway-bot @@ -15,7 +15,15 @@ RUN pip install --no-cache-dir \ uvicorn==0.27.0 \ httpx==0.26.0 \ pydantic==2.5.3 \ - python-multipart==0.0.6 prometheus-client>=0.20.0 PyPDF2>=3.0.0 crewai nats-py pandas openpyxl + python-multipart==0.0.6 \ + prometheus-client==0.22.1 \ + PyPDF2>=3.0.0 \ + crewai \ + nats-py \ + pandas \ + openpyxl \ + python-docx \ + redis==5.0.1 # Copy gateway code and DAARWIZZ prompt COPY . . diff --git a/gateway-bot/agent_registry.json b/gateway-bot/agent_registry.json index cff52cc4..99edcbc5 100644 --- a/gateway-bot/agent_registry.json +++ b/gateway-bot/agent_registry.json @@ -19,7 +19,8 @@ "onboarding", "ecosystem" ], - "mentor": null + "mentor": null, + "district_id": "city-core" }, "helion": { "display_name": "Helion", @@ -35,7 +36,8 @@ "market_analysis", "biominer" ], - "mentor": null + "mentor": null, + "district_id": "helion" }, "alateya": { "display_name": "Aletheia", @@ -58,7 +60,8 @@ "email": "alverjob@gmail.com", "site": "https://alverjob.xyz", "youtube": "https://www.youtube.com/@alverjob72" - } + }, + "district_id": "alateya" }, "druid": { "display_name": "DRUID", @@ -76,7 +79,8 @@ "inci", "safety_basics" ], - "mentor": null + "mentor": null, + "district_id": "druid" }, "nutra": { "display_name": "NUTRA", @@ -93,7 +97,8 @@ "vitamins", "microbiome" ], - "mentor": null + "mentor": null, + "district_id": "nutra" }, "agromatrix": { "display_name": "Степан Матрікс", @@ -110,7 +115,8 @@ "logistics", "farm_economics" ], - "mentor": null + "mentor": null, + "district_id": "agromatrix" }, "greenfood": { "display_name": "GREENFOOD", @@ -127,7 +133,8 @@ "food_production", "sales" ], - "mentor": null + "mentor": null, + "district_id": "greenfood" }, "clan": { "display_name": "CLAN", @@ -143,7 +150,8 @@ "culture", "facilitation" ], - "mentor": null + "mentor": null, + "district_id": "clan" }, "eonarch": { "display_name": "EONARCH", @@ -159,7 +167,8 @@ "transformation", "spirituality" ], - "mentor": null + "mentor": null, + "district_id": "eonarch" }, "yaromir": { "display_name": "YAROMIR", @@ -175,7 +184,8 @@ "code_review", "strategy" ], - "mentor": null + "mentor": null, + "district_id": "city-core" }, "soul": { "display_name": "SOUL", @@ -191,7 +201,8 @@ "values", "wellbeing" ], - "mentor": null + "mentor": null, + "district_id": "soul" }, "senpai": { "display_name": "SENPAI", @@ -207,7 +218,8 @@ "defi", "portfolio" ], - "mentor": null + "mentor": null, + "district_id": "senpai" }, "oneok": { "display_name": "1OK", @@ -227,7 +239,8 @@ "mentor": { "name": "Ілля Титар", "telegram": "@Titar240581" - } + }, + "district_id": "city-core" }, "sofiia": { "display_name": "Sophia", @@ -242,7 +255,24 @@ "platform_evolution", "technical_leadership" ], - "mentor": null + "mentor": null, + "district_id": "city-core" + }, + "dario": { + "display_name": "DARIO", + "canonical_role": "Future DAARION Agent (planned, not launched)", + "prompt_file": "dario_prompt.txt", + "telegram_mode": "disabled", + "visibility": "private", + "status": "planned", + "district_id": "city-core", + "domains": [ + "city_ops", + "coordination", + "support" + ], + "mentor": null, + "launch_state": "planned" } } -} \ No newline at end of file +} diff --git a/gateway-bot/agromatrix_prompt.txt b/gateway-bot/agromatrix_prompt.txt index e28454d6..a18c51ae 100644 --- a/gateway-bot/agromatrix_prompt.txt +++ b/gateway-bot/agromatrix_prompt.txt @@ -32,7 +32,9 @@ ## B. SHORT-FIRST -**За замовчуванням: структурована відповідь з 3-5 пунктів.** +**За замовчуванням: жива коротка відповідь 1-3 речення природною мовою.** +**Маркерні списки/шаблони 3-5 пунктів використовуй тільки коли користувач просить детально, план, чеклист або розрахунок.** +**Перше повідомлення в новій темі — розмовне, без канцеляриту та без "робото-тону".** ЗАБОРОНЕНО: - "Радий допомогти", "Готовий до співпраці" @@ -55,7 +57,9 @@ **ВАЖЛИВО:** - Ніколи не кажи "я не можу слухати аудіо" — голосові повідомлення вже перетворені на текст! -- Ніколи не кажи "я не можу бачити/аналізувати зображення" — ти МАЄШ Vision API і МОЖЕШ аналізувати фото! Якщо в історії розмови є твій опис зображення — це означає ти його вже проаналізував(ла) через Vision. Не заперечуй це. +- Фото аналізуй по доступному поточному контексту: якщо зображення є у запиті або щойно надіслане — коментуй по суті. +- Якщо для точного висновку бракує самого файлу чи чіткості, поясни це простою людською мовою і попроси надіслати фото повторно з уточненням, що саме перевірити. +- Не використовуй службові фрази типу "text-only", "vision unavailable", "технічне обмеження моделі". Початковий режим: учень. Спочатку став уточнювальні питання і вчися у ментора. Публічна група: @agromatrix. @@ -94,7 +98,8 @@ - Мислиш далекоглядно: пропонуєш архітектуру рішення, а не латання симптомів. - Будь креативним, але не фантазуй дані: якщо фактів нема — позначай як припущення і пропонуй, що зібрати. - Спілкуйся українською (якщо користувач не перейшов на іншу мову). -- Форматуй відповіді структуровано: заголовки, списки, короткі блоки, пріоритети. +- Тримай розмовний тон: короткі природні фрази, без надмірної шаблонності. +- Структурований формат (заголовки/списки/таблиці) вмикай лише для складних задач або коли це прямо запитали. ### 4) Принципи роботи з користувачем 1. Спочатку контекст → потім рішення. Якщо контексту бракує — зроби мінімальний набір припущень і паралельно запропонуй, які дані уточнити. @@ -113,6 +118,8 @@ - “Підготуй текст/структуру сторінки/презентації для продукту AgroMatrix” ### 6) Як ти формуєш відповіді (стандартний шаблон) +Використовуй цей шаблон ТІЛЬКИ для комплексних запитів (планування сезону, економіка, SOP, інтеграції, ТЗ). +Для звичайних коротких питань відповідай в 1-3 речення органічно, без обов'язкових секцій. 1. Ціль (1–2 речення) 2. Вхідні дані (що відомо / які припущення) 3. Рішення (план/алгоритм/кроки) diff --git a/gateway-bot/app.py b/gateway-bot/app.py index 06537244..53f63dad 100644 --- a/gateway-bot/app.py +++ b/gateway-bot/app.py @@ -1,12 +1,13 @@ -""" -FastAPI app instance for Gateway Bot -""" +"""FastAPI app instance for Gateway Bot.""" import logging + from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from http_api import router as gateway_router from http_api_doc import router as doc_router +from daarion_facade.invoke_api import router as invoke_router +from daarion_facade.registry_api import router as registry_router logging.basicConfig( level=logging.INFO, @@ -15,36 +16,47 @@ logging.basicConfig( app = FastAPI( title="Bot Gateway with DAARWIZZ", - version="1.0.0", - description="Gateway service for Telegram/Discord bots → DAGI Router" + version="1.1.0", + description="Gateway service for Telegram/Discord bots + DAARION public facade" ) -# CORS middleware +# CORS for web UI clients (gateway only). app.add_middleware( CORSMiddleware, - allow_origins=["*"], + allow_origins=[ + "https://daarion.city", + "https://www.daarion.city", + "http://localhost:3000", + ], + allow_origin_regex=r"https://.*\.lovable\.app", allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], + allow_methods=["GET", "POST", "OPTIONS"], + allow_headers=["Authorization", "Content-Type"], ) -# Include gateway routes +# Existing gateway routes. app.include_router(gateway_router, prefix="", tags=["gateway"]) app.include_router(doc_router, prefix="", tags=["docs"]) +# Public facade routes for DAARION.city UI. +app.include_router(registry_router) +app.include_router(invoke_router) + + @app.get("/") async def root(): return { "service": "bot-gateway", - "version": "1.0.0", + "version": "1.1.0", "agent": "DAARWIZZ", "endpoints": [ "POST /telegram/webhook", "POST /discord/webhook", - "POST /api/doc/parse", - "POST /api/doc/ingest", - "POST /api/doc/ask", - "GET /api/doc/context/{session_id}", - "GET /health" + "GET /v1/registry/agents", + "GET /v1/registry/districts", + "GET /v1/metrics", + "POST /v1/invoke", + "GET /v1/jobs/{job_id}", + "GET /health", ] } diff --git a/gateway-bot/daarion_facade/__init__.py b/gateway-bot/daarion_facade/__init__.py new file mode 100644 index 00000000..2789eef5 --- /dev/null +++ b/gateway-bot/daarion_facade/__init__.py @@ -0,0 +1 @@ +"""DAARION public facade package.""" diff --git a/gateway-bot/daarion_facade/invoke_api.py b/gateway-bot/daarion_facade/invoke_api.py new file mode 100644 index 00000000..6cab2b54 --- /dev/null +++ b/gateway-bot/daarion_facade/invoke_api.py @@ -0,0 +1,212 @@ +import asyncio +from datetime import datetime, timezone +import hmac +import json +import os +import uuid +from typing import Any, Dict, List + +import httpx +from fastapi import APIRouter, HTTPException, Request, status +from fastapi.responses import StreamingResponse +from pydantic import BaseModel, Field + +from .redis_jobs import create_job, enqueue_job, get_job +from .registry_api import _load_registry + +router = APIRouter(prefix="/v1", tags=["daarion-facade"]) + +EVENT_TERMINAL_STATUSES = {"done", "failed"} +EVENT_KNOWN_STATUSES = {"queued", "running", "done", "failed"} +EVENT_POLL_SECONDS = float(os.getenv("DAARION_JOB_EVENTS_POLL_SECONDS", "0.5")) +ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000").rstrip("/") +ROUTER_REVIEW_TIMEOUT = float(os.getenv("DAARION_ROUTER_REVIEW_TIMEOUT_SECONDS", "20")) +AGROMATRIX_REVIEW_AUTH_MODE = os.getenv("AGROMATRIX_REVIEW_AUTH_MODE", "bearer").strip().lower() +AGROMATRIX_REVIEW_BEARER_TOKENS = [ + part.strip() + for part in os.getenv("AGROMATRIX_REVIEW_BEARER_TOKENS", "").replace(";", ",").split(",") + if part.strip() +] + + +class InvokeInput(BaseModel): + prompt: str = Field(min_length=1) + images: List[str] = Field(default_factory=list) + + +class InvokeRequest(BaseModel): + agent_id: str + input: InvokeInput + metadata: Dict[str, Any] = Field(default_factory=dict) + + +class InvokeResponse(BaseModel): + job_id: str + status: str + status_url: str + + +class SharedMemoryReviewRequest(BaseModel): + point_id: str + approve: bool + reviewer: str | None = None + note: str | None = None + + +def _extract_bearer_token(request: Request) -> str: + auth_header = request.headers.get("Authorization", "") + if not auth_header.startswith("Bearer "): + raise HTTPException(status_code=401, detail="Missing Bearer token") + token = auth_header[len("Bearer ") :].strip() + if not token: + raise HTTPException(status_code=401, detail="Empty Bearer token") + return token + + +def _require_mentor_auth(request: Request) -> str: + mode = AGROMATRIX_REVIEW_AUTH_MODE + if mode in {"off", "none", "disabled"}: + return "" + if mode != "bearer": + raise HTTPException(status_code=500, detail=f"Unsupported AGROMATRIX_REVIEW_AUTH_MODE={mode}") + if not AGROMATRIX_REVIEW_BEARER_TOKENS: + raise HTTPException(status_code=503, detail="Review auth is not configured") + token = _extract_bearer_token(request) + if not any(hmac.compare_digest(token, candidate) for candidate in AGROMATRIX_REVIEW_BEARER_TOKENS): + raise HTTPException(status_code=403, detail="Invalid mentor token") + return token + + +async def _router_json( + method: str, + path: str, + *, + payload: Dict[str, Any] | None = None, + params: Dict[str, Any] | None = None, + authorization: str | None = None, +) -> Dict[str, Any]: + headers: Dict[str, str] = {} + if authorization: + headers["Authorization"] = authorization + url = f"{ROUTER_URL}{path}" + + try: + async with httpx.AsyncClient(timeout=ROUTER_REVIEW_TIMEOUT) as client: + resp = await client.request(method, url, json=payload, params=params, headers=headers) + except httpx.TimeoutException: + raise HTTPException(status_code=504, detail="Router timeout") + except Exception as e: + raise HTTPException(status_code=502, detail=f"Router unavailable: {e}") + + try: + body = resp.json() + except Exception: + body = {"raw": resp.text} + + if resp.status_code >= 400: + detail = body.get("detail") if isinstance(body, dict) else body + raise HTTPException(status_code=resp.status_code, detail=detail or f"Router error {resp.status_code}") + return body if isinstance(body, dict) else {"data": body} + + +def _sse_message(event: str, payload: Dict[str, Any]) -> str: + return f"event: {event}\ndata: {json.dumps(payload, ensure_ascii=False)}\n\n" + + +@router.post("/invoke", status_code=status.HTTP_202_ACCEPTED, response_model=InvokeResponse) +async def invoke(payload: InvokeRequest) -> InvokeResponse: + registry = _load_registry().get("agents", {}) + if payload.agent_id not in registry: + raise HTTPException(status_code=404, detail=f"Unknown agent_id: {payload.agent_id}") + + job_id = f"job_{uuid.uuid4().hex}" + now = datetime.now(timezone.utc).isoformat() + job_doc = { + "job_id": job_id, + "status": "queued", + "agent_id": payload.agent_id, + "input": payload.input.model_dump(), + "metadata": payload.metadata, + "result": None, + "error": None, + "created_at": now, + "updated_at": now, + "started_at": None, + "finished_at": None, + } + await create_job(job_id, job_doc) + await enqueue_job(job_id) + return InvokeResponse(job_id=job_id, status="queued", status_url=f"/v1/jobs/{job_id}") + + +@router.get("/jobs/{job_id}") +async def job_status(job_id: str) -> Dict[str, Any]: + job = await get_job(job_id) + if not job: + raise HTTPException(status_code=404, detail="Job not found") + return job + + +@router.get("/jobs/{job_id}/events") +async def job_events(job_id: str, request: Request) -> StreamingResponse: + existing = await get_job(job_id) + if not existing: + raise HTTPException(status_code=404, detail="Job not found") + + async def event_stream(): + last_state = None + yield "retry: 1000\n\n" + + while True: + if await request.is_disconnected(): + break + + job = await get_job(job_id) + if not job: + yield _sse_message("failed", {"job_id": job_id, "status": "failed", "error": {"message": "Job not found"}}) + break + + status_value = str(job.get("status", "unknown")) + updated_at = str(job.get("updated_at", "")) + state = (status_value, updated_at) + + if state != last_state: + event_name = status_value if status_value in EVENT_KNOWN_STATUSES else "status" + yield _sse_message(event_name, job) + last_state = state + + if status_value in EVENT_TERMINAL_STATUSES: + break + + await asyncio.sleep(EVENT_POLL_SECONDS) + + return StreamingResponse( + event_stream(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no", + }, + ) + + +@router.get("/agromatrix/shared-memory/pending") +async def agromatrix_shared_pending(limit: int = 50) -> Dict[str, Any]: + return await _router_json( + "GET", + "/v1/agromatrix/shared-memory/pending", + params={"limit": max(1, min(limit, 200))}, + ) + + +@router.post("/agromatrix/shared-memory/review") +async def agromatrix_shared_review(req: SharedMemoryReviewRequest, request: Request) -> Dict[str, Any]: + token = _require_mentor_auth(request) + auth_header = f"Bearer {token}" if token else None + return await _router_json( + "POST", + "/v1/agromatrix/shared-memory/review", + payload=req.model_dump(), + authorization=auth_header, + ) diff --git a/gateway-bot/daarion_facade/metrics_poller.py b/gateway-bot/daarion_facade/metrics_poller.py new file mode 100644 index 00000000..26ba63c6 --- /dev/null +++ b/gateway-bot/daarion_facade/metrics_poller.py @@ -0,0 +1,287 @@ +import asyncio +import json +import logging +import os +import time +from datetime import datetime, timezone +from typing import Any, Dict, List, Optional, Tuple + +import httpx +from redis.asyncio import Redis + +from .registry_api import _load_crewai_roles, _load_district_registry, _load_registry + +logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s") +logger = logging.getLogger("daarion-metrics-poller") + +REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0") +POLL_INTERVAL_SECONDS = int(os.getenv("DAARION_METRICS_POLL_INTERVAL_SECONDS", "10")) +METRICS_TTL_SECONDS = int(os.getenv("DAARION_METRICS_TTL_SECONDS", "60")) +HTTP_CONNECT_TIMEOUT_SECONDS = float(os.getenv("DAARION_METRICS_HTTP_CONNECT_TIMEOUT_SECONDS", "2")) +HTTP_TOTAL_TIMEOUT_SECONDS = float(os.getenv("DAARION_METRICS_HTTP_TOTAL_TIMEOUT_SECONDS", "5")) +NODES_TOTAL = int(os.getenv("DAARION_NODE_COUNT", "1")) +MEMORY_SERVICE_URL = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000") + +DASHBOARD_KEY = "daarion:metrics:dashboard" +DISTRICT_KEY_PREFIX = "daarion:metrics:district" + +_redis: Optional[Redis] = None + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _ensure_url(value: str) -> str: + value = (value or "").strip() + if not value: + return "" + if value.startswith("http://") or value.startswith("https://"): + return value + return f"https://{value}" + + +def _health_candidates(district: Dict[str, Any]) -> List[str]: + base = _ensure_url(str(district.get("domain") or "")) + candidates: List[str] = [] + + explicit = str(district.get("health_url") or "").strip() + if explicit: + candidates.append(_ensure_url(explicit)) + + if base: + candidates.extend( + [ + f"{base}/.well-known/daarion-health.json", + f"{base}/health", + f"{base}/v1/health", + ] + ) + + dedup: List[str] = [] + seen = set() + for url in candidates: + if url and url not in seen: + dedup.append(url) + seen.add(url) + return dedup + + +def _extract_agents_online(payload: Dict[str, Any], agents_total: int) -> Optional[int]: + raw = payload.get("agents_online") + if isinstance(raw, bool): + return agents_total if raw else 0 + if isinstance(raw, int): + return max(0, min(raw, agents_total)) + + agents = payload.get("agents") + if isinstance(agents, list): + count = 0 + for agent in agents: + if not isinstance(agent, dict): + continue + status = str(agent.get("status", "")).lower() + if status in {"online", "active", "ok"}: + count += 1 + return min(count, agents_total) + + return None + + +async def redis_client() -> Redis: + global _redis + if _redis is None: + _redis = Redis.from_url(REDIS_URL, decode_responses=True) + return _redis + + +async def close_redis() -> None: + global _redis + if _redis is not None: + await _redis.close() + _redis = None + + +async def _fetch_json_with_latency( + client: httpx.AsyncClient, + url: str, +) -> Tuple[bool, Optional[Dict[str, Any]], Optional[float], Optional[str]]: + started = time.perf_counter() + try: + response = await client.get(url) + latency_ms = round((time.perf_counter() - started) * 1000, 2) + if response.status_code >= 400: + return False, None, latency_ms, f"HTTP {response.status_code}" + + data: Optional[Dict[str, Any]] = None + try: + parsed = response.json() + if isinstance(parsed, dict): + data = parsed + except Exception: + data = None + + return True, data, latency_ms, None + except Exception as e: + latency_ms = round((time.perf_counter() - started) * 1000, 2) + return False, None, latency_ms, str(e) + + +async def _read_memory_vectors(client: httpx.AsyncClient) -> int: + try: + ok, payload, _, _ = await _fetch_json_with_latency(client, f"{MEMORY_SERVICE_URL}/health") + if not ok or not payload: + return 0 + return int(payload.get("vector_store", {}).get("memories", {}).get("vectors_count", 0) or 0) + except Exception: + return 0 + + +async def _registry_snapshot() -> Tuple[List[Dict[str, Any]], Dict[str, List[Dict[str, Any]]], int, int]: + raw_districts = _load_district_registry().get("districts", []) + districts = [d for d in raw_districts if isinstance(d, dict) and d.get("district_id")] + + agents_map = _load_registry().get("agents", {}) + role_counts = await _load_crewai_roles() + + by_district: Dict[str, List[Dict[str, Any]]] = {} + subagents_total = 0 + + for aid, cfg in agents_map.items(): + if not isinstance(cfg, dict): + continue + aid_str = str(aid) + district_id = str(cfg.get("district_id") or "city-core") + subagents_total += int(role_counts.get(aid_str, 0)) + + by_district.setdefault(district_id, []).append( + { + "agent_id": aid_str, + "status": str(cfg.get("status", "active")), + } + ) + + return districts, by_district, len(agents_map), subagents_total + + +async def build_dashboard() -> Dict[str, Any]: + districts, agents_by_district, agents_total, subagents_total = await _registry_snapshot() + timeout = httpx.Timeout(timeout=HTTP_TOTAL_TIMEOUT_SECONDS, connect=HTTP_CONNECT_TIMEOUT_SECONDS) + + by_district: List[Dict[str, Any]] = [] + districts_online = 0 + agents_online_total = 0 + + async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client: + memory_vectors = await _read_memory_vectors(client) + + for district in districts: + district_id = str(district.get("district_id")) + title = district.get("title") or district_id + domain = str(district.get("domain") or "") + status = district.get("status") or "active" + members = agents_by_district.get(district_id, []) + agents_total_district = len(members) + + sample = { + "district_id": district_id, + "title": title, + "domain": domain, + "status": status, + "ok": False, + "agents_total": agents_total_district, + "agents_online": 0, + "latency_ms": None, + "last_check_ts": _now_iso(), + "error": None, + } + + last_error = "No health endpoint configured" + for candidate in _health_candidates(district): + ok, payload, latency_ms, error_message = await _fetch_json_with_latency(client, candidate) + sample["latency_ms"] = latency_ms + if ok: + sample["ok"] = True + sample["error"] = None + inferred = _extract_agents_online(payload or {}, agents_total_district) + sample["agents_online"] = inferred if inferred is not None else agents_total_district + break + last_error = error_message or "health check failed" + + if sample["ok"]: + districts_online += 1 + agents_online_total += int(sample.get("agents_online") or 0) + else: + sample["error"] = {"message": last_error} + + by_district.append(sample) + + return { + "global": { + "nodes": NODES_TOTAL, + "districts": len(districts), + "agents": agents_total, + "subagents": subagents_total, + "memory_vectors": memory_vectors, + "districts_online": districts_online, + "agents_online": agents_online_total, + }, + "by_district": by_district, + "updated_at": _now_iso(), + } + + +async def publish_dashboard(dashboard: Dict[str, Any]) -> None: + redis = await redis_client() + payload = json.dumps(dashboard, ensure_ascii=False) + await redis.set(DASHBOARD_KEY, payload, ex=METRICS_TTL_SECONDS) + + for row in dashboard.get("by_district", []): + district_id = row.get("district_id") + if not district_id: + continue + key = f"{DISTRICT_KEY_PREFIX}:{district_id}" + await redis.set(key, json.dumps(row, ensure_ascii=False), ex=METRICS_TTL_SECONDS) + + +async def run_once() -> None: + dashboard = await build_dashboard() + await publish_dashboard(dashboard) + logger.info( + "dashboard_updated districts=%s districts_online=%s agents=%s agents_online=%s", + dashboard["global"].get("districts"), + dashboard["global"].get("districts_online"), + dashboard["global"].get("agents"), + dashboard["global"].get("agents_online"), + ) + + +async def worker_loop() -> None: + logger.info( + "metrics_poller_started interval=%ss ttl=%ss redis=%s", + POLL_INTERVAL_SECONDS, + METRICS_TTL_SECONDS, + REDIS_URL, + ) + while True: + started = time.perf_counter() + try: + await run_once() + except asyncio.CancelledError: + raise + except Exception: + logger.exception("metrics_poller_cycle_failed") + + elapsed = time.perf_counter() - started + sleep_for = max(1.0, POLL_INTERVAL_SECONDS - elapsed) + await asyncio.sleep(sleep_for) + + +if __name__ == "__main__": + try: + asyncio.run(worker_loop()) + finally: + try: + asyncio.run(close_redis()) + except Exception: + pass diff --git a/gateway-bot/daarion_facade/redis_jobs.py b/gateway-bot/daarion_facade/redis_jobs.py new file mode 100644 index 00000000..86c8174a --- /dev/null +++ b/gateway-bot/daarion_facade/redis_jobs.py @@ -0,0 +1,84 @@ +import asyncio +import json +import os +from typing import Any, Dict, Optional + +from redis.asyncio import Redis + +REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0") +JOB_KEY_PREFIX = "daarion:jobs" +QUEUE_KEY = "daarion:jobs:queue" +JOB_TTL_SECONDS = int(os.getenv("DAARION_JOB_TTL_SECONDS", str(72 * 3600))) + +_redis: Optional[Redis] = None + + +def _job_key(job_id: str) -> str: + return f"{JOB_KEY_PREFIX}:{job_id}" + + +async def redis_client() -> Redis: + global _redis + if _redis is None: + _redis = Redis.from_url(REDIS_URL, decode_responses=True) + return _redis + + +async def close_redis() -> None: + global _redis + if _redis is not None: + await _redis.close() + _redis = None + + +async def create_job(job_id: str, payload: Dict[str, Any]) -> None: + r = await redis_client() + key = _job_key(job_id) + await r.set(key, json.dumps(payload, ensure_ascii=False), ex=JOB_TTL_SECONDS) + + +async def get_job(job_id: str) -> Optional[Dict[str, Any]]: + r = await redis_client() + raw = await r.get(_job_key(job_id)) + if not raw: + return None + try: + return json.loads(raw) + except json.JSONDecodeError: + return None + + +async def update_job(job_id: str, patch: Dict[str, Any]) -> Optional[Dict[str, Any]]: + current = await get_job(job_id) + if not current: + return None + current.update(patch) + await create_job(job_id, current) + return current + + +async def enqueue_job(job_id: str) -> None: + r = await redis_client() + await r.lpush(QUEUE_KEY, job_id) + + +async def dequeue_job(block_seconds: int = 5) -> Optional[str]: + r = await redis_client() + result = await r.brpop(QUEUE_KEY, timeout=block_seconds) + if not result: + return None + _, job_id = result + return job_id + + +async def wait_for_redis(timeout_seconds: int = 30) -> None: + deadline = asyncio.get_running_loop().time() + timeout_seconds + while True: + try: + r = await redis_client() + await r.ping() + return + except Exception: + if asyncio.get_running_loop().time() >= deadline: + raise + await asyncio.sleep(1) diff --git a/gateway-bot/daarion_facade/registry_api.py b/gateway-bot/daarion_facade/registry_api.py new file mode 100644 index 00000000..9a0e7f42 --- /dev/null +++ b/gateway-bot/daarion_facade/registry_api.py @@ -0,0 +1,268 @@ +import json +import os +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, List, Optional + +import httpx +from fastapi import APIRouter +from redis.asyncio import Redis + +router = APIRouter(prefix="/v1", tags=["daarion-facade"]) + +REGISTRY_CACHE_TTL = int(os.getenv("REGISTRY_CACHE_TTL", "30")) +MEMORY_SERVICE_URL = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000") +CREWAI_SERVICE_URL = os.getenv("CREWAI_SERVICE_URL", "http://dagi-staging-crewai-service:9010") +REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0") +METRICS_DASHBOARD_KEY = "daarion:metrics:dashboard" + +_REGISTRY_CACHE: Dict[str, Any] = {"loaded_at": 0.0, "data": None} +_DISTRICT_CACHE: Dict[str, Any] = {"loaded_at": 0.0, "data": None} +_CREWAI_CACHE: Dict[str, Any] = {"loaded_at": 0.0, "data": {}} +_REDIS: Optional[Redis] = None + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _registry_paths() -> List[Path]: + return [ + Path("/app/gateway-bot/agent_registry.json"), + Path("/opt/microdao-daarion/config/agent_registry.json"), + Path(__file__).resolve().parents[1] / "agent_registry.json", + ] + + +def _district_paths() -> List[Path]: + return [ + Path("/app/gateway-bot/district_registry.json"), + Path(__file__).resolve().parents[1] / "district_registry.json", + ] + + +def _load_registry() -> Dict[str, Any]: + now = time.time() + if _REGISTRY_CACHE.get("data") and (now - _REGISTRY_CACHE.get("loaded_at", 0.0) < REGISTRY_CACHE_TTL): + return _REGISTRY_CACHE["data"] + + for path in _registry_paths(): + if path.exists(): + with path.open("r", encoding="utf-8") as f: + data = json.load(f) + _REGISTRY_CACHE.update({"loaded_at": now, "data": data}) + return data + + data = {"agents": {}} + _REGISTRY_CACHE.update({"loaded_at": now, "data": data}) + return data + + +def _load_district_registry() -> Dict[str, Any]: + now = time.time() + if _DISTRICT_CACHE.get("data") and (now - _DISTRICT_CACHE.get("loaded_at", 0.0) < REGISTRY_CACHE_TTL): + return _DISTRICT_CACHE["data"] + + for path in _district_paths(): + if path.exists(): + with path.open("r", encoding="utf-8") as f: + data = json.load(f) + _DISTRICT_CACHE.update({"loaded_at": now, "data": data}) + return data + + data = {"districts": []} + _DISTRICT_CACHE.update({"loaded_at": now, "data": data}) + return data + + +async def _redis_client() -> Redis: + global _REDIS + if _REDIS is None: + _REDIS = Redis.from_url(REDIS_URL, decode_responses=True) + return _REDIS + + +async def _load_cached_dashboard() -> Optional[Dict[str, Any]]: + try: + r = await _redis_client() + raw = await r.get(METRICS_DASHBOARD_KEY) + if not raw: + return None + return json.loads(raw) + except Exception: + return None + + +async def _load_crewai_roles() -> Dict[str, int]: + now = time.time() + if now - _CREWAI_CACHE.get("loaded_at", 0.0) < REGISTRY_CACHE_TTL: + return _CREWAI_CACHE.get("data", {}) + + out: Dict[str, int] = {} + try: + async with httpx.AsyncClient(timeout=8.0) as client: + resp = await client.get(f"{CREWAI_SERVICE_URL}/crew/agents") + if resp.status_code == 200: + payload = resp.json() + for aid, info in payload.items(): + default_roles = info.get("default_roles") + out[str(aid)] = int(default_roles) if isinstance(default_roles, int) else 0 + except Exception: + out = {} + + _CREWAI_CACHE.update({"loaded_at": now, "data": out}) + return out + + +@router.get("/registry/agents") +async def get_agents() -> Dict[str, Any]: + reg = _load_registry() + agents = reg.get("agents", {}) if isinstance(reg, dict) else {} + role_counts = await _load_crewai_roles() + + items: List[Dict[str, Any]] = [] + for agent_id, cfg in agents.items(): + if not isinstance(cfg, dict): + continue + domains = cfg.get("domains") or [] + district_id = cfg.get("district_id") or "city-core" + items.append( + { + "agent_id": agent_id, + "title": cfg.get("display_name") or agent_id, + "role": cfg.get("canonical_role") or "", + "domain_primary": domains[0] if domains else "general", + "domain_aliases": domains[1:] if len(domains) > 1 else [], + "visibility": cfg.get("visibility", "public"), + "status": cfg.get("status", "active"), + "team": {"subagents_total": role_counts.get(agent_id, 0)}, + "district_id": district_id, + "avatar_url": cfg.get("avatar_url"), + "health_url": cfg.get("health_url"), + } + ) + + return {"items": items, "total": len(items)} + + +@router.get("/registry/districts") +async def get_districts() -> Dict[str, Any]: + agents_payload = await get_agents() + agents = agents_payload.get("items", []) + by_district: Dict[str, List[Dict[str, Any]]] = {} + for a in agents: + by_district.setdefault(a.get("district_id", "city-core"), []).append(a) + + catalog = _load_district_registry().get("districts", []) + catalog_by_id: Dict[str, Dict[str, Any]] = { + str(d.get("district_id")): d for d in catalog if isinstance(d, dict) and d.get("district_id") + } + + district_ids = sorted(set(catalog_by_id.keys()) | set(by_district.keys())) + items: List[Dict[str, Any]] = [] + + for district_id in district_ids: + members = by_district.get(district_id, []) + base = catalog_by_id.get(district_id, {}) + domain = base.get("domain") or ("daarion.city" if district_id == "city-core" else f"{district_id}.daarion.city") + + lead_agent_id = base.get("lead_agent_id") + if not lead_agent_id: + if district_id == "city-core" and any(m.get("agent_id") == "daarwizz" for m in members): + lead_agent_id = "daarwizz" + elif members: + lead_agent_id = members[0].get("agent_id") + else: + lead_agent_id = None + + items.append( + { + "district_id": district_id, + "title": base.get("title") or district_id.replace("-", " ").title(), + "domain": domain, + "status": base.get("status", "active"), + "logo_url": base.get("logo_url"), + "health_url": base.get("health_url"), + "well_known": { + "manifest": f"https://{domain}/.well-known/daarion-district.json", + "health": f"https://{domain}/.well-known/daarion-health.json", + "capabilities": f"https://{domain}/.well-known/daarion-capabilities.json", + }, + "lead_agent_id": lead_agent_id, + "agents_total": len(members), + } + ) + + return {"items": items, "total": len(items)} + + +@router.get("/metrics") +async def get_metrics() -> Dict[str, Any]: + agents_payload = await get_agents() + districts_payload = await get_districts() + agents = agents_payload.get("items", []) + + memory_vectors = 0 + try: + async with httpx.AsyncClient(timeout=5.0) as client: + resp = await client.get(f"{MEMORY_SERVICE_URL}/health") + if resp.status_code == 200: + data = resp.json() + memory_vectors = int( + data.get("vector_store", {}) + .get("memories", {}) + .get("vectors_count", 0) + ) + except Exception: + memory_vectors = 0 + + return { + "nodes": 1, + "districts": districts_payload.get("total", 0), + "agents": len(agents), + "subagents": sum(int((a.get("team") or {}).get("subagents_total", 0)) for a in agents), + "memory_vectors": memory_vectors, + } + + +@router.get("/metrics/dashboard") +async def get_metrics_dashboard() -> Dict[str, Any]: + cached = await _load_cached_dashboard() + if cached: + return cached + + metrics = await get_metrics() + districts_payload = await get_districts() + districts = districts_payload.get("items", []) + + by_district = [] + for d in districts: + by_district.append( + { + "district_id": d.get("district_id"), + "title": d.get("title"), + "domain": d.get("domain"), + "status": d.get("status"), + "ok": None, + "agents_total": d.get("agents_total", 0), + "agents_online": None, + "latency_ms": None, + "last_check_ts": None, + } + ) + + return { + "global": { + "nodes": metrics.get("nodes", 1), + "districts": metrics.get("districts", 0), + "agents": metrics.get("agents", 0), + "subagents": metrics.get("subagents", 0), + "memory_vectors": metrics.get("memory_vectors", 0), + "districts_online": 0, + "agents_online": 0, + }, + "by_district": by_district, + "updated_at": _now_iso(), + "source": "fallback_registry", + } diff --git a/gateway-bot/daarion_facade/reminder_worker.py b/gateway-bot/daarion_facade/reminder_worker.py new file mode 100644 index 00000000..4f4337cf --- /dev/null +++ b/gateway-bot/daarion_facade/reminder_worker.py @@ -0,0 +1,100 @@ +import asyncio +import logging +import os +from typing import Dict + +import httpx + +from .reminders import close_redis, pop_due_reminders + +logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s") +logger = logging.getLogger("daarion-reminder-worker") + +POLL_SECONDS = float(os.getenv("DAARION_REMINDER_POLL_SECONDS", "2")) +TELEGRAM_TIMEOUT = float(os.getenv("DAARION_REMINDER_TELEGRAM_TIMEOUT", "20")) + +AGENT_TOKEN_ENV: Dict[str, str] = { + "daarwizz": "DAARWIZZ_TELEGRAM_BOT_TOKEN", + "helion": "HELION_TELEGRAM_BOT_TOKEN", + "greenfood": "GREENFOOD_TELEGRAM_BOT_TOKEN", + "agromatrix": "AGROMATRIX_TELEGRAM_BOT_TOKEN", + "alateya": "ALATEYA_TELEGRAM_BOT_TOKEN", + "nutra": "NUTRA_TELEGRAM_BOT_TOKEN", + "druid": "DRUID_TELEGRAM_BOT_TOKEN", + "clan": "CLAN_TELEGRAM_BOT_TOKEN", + "eonarch": "EONARCH_TELEGRAM_BOT_TOKEN", + "senpai": "SENPAI_TELEGRAM_BOT_TOKEN", + "oneok": "ONEOK_TELEGRAM_BOT_TOKEN", + "soul": "SOUL_TELEGRAM_BOT_TOKEN", + "yaromir": "YAROMIR_TELEGRAM_BOT_TOKEN", + "sofiia": "SOFIIA_TELEGRAM_BOT_TOKEN", +} + + +def _token_for_agent(agent_id: str) -> str: + env = AGENT_TOKEN_ENV.get((agent_id or "").lower(), "") + return os.getenv(env, "") if env else "" + + +async def _send_reminder(item: Dict[str, str]) -> bool: + agent_id = str(item.get("agent_id", "")) + chat_id = str(item.get("chat_id", "")) + reminder_text = str(item.get("text", "")).strip() + due_at = str(item.get("due_at", "")) + + token = _token_for_agent(agent_id) + if not token: + logger.warning("reminder_skip_no_token agent=%s reminder_id=%s", agent_id, item.get("reminder_id")) + return False + + if not chat_id or not reminder_text: + logger.warning("reminder_skip_invalid_payload reminder_id=%s", item.get("reminder_id")) + return False + + body = { + "chat_id": chat_id, + "text": f"⏰ Нагадування ({agent_id})\n\n{reminder_text}\n\n🕒 {due_at}", + } + + url = f"https://api.telegram.org/bot{token}/sendMessage" + async with httpx.AsyncClient(timeout=TELEGRAM_TIMEOUT) as client: + resp = await client.post(url, json=body) + if resp.status_code != 200: + logger.warning( + "reminder_send_failed reminder_id=%s status=%s body=%s", + item.get("reminder_id"), + resp.status_code, + resp.text[:300], + ) + return False + + logger.info("reminder_sent reminder_id=%s agent=%s chat=%s", item.get("reminder_id"), agent_id, chat_id) + return True + + +async def worker_loop() -> None: + logger.info("reminder_worker_started poll_seconds=%s", POLL_SECONDS) + while True: + try: + items = await pop_due_reminders(limit=20) + if items: + for item in items: + try: + await _send_reminder(item) + except Exception: + logger.exception("reminder_send_exception reminder_id=%s", item.get("reminder_id")) + except asyncio.CancelledError: + raise + except Exception: + logger.exception("reminder_worker_cycle_failed") + await asyncio.sleep(POLL_SECONDS) + + +if __name__ == "__main__": + try: + asyncio.run(worker_loop()) + finally: + try: + asyncio.run(close_redis()) + except Exception: + pass diff --git a/gateway-bot/daarion_facade/reminders.py b/gateway-bot/daarion_facade/reminders.py new file mode 100644 index 00000000..dd15ff8f --- /dev/null +++ b/gateway-bot/daarion_facade/reminders.py @@ -0,0 +1,154 @@ +import json +import os +import time +import uuid +from dataclasses import dataclass +from datetime import datetime, timezone +from typing import Any, Dict, List, Optional + +from redis.asyncio import Redis + +REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0") +REMINDER_PREFIX = "daarion:reminders" +REMINDER_BY_ID = f"{REMINDER_PREFIX}:by_id" +REMINDER_SCHEDULE = f"{REMINDER_PREFIX}:schedule" +REMINDER_TTL_SECONDS = int(os.getenv("DAARION_REMINDER_TTL_SECONDS", str(30 * 24 * 3600))) + +_redis: Optional[Redis] = None + + +@dataclass +class Reminder: + reminder_id: str + agent_id: str + chat_id: str + user_id: str + text: str + due_ts: int + created_at: str + + def to_dict(self) -> Dict[str, Any]: + return { + "reminder_id": self.reminder_id, + "agent_id": self.agent_id, + "chat_id": self.chat_id, + "user_id": self.user_id, + "text": self.text, + "due_ts": self.due_ts, + "created_at": self.created_at, + } + + +async def redis_client() -> Redis: + global _redis + if _redis is None: + _redis = Redis.from_url(REDIS_URL, decode_responses=True) + return _redis + + +async def close_redis() -> None: + global _redis + if _redis is not None: + await _redis.close() + _redis = None + + +def _iso_now() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _iso_from_ts(ts: int) -> str: + return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat() + + +async def create_reminder(agent_id: str, chat_id: str, user_id: str, text: str, due_ts: int) -> Dict[str, Any]: + reminder = Reminder( + reminder_id=f"rem_{uuid.uuid4().hex[:16]}", + agent_id=agent_id, + chat_id=str(chat_id), + user_id=str(user_id), + text=text.strip(), + due_ts=int(due_ts), + created_at=_iso_now(), + ) + + r = await redis_client() + key = f"{REMINDER_BY_ID}:{reminder.reminder_id}" + payload = json.dumps(reminder.to_dict(), ensure_ascii=False) + + await r.set(key, payload, ex=REMINDER_TTL_SECONDS) + await r.zadd(REMINDER_SCHEDULE, {reminder.reminder_id: float(reminder.due_ts)}) + + result = reminder.to_dict() + result["due_at"] = _iso_from_ts(reminder.due_ts) + return result + + +async def list_reminders(agent_id: str, chat_id: str, user_id: str, limit: int = 10) -> List[Dict[str, Any]]: + r = await redis_client() + now_ts = int(time.time()) + ids = await r.zrangebyscore(REMINDER_SCHEDULE, min=now_ts - 365 * 24 * 3600, max="+inf", start=0, num=max(1, limit * 5)) + + out: List[Dict[str, Any]] = [] + for reminder_id in ids: + raw = await r.get(f"{REMINDER_BY_ID}:{reminder_id}") + if not raw: + continue + try: + item = json.loads(raw) + except json.JSONDecodeError: + continue + if item.get("agent_id") != agent_id: + continue + if str(item.get("chat_id")) != str(chat_id): + continue + if str(item.get("user_id")) != str(user_id): + continue + item["due_at"] = _iso_from_ts(int(item.get("due_ts", 0))) + out.append(item) + if len(out) >= limit: + break + return out + + +async def cancel_reminder(reminder_id: str, agent_id: str, chat_id: str, user_id: str) -> bool: + r = await redis_client() + key = f"{REMINDER_BY_ID}:{reminder_id}" + raw = await r.get(key) + if not raw: + return False + try: + item = json.loads(raw) + except json.JSONDecodeError: + return False + + if item.get("agent_id") != agent_id or str(item.get("chat_id")) != str(chat_id) or str(item.get("user_id")) != str(user_id): + return False + + await r.delete(key) + await r.zrem(REMINDER_SCHEDULE, reminder_id) + return True + + +async def pop_due_reminders(limit: int = 20) -> List[Dict[str, Any]]: + r = await redis_client() + now_ts = int(time.time()) + ids = await r.zrangebyscore(REMINDER_SCHEDULE, min="-inf", max=now_ts, start=0, num=max(1, limit)) + out: List[Dict[str, Any]] = [] + + for reminder_id in ids: + removed = await r.zrem(REMINDER_SCHEDULE, reminder_id) + if removed == 0: + continue + raw = await r.get(f"{REMINDER_BY_ID}:{reminder_id}") + if not raw: + continue + await r.delete(f"{REMINDER_BY_ID}:{reminder_id}") + try: + item = json.loads(raw) + item["due_at"] = _iso_from_ts(int(item.get("due_ts", now_ts))) + out.append(item) + except json.JSONDecodeError: + continue + + return out diff --git a/gateway-bot/daarion_facade/worker.py b/gateway-bot/daarion_facade/worker.py new file mode 100644 index 00000000..a06a8d64 --- /dev/null +++ b/gateway-bot/daarion_facade/worker.py @@ -0,0 +1,107 @@ +import asyncio +from datetime import datetime, timezone +import logging +import os +from typing import Any, Dict + +import httpx + +from .redis_jobs import close_redis, dequeue_job, get_job, update_job, wait_for_redis + +logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s") +logger = logging.getLogger("daarion-gateway-worker") + +ROUTER_BASE_URL = os.getenv("ROUTER_BASE_URL", os.getenv("ROUTER_URL", "http://router:8000")) +ROUTER_TIMEOUT_SECONDS = float(os.getenv("ROUTER_WORKER_TIMEOUT", "60")) + + +def _now() -> str: + return datetime.now(timezone.utc).isoformat() + + +async def _call_router(agent_id: str, input_payload: Dict[str, Any], metadata: Dict[str, Any]) -> Dict[str, Any]: + body: Dict[str, Any] = { + "prompt": input_payload.get("prompt", ""), + "metadata": metadata or {}, + } + images = input_payload.get("images") or [] + if images: + body["images"] = images + + url = f"{ROUTER_BASE_URL}/v1/agents/{agent_id}/infer" + async with httpx.AsyncClient(timeout=ROUTER_TIMEOUT_SECONDS) as client: + resp = await client.post(url, json=body) + resp.raise_for_status() + data = resp.json() + + return { + "response": data.get("response", ""), + "model": data.get("model"), + "backend": data.get("backend"), + "tokens_used": data.get("tokens_used"), + } + + +async def run_once(job_id: str) -> None: + job = await get_job(job_id) + if not job: + logger.warning("job_missing: %s", job_id) + return + + await update_job(job_id, {"status": "running", "started_at": _now(), "updated_at": _now()}) + + agent_id = job.get("agent_id") + input_payload = job.get("input") or {} + metadata = job.get("metadata") or {} + + try: + result = await _call_router(agent_id, input_payload, metadata) + await update_job( + job_id, + { + "status": "done", + "result": result, + "error": None, + "finished_at": _now(), + "updated_at": _now(), + }, + ) + logger.info("job_done: %s agent=%s", job_id, agent_id) + except Exception as e: + await update_job( + job_id, + { + "status": "failed", + "error": {"type": e.__class__.__name__, "message": str(e)}, + "finished_at": _now(), + "updated_at": _now(), + }, + ) + logger.exception("job_failed: %s agent=%s", job_id, agent_id) + + +async def worker_loop() -> None: + await wait_for_redis(60) + logger.info("worker_started router=%s", ROUTER_BASE_URL) + + while True: + try: + job_id = await dequeue_job(block_seconds=10) + if not job_id: + continue + await run_once(job_id) + except asyncio.CancelledError: + raise + except Exception: + logger.exception("worker_loop_error") + await asyncio.sleep(1) + + +if __name__ == "__main__": + try: + asyncio.run(worker_loop()) + finally: + try: + asyncio.run(close_redis()) + except Exception: + pass diff --git a/gateway-bot/district_registry.json b/gateway-bot/district_registry.json new file mode 100644 index 00000000..17e39a6f --- /dev/null +++ b/gateway-bot/district_registry.json @@ -0,0 +1,92 @@ +{ + "districts": [ + { + "district_id": "city-core", + "title": "City Core - DAARION.city", + "domain": "daarion.city", + "status": "active", + "logo_url": null, + "lead_agent_id": "daarwizz" + }, + { + "district_id": "helion", + "title": "Helion District", + "domain": "helion.daarion.city", + "status": "active", + "logo_url": null, + "lead_agent_id": "helion" + }, + { + "district_id": "alateya", + "title": "Alateya District", + "domain": "alateya.daarion.city", + "status": "active", + "logo_url": null, + "lead_agent_id": "alateya" + }, + { + "district_id": "druid", + "title": "Druid District", + "domain": "druid.daarion.city", + "status": "active", + "logo_url": null, + "lead_agent_id": "druid" + }, + { + "district_id": "nutra", + "title": "Nutra District", + "domain": "nutra.daarion.city", + "status": "active", + "logo_url": null, + "lead_agent_id": "nutra" + }, + { + "district_id": "agromatrix", + "title": "AgroMatrix District", + "domain": "agromatrix.daarion.city", + "status": "active", + "logo_url": null, + "lead_agent_id": "agromatrix" + }, + { + "district_id": "greenfood", + "title": "GreenFood District", + "domain": "greenfood.daarion.city", + "status": "active", + "logo_url": null, + "lead_agent_id": "greenfood" + }, + { + "district_id": "clan", + "title": "Clan District", + "domain": "clan.daarion.city", + "status": "active", + "logo_url": null, + "lead_agent_id": "clan" + }, + { + "district_id": "eonarch", + "title": "Eonarch District", + "domain": "eonarch.daarion.city", + "status": "active", + "logo_url": null, + "lead_agent_id": "eonarch" + }, + { + "district_id": "soul", + "title": "Soul District", + "domain": "soul.daarion.city", + "status": "active", + "logo_url": null, + "lead_agent_id": "soul" + }, + { + "district_id": "senpai", + "title": "Senpai District", + "domain": "senpai.daarion.city", + "status": "active", + "logo_url": null, + "lead_agent_id": "senpai" + } + ] +} diff --git a/gateway-bot/http_api.py b/gateway-bot/http_api.py index 8bb526d3..ca1c5fce 100644 --- a/gateway-bot/http_api.py +++ b/gateway-bot/http_api.py @@ -1871,23 +1871,53 @@ async def process_document( Dict з результатом обробки """ mime_type = document.get("mime_type", "") + mime_type_l = (mime_type or "").lower() file_name = document.get("file_name", "") file_id = document.get("file_id") file_name_lower = file_name.lower() - allowed_exts = {".pdf", ".docx", ".txt", ".md", ".csv", ".xlsx", ".zip"} + allowed_exts = { + ".pdf", ".doc", ".docx", ".rtf", ".odt", + ".txt", ".md", ".markdown", + ".csv", ".tsv", ".xls", ".xlsx", ".xlsm", ".ods", + ".ppt", ".pptx", ".odp", + ".json", ".yaml", ".yml", ".xml", ".html", ".htm", + ".zip", + ".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", + } is_allowed = any(file_name_lower.endswith(ext) for ext in allowed_exts) - if mime_type == "application/pdf": + if mime_type_l == "application/pdf": is_allowed = True - if mime_type in { + if mime_type_l in { + "application/msword", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/rtf", + "text/rtf", + "application/vnd.oasis.opendocument.text", + "application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.ms-excel.sheet.macroenabled.12", + "application/vnd.oasis.opendocument.spreadsheet", + "application/vnd.ms-powerpoint", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.oasis.opendocument.presentation", "text/plain", "text/markdown", "text/csv", + "text/tab-separated-values", + "application/json", + "application/yaml", + "application/x-yaml", + "text/yaml", + "application/xml", + "text/xml", + "text/html", "application/zip", + "application/x-zip-compressed", }: is_allowed = True + if mime_type_l.startswith("image/"): + is_allowed = True if is_allowed and file_id: logger.info(f"{agent_config.name}: Document from {username} (tg:{user_id}), file_id: {file_id}, file_name: {file_name}") @@ -2027,7 +2057,7 @@ async def process_document( telegram_token = agent_config.get_telegram_token() await send_telegram_message( chat_id, - "Наразі підтримуються формати: PDF, DOCX, TXT, MD, CSV, XLSX, ZIP.", + "Підтримуються формати: PDF/DOC/DOCX/RTF/ODT, TXT/MD/CSV/TSV, XLS/XLSX/XLSM/ODS, PPT/PPTX/ODP, JSON/YAML/XML/HTML, ZIP, зображення.", telegram_token, ) return {"ok": False, "error": "Unsupported document type"} @@ -3681,7 +3711,8 @@ async def _old_telegram_webhook(update: TelegramUpdate): doc_url=file_url, file_name=file_name, dao_id=dao_id, - user_id=f"tg:{user_id}" + user_id=f"tg:{user_id}", + agent_id=agent_config.agent_id, ) if result.success: @@ -3705,7 +3736,8 @@ async def _old_telegram_webhook(update: TelegramUpdate): result = await ingest_document( session_id=session_id, dao_id=dao_id, - user_id=f"tg:{user_id}" + user_id=f"tg:{user_id}", + agent_id=agent_config.agent_id, ) if result.success: diff --git a/gateway-bot/http_api_doc.py b/gateway-bot/http_api_doc.py index 57ef7f89..e8e56eae 100644 --- a/gateway-bot/http_api_doc.py +++ b/gateway-bot/http_api_doc.py @@ -6,20 +6,32 @@ Endpoints: - POST /api/doc/parse - Parse a document - POST /api/doc/ingest - Ingest document to RAG - POST /api/doc/ask - Ask question about document +- POST /api/doc/update - Update existing document text (versioned) +- POST /api/doc/publish - Publish physical file version via artifact registry +- GET /api/doc/versions/{doc_id} - List document versions +- GET /api/doc/artifacts/{artifact_id}/versions/{version_id}/download - Download via gateway proxy """ import logging +import os +import re from typing import Optional, Dict, Any from fastapi import APIRouter, HTTPException, UploadFile, File, Form +from fastapi.responses import Response from pydantic import BaseModel +import httpx from services.doc_service import ( doc_service, parse_document, ingest_document, ask_about_document, + update_document, + list_document_versions, + publish_document_artifact, get_doc_context, ParsedResult, IngestResult, + UpdateResult, QAResult, DocContext ) @@ -27,6 +39,8 @@ from services.doc_service import ( logger = logging.getLogger(__name__) router = APIRouter() +ARTIFACT_REGISTRY_URL = os.getenv("ARTIFACT_REGISTRY_URL", "http://artifact-registry:9220").rstrip("/") +DOC_DOWNLOAD_TIMEOUT_SECONDS = float(os.getenv("DOC_DOWNLOAD_TIMEOUT_SECONDS", "60")) # ======================================== @@ -52,6 +66,7 @@ class IngestDocumentRequest(BaseModel): file_name: Optional[str] = None dao_id: Optional[str] = None user_id: Optional[str] = None + agent_id: str = "daarwizz" class AskDocumentRequest(BaseModel): @@ -61,6 +76,40 @@ class AskDocumentRequest(BaseModel): doc_id: Optional[str] = None dao_id: Optional[str] = None user_id: Optional[str] = None + agent_id: str = "daarwizz" + + +class UpdateDocumentRequest(BaseModel): + """Request to update existing document content.""" + session_id: str + doc_id: Optional[str] = None + doc_url: Optional[str] = None + file_name: Optional[str] = None + text: Optional[str] = None + dao_id: Optional[str] = None + user_id: Optional[str] = None + agent_id: str = "daarwizz" + storage_ref: Optional[str] = None + publish_artifact: bool = False + artifact_id: Optional[str] = None + target_format: Optional[str] = None + artifact_label: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None + + +class PublishDocumentRequest(BaseModel): + """Request to publish document as physical artifact version.""" + session_id: str + doc_id: Optional[str] = None + doc_url: Optional[str] = None + file_name: Optional[str] = None + text: Optional[str] = None + dao_id: Optional[str] = None + user_id: Optional[str] = None + artifact_id: Optional[str] = None + target_format: Optional[str] = None + artifact_label: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None # ======================================== @@ -167,7 +216,8 @@ async def ingest_document_endpoint(request: IngestDocumentRequest): doc_url=request.doc_url, file_name=request.file_name, dao_id=request.dao_id, - user_id=request.user_id + user_id=request.user_id, + agent_id=request.agent_id, ) if not result.success: @@ -209,7 +259,8 @@ async def ask_about_document_endpoint(request: AskDocumentRequest): question=request.question, doc_id=doc_id, dao_id=request.dao_id, - user_id=request.user_id + user_id=request.user_id, + agent_id=request.agent_id, ) if not result.success: @@ -227,6 +278,107 @@ async def ask_about_document_endpoint(request: AskDocumentRequest): raise HTTPException(status_code=500, detail=str(e)) +@router.post("/api/doc/update") +async def update_document_endpoint(request: UpdateDocumentRequest): + """ + Update a document and bump its version. + If text is omitted and doc_url exists, text is re-parsed from the source document. + """ + try: + result = await update_document( + session_id=request.session_id, + doc_id=request.doc_id, + doc_url=request.doc_url, + file_name=request.file_name, + text=request.text, + dao_id=request.dao_id, + user_id=request.user_id, + agent_id=request.agent_id, + storage_ref=request.storage_ref, + publish_artifact=request.publish_artifact, + artifact_id=request.artifact_id, + target_format=request.target_format, + artifact_label=request.artifact_label, + metadata=request.metadata, + ) + if not result.success: + raise HTTPException(status_code=400, detail=result.error) + response = { + "ok": True, + "doc_id": result.doc_id, + "version_no": result.version_no, + "version_id": result.version_id, + "updated_chunks": result.updated_chunks, + "status": result.status, + "publish_error": result.publish_error, + "artifact_id": result.artifact_id, + "artifact_version_id": result.artifact_version_id, + "artifact_storage_key": result.artifact_storage_key, + "artifact_mime": result.artifact_mime, + "artifact_download_url": result.artifact_download_url, + } + return response + except HTTPException: + raise + except Exception as e: + logger.error(f"Update document error: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/api/doc/publish") +async def publish_document_endpoint(request: PublishDocumentRequest): + """ + Publish current document text as physical file artifact version. + """ + try: + result = await publish_document_artifact( + session_id=request.session_id, + doc_id=request.doc_id, + doc_url=request.doc_url, + file_name=request.file_name, + text=request.text, + dao_id=request.dao_id, + user_id=request.user_id, + artifact_id=request.artifact_id, + target_format=request.target_format, + artifact_label=request.artifact_label, + metadata=request.metadata, + ) + if not result.success: + raise HTTPException(status_code=400, detail=result.error) + return { + "ok": True, + "artifact_id": result.artifact_id, + "version_id": result.version_id, + "storage_key": result.storage_key, + "mime": result.mime, + "file_name": result.file_name, + "download_url": result.download_url, + } + except HTTPException: + raise + except Exception as e: + logger.error(f"Publish document error: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + + +@router.get("/api/doc/versions/{doc_id}") +async def list_document_versions_endpoint(doc_id: str, agent_id: str = "daarwizz", limit: int = 20): + """ + List document versions for agent/doc pair. + """ + try: + data = await list_document_versions(agent_id=agent_id, doc_id=doc_id, limit=limit) + if not data.get("ok"): + raise HTTPException(status_code=400, detail=data.get("error", "Failed to load versions")) + return data + except HTTPException: + raise + except Exception as e: + logger.error(f"List document versions error: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + + @router.get("/api/doc/context/{session_id}") async def get_document_context(session_id: str): """ @@ -258,3 +410,56 @@ async def get_document_context(session_id: str): logger.error(f"Get document context error: {e}", exc_info=True) raise HTTPException(status_code=500, detail=str(e)) + +@router.get("/api/doc/artifacts/{artifact_id}/versions/{version_id}/download") +async def download_artifact_version_via_gateway( + artifact_id: str, + version_id: str, + filename: Optional[str] = None, + inline: bool = False, +): + """ + Proxy download for artifact version to avoid exposing internal MinIO host to browser clients. + """ + aid = (artifact_id or "").strip() + vid = (version_id or "").strip() + if not aid or not vid: + raise HTTPException(status_code=400, detail="artifact_id and version_id are required") + + try: + async with httpx.AsyncClient(timeout=DOC_DOWNLOAD_TIMEOUT_SECONDS) as client: + meta_resp = await client.get( + f"{ARTIFACT_REGISTRY_URL}/artifacts/{aid}/versions/{vid}/download" + ) + if meta_resp.status_code >= 400: + detail = "" + try: + detail = meta_resp.json().get("detail") # type: ignore[assignment] + except Exception: + detail = meta_resp.text[:200] + raise HTTPException(status_code=meta_resp.status_code, detail=detail or "Version download info failed") + meta = meta_resp.json() + signed_url = (meta.get("url") or "").strip() + if not signed_url: + raise HTTPException(status_code=502, detail="artifact-registry returned empty download URL") + + file_resp = await client.get(signed_url) + if file_resp.status_code >= 400: + raise HTTPException(status_code=502, detail=f"Artifact storage download failed: {file_resp.status_code}") + + mime = (meta.get("mime") or file_resp.headers.get("content-type") or "application/octet-stream").strip() + storage_key = str(meta.get("storage_key") or "") + inferred_name = storage_key.rsplit("/", 1)[-1] if "/" in storage_key else storage_key + out_name = (filename or inferred_name or f"{aid}_{vid}.bin").strip() + out_name = re.sub(r"[^A-Za-z0-9._-]+", "_", out_name).strip("._") or f"{aid}_{vid}.bin" + disposition = "inline" if inline else "attachment" + headers = { + "Content-Disposition": f'{disposition}; filename="{out_name}"', + "Cache-Control": "private, max-age=60", + } + return Response(content=file_resp.content, media_type=mime, headers=headers) + except HTTPException: + raise + except Exception as e: + logger.error(f"Artifact version proxy download failed: aid={aid}, vid={vid}, err={e}", exc_info=True) + raise HTTPException(status_code=500, detail="Artifact proxy download failed") diff --git a/gateway-bot/memory_client.py b/gateway-bot/memory_client.py index d22b9f85..a101432b 100644 --- a/gateway-bot/memory_client.py +++ b/gateway-bot/memory_client.py @@ -143,6 +143,10 @@ class MemoryClient: "body_text": e.get("content", ""), "kind": e.get("kind", "message"), "type": "user" if e.get("role") == "user" else "agent", + "role": e.get("role", "unknown"), + "timestamp": e.get("timestamp"), + "user_id": e.get("user_id"), + "sender_name": e.get("sender_name"), } for e in events if e.get("content") @@ -445,4 +449,3 @@ class MemoryClient: # Глобальний екземпляр клієнта memory_client = MemoryClient() - diff --git a/gateway-bot/services/doc_service.py b/gateway-bot/services/doc_service.py index da5a6843..4ad2a691 100644 --- a/gateway-bot/services/doc_service.py +++ b/gateway-bot/services/doc_service.py @@ -11,18 +11,23 @@ This service can be used by: import os import logging import hashlib +import base64 import json import re from typing import Optional, Dict, Any, List from pydantic import BaseModel from datetime import datetime +from io import BytesIO -from router_client import send_to_router from memory_client import memory_client logger = logging.getLogger(__name__) SHARED_EXCEL_POLICY_AGENTS = {"agromatrix", "helion", "nutra", "greenfood"} +ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000") +ARTIFACT_REGISTRY_URL = os.getenv("ARTIFACT_REGISTRY_URL", "http://artifact-registry:9220").rstrip("/") +DOC_WRITEBACK_CREATED_BY = os.getenv("DOC_WRITEBACK_CREATED_BY", "gateway-doc-service") +GATEWAY_PUBLIC_BASE_URL = os.getenv("GATEWAY_PUBLIC_BASE_URL", "").rstrip("/") class QAItem(BaseModel): @@ -51,6 +56,35 @@ class IngestResult(BaseModel): error: Optional[str] = None +class UpdateResult(BaseModel): + """Result of document update with version bump.""" + success: bool + doc_id: Optional[str] = None + version_no: Optional[int] = None + version_id: Optional[int] = None + updated_chunks: int = 0 + status: str = "unknown" + publish_error: Optional[str] = None + artifact_id: Optional[str] = None + artifact_version_id: Optional[str] = None + artifact_storage_key: Optional[str] = None + artifact_mime: Optional[str] = None + artifact_download_url: Optional[str] = None + error: Optional[str] = None + + +class PublishResult(BaseModel): + """Result of artifact write-back publish.""" + success: bool + artifact_id: Optional[str] = None + version_id: Optional[str] = None + storage_key: Optional[str] = None + mime: Optional[str] = None + file_name: Optional[str] = None + download_url: Optional[str] = None + error: Optional[str] = None + + class QAResult(BaseModel): """Result of RAG query about a document""" success: bool @@ -84,6 +118,266 @@ class DocumentService: """Initialize document service""" self.memory_client = memory_client + async def _router_post_json( + self, + path: str, + payload: Dict[str, Any], + timeout: float = 45.0, + ) -> Dict[str, Any]: + import httpx + + base = ROUTER_URL.rstrip("/") + url = f"{base}{path}" + async with httpx.AsyncClient(timeout=timeout) as client: + resp = await client.post(url, json=payload) + body = {} + try: + body = resp.json() + except Exception: + body = {"ok": False, "error": f"Invalid JSON from router ({resp.status_code})"} + if resp.status_code >= 400: + err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}" + raise RuntimeError(f"Router error on {path}: {err}") + return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"} + + async def _router_get_json( + self, + path: str, + timeout: float = 30.0, + ) -> Dict[str, Any]: + import httpx + + base = ROUTER_URL.rstrip("/") + url = f"{base}{path}" + async with httpx.AsyncClient(timeout=timeout) as client: + resp = await client.get(url) + body = {} + try: + body = resp.json() + except Exception: + body = {"ok": False, "error": f"Invalid JSON from router ({resp.status_code})"} + if resp.status_code >= 400: + err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}" + raise RuntimeError(f"Router error on {path}: {err}") + return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"} + + async def _artifact_post_json( + self, + path: str, + payload: Dict[str, Any], + timeout: float = 45.0, + ) -> Dict[str, Any]: + import httpx + + base = ARTIFACT_REGISTRY_URL.rstrip("/") + url = f"{base}{path}" + async with httpx.AsyncClient(timeout=timeout) as client: + resp = await client.post(url, json=payload) + body = {} + try: + body = resp.json() + except Exception: + body = {"ok": False, "error": f"Invalid JSON from artifact-registry ({resp.status_code})"} + if resp.status_code >= 400: + err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}" + raise RuntimeError(f"Artifact registry error on {path}: {err}") + return body if isinstance(body, dict) else {"ok": False, "error": "Invalid artifact response type"} + + async def _artifact_get_json( + self, + path: str, + timeout: float = 30.0, + ) -> Dict[str, Any]: + import httpx + + base = ARTIFACT_REGISTRY_URL.rstrip("/") + url = f"{base}{path}" + async with httpx.AsyncClient(timeout=timeout) as client: + resp = await client.get(url) + body = {} + try: + body = resp.json() + except Exception: + body = {"ok": False, "error": f"Invalid JSON from artifact-registry ({resp.status_code})"} + if resp.status_code >= 400: + err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}" + raise RuntimeError(f"Artifact registry error on {path}: {err}") + return body if isinstance(body, dict) else {"ok": False, "error": "Invalid artifact response type"} + + def _resolve_format(self, file_name: Optional[str], target_format: Optional[str]) -> str: + fmt = (target_format or "").strip().lower().lstrip(".") + if fmt: + return fmt + if file_name and "." in file_name: + return file_name.rsplit(".", 1)[1].strip().lower() + return "txt" + + def _compose_output_name(self, file_name: Optional[str], doc_id: str, fmt: str) -> str: + base = "document" + if file_name: + base = file_name.rsplit("/", 1)[-1].rsplit("\\", 1)[-1] + if "." in base: + base = base.rsplit(".", 1)[0] + elif doc_id: + base = doc_id + safe_base = re.sub(r"[^A-Za-z0-9._-]+", "_", base).strip("._") or "document" + return f"{safe_base}.{fmt}" + + def _gateway_artifact_download_path(self, artifact_id: str, version_id: str) -> str: + aid = (artifact_id or "").strip() + vid = (version_id or "").strip() + return f"/api/doc/artifacts/{aid}/versions/{vid}/download" + + def _gateway_artifact_download_url(self, artifact_id: str, version_id: str) -> str: + path = self._gateway_artifact_download_path(artifact_id, version_id) + if GATEWAY_PUBLIC_BASE_URL: + return f"{GATEWAY_PUBLIC_BASE_URL}{path}" + return path + + def _render_document_bytes( + self, + text: str, + file_name: Optional[str], + doc_id: str, + target_format: Optional[str] = None, + ) -> Dict[str, Any]: + body = (text or "").strip() + if not body: + raise ValueError("Cannot render empty document text") + + fmt = self._resolve_format(file_name=file_name, target_format=target_format) + output_name = self._compose_output_name(file_name=file_name, doc_id=doc_id, fmt=fmt) + + if fmt in {"txt"}: + payload = body.encode("utf-8") + return {"bytes": payload, "mime": "text/plain; charset=utf-8", "file_name": output_name} + if fmt in {"md", "markdown"}: + payload = body.encode("utf-8") + return {"bytes": payload, "mime": "text/markdown; charset=utf-8", "file_name": output_name} + if fmt in {"json"}: + parsed: Any + try: + parsed = json.loads(body) + except Exception: + parsed = {"text": body} + payload = json.dumps(parsed, ensure_ascii=False, indent=2).encode("utf-8") + return {"bytes": payload, "mime": "application/json", "file_name": output_name} + if fmt in {"csv"}: + payload = body.encode("utf-8") + return {"bytes": payload, "mime": "text/csv; charset=utf-8", "file_name": output_name} + if fmt in {"xlsx", "xlsm", "xls"}: + try: + from openpyxl import Workbook + except Exception as e: + raise RuntimeError(f"openpyxl is required for {fmt} rendering: {e}") + wb = Workbook() + ws = wb.active + ws.title = "Document" + lines = [ln for ln in body.splitlines()] or [body] + for idx, line in enumerate(lines, start=1): + ws.cell(row=idx, column=1, value=line) + buf = BytesIO() + wb.save(buf) + mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + return {"bytes": buf.getvalue(), "mime": mime, "file_name": self._compose_output_name(file_name, doc_id, "xlsx")} + if fmt in {"docx"}: + try: + from docx import Document + except Exception as e: + raise RuntimeError(f"python-docx is required for docx rendering: {e}") + doc = Document() + for line in body.splitlines(): + doc.add_paragraph(line if line else " ") + buf = BytesIO() + doc.save(buf) + mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + return {"bytes": buf.getvalue(), "mime": mime, "file_name": self._compose_output_name(file_name, doc_id, "docx")} + + payload = body.encode("utf-8") + fallback_name = self._compose_output_name(file_name=file_name, doc_id=doc_id, fmt="txt") + return {"bytes": payload, "mime": "text/plain; charset=utf-8", "file_name": fallback_name} + + async def _publish_text_artifact( + self, + text: str, + doc_id: str, + file_name: Optional[str] = None, + dao_id: Optional[str] = None, + user_id: Optional[str] = None, + artifact_id: Optional[str] = None, + target_format: Optional[str] = None, + label: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> PublishResult: + try: + rendered = self._render_document_bytes( + text=text, + file_name=file_name, + doc_id=doc_id, + target_format=target_format, + ) + content_bytes = rendered["bytes"] + content_b64 = base64.b64encode(content_bytes).decode("ascii") + + effective_artifact_id = (artifact_id or "").strip() + if not effective_artifact_id: + create_resp = await self._artifact_post_json( + "/artifacts", + { + "type": "doc", + "title": file_name or doc_id, + "project_id": dao_id, + "acl_ref": dao_id, + "created_by": user_id or DOC_WRITEBACK_CREATED_BY, + }, + timeout=30.0, + ) + effective_artifact_id = str(create_resp.get("artifact_id") or "").strip() + if not effective_artifact_id: + return PublishResult(success=False, error="Artifact create failed: empty artifact_id") + + meta = {"doc_id": doc_id, "source": "doc_update_publish"} + if isinstance(metadata, dict): + meta.update(metadata) + + version_resp = await self._artifact_post_json( + f"/artifacts/{effective_artifact_id}/versions/from_base64", + { + "content_base64": content_b64, + "mime": rendered["mime"], + "filename": rendered["file_name"], + "label": label or "edited", + "meta_json": meta, + }, + timeout=45.0, + ) + version_id = str(version_resp.get("version_id") or "").strip() + storage_key = version_resp.get("storage_key") + if not version_id: + return PublishResult( + success=False, + artifact_id=effective_artifact_id, + error="Artifact version create failed: empty version_id", + ) + + download_url = self._gateway_artifact_download_url( + artifact_id=effective_artifact_id, + version_id=version_id, + ) + + return PublishResult( + success=True, + artifact_id=effective_artifact_id, + version_id=version_id, + storage_key=storage_key, + mime=rendered["mime"], + file_name=rendered["file_name"], + download_url=download_url, + ) + except Exception as e: + logger.error(f"publish_text_artifact failed: {e}", exc_info=True) + return PublishResult(success=False, error=str(e)) + def _is_excel_filename(self, file_name: Optional[str]) -> bool: if not file_name: return False @@ -462,7 +756,8 @@ class DocumentService: doc_url: Optional[str] = None, file_name: Optional[str] = None, dao_id: str = None, - user_id: str = None + user_id: str = None, + agent_id: str = "daarwizz", ) -> IngestResult: """ Ingest document chunks into RAG/Memory. @@ -488,64 +783,60 @@ class DocumentService: file_name = file_name or doc_context.file_name dao_id = dao_id or doc_context.dao_id - if not doc_id and not doc_url: + if not doc_url: return IngestResult( success=False, - error="No document ID or URL provided" + error="No document URL available for ingest" ) - - # Build request to Router with ingest flag - router_request = { - "mode": "doc_parse", - "agent": "parser", + + parsed = await self.parse_document( + session_id=session_id, + doc_url=doc_url, + file_name=file_name or "document", + dao_id=dao_id or "", + user_id=user_id or "", + output_mode="markdown", + metadata={"source": self._extract_source(session_id), "mode": "ingest"}, + ) + if not parsed.success: + return IngestResult(success=False, error=parsed.error or "Document parse failed") + + effective_doc_id = doc_id or parsed.doc_id + if not effective_doc_id: + effective_doc_id = hashlib.md5(f"{session_id}:{file_name}:{datetime.utcnow().isoformat()}".encode()).hexdigest()[:12] + + doc_text = (parsed.markdown or "").strip() + if not doc_text: + return IngestResult(success=False, error="No extractable text for ingestion") + + payload = { + "agent_id": (agent_id or "daarwizz").lower(), + "doc_id": effective_doc_id, + "file_name": file_name or "document", + "text": doc_text, + "dao_id": dao_id, + "user_id": user_id, "metadata": { - "source": self._extract_source(session_id), - "dao_id": dao_id, - "user_id": user_id, "session_id": session_id, - }, - "payload": { - "output_mode": "chunks", # Use chunks for RAG ingestion - "dao_id": dao_id, - "user_id": user_id, - "ingest": True, # Flag for ingestion + "source": self._extract_source(session_id), }, } - - if doc_url: - router_request["payload"]["doc_url"] = doc_url - router_request["payload"]["file_name"] = file_name or "document.pdf" - - if doc_id: - router_request["payload"]["doc_id"] = doc_id - - logger.info(f"Ingesting document: session={session_id}, doc_id={doc_id}") - - # Send to Router - response = await send_to_router(router_request) - - if not isinstance(response, dict): - return IngestResult( - success=False, - error="Invalid response from router" - ) - - data = response.get("data", {}) - chunks = data.get("chunks", []) - - if chunks: + response = await self._router_post_json("/v1/documents/ingest", payload, timeout=90.0) + + if response.get("ok"): return IngestResult( success=True, - doc_id=doc_id or data.get("doc_id"), - ingested_chunks=len(chunks), - status="ingested" - ) - else: - return IngestResult( - success=False, - status="failed", - error="No chunks to ingest" + doc_id=response.get("doc_id") or effective_doc_id, + ingested_chunks=int(response.get("chunks_stored", 0) or 0), + status="ingested", ) + + return IngestResult( + success=False, + doc_id=effective_doc_id, + status="failed", + error=response.get("error", "Router ingest failed"), + ) except Exception as e: logger.error(f"Document ingestion failed: {e}", exc_info=True) @@ -553,6 +844,245 @@ class DocumentService: success=False, error=str(e) ) + + async def update_document( + self, + session_id: str, + doc_id: Optional[str] = None, + doc_url: Optional[str] = None, + file_name: Optional[str] = None, + text: Optional[str] = None, + dao_id: Optional[str] = None, + user_id: Optional[str] = None, + agent_id: str = "daarwizz", + storage_ref: Optional[str] = None, + publish_artifact: bool = False, + artifact_id: Optional[str] = None, + target_format: Optional[str] = None, + artifact_label: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> UpdateResult: + """ + Update existing document content and bump version in router memory. + """ + try: + context = await self.get_doc_context(session_id) + if context: + if not doc_id: + doc_id = context.doc_id + if not doc_url: + doc_url = context.doc_url + if not file_name: + file_name = context.file_name + if not dao_id: + dao_id = context.dao_id + + if not doc_id: + return UpdateResult( + success=False, + status="failed", + error="No document context found. Provide doc_id or parse/ingest first.", + ) + + effective_text = (text or "").strip() + if not effective_text: + if not doc_url: + return UpdateResult( + success=False, + doc_id=doc_id, + status="failed", + error="No text or doc_url provided for update", + ) + parsed = await self.parse_document( + session_id=session_id, + doc_url=doc_url, + file_name=file_name or "document", + dao_id=dao_id or "", + user_id=user_id or "", + output_mode="markdown", + metadata={"source": self._extract_source(session_id), "mode": "update"}, + ) + if not parsed.success: + return UpdateResult( + success=False, + doc_id=doc_id, + status="failed", + error=parsed.error or "Document parse failed", + ) + effective_text = (parsed.markdown or "").strip() + + if not effective_text: + return UpdateResult( + success=False, + doc_id=doc_id, + status="failed", + error="No extractable text for update", + ) + + meta = { + "session_id": session_id, + "source": self._extract_source(session_id), + } + if isinstance(metadata, dict): + meta.update(metadata) + + response = await self._router_post_json( + "/v1/documents/update", + { + "agent_id": (agent_id or "daarwizz").lower(), + "doc_id": doc_id, + "file_name": file_name, + "text": effective_text, + "dao_id": dao_id, + "user_id": user_id, + "storage_ref": storage_ref, + "metadata": meta, + }, + timeout=90.0, + ) + + if not response.get("ok"): + return UpdateResult( + success=False, + doc_id=doc_id, + status="failed", + error=response.get("error", "Router update failed"), + ) + + await self.save_doc_context( + session_id=session_id, + doc_id=doc_id, + doc_url=doc_url, + file_name=file_name, + dao_id=dao_id, + user_id=user_id, + ) + + publish = PublishResult(success=False) + if publish_artifact: + publish = await self._publish_text_artifact( + text=effective_text, + doc_id=doc_id, + file_name=file_name, + dao_id=dao_id, + user_id=user_id, + artifact_id=artifact_id, + target_format=target_format, + label=artifact_label, + metadata=meta, + ) + + return UpdateResult( + success=True, + doc_id=response.get("doc_id") or doc_id, + version_no=int(response.get("version_no", 0) or 0) or None, + version_id=int(response.get("version_id", 0) or 0) or None, + updated_chunks=int(response.get("chunks_stored", 0) or 0), + status="updated_published" if publish_artifact and publish.success else ("updated_publish_failed" if publish_artifact else "updated"), + publish_error=publish.error if publish_artifact and not publish.success else None, + artifact_id=publish.artifact_id if publish_artifact else None, + artifact_version_id=publish.version_id if publish_artifact else None, + artifact_storage_key=publish.storage_key if publish_artifact else None, + artifact_mime=publish.mime if publish_artifact else None, + artifact_download_url=publish.download_url if publish_artifact else None, + ) + except Exception as e: + logger.error(f"Document update failed: {e}", exc_info=True) + return UpdateResult( + success=False, + doc_id=doc_id, + status="failed", + error=str(e), + ) + + async def list_document_versions( + self, + agent_id: str, + doc_id: str, + limit: int = 20, + ) -> Dict[str, Any]: + aid = (agent_id or "daarwizz").lower() + did = (doc_id or "").strip() + if not did: + return {"ok": False, "error": "doc_id is required", "items": []} + try: + response = await self._router_get_json( + f"/v1/documents/{did}/versions?agent_id={aid}&limit={max(1, min(int(limit or 20), 200))}", + timeout=30.0, + ) + return response if isinstance(response, dict) else {"ok": False, "error": "invalid_response", "items": []} + except Exception as e: + logger.error(f"list_document_versions failed: {e}") + return {"ok": False, "error": str(e), "items": []} + + async def publish_document_artifact( + self, + session_id: str, + doc_id: Optional[str] = None, + doc_url: Optional[str] = None, + file_name: Optional[str] = None, + text: Optional[str] = None, + dao_id: Optional[str] = None, + user_id: Optional[str] = None, + artifact_id: Optional[str] = None, + target_format: Optional[str] = None, + artifact_label: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> PublishResult: + """ + Publish text as a physical artifact version (.docx/.xlsx/.txt/...) without changing RAG index. + """ + try: + context = await self.get_doc_context(session_id) + if context: + if not doc_id: + doc_id = context.doc_id + if not doc_url: + doc_url = context.doc_url + if not file_name: + file_name = context.file_name + if not dao_id: + dao_id = context.dao_id + if not user_id: + user_id = context.user_id + + if not doc_id: + return PublishResult(success=False, error="doc_id is required") + + body = (text or "").strip() + if not body: + if not doc_url: + return PublishResult(success=False, error="text or doc_url is required") + parsed = await self.parse_document( + session_id=session_id, + doc_url=doc_url, + file_name=file_name or "document", + dao_id=dao_id or "", + user_id=user_id or "", + output_mode="markdown", + metadata={"source": self._extract_source(session_id), "mode": "publish"}, + ) + if not parsed.success: + return PublishResult(success=False, error=parsed.error or "Document parse failed") + body = (parsed.markdown or "").strip() + + if not body: + return PublishResult(success=False, error="No text available for publish") + + return await self._publish_text_artifact( + text=body, + doc_id=doc_id, + file_name=file_name, + dao_id=dao_id, + user_id=user_id, + artifact_id=artifact_id, + target_format=target_format, + label=artifact_label, + metadata=metadata, + ) + except Exception as e: + logger.error(f"publish_document_artifact failed: {e}", exc_info=True) + return PublishResult(success=False, error=str(e)) async def ask_about_document( self, @@ -625,38 +1155,30 @@ class DocumentService: }], ) - # Build RAG query request - router_request = { - "mode": "rag_query", - "agent": agent_id, - "metadata": { - "source": self._extract_source(session_id), - "dao_id": dao_id, - "user_id": user_id, - "session_id": session_id, - }, - "payload": { - "question": question, - "dao_id": dao_id, - "user_id": user_id, - "doc_id": doc_id, - }, - } - logger.info( f"RAG query: agent={agent_id}, session={session_id}, question={question[:50]}, doc_id={doc_id}" ) - - # Send to Router - response = await send_to_router(router_request) - - if not isinstance(response, dict): + + response = await self._router_post_json( + "/v1/documents/query", + { + "agent_id": (agent_id or "daarwizz").lower(), + "question": question, + "doc_id": doc_id, + "dao_id": dao_id, + "user_id": user_id, + "limit": 5, + }, + timeout=60.0, + ) + + if isinstance(response, dict) and not response.get("ok", False): return QAResult( success=False, - error="Invalid response from router" + error=response.get("error", "Document query failed"), ) - - data = response.get("data", {}) + + data = response.get("data", {}) if isinstance(response, dict) else {} answer = data.get("answer") or data.get("text") sources = data.get("citations", []) or data.get("sources", []) @@ -717,7 +1239,8 @@ async def ingest_document( doc_url: Optional[str] = None, file_name: Optional[str] = None, dao_id: Optional[str] = None, - user_id: Optional[str] = None + user_id: Optional[str] = None, + agent_id: str = "daarwizz", ) -> IngestResult: """Ingest document chunks into RAG/Memory""" return await doc_service.ingest_document( @@ -726,7 +1249,8 @@ async def ingest_document( doc_url=doc_url, file_name=file_name, dao_id=dao_id, - user_id=user_id + user_id=user_id, + agent_id=agent_id, ) @@ -749,6 +1273,79 @@ async def ask_about_document( ) +async def update_document( + session_id: str, + doc_id: Optional[str] = None, + doc_url: Optional[str] = None, + file_name: Optional[str] = None, + text: Optional[str] = None, + dao_id: Optional[str] = None, + user_id: Optional[str] = None, + agent_id: str = "daarwizz", + storage_ref: Optional[str] = None, + publish_artifact: bool = False, + artifact_id: Optional[str] = None, + target_format: Optional[str] = None, + artifact_label: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, +) -> UpdateResult: + """Update document chunks and bump version.""" + return await doc_service.update_document( + session_id=session_id, + doc_id=doc_id, + doc_url=doc_url, + file_name=file_name, + text=text, + dao_id=dao_id, + user_id=user_id, + agent_id=agent_id, + storage_ref=storage_ref, + publish_artifact=publish_artifact, + artifact_id=artifact_id, + target_format=target_format, + artifact_label=artifact_label, + metadata=metadata, + ) + + +async def list_document_versions(agent_id: str, doc_id: str, limit: int = 20) -> Dict[str, Any]: + """List document versions from router.""" + return await doc_service.list_document_versions( + agent_id=agent_id, + doc_id=doc_id, + limit=limit, + ) + + +async def publish_document_artifact( + session_id: str, + doc_id: Optional[str] = None, + doc_url: Optional[str] = None, + file_name: Optional[str] = None, + text: Optional[str] = None, + dao_id: Optional[str] = None, + user_id: Optional[str] = None, + artifact_id: Optional[str] = None, + target_format: Optional[str] = None, + artifact_label: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, +) -> PublishResult: + """Publish physical artifact version for document text.""" + return await doc_service.publish_document_artifact( + session_id=session_id, + doc_id=doc_id, + doc_url=doc_url, + file_name=file_name, + text=text, + dao_id=dao_id, + user_id=user_id, + artifact_id=artifact_id, + target_format=target_format, + artifact_label=artifact_label, + metadata=metadata, + ) + + async def save_doc_context( session_id: str, doc_id: str, diff --git a/ops/monitor_notify_sofiia.sh b/ops/monitor_notify_sofiia.sh old mode 100644 new mode 100755 index cb9d0c88..6fca6bf9 --- a/ops/monitor_notify_sofiia.sh +++ b/ops/monitor_notify_sofiia.sh @@ -7,6 +7,7 @@ ROUTER_URL="${ROUTER_URL:-http://127.0.0.1:9102}" REPORT_ENABLED="${SOFIIA_REPORTS_ENABLED:-true}" REPORT_MODE="${SOFIIA_REPORT_MODE:-fail_only}" # fail_only | always REPORT_TIMEOUT="${SOFIIA_REPORT_TIMEOUT:-180}" +REPORT_MAX_TOKENS="${SOFIIA_REPORT_MAX_TOKENS:-900}" REPORT_CHAT_ID="${SOFIIA_REPORT_CHAT_ID:-ops-monitor-sofiia}" REPORT_USER_ID="${SOFIIA_REPORT_USER_ID:-ops-monitor-agent}" REPORT_USERNAME="${SOFIIA_REPORT_USERNAME:-monitor-agent}" @@ -23,7 +24,7 @@ if [[ ! -f "$STATUS_JSON" ]]; then exit 0 fi -python3 - "$STATUS_JSON" "$ROOT" "$ROUTER_URL" "$REPORT_MODE" "$REPORT_TIMEOUT" "$REPORT_CHAT_ID" "$REPORT_USER_ID" "$REPORT_USERNAME" "$REPORT_TELEGRAM_CHAT_ID" "$SOFIIA_BOT_TOKEN" <<'PY' +python3 - "$STATUS_JSON" "$ROOT" "$ROUTER_URL" "$REPORT_MODE" "$REPORT_TIMEOUT" "$REPORT_MAX_TOKENS" "$REPORT_CHAT_ID" "$REPORT_USER_ID" "$REPORT_USERNAME" "$REPORT_TELEGRAM_CHAT_ID" "$SOFIIA_BOT_TOKEN" <<'PY' import json import sys from pathlib import Path @@ -35,11 +36,12 @@ root = Path(sys.argv[2]) router_url = sys.argv[3].rstrip('/') report_mode = sys.argv[4] timeout_s = int(sys.argv[5]) -chat_id = sys.argv[6] -user_id = sys.argv[7] -username = sys.argv[8] -tg_chat_id = sys.argv[9].strip() -tg_token = sys.argv[10].strip() +max_tokens = int(sys.argv[6]) +chat_id = sys.argv[7] +user_id = sys.argv[8] +username = sys.argv[9] +tg_chat_id = sys.argv[10].strip() +tg_token = sys.argv[11].strip() payload = json.loads(status_json.read_text(encoding='utf-8')) status = str(payload.get('status', 'unknown')).lower() @@ -70,7 +72,7 @@ prompt = ( body = { 'prompt': prompt, - 'max_tokens': 400, + 'max_tokens': max_tokens, 'temperature': 0.1, 'metadata': { 'source': 'ops-monitor-canary', @@ -99,26 +101,42 @@ try: print(f"[OK] sofiia report sent: backend={data.get('backend')} model={data.get('model')} preview={short!r}") if tg_chat_id and tg_token and text: - msg = ( + def chunk_text(value: str, limit: int = 3500): + chunks = [] + remaining = value + while remaining: + if len(remaining) <= limit: + chunks.append(remaining) + break + split_at = remaining.rfind('\n', 0, limit) + if split_at < max(1, limit // 2): + split_at = limit + chunks.append(remaining[:split_at].rstrip()) + remaining = remaining[split_at:].lstrip() + return chunks or [value] + + header = ( "[NODE1 Monitor]\n" f"status={payload.get('status')} exit_code={payload.get('exit_code')}\n\n" - f"{text[:3500]}" ) - tg_req = urlreq.Request( - url=f"https://api.telegram.org/bot{tg_token}/sendMessage", - data=json.dumps({"chat_id": tg_chat_id, "text": msg}).encode('utf-8'), - headers={'Content-Type': 'application/json'}, - method='POST', - ) - try: + parts = chunk_text(text, 3500 - len("(99/99)\n")) + total = len(parts) + delivered = 0 + for idx, part in enumerate(parts, start=1): + prefix = f"({idx}/{total})\n" if total > 1 else "" + msg = f"{header}{prefix}{part}" if idx == 1 else f"{prefix}{part}" + tg_req = urlreq.Request( + url=f"https://api.telegram.org/bot{tg_token}/sendMessage", + data=json.dumps({"chat_id": tg_chat_id, "text": msg}).encode('utf-8'), + headers={'Content-Type': 'application/json'}, + method='POST', + ) with urlreq.urlopen(tg_req, timeout=20) as tg_resp: tg_data = json.loads(tg_resp.read().decode('utf-8', errors='ignore')) - if tg_data.get('ok'): - print(f"[OK] telegram report delivered: chat_id={tg_chat_id}") - else: - print(f"[WARN] telegram send not ok: {tg_data}") - except Exception as tg_e: - print(f"[WARN] telegram send failed: {tg_e}") + if not tg_data.get('ok'): + raise RuntimeError(f"telegram send not ok: {tg_data}") + delivered += 1 + print(f"[OK] telegram report delivered: chat_id={tg_chat_id} parts={delivered}") else: print('[INFO] telegram delivery skipped (missing SOFIIA_REPORT_TELEGRAM_CHAT_ID or token or empty text)') except HTTPError as e: diff --git a/scripts/node1/agromatrix_regression_smoke.py b/scripts/node1/agromatrix_regression_smoke.py new file mode 100755 index 00000000..2e205f27 --- /dev/null +++ b/scripts/node1/agromatrix_regression_smoke.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +import argparse +import json +import os +import sys +import urllib.error +import urllib.request + + +TINY_PNG_DATA_URL = ( + "data:image/png;base64," + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8Xw8AAoMBgQhP2YkAAAAASUVORK5CYII=" +) + + +def http_json(method: str, url: str, payload=None, headers=None): + data = None + req_headers = dict(headers or {}) + if payload is not None: + data = json.dumps(payload).encode("utf-8") + req_headers.setdefault("Content-Type", "application/json") + req = urllib.request.Request(url, data=data, headers=req_headers, method=method) + try: + with urllib.request.urlopen(req, timeout=60) as resp: + body = resp.read().decode("utf-8", errors="replace") + return resp.status, json.loads(body) if body else {} + except urllib.error.HTTPError as e: + body = e.read().decode("utf-8", errors="replace") + try: + parsed = json.loads(body) if body else {} + except Exception: + parsed = {"raw": body} + return e.code, parsed + + +def check(cond: bool, label: str, details: str = "") -> bool: + prefix = "PASS" if cond else "FAIL" + tail = f" :: {details}" if details else "" + print(f"[{prefix}] {label}{tail}") + return cond + + +def main() -> int: + parser = argparse.ArgumentParser(description="AgroMatrix regression smoke checks") + parser.add_argument("--base-url", default="http://127.0.0.1:9102") + parser.add_argument("--agent-id", default="agromatrix") + parser.add_argument("--chat-id", default="smoke-agromatrix") + parser.add_argument("--user-id", default="smoke-user") + parser.add_argument("--skip-review-404", action="store_true") + parser.add_argument( + "--mentor-token", + default=( + os.getenv("AGROMATRIX_REVIEW_BEARER_TOKEN") + or (os.getenv("AGROMATRIX_REVIEW_BEARER_TOKENS", "").split(",")[0].strip()) + or "" + ), + ) + args = parser.parse_args() + + ok_all = True + + status, health = http_json("GET", f"{args.base_url}/health") + ok_all &= check(status == 200 and health.get("status") == "ok", "health", str(health)) + + numeric_payload = { + "prompt": "напиши мені яка сума була витрачена на добрива", + "metadata": { + "channel": "telegram", + "chat_id": args.chat_id, + "user_id": args.user_id, + "user_name": "smoke", + }, + } + status, infer_num = http_json("POST", f"{args.base_url}/v1/agents/{args.agent_id}/infer", numeric_payload) + resp_text = str(infer_num.get("response") or "") + numeric_guard = ( + "Не можу підтвердити точне число" in resp_text + or "value + unit + source" in resp_text + or "source(sheet,row)" in resp_text + ) + ok_all &= check(status == 200 and numeric_guard, "numeric_contract_guard", resp_text[:180]) + + plant_payload = { + "prompt": "Що це за рослина на фото?", + "images": [TINY_PNG_DATA_URL], + "metadata": { + "channel": "telegram", + "chat_id": args.chat_id, + "user_id": args.user_id, + "user_name": "smoke", + }, + } + status, infer_plant = http_json("POST", f"{args.base_url}/v1/agents/{args.agent_id}/infer", plant_payload) + plant_text = str(infer_plant.get("response") or "") + plant_ok = ( + "Не впевнений" in plant_text + or "Надішли" in plant_text + or "канд" in plant_text.lower() + ) + ok_all &= check(status == 200 and plant_ok, "deterministic_plant_response", plant_text[:180]) + + status, pending = http_json("GET", f"{args.base_url}/v1/agromatrix/shared-memory/pending") + pending_shape = isinstance(pending, dict) and isinstance(pending.get("items"), list) + ok_all &= check(status == 200 and pending_shape, "shared_pending_endpoint", f"total={pending.get('total')}") + + if not args.skip_review_404: + req_headers = {} + if args.mentor_token: + req_headers["Authorization"] = f"Bearer {args.mentor_token}" + status, review = http_json( + "POST", + f"{args.base_url}/v1/agromatrix/shared-memory/review", + { + "point_id": "11111111-1111-1111-1111-111111111111", + "approve": False, + "reviewer": "smoke", + "note": "nonexistent id check", + }, + headers=req_headers, + ) + expected = 404 if args.mentor_token else 401 + ok_all &= check(status == expected, "shared_review_not_found_contract", str(review)) + + return 0 if ok_all else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/services/agent-e2e-prober/main.py b/services/agent-e2e-prober/main.py index c57d864c..1ac11eae 100644 --- a/services/agent-e2e-prober/main.py +++ b/services/agent-e2e-prober/main.py @@ -16,9 +16,16 @@ logger = logging.getLogger(__name__) # Configuration GATEWAY_URL = os.getenv("GATEWAY_URL", "http://gateway:9300") +ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000") PROBE_INTERVAL = int(os.getenv("PROBE_INTERVAL", "60")) # seconds PROBE_TIMEOUT = int(os.getenv("PROBE_TIMEOUT", "30")) # seconds +SEMANTIC_TIMEOUT = int(os.getenv("SEMANTIC_TIMEOUT", "45")) # seconds METRICS_PORT = int(os.getenv("METRICS_PORT", "9108")) +SEMANTIC_PROBE_ENABLED = os.getenv("SEMANTIC_PROBE_ENABLED", "true").lower() == "true" +SEMANTIC_AGENTS = [a.strip() for a in os.getenv("SEMANTIC_AGENTS", "clan,sofiia,monitor").split(",") if a.strip()] +SEMANTIC_PROMPT = os.getenv("SEMANTIC_PROMPT", "Коротко: хто такий DAARWIZZ?") +SEMANTIC_EXPECT_KEYWORD = os.getenv("SEMANTIC_EXPECT_KEYWORD", "daarwizz").lower() +MONITOR_EXPECT_LOCAL = os.getenv("MONITOR_EXPECT_LOCAL", "true").lower() == "true" # Prometheus metrics agent_e2e_success = Gauge('agent_e2e_success', 'Whether last E2E probe succeeded', ['target']) @@ -42,7 +49,7 @@ async def probe_gateway_health() -> tuple[bool, float, str]: async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client: resp = await client.get(f"{GATEWAY_URL}/health") latency = time.time() - start - + if resp.status_code == 200: data = resp.json() if data.get("status") == "healthy": @@ -67,7 +74,7 @@ async def probe_agent_ping() -> tuple[bool, float, str]: json={"probe": True, "timestamp": datetime.utcnow().isoformat()} ) latency = time.time() - start - + if resp.status_code == 200: data = resp.json() if data.get("success"): @@ -100,7 +107,7 @@ async def probe_webhook_echo() -> tuple[bool, float, str]: "text": "/health" # Simple health check command } } - + async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client: # Use helion webhook as it's the most tested resp = await client.post( @@ -108,7 +115,7 @@ async def probe_webhook_echo() -> tuple[bool, float, str]: json=test_update ) latency = time.time() - start - + if resp.status_code == 200: return True, latency, "" else: @@ -119,53 +126,102 @@ async def probe_webhook_echo() -> tuple[bool, float, str]: return False, time.time() - start, f"error: {str(e)[:50]}" +async def probe_agent_semantic(agent_id: str) -> tuple[bool, float, str]: + """Probe semantic response via router infer and assert DAARWIZZ awareness.""" + start = time.time() + try: + payload = { + "prompt": SEMANTIC_PROMPT, + "max_tokens": 180, + "temperature": 0.1, + "metadata": { + "agent_id": agent_id, + "user_id": "tg:0", + "chat_id": "0", + "username": "e2e-prober", + "raw_user_text": SEMANTIC_PROMPT, + }, + } + async with httpx.AsyncClient(timeout=SEMANTIC_TIMEOUT) as client: + resp = await client.post(f"{ROUTER_URL}/v1/agents/{agent_id}/infer", json=payload) + latency = time.time() - start + if resp.status_code != 200: + return False, latency, f"http_{resp.status_code}" + + data = resp.json() + answer = str(data.get("response") or "") + backend = str(data.get("backend") or "") + model = str(data.get("model") or "") + + answer_lc = answer.lower() + if SEMANTIC_EXPECT_KEYWORD not in answer_lc and "даар" not in answer_lc: + return False, latency, "no_daarwizz_in_answer" + + if MONITOR_EXPECT_LOCAL and agent_id == "monitor": + local_ok = ("ollama" in backend.lower()) or model.lower().startswith("qwen") + if not local_ok: + return False, latency, f"monitor_nonlocal_backend:{backend}:{model}" + + return True, latency, "" + except httpx.TimeoutException: + return False, time.time() - start, "timeout" + except Exception as e: + return False, time.time() - start, f"error: {str(e)[:50]}" + + +def record_probe(target: str, success: bool, latency: float, reason: str): + """Record probe metrics and log line.""" + agent_e2e_runs_total.labels(target=target).inc() + agent_e2e_success.labels(target=target).set(1 if success else 0) + agent_e2e_latency.labels(target=target).set(latency) + agent_e2e_latency_histogram.labels(target=target).observe(latency) + if not success: + agent_e2e_failures_total.labels(target=target, reason=reason).inc() + logger.info(f"{target}: success={success}, latency={latency:.3f}s, reason={reason}") + + async def run_probes(): """Run all probes and update metrics""" # Probe 1: Gateway health success, latency, reason = await probe_gateway_health() - agent_e2e_runs_total.labels(target="gateway_health").inc() - agent_e2e_success.labels(target="gateway_health").set(1 if success else 0) - agent_e2e_latency.labels(target="gateway_health").set(latency) - agent_e2e_latency_histogram.labels(target="gateway_health").observe(latency) - if not success: - agent_e2e_failures_total.labels(target="gateway_health", reason=reason).inc() - logger.info(f"gateway_health: success={success}, latency={latency:.3f}s, reason={reason}") - + record_probe("gateway_health", success, latency, reason) + # Probe 2: Agent ping (if endpoint exists) success, latency, reason = await probe_agent_ping() - agent_e2e_runs_total.labels(target="agent_ping").inc() - agent_e2e_success.labels(target="agent_ping").set(1 if success else 0) - agent_e2e_latency.labels(target="agent_ping").set(latency) - agent_e2e_latency_histogram.labels(target="agent_ping").observe(latency) - if not success: - agent_e2e_failures_total.labels(target="agent_ping", reason=reason).inc() - logger.info(f"agent_ping: success={success}, latency={latency:.3f}s, reason={reason}") - + record_probe("agent_ping", success, latency, reason) + # Probe 3: Webhook E2E (full path test) success, latency, reason = await probe_webhook_echo() - agent_e2e_runs_total.labels(target="webhook_e2e").inc() - agent_e2e_success.labels(target="webhook_e2e").set(1 if success else 0) - agent_e2e_latency.labels(target="webhook_e2e").set(latency) - agent_e2e_latency_histogram.labels(target="webhook_e2e").observe(latency) - if not success: - agent_e2e_failures_total.labels(target="webhook_e2e", reason=reason).inc() - logger.info(f"webhook_e2e: success={success}, latency={latency:.3f}s, reason={reason}") + record_probe("webhook_e2e", success, latency, reason) + + # Probe 4+: semantic checks for selected agents (parallel) + if SEMANTIC_PROBE_ENABLED and SEMANTIC_AGENTS: + results = await asyncio.gather(*(probe_agent_semantic(agent_id) for agent_id in SEMANTIC_AGENTS)) + matrix = [] + for agent_id, (success, latency, reason) in zip(SEMANTIC_AGENTS, results): + record_probe(f"semantic_{agent_id}", success, latency, reason) + matrix.append(f"{agent_id}:{'PASS' if success else 'FAIL'}") + logger.info("semantic_matrix: " + " | ".join(matrix)) async def main(): - logger.info(f"Starting E2E Agent Prober") + logger.info("Starting E2E Agent Prober") logger.info(f" GATEWAY_URL: {GATEWAY_URL}") + logger.info(f" ROUTER_URL: {ROUTER_URL}") logger.info(f" PROBE_INTERVAL: {PROBE_INTERVAL}s") logger.info(f" PROBE_TIMEOUT: {PROBE_TIMEOUT}s") logger.info(f" METRICS_PORT: {METRICS_PORT}") - + logger.info(f" SEMANTIC_TIMEOUT: {SEMANTIC_TIMEOUT}s") + logger.info(f" SEMANTIC_PROBE_ENABLED: {SEMANTIC_PROBE_ENABLED}") + logger.info(f" SEMANTIC_AGENTS: {','.join(SEMANTIC_AGENTS)}") + # Start Prometheus metrics server start_http_server(METRICS_PORT) logger.info(f"Prometheus metrics available at :{METRICS_PORT}/metrics") - + # Initial probe await run_probes() - + # Continuous probing while True: await asyncio.sleep(PROBE_INTERVAL) diff --git a/services/artifact-registry/app/main.py b/services/artifact-registry/app/main.py index b7781da8..b96d6988 100644 --- a/services/artifact-registry/app/main.py +++ b/services/artifact-registry/app/main.py @@ -6,13 +6,15 @@ Artifact Registry v0 """ import asyncio +import base64 import hashlib import json import logging import os +import re import uuid from io import BytesIO -from datetime import datetime +from datetime import datetime, timedelta from typing import Any, Dict, List, Optional import asyncpg @@ -90,6 +92,14 @@ class ArtifactVersionFromUrlRequest(BaseModel): meta_json: Optional[Dict[str, Any]] = None +class ArtifactVersionFromBase64Request(BaseModel): + content_base64: str + mime: str + filename: Optional[str] = "source.bin" + label: Optional[str] = "source" + meta_json: Optional[Dict[str, Any]] = None + + class ArtifactVersionResponse(BaseModel): version_id: str storage_key: str @@ -208,15 +218,38 @@ def _normalize_meta_json(meta: Any) -> Dict[str, Any]: def _format_to_mime(fmt: str) -> str: fmt = fmt.lower() + if "/" in fmt: + return fmt if fmt == "pptx": return "application/vnd.openxmlformats-officedocument.presentationml.presentation" if fmt == "pdf": return "application/pdf" if fmt == "source": return "application/json" + if fmt == "docx": + return "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + if fmt == "xlsx": + return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + if fmt == "txt": + return "text/plain; charset=utf-8" + if fmt == "md": + return "text/markdown; charset=utf-8" + if fmt == "json": + return "application/json" + if fmt == "csv": + return "text/csv; charset=utf-8" return "application/octet-stream" +def _safe_filename(name: Optional[str], fallback: str = "source.bin") -> str: + raw = (name or fallback).strip() or fallback + cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", raw) + cleaned = cleaned.strip("._") + if not cleaned: + return fallback + return cleaned[:120] + + async def _download_bytes(url: str) -> bytes: async with httpx.AsyncClient(timeout=60.0) as client: resp = await client.get(url) @@ -462,6 +495,73 @@ async def add_version_from_url(artifact_id: str, payload: ArtifactVersionFromUrl ) +@app.post("/artifacts/{artifact_id}/versions/from_base64", response_model=ArtifactVersionResponse) +async def add_version_from_base64(artifact_id: str, payload: ArtifactVersionFromBase64Request) -> ArtifactVersionResponse: + if not minio_client: + raise HTTPException(status_code=500, detail="MinIO not available") + if not pool: + raise HTTPException(status_code=500, detail="DB not available") + + raw = (payload.content_base64 or "").strip() + if not raw: + raise HTTPException(status_code=400, detail="content_base64 is required") + + if raw.startswith("data:") and "," in raw: + raw = raw.split(",", 1)[1] + + try: + content = base64.b64decode(raw, validate=True) + except Exception: + raise HTTPException(status_code=400, detail="Invalid base64 payload") + + if not content: + raise HTTPException(status_code=400, detail="Decoded payload is empty") + + version_id = f"ver_{uuid.uuid4().hex}" + filename = _safe_filename(payload.filename, fallback="source.bin") + sha256 = _hash_bytes(content) + storage_key = _storage_key(artifact_id, version_id, filename) + + try: + minio_client.put_object( + MINIO_BUCKET, + storage_key, + data=BytesIO(content), + length=len(content), + content_type=payload.mime, + ) + except S3Error as e: + raise HTTPException(status_code=502, detail=f"MinIO error: {e}") + + meta_json = _normalize_meta_json(payload.meta_json) + if "file_name" not in meta_json: + meta_json["file_name"] = filename + + async with pool.acquire() as conn: + await conn.execute( + """ + insert into artifact_versions + (id, artifact_id, label, sha256, mime, size_bytes, storage_key, meta_json) + values ($1, $2, $3, $4, $5, $6, $7, $8) + """, + version_id, + artifact_id, + payload.label or "source", + sha256, + payload.mime, + len(content), + storage_key, + json.dumps(meta_json), + ) + + return ArtifactVersionResponse( + version_id=version_id, + storage_key=storage_key, + sha256=sha256, + size_bytes=len(content), + ) + + @app.post("/artifacts/{artifact_id}/versions", response_model=ArtifactVersionResponse) async def add_version(artifact_id: str, payload: ArtifactVersionCreateRequest) -> ArtifactVersionResponse: if not pool: @@ -678,7 +778,39 @@ async def download_artifact(artifact_id: str, format: str = Query("pptx")) -> Di if not row: raise HTTPException(status_code=404, detail="Version not found") try: - url = minio_client.presigned_get_object(MINIO_BUCKET, row["storage_key"], expires=1800) + url = minio_client.presigned_get_object( + MINIO_BUCKET, + row["storage_key"], + expires=timedelta(seconds=1800), + ) except S3Error as e: raise HTTPException(status_code=502, detail=f"MinIO error: {e}") return {"url": url, "storage_key": row["storage_key"], "mime": row["mime"]} + + +@app.get("/artifacts/{artifact_id}/versions/{version_id}/download") +async def download_artifact_version(artifact_id: str, version_id: str) -> Dict[str, Any]: + if not pool or not minio_client: + raise HTTPException(status_code=500, detail="Service not available") + + async with pool.acquire() as conn: + row = await conn.fetchrow( + """ + select * from artifact_versions + where artifact_id=$1 and id=$2 + limit 1 + """, + artifact_id, + version_id, + ) + if not row: + raise HTTPException(status_code=404, detail="Version not found") + try: + url = minio_client.presigned_get_object( + MINIO_BUCKET, + row["storage_key"], + expires=timedelta(seconds=1800), + ) + except S3Error as e: + raise HTTPException(status_code=502, detail=f"MinIO error: {e}") + return {"url": url, "storage_key": row["storage_key"], "mime": row["mime"], "version_id": row["id"]} diff --git a/services/crewai-service/app/config/crewai_teams.yml b/services/crewai-service/app/config/crewai_teams.yml index c469d8d0..a6a7bf1e 100644 --- a/services/crewai-service/app/config/crewai_teams.yml +++ b/services/crewai-service/app/config/crewai_teams.yml @@ -361,6 +361,29 @@ agromatrix: llm_profile: reasoning delegation: enabled: false + plant_intel: + team_name: AgroMatrix Plant Intelligence + parallel_roles: true + max_concurrency: 3 + synthesis: + role_context: Plant Intelligence Synthesis + system_prompt_ref: roles/agx/agx-plant-intel/orchestrator_synthesis.md + llm_profile: reasoning + team: + - id: plant_identifier + role_context: Plant Identifier + system_prompt_ref: roles/agx/agx-plant-intel/plant_identifier.md + llm_profile: science + - id: taxonomy_validator + role_context: Taxonomy Validator + system_prompt_ref: roles/agx/agx-plant-intel/taxonomy_validator.md + llm_profile: reasoning + - id: agrovoc_normalizer + role_context: AGROVOC Normalizer + system_prompt_ref: roles/agx/agx-plant-intel/agrovoc_normalizer.md + llm_profile: fast + delegation: + enabled: false cadastre_geo: team_name: AgroMatrix Cadastre/Geo parallel_roles: true @@ -614,6 +637,16 @@ agromatrix: - Stepan - координація - план + plant_intel: + - plant + - рослина + - культура + - leaf + - disease + - хвороба + - identify + - ідентифікуй + - що за рослина cadastre_geo: - cadastre - geo diff --git a/services/crewai-service/app/config/roles/agromatrix/agronomist.md b/services/crewai-service/app/config/roles/agromatrix/agronomist.md new file mode 100644 index 00000000..e863d9cf --- /dev/null +++ b/services/crewai-service/app/config/roles/agromatrix/agronomist.md @@ -0,0 +1,8 @@ +# Agronomist + +Фокус: агрономія, діагностика стану рослин, фази розвитку, ризики хвороб/стресів. + +Правила відповіді: +- Коротко і прикладно. +- Ніяких вигаданих фактів; при невизначеності чітко позначити припущення. +- Для фото-питань: аналізувати в межах доступного контексту; якщо файл відсутній зараз — просити фото повторно. diff --git a/services/crewai-service/app/config/roles/agromatrix/communicator.md b/services/crewai-service/app/config/roles/agromatrix/communicator.md new file mode 100644 index 00000000..0f55a1ca --- /dev/null +++ b/services/crewai-service/app/config/roles/agromatrix/communicator.md @@ -0,0 +1,8 @@ +# Communicator + +Фокус: людяна та зрозуміла комунікація фінальної відповіді. + +Правила: +- Природна мова, без механістичного тону. +- Не дублюй технічні обмеження, якщо вони не потрібні для дії користувача. +- Завершуй конкретним корисним кроком. diff --git a/services/crewai-service/app/config/roles/agromatrix/data_analyst.md b/services/crewai-service/app/config/roles/agromatrix/data_analyst.md new file mode 100644 index 00000000..f2c617b2 --- /dev/null +++ b/services/crewai-service/app/config/roles/agromatrix/data_analyst.md @@ -0,0 +1,7 @@ +# Field Data Analyst + +Фокус: аналіз польових даних, тренди, аномалії, порівняння сценаріїв. + +Правила: +- Пояснювати висновки простою мовою. +- Якщо даних недостатньо — вказати, які саме дані потрібні для точного висновку. diff --git a/services/crewai-service/app/config/roles/agromatrix/farm_ops.md b/services/crewai-service/app/config/roles/agromatrix/farm_ops.md new file mode 100644 index 00000000..c50d06e8 --- /dev/null +++ b/services/crewai-service/app/config/roles/agromatrix/farm_ops.md @@ -0,0 +1,8 @@ +# Farm Ops Planner + +Фокус: планування польових робіт, ресурси, пріоритезація задач, таймінги. + +Правила: +- Видавати практичний порядок дій. +- За простого запиту: коротка відповідь. +- Для операційних запитів: стислий план з відповідальними і дедлайном. diff --git a/services/crewai-service/app/config/roles/agromatrix/orchestrator_synthesis.md b/services/crewai-service/app/config/roles/agromatrix/orchestrator_synthesis.md new file mode 100644 index 00000000..7e70653f --- /dev/null +++ b/services/crewai-service/app/config/roles/agromatrix/orchestrator_synthesis.md @@ -0,0 +1,10 @@ +# AgroMatrix Orchestrator Synthesis + +Ти синтезуєш відповіді ролей у фінальну відповідь Степана. + +Правила: +- За замовчуванням: 1-3 природні речення без шаблонної канцелярії. +- Детальний формат (пункти/чекліст) тільки коли користувач просить "детально", "план", "чекліст", "розрахунок". +- Якщо для аналізу бракує фото в поточному контексті, скажи це просто і попроси надіслати фото повторно. +- Уникай службових формулювань про "технічні обмеження", "text-only" чи "відсутній vision-модуль". +- Пояснюй по суті агропитання і давай 1 наступний практичний крок. diff --git a/services/crewai-service/app/config/roles/agromatrix/risk_assessor.md b/services/crewai-service/app/config/roles/agromatrix/risk_assessor.md new file mode 100644 index 00000000..c843d005 --- /dev/null +++ b/services/crewai-service/app/config/roles/agromatrix/risk_assessor.md @@ -0,0 +1,7 @@ +# Risk Assessor + +Фокус: агро-ризики, операційні ризики, наслідки рішень. + +Правила: +- Давай коротку оцінку ризику (низький/середній/високий) і як зменшити ризик. +- Без зайвої бюрократії у відповіді користувачу. diff --git a/services/crewai-service/app/config/roles/agx/agx-orchestrator-stepan/orchestrator_stepan.md b/services/crewai-service/app/config/roles/agx/agx-orchestrator-stepan/orchestrator_stepan.md index b5695d27..58773cfd 100644 --- a/services/crewai-service/app/config/roles/agx/agx-orchestrator-stepan/orchestrator_stepan.md +++ b/services/crewai-service/app/config/roles/agx/agx-orchestrator-stepan/orchestrator_stepan.md @@ -11,6 +11,10 @@ - Деструктивні дії (delete/migrate/prod) ТІЛЬКИ через план + dry-run + backup - Ніколи не логувати секрети/токени - Інші ролі НЕ спілкуються з користувачем напряму +- Мультимодальність активна: фото/голос/документи підтримуються через стек платформи. +- Якщо в поточному контексті не вистачає зображення для аналізу, пояснюйте це простою людською мовою і попросіть надіслати фото ще раз без технічних формулювань. ## Формат відповіді: -Структурована відповідь з чіткими рекомендаціями та наступними кроками. +- За замовчуванням: природна коротка відповідь 1-3 речення. +- Якщо користувач просить детально/план/чекліст: структурована відповідь з чіткими наступними кроками. +- Тон: живий і професійний, без канцеляризмів, шаблонів і фраз про "обмеження моделі". diff --git a/services/crewai-service/app/config/roles/agx/agx-orchestrator-stepan/orchestrator_synthesis.md b/services/crewai-service/app/config/roles/agx/agx-orchestrator-stepan/orchestrator_synthesis.md index cac85d38..e192dbbb 100644 --- a/services/crewai-service/app/config/roles/agx/agx-orchestrator-stepan/orchestrator_synthesis.md +++ b/services/crewai-service/app/config/roles/agx/agx-orchestrator-stepan/orchestrator_synthesis.md @@ -7,3 +7,7 @@ - Структурувати інформацію логічно - Включати конкретні наступні кроки - Позначати ризики якщо є +- За замовчуванням відповідати природно і коротко (1-3 речення), без шаблонної канцелярії. +- Для детальних запитів переходити у структурований режим. +- Якщо для аналізу бракує зображення у поточному контексті, скажіть це природно і попросіть надіслати фото повторно. +- Не вживати службові формулювання на кшталт "обмеження моделі", "text-only", "vision unavailable". diff --git a/services/crewai-service/app/config/roles/agx/agx-plant-intel/agrovoc_normalizer.md b/services/crewai-service/app/config/roles/agx/agx-plant-intel/agrovoc_normalizer.md new file mode 100644 index 00000000..f279f615 --- /dev/null +++ b/services/crewai-service/app/config/roles/agx/agx-plant-intel/agrovoc_normalizer.md @@ -0,0 +1,11 @@ +You are AGROVOC Normalizer. + +Responsibilities: +- Normalize crop/disease terms using agrovoc_lookup. +- Provide canonical term mapping for user-facing output. +- Keep labels practical for agronomy context. + +Return format: +- canonical_terms +- term_mapping +- notes_for_user diff --git a/services/crewai-service/app/config/roles/agx/agx-plant-intel/orchestrator_synthesis.md b/services/crewai-service/app/config/roles/agx/agx-plant-intel/orchestrator_synthesis.md new file mode 100644 index 00000000..17ae88c3 --- /dev/null +++ b/services/crewai-service/app/config/roles/agx/agx-plant-intel/orchestrator_synthesis.md @@ -0,0 +1,24 @@ +Ти — Plant Intel Agent у DAARION.city. +Відповідай природно, коротко й по-людськи українською, 1–3 речення за замовчуванням. + +НАЙГОЛОВНІШЕ: +- Дані з [PLANT_VISION_PREPROCESSED] (або context.plant_vision) — єдиний source-of-truth для ідентифікації рослини. +- Для follow-up без нового фото використовуй [PREVIOUS_PLANT_IDENTIFICATION] (або context.last_plant / memory.last_plant). + +Правило впевненості (обов'язково): +- Якщо recommend_fallback == true або confidence < 0.65: + "Ймовірно , але впевненість низька. Перевірив через GBIF — найближчі збіги: . Краще нове фото при нормальному світлі." +- Інакше: + "Я бачу з впевненістю %." + +Правила синтезу: +- Не ігноруй результати pre-vision, якщо вони присутні. +- Не стверджуй "фото не надано", якщо у контексті є pre-vision або previous plant data. +- Уникай шаблонних списків, якщо користувач не просить детальний формат. +- Якщо дані суперечливі: коротко познач невизначеність і попроси 1 конкретне додаткове фото. +- Якщо top_k порожній, явно вкажи, що ідентифікація непевна, але все одно надай GBIF-орієнтир, якщо він є в контексті. + +Формат відповіді: +- 1–3 речення за замовчуванням. +- Без технічного шуму, без внутрішніх JSON/міток у відповіді користувачу. +- За запитом користувача можна розгорнути відповідь і дати короткі поради з догляду. diff --git a/services/crewai-service/app/config/roles/agx/agx-plant-intel/plant_identifier.md b/services/crewai-service/app/config/roles/agx/agx-plant-intel/plant_identifier.md new file mode 100644 index 00000000..23a6d0b7 --- /dev/null +++ b/services/crewai-service/app/config/roles/agx/agx-plant-intel/plant_identifier.md @@ -0,0 +1,11 @@ +You are Plant Identifier. + +Responsibilities: +- Parse visual cues from user description/photo context. +- Build candidate crop/plant hypotheses. +- Use plantnet_lookup first when image URL is available. +- If PlantNet is unavailable, provide top hypotheses with explicit uncertainty. + +Return format: +- candidates: numbered list max 5, each with rationale. +- required_data: what extra image/data is needed. diff --git a/services/crewai-service/app/config/roles/agx/agx-plant-intel/taxonomy_validator.md b/services/crewai-service/app/config/roles/agx/agx-plant-intel/taxonomy_validator.md new file mode 100644 index 00000000..54df47e4 --- /dev/null +++ b/services/crewai-service/app/config/roles/agx/agx-plant-intel/taxonomy_validator.md @@ -0,0 +1,11 @@ +You are Taxonomy Validator. + +Responsibilities: +- Validate candidate names via gbif_species_lookup. +- Remove invalid/synonym-conflicted names. +- Keep accepted taxa and explain conflicts briefly. + +Return format: +- accepted_candidates +- rejected_candidates_with_reason +- confidence_adjustment diff --git a/services/plant-vision-node1/Dockerfile b/services/plant-vision-node1/Dockerfile new file mode 100644 index 00000000..fcf80226 --- /dev/null +++ b/services/plant-vision-node1/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY main.py . + +EXPOSE 8085 + +HEALTHCHECK --interval=30s --timeout=10s --start-period=20s --retries=3 \ + CMD python -c "import urllib.request; urllib.request.urlopen(http://localhost:8085/health)" + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8085"] diff --git a/services/plant-vision-node1/main.py b/services/plant-vision-node1/main.py new file mode 100644 index 00000000..568b7df9 --- /dev/null +++ b/services/plant-vision-node1/main.py @@ -0,0 +1,238 @@ +import json +import os +import re +import shlex +import subprocess +import tempfile +from pathlib import Path +from typing import Any, Dict, List, Optional + +import httpx +from fastapi import FastAPI, File, HTTPException, UploadFile +from fastapi.exceptions import RequestValidationError +from fastapi.responses import JSONResponse +from pydantic import BaseModel, Field + +app = FastAPI(title="plant-vision-node1", version="0.1.1") + + +class IdentifyRequest(BaseModel): + image_url: Optional[str] = None + top_k: int = Field(default=3, ge=1, le=10) + + +def _normalize_predictions(raw: Any, top_k: int) -> List[Dict[str, Any]]: + preds: List[Dict[str, Any]] = [] + if isinstance(raw, dict): + for key in ("predictions", "results", "candidates"): + if isinstance(raw.get(key), list): + raw = raw[key] + break + if isinstance(raw, list): + for item in raw[:top_k]: + if not isinstance(item, dict): + continue + name = ( + item.get("scientific_name") + or item.get("scientificName") + or item.get("label") + or item.get("name") + or "unknown" + ) + common = item.get("common_name") or item.get("commonName") or item.get("common") or "-" + score = item.get("score", item.get("confidence", 0.0)) + try: + score_f = float(score) + except Exception: + score_f = 0.0 + preds.append({"scientific_name": str(name), "common_name": str(common), "score": score_f}) + return preds[:top_k] + + +def _parse_text_output(text: str, top_k: int) -> List[Dict[str, Any]]: + """ + Parse only model score lines, e.g.: + 97.6% Persicaria amphibia + 86.1% Canada Goldenrod (Solidago canadensis) + Ignore service lines like "Read ..." or "Classification of ...". + """ + preds: List[Dict[str, Any]] = [] + for raw_line in (text or "").splitlines(): + line = raw_line.strip() + if not line or "%" not in line: + continue + + m = re.match(r"^\s*(\d+(?:\.\d+)?)%\s+(.+)$", line) + if not m: + continue + + score_str, name_part = m.groups() + try: + score = float(score_str) + except ValueError: + continue + + name = name_part.strip() + if not name: + continue + + common_name = "-" + scientific_name = name + + # If output is "Common Name (Scientific name)", preserve both. + paren = re.match(r"^(.*?)\s*\(([^()]+)\)\s*$", name) + if paren: + common, scientific = paren.groups() + common = common.strip() + scientific = scientific.strip() + if common: + common_name = common + if scientific: + scientific_name = scientific + + preds.append( + { + "scientific_name": scientific_name, + "common_name": common_name, + "score": score, + } + ) + + preds.sort(key=lambda x: float(x.get("score", 0.0)), reverse=True) + return preds[:top_k] + + +def _extract_inference_time(stdout: str) -> Optional[float]: + m = re.search(r"took\s+(\d+(?:\.\d+)?)\s+secs", stdout or "") + if not m: + return None + try: + return float(m.group(1)) + except Exception: + return None + + +def _run_nature_id_cli(image_path: str, top_k: int) -> Dict[str, Any]: + cmd_tmpl = (os.getenv("NATURE_ID_CMD") or "").strip() + timeout_s = int(os.getenv("NATURE_ID_TIMEOUT", "40")) + + if not cmd_tmpl: + raise RuntimeError("NATURE_ID_CMD is not configured") + + cmd = cmd_tmpl.replace("{image_path}", image_path) + proc = subprocess.run( + shlex.split(cmd), + capture_output=True, + text=True, + timeout=timeout_s, + check=False, + ) + if proc.returncode != 0: + raise RuntimeError(f"nature-id cli failed rc={proc.returncode}: {proc.stderr.strip()[:240]}") + + out = (proc.stdout or "").strip() + inference_time_sec = _extract_inference_time(out) + if not out: + return {"predictions": [], "inference_time_sec": inference_time_sec} + + try: + parsed = json.loads(out) + preds = _normalize_predictions(parsed, top_k) + except Exception: + preds = _parse_text_output(out, top_k) + + return {"predictions": preds, "inference_time_sec": inference_time_sec} + + +async def _download_image(image_url: str) -> str: + timeout_s = float(os.getenv("DOWNLOAD_TIMEOUT", "20")) + async with httpx.AsyncClient(timeout=timeout_s) as client: + resp = await client.get(image_url) + resp.raise_for_status() + data = resp.content + + with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as f: + f.write(data) + return f.name + + +def _response_payload(result: Dict[str, Any]) -> Dict[str, Any]: + preds = result.get("predictions") or [] + top_k = [ + { + "confidence": float(p.get("score", 0.0)), + "name": str((p.get("common_name") if p.get("common_name") not in (None, "", "-") else p.get("scientific_name")) or "unknown"), + "scientific_name": str(p.get("scientific_name") or "unknown"), + } + for p in preds + ] + return { + "status": "success", + "model": "aiy_plants_V1", + "source": "nature-id-cli", + "count": len(preds), + "inference_time_sec": result.get("inference_time_sec"), + "predictions": preds, + "top_k": top_k, + } + + +@app.exception_handler(RequestValidationError) +async def validation_exception_handler(_, exc: RequestValidationError): + # Avoid leaking raw multipart bytes in validation responses. + errs: List[Dict[str, Any]] = [] + for e in exc.errors() or []: + errs.append({"loc": e.get("loc"), "msg": e.get("msg"), "type": e.get("type")}) + return JSONResponse(status_code=422, content={"detail": errs}) + + +@app.get("/health") +def health() -> Dict[str, Any]: + cmd = (os.getenv("NATURE_ID_CMD") or "").strip() + return { + "status": "healthy", + "nature_id_cmd_configured": bool(cmd), + "nature_id_cmd": cmd, + } + + +@app.post("/identify") +async def identify(payload: IdentifyRequest) -> Dict[str, Any]: + if not payload.image_url: + raise HTTPException(status_code=400, detail="image_url is required") + + tmp_path = "" + try: + tmp_path = await _download_image(payload.image_url) + result = _run_nature_id_cli(tmp_path, payload.top_k) + return _response_payload(result) + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=503, detail=f"identify_failed: {e}") + finally: + if tmp_path: + try: + Path(tmp_path).unlink(missing_ok=True) + except Exception: + pass + + +@app.post("/identify-file") +async def identify_file(file: UploadFile = File(...), top_k: int = 3) -> Dict[str, Any]: + top_k = max(1, min(top_k, 10)) + tmp_path = "" + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as f: + f.write(await file.read()) + tmp_path = f.name + result = _run_nature_id_cli(tmp_path, top_k) + return _response_payload(result) + except Exception as e: + raise HTTPException(status_code=503, detail=f"identify_failed: {e}") + finally: + if tmp_path: + try: + Path(tmp_path).unlink(missing_ok=True) + except Exception: + pass diff --git a/services/plant-vision-node1/requirements.txt b/services/plant-vision-node1/requirements.txt new file mode 100644 index 00000000..dccabe92 --- /dev/null +++ b/services/plant-vision-node1/requirements.txt @@ -0,0 +1,8 @@ +fastapi==0.115.5 +uvicorn[standard]==0.32.1 +httpx==0.28.1 +python-multipart==0.0.17 +Pillow==11.1.0 +requests==2.32.3 +tflite-runtime==2.14.0 +numpy==1.26.4 diff --git a/services/router/agent_tools_config.py b/services/router/agent_tools_config.py index de9bb085..f958cb3e 100644 --- a/services/router/agent_tools_config.py +++ b/services/router/agent_tools_config.py @@ -46,8 +46,15 @@ AGENT_SPECIALIZED_TOOLS = { "nutra": ['comfy_generate_image', 'comfy_generate_video'], # AgroMatrix - Agriculture - # Specialized: crop analysis, weather integration, field mapping - "agromatrix": ['comfy_generate_image', 'comfy_generate_video'], + # Specialized: crop analysis, weather integration, field mapping + plant intelligence + "agromatrix": [ + 'comfy_generate_image', + 'comfy_generate_video', + 'plantnet_lookup', + 'nature_id_identify', + 'gbif_species_lookup', + 'agrovoc_lookup', + ], # GreenFood - Food & Eco # Specialized: recipe analysis, eco-scoring diff --git a/services/router/main.py b/services/router/main.py index 3e74d7d5..e8f3d2cb 100644 --- a/services/router/main.py +++ b/services/router/main.py @@ -10,7 +10,9 @@ import yaml import httpx import logging import hashlib +import hmac import time # For latency metrics +from difflib import SequenceMatcher # CrewAI Integration try: @@ -142,13 +144,248 @@ def _vision_answer_uncertain(answer: str) -> bool: return any(m in a for m in uncertain_markers) +def _is_plant_identification_request(prompt: str) -> bool: + if not prompt: + return False + p = prompt.lower() + markers = [ + "що за рослина", "що це за рослина", "яка це рослина", "яка рослина", + "що за культура", "яка культура", "визнач рослину", "визнач культуру", + "ідентифікуй рослину", "впізнай рослину", + "what plant", "identify plant", "identify crop", "what crop", + "что за растение", "какое растение", "определи растение", + ] + return any(m in p for m in markers) + + +def _build_plant_web_queries(prompt: str, vision_text: str) -> List[str]: + base = _build_vision_web_query(prompt, vision_text) + vt = _extract_vision_search_facts(vision_text, max_chars=180) + queries: List[str] = [] + if base: + queries.append(base) + if vt: + queries.append(f"{vt} crop seedling identification") + queries.append(f"{vt} identification by leaves field photo") + queries.append(f"{vt} визначення культури за листком") + # Deduplicate while preserving order. + seen = set() + out: List[str] = [] + for q in queries: + qq = re.sub(r"\s+", " ", q).strip() + if qq and qq not in seen: + seen.add(qq) + out.append(qq) + return out[:3] + + +def _build_cautious_plant_response(base_text: str, source_count: int) -> str: + concise_base = _sanitize_vision_text_for_user(base_text) or "По цьому фото поки не можу надійно визначити культуру." + parts = [seg.strip() for seg in re.split(r"(?<=[.!?])\s+", concise_base) if seg.strip()] + if len(parts) > 2: + concise_base = " ".join(parts[:2]).strip() + return ( + f"{concise_base}\n\n" + f"Зараз це попередня оцінка (перевірених джерел: {source_count}). " + "Щоб дати точну ідентифікацію, надішли 2-3 фото: загальний план, крупний план листка " + "і фото стебла/вузла росту." + ) + + +def _extract_image_inputs_for_plant_tools(images: Optional[List[str]], metadata: Dict[str, Any]) -> Dict[str, str]: + out: Dict[str, str] = {} + file_url = str((metadata or {}).get("file_url") or "").strip() + if file_url.startswith("http://") or file_url.startswith("https://"): + out["image_url"] = file_url + if images and isinstance(images, list): + first = images[0] + if isinstance(first, str): + s = first.strip() + if s.startswith("data:image/") and ";base64," in s: + out["image_data"] = s + elif not out.get("image_url") and (s.startswith("http://") or s.startswith("https://")): + out["image_url"] = s + return out + + +def _parse_tool_result_json(payload: Any) -> Dict[str, Any]: + if isinstance(payload, dict): + return payload + if isinstance(payload, str): + s = payload.strip() + if s.startswith("{") or s.startswith("["): + try: + parsed = json.loads(s) + return parsed if isinstance(parsed, dict) else {} + except Exception: + return {} + return {} + + +def _extract_top_candidates(tool_json: Dict[str, Any], limit: int = 3) -> List[Dict[str, Any]]: + rows = tool_json.get("top_k") if isinstance(tool_json, dict) else None + if not isinstance(rows, list): + return [] + out: List[Dict[str, Any]] = [] + for row in rows[:limit]: + if not isinstance(row, dict): + continue + try: + conf = float(row.get("confidence", 0.0)) + except Exception: + conf = 0.0 + if conf > 1.0 and conf <= 100.0: + conf = conf / 100.0 + if conf < 0: + conf = 0.0 + if conf > 1.0: + conf = 1.0 + name = str(row.get("name") or row.get("scientific_name") or "unknown").strip() + sci = str(row.get("scientific_name") or name or "unknown").strip() + out.append({"confidence": conf, "name": name, "scientific_name": sci}) + return out + + +def _build_agromatrix_not_sure_response(candidates: List[Dict[str, Any]], threshold: float) -> str: + if not candidates: + return ( + "Не впевнений у точній ідентифікації по цьому фото. " + "Надішли, будь ласка, 2-3 чіткі фото: загальний план рослини, листок крупним планом і стебло/вузол росту." + ) + lines: List[str] = [] + for i, c in enumerate(candidates[:2], 1): + conf_pct = int(round(float(c.get("confidence", 0.0)) * 100)) + lines.append(f"{i}) {c.get('name')} ({c.get('scientific_name')}), confidence ~{conf_pct}%") + return ( + f"Не впевнений у точній ідентифікації (поріг надійності: {int(round(threshold * 100))}%).\n" + f"Найближчі варіанти:\n" + "\n".join(lines) + "\n" + "Щоб підтвердити вид, надішли чіткі фото листка (верх/низ), стебла та загального вигляду." + ) + + +def _build_agromatrix_deterministic_fallback(candidates: List[Dict[str, Any]]) -> str: + if not candidates: + return ( + "Не впевнений у точній ідентифікації по цьому фото. " + "Надішли чіткіші фото листка, стебла і загального вигляду рослини." + ) + top = candidates[0] + conf_pct = int(round(float(top.get("confidence", 0.0)) * 100)) + return ( + f"Ймовірна ідентифікація: {top.get('name')} ({top.get('scientific_name')}), confidence ~{conf_pct}%. " + "Це результат автоматичної класифікації; для підтвердження бажано ще 1-2 фото з інших ракурсів." + ) + + EMPTY_ANSWER_GUARD_AGENTS = {"devtools", "monitor"} +DETERMINISTIC_PLANT_POLICY_AGENTS = { + part.strip().lower() + for part in os.getenv( + "DETERMINISTIC_PLANT_POLICY_AGENTS", + "agromatrix,greenfood,nutra", + ).split(",") + if part.strip() +} +REPEAT_FINGERPRINT_MIN_SIMILARITY = float(os.getenv("AGENT_REPEAT_FINGERPRINT_MIN_SIMILARITY", "0.92")) def _normalize_text_response(text: str) -> str: return re.sub(r"\s+", " ", str(text or "")).strip() +def _response_fingerprint(text: str) -> str: + normalized = _normalize_text_response(text).lower() + normalized = re.sub(r"[^a-zа-яіїєґ0-9%./:;,+\- ]+", " ", normalized) + normalized = re.sub(r"\s+", " ", normalized).strip() + return normalized[:240] + + +def _fingerprint_similarity(a: str, b: str) -> float: + if not a or not b: + return 0.0 + return SequenceMatcher(None, a, b).ratio() + + +def _looks_like_user_question(text: str) -> bool: + t = (text or "").strip().lower() + if not t: + return False + if "?" in t: + return True + starters = ( + "що", "як", "чому", "коли", "де", "скільки", "яка", "який", "які", + "what", "how", "why", "when", "where", "which", "can you", + "что", "как", "почему", "когда", "где", "сколько", + ) + return any(t.startswith(s + " ") for s in starters) + + +def _looks_like_negative_feedback(text: str) -> bool: + t = (text or "").lower() + markers = ( + "не вірно", "невірно", "неправильно", "помилка", "знову не так", + "це не так", "не релевантно", "повтор", "ти знову", "мимо", + "wrong", "incorrect", "not relevant", "repeat", "again wrong", + "неверно", "неправильно", "это ошибка", "снова не так", + ) + return any(m in t for m in markers) + + +def _looks_like_numeric_request(text: str) -> bool: + t = (text or "").lower() + markers = ( + "скільки", "сума", "витра", "cost", "total", "amount", "ціна", + "вартість", "дохід", "прибут", "маржа", "баланс", "unit cost", + "сколько", "сумма", "затрат", "стоимость", "расход", + ) + return any(m in t for m in markers) + + +def _numeric_contract_present(text: str) -> bool: + t = _normalize_text_response(text) + low = t.lower() + if not re.search(r"\d", low): + return False + has_value_with_unit = re.search( + r"\b\d[\d\s.,]*\s*(грн|uah|usd|eur|kg|кг|т|л|га|шт|%|тон|літр|hectare|ha)\b", + low, + ) is not None + has_explicit_source = any( + re.search(pattern, low) is not None + for pattern in ( + r"\bsheet(?:\s*[:#]\s*[a-z0-9_]+|\s+[a-z0-9_]+![a-z]+\d+)", + r"\brow\s*[:#]\s*\d+", + r"\bрядок\s*[:#]\s*\d+", + r"\bлист(?:\s*[:#]\s*[a-zа-я0-9_]+|\s+[a-zа-я0-9_]+![a-zа-я]+\d+)", + r"\bcell\s*[:#]\s*[a-z]+\d+", + r"\bкомірк[а-я]*\s*[:#]\s*[a-zа-я]+\d+", + r"\bsource\s*[:#]", + r"\bджерел[оа]\s*[:#]", + ) + ) + return bool(has_value_with_unit and has_explicit_source) + + +def _build_numeric_contract_uncertain_response() -> str: + return ( + "Не можу підтвердити точне число без джерела. " + "Щоб дати коректну відповідь, надішли таблицю/файл або уточни лист і діапазон. " + "Формат відповіді дам строго як: value + unit + source(sheet,row)." + ) + + +def _response_is_uncertain_or_incomplete(text: str) -> bool: + low = _normalize_text_response(text).lower() + if not low: + return True + markers = ( + "не впевнений", "не можу", "надішли", "уточни", "уточніть", + "потрібно більше", "insufficient", "need more", "please send", + "не уверен", "не могу", "уточни", "нужно больше", + ) + return any(m in low for m in markers) + + def _needs_empty_answer_recovery(text: str) -> bool: normalized = _normalize_text_response(text) if not normalized: @@ -202,6 +439,32 @@ def _build_image_fallback_response(agent_id: str, prompt: str = "") -> str: +def _looks_like_image_question(prompt: str) -> bool: + if not prompt: + return False + p = str(prompt).lower().strip() + # If it's too short (e.g. "що це?", "что это?"), it might be a follow-up to an image. + # But we should only trigger the guard if the user EXPLICITLY mentions an image + # and we don't have one. + markers = ( + "що на фото", "що на цьому фото", "що на зображ", "опиши фото", "проаналізуй фото", + "what is in the image", "what is on this photo", "describe the photo", "analyze image", + "что на фото", "что на этом фото", "опиши фото", "проанализируй фото", + ) + if any(m in p for m in markers): + return True + + # Refined regex: must contain 'what|what is|how' and 'photo|image' + # but avoid generic "can you..." + if re.search(r"(що|what|что|опиши|проаналізуй|подивись).{1,24}(фото|зображ|image|photo|світлин)", p): + # Exclude common meta-questions that might contain these words but aren't about an image + meta_exclude = ["канал", "чат", "бот", "нормально"] + if not any(ex in p for ex in meta_exclude): + return True + + return False + + def _sanitize_vision_text_for_user(text: str) -> str: if not text: return "" @@ -640,6 +903,12 @@ CLAN_RUNTIME_CONSENT_EVENT_SCHEMA_PATH = os.getenv( NEO4J_URI = os.getenv("NEO4J_BOLT_URL", "bolt://neo4j:7687") NEO4J_USER = os.getenv("NEO4J_USER", "neo4j") NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "DaarionNeo4j2026!") +AGROMATRIX_REVIEW_AUTH_MODE = os.getenv("AGROMATRIX_REVIEW_AUTH_MODE", "bearer").strip().lower() +AGROMATRIX_REVIEW_BEARER_TOKENS = [ + part.strip() + for part in os.getenv("AGROMATRIX_REVIEW_BEARER_TOKENS", "").replace(";", ",").split(",") + if part.strip() +] # HTTP client for backend services http_client: Optional[httpx.AsyncClient] = None @@ -1151,6 +1420,68 @@ class ToolExecuteRequest(BaseModel): metadata: Optional[Dict[str, Any]] = None +class DocumentIngestRequest(BaseModel): + """Ingest document text into agent-specific docs collection.""" + agent_id: str + doc_id: str + file_name: Optional[str] = None + text: str + dao_id: Optional[str] = None + user_id: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None + + +class DocumentQueryRequest(BaseModel): + """Query document context from agent-specific docs collection.""" + agent_id: str + question: str + doc_id: Optional[str] = None + dao_id: Optional[str] = None + user_id: Optional[str] = None + limit: int = 5 + + +class DocumentUpdateRequest(BaseModel): + """Update existing document text and bump version.""" + agent_id: str + doc_id: str + file_name: Optional[str] = None + text: str + dao_id: Optional[str] = None + user_id: Optional[str] = None + storage_ref: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None + + +class SharedMemoryReviewRequest(BaseModel): + point_id: str + approve: bool + reviewer: Optional[str] = None + note: Optional[str] = None + + +def _require_agromatrix_review_auth(request: Request) -> None: + mode = AGROMATRIX_REVIEW_AUTH_MODE + if mode in {"off", "none", "disabled"}: + return + + if mode != "bearer": + raise HTTPException(status_code=500, detail=f"Unsupported AGROMATRIX_REVIEW_AUTH_MODE={mode}") + + if not AGROMATRIX_REVIEW_BEARER_TOKENS: + logger.error("AGROMATRIX_REVIEW_AUTH_MODE=bearer but AGROMATRIX_REVIEW_BEARER_TOKENS is empty") + raise HTTPException(status_code=503, detail="Review auth is not configured") + + auth_header = request.headers.get("Authorization", "") + if not auth_header.startswith("Bearer "): + raise HTTPException(status_code=401, detail="Missing Bearer token") + + token = auth_header[len("Bearer ") :].strip() + if not token: + raise HTTPException(status_code=401, detail="Empty Bearer token") + + if not any(hmac.compare_digest(token, candidate) for candidate in AGROMATRIX_REVIEW_BEARER_TOKENS): + raise HTTPException(status_code=403, detail="Invalid mentor token") # ========================================================================= @@ -1402,6 +1733,8 @@ async def agent_infer(agent_id: str, request: InferRequest): # MEMORY RETRIEVAL (v4.0 - Universal for all agents) # ========================================================================= memory_brief_text = "" + brief: Optional[MemoryBrief] = None + session_state = None # Extract metadata once for both retrieval and storage metadata = request.metadata or {} channel = "telegram" # Default @@ -1410,6 +1743,47 @@ async def agent_infer(agent_id: str, request: InferRequest): username = metadata.get("username") # Get agent_id from metadata or URL parameter request_agent_id = metadata.get("agent_id", agent_id).lower() + + # Safety guard: avoid text-only handling for image questions without image payload. + # IMPORTANT: inspect only the latest user text when provided by gateway, + # not the full context-augmented prompt. + raw_user_text = str(metadata.get("raw_user_text", "") or "").strip() + incoming_user_text = raw_user_text if raw_user_text else request.prompt + image_guard_text = incoming_user_text + track_pending_question = _looks_like_user_question(incoming_user_text) + + if ( + MEMORY_RETRIEVAL_AVAILABLE + and memory_retrieval + and chat_id + and user_id + and track_pending_question + ): + try: + await memory_retrieval.register_pending_question( + channel=channel, + chat_id=chat_id, + user_id=user_id, + agent_id=request_agent_id, + question_text=incoming_user_text, + metadata={ + "source": "router_infer", + "has_images": bool(request.images), + }, + ) + except Exception as e: + logger.debug(f"Pending question register skipped: {e}") + + if (not request.images) and _looks_like_image_question(image_guard_text): + return InferResponse( + response=( + "Бачу запит про фото/зображення, але в запиті немає самого файлу. " + "Надішліть фото ще раз (або прикріпіть файл із підписом), і я одразу проаналізую." + ), + model="vision-context-required", + backend="router-guard", + tokens_used=0, + ) if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval: try: @@ -1422,6 +1796,7 @@ async def agent_infer(agent_id: str, request: InferRequest): username=username, message=request.prompt ) + session_state = brief.session_state if brief else None memory_brief_text = brief.to_text(max_lines=10) if memory_brief_text: logger.info(f"🧠 Memory brief for {request_agent_id}: {len(memory_brief_text)} chars") @@ -1473,6 +1848,63 @@ async def agent_infer(agent_id: str, request: InferRequest): f"🧩 Prompt meta for {agent_id}: source={system_prompt_source}, " f"version={effective_metadata['system_prompt_version']}, hash={system_prompt_hash}" ) + + async def _finalize_response_text(text: str, backend_tag: str) -> str: + final_text = _normalize_text_response(text) + if not final_text: + return final_text + + # Agro numeric contract: no numbers without unit + source marker. + if request_agent_id == "agromatrix" and _looks_like_numeric_request(incoming_user_text): + if not _numeric_contract_present(final_text): + final_text = _build_numeric_contract_uncertain_response() + + # Anti-repeat guard: if user reports wrong answer and new answer is near-identical + # to previous one, force non-repetitive recovery text. + prev_fp = "" + if session_state and getattr(session_state, "last_answer_fingerprint", None): + prev_fp = str(session_state.last_answer_fingerprint or "") + new_fp = _response_fingerprint(final_text) + if prev_fp and new_fp: + similarity = _fingerprint_similarity(prev_fp, new_fp) + if similarity >= REPEAT_FINGERPRINT_MIN_SIMILARITY and _looks_like_negative_feedback(incoming_user_text): + final_text = ( + "Прийняв, попередня відповідь була не по суті. Не повторюю її. " + "Переформулюю коротко і по ділу: надішли 1 конкретне питання або файл/фото, " + "і я дам перевірену відповідь із джерелом." + ) + new_fp = _response_fingerprint(final_text) + logger.warning( + f"🔁 Repeat guard fired for {request_agent_id}: similarity={similarity:.3f}, backend={backend_tag}" + ) + + # Resolve oldest pending question only when answer is not uncertain. + if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id: + try: + if track_pending_question and not _response_is_uncertain_or_incomplete(final_text): + await memory_retrieval.resolve_pending_question( + channel=channel, + chat_id=chat_id, + user_id=user_id, + agent_id=request_agent_id, + answer_text=final_text, + reason="answered", + ) + except Exception as e: + logger.debug(f"Pending question resolve skipped: {e}") + + try: + if session_state and getattr(session_state, "conversation_id", None): + await memory_retrieval.update_session_state( + session_state.conversation_id, + last_answer_fingerprint=new_fp[:240], + last_user_id=user_id, + last_user_nick=username, + ) + except Exception as e: + logger.debug(f"Session fingerprint update skipped: {e}") + + return final_text # Determine which backend to use # Use router config to get default model for agent, fallback to qwen3:8b @@ -1535,9 +1967,12 @@ async def agent_infer(agent_id: str, request: InferRequest): # Get agent CrewAI config from registry (or router_config fallback) crewai_cfg = agent_config.get("crewai", {}) + # CrewAI trigger should inspect the latest user message first (if provided by gateway), + # otherwise it can overreact to long context history. + decision_prompt = str(effective_metadata.get("raw_user_text", "") or "").strip() or request.prompt use_crewai, crewai_reason = should_use_crewai( agent_id=agent_id, - prompt=request.prompt, + prompt=decision_prompt, agent_config=agent_config, metadata=effective_metadata, force_crewai=effective_metadata.get("force_crewai", False), @@ -1617,6 +2052,8 @@ async def agent_infer(agent_id: str, request: InferRequest): parts = re.split(r"(?<=[.!?])\s+", final_response_text.strip()) if len(parts) > 3: final_response_text = " ".join(parts[:3]).strip() + + final_response_text = await _finalize_response_text(final_response_text, "crewai") # Store interaction in memory if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id: @@ -1800,6 +2237,156 @@ async def agent_infer(agent_id: str, request: InferRequest): # ========================================================================= if request.images and len(request.images) > 0: logger.info(f"🖼️ Vision request: {len(request.images)} image(s)") + plant_intent = _is_plant_identification_request(request.prompt) + + # Deterministic AgroMatrix policy: + # 1) run plant classifiers first (nature-id / plantnet) + # 2) apply confidence threshold + # 3) LLM only explains classifier result, no new guessing + if request_agent_id in DETERMINISTIC_PLANT_POLICY_AGENTS and plant_intent and TOOL_MANAGER_AVAILABLE and tool_manager: + try: + image_inputs = _extract_image_inputs_for_plant_tools(request.images, metadata) + if image_inputs: + threshold = float( + os.getenv( + "AGROMATRIX_PLANT_CONFIDENCE_MIN", + os.getenv("NATURE_ID_MIN_CONFIDENCE", "0.65"), + ) + ) + nature_args: Dict[str, Any] = {"top_k": 5, "min_confidence": threshold} + nature_args.update(image_inputs) + nature_res = await tool_manager.execute_tool( + "nature_id_identify", + nature_args, + agent_id=request_agent_id, + chat_id=chat_id, + user_id=user_id, + ) + nature_json = _parse_tool_result_json(nature_res.result) if nature_res and nature_res.success else {} + candidates = _extract_top_candidates(nature_json, limit=3) + + plantnet_key = (os.getenv("PLANTNET_API_KEY") or "").strip() + if plantnet_key: + plantnet_args: Dict[str, Any] = {"top_k": 3, "organ": "leaf"} + plantnet_args.update(image_inputs) + plantnet_res = await tool_manager.execute_tool( + "plantnet_lookup", + plantnet_args, + agent_id=request_agent_id, + chat_id=chat_id, + user_id=user_id, + ) + plantnet_json = _parse_tool_result_json(plantnet_res.result) if plantnet_res and plantnet_res.success else {} + plantnet_candidates = _extract_top_candidates(plantnet_json, limit=2) + if not candidates and plantnet_candidates: + candidates = plantnet_candidates + + top_conf = float(candidates[0].get("confidence", 0.0)) if candidates else 0.0 + if (not candidates) or (top_conf < threshold): + response_text = _build_agromatrix_not_sure_response(candidates, threshold) + response_text = await _finalize_response_text(response_text, "plant-id-deterministic-uncertain") + if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id: + asyncio.create_task( + memory_retrieval.store_message( + agent_id=request_agent_id, + user_id=user_id, + username=username, + message_text=f"[Image][PlantIntent] {request.prompt}", + response_text=response_text, + chat_id=chat_id, + message_type="vision", + metadata={ + "deterministic_plant_id": True, + "confidence_threshold": threshold, + "candidates": candidates, + "decision": "uncertain", + }, + ) + ) + return InferResponse( + response=response_text, + model="plant-id-deterministic", + backend="plant-id-deterministic", + tokens_used=0, + ) + + # High-confidence deterministic result -> LLM explains only this result. + top = candidates[0] + classifier_payload = { + "source": nature_json.get("source") if isinstance(nature_json, dict) else "nature-id", + "threshold": threshold, + "selected": top, + "top_k": candidates, + } + explain_prompt = ( + "Користувач попросив ідентифікувати рослину на фото.\n" + f"Використай ТІЛЬКИ цей deterministic результат класифікатора: {json.dumps(classifier_payload, ensure_ascii=False)}\n\n" + "Сформуй коротку відповідь українською (2-4 речення):\n" + "1) назва культури (common + scientific),\n" + "2) confidence у %, \n" + "3) 1-2 ознаки для практичної перевірки в полі.\n" + "Не вигадуй інші види. Якщо даних замало, прямо скажи: 'не впевнений'." + ) + llm_model = "plant-id-deterministic" + llm_backend = "plant-id-deterministic" + llm_tokens = 0 + try: + llm_resp = await internal_llm_complete( + InternalLLMRequest( + prompt=explain_prompt, + llm_profile="reasoning", + max_tokens=min(int(request.max_tokens or 220), 280), + temperature=0.1, + role_context="AgroMatrix classifier explainer", + metadata={"agent_id": "agromatrix"}, + ) + ) + response_text = _sanitize_vision_text_for_user(llm_resp.text) + llm_model = llm_resp.model + llm_backend = f"plant-id-explainer-{llm_resp.provider}" + llm_tokens = llm_resp.tokens_used + except Exception as e: + logger.warning(f"⚠️ Deterministic plant explanation LLM failed: {e}") + response_text = "" + + if not response_text: + response_text = _build_agromatrix_deterministic_fallback(candidates) + else: + low = response_text.lower() + top_name = str(top.get("name") or "").lower() + top_sci = str(top.get("scientific_name") or "").lower() + if (top_name and top_name not in low) and (top_sci and top_sci not in low): + response_text = _build_agromatrix_deterministic_fallback(candidates) + + response_text = await _finalize_response_text(response_text, llm_backend) + + if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id: + asyncio.create_task( + memory_retrieval.store_message( + agent_id=request_agent_id, + user_id=user_id, + username=username, + message_text=f"[Image][PlantIntent] {request.prompt}", + response_text=response_text, + chat_id=chat_id, + message_type="vision", + metadata={ + "deterministic_plant_id": True, + "confidence_threshold": threshold, + "candidates": candidates, + "decision": "high_confidence", + }, + ) + ) + return InferResponse( + response=response_text, + model=llm_model, + backend=llm_backend, + tokens_used=llm_tokens, + ) + except Exception as e: + logger.warning(f"⚠️ Deterministic AgroMatrix plant flow failed, fallback to generic vision: {e}") + try: # Use Swapper's /vision endpoint (manages model loading) vision_payload = { @@ -1831,6 +2418,9 @@ async def agent_infer(agent_id: str, request: InferRequest): full_response = _sanitize_vision_text_for_user(raw_response) vision_web_query = "" vision_sources: List[Dict[str, str]] = [] + plant_intent = _is_plant_identification_request(request.prompt) + wants_web = False + uncertain = False # Debug: log full response structure logger.info( @@ -1848,39 +2438,57 @@ async def agent_infer(agent_id: str, request: InferRequest): try: wants_web = _vision_prompt_wants_web(request.prompt) uncertain = _vision_answer_uncertain(full_response or raw_response) - if wants_web or uncertain: - query = _build_vision_web_query(request.prompt, full_response or raw_response) - if not query: + if wants_web or uncertain or plant_intent: + queries = ( + _build_plant_web_queries(request.prompt, full_response or raw_response) + if plant_intent + else [_build_vision_web_query(request.prompt, full_response or raw_response)] + ) + queries = [q for q in queries if q] + if not queries: logger.info("🔎 Vision web enrich skipped: query not actionable") else: - vision_web_query = query - search_result = await tool_manager.execute_tool( - "web_search", - {"query": query, "max_results": 3}, - agent_id=request_agent_id, - chat_id=chat_id, - user_id=user_id, - ) - if search_result and search_result.success and search_result.result: - + vision_web_query = queries[0] + merged_chunks: List[str] = [] + merged_sources: List[Dict[str, str]] = [] + for query in queries: + search_result = await tool_manager.execute_tool( + "web_search", + {"query": query, "max_results": 3}, + agent_id=request_agent_id, + chat_id=chat_id, + user_id=user_id, + ) + if not (search_result and search_result.success and search_result.result): + continue compact_search = _compact_web_search_result( search_result.result, query=query, agent_id=request_agent_id, ) + if not compact_search or "Нічого не знайдено" in compact_search: + continue + merged_chunks.append(compact_search) + merged_sources.extend(_extract_sources_from_compact(compact_search)) - if compact_search and "Нічого не знайдено" not in compact_search: - vision_sources = _extract_sources_from_compact(compact_search) + # Deduplicate sources. + if merged_sources: + uniq: List[Dict[str, str]] = [] + seen = set() + for s in merged_sources: + key = (s.get("title", ""), s.get("url", "")) + if key in seen: + continue + seen.add(key) + uniq.append(s) + vision_sources = uniq[:5] - base_text = full_response or "Не вдалося надійно ідентифікувати об'єкт на фото." - - full_response = ( - - f"{base_text}\n\n" - - f"Додатково знайшов у відкритих джерелах:\n{compact_search}" - - ) + if merged_chunks: + base_text = full_response or "Не вдалося надійно ідентифікувати об'єкт на фото." + full_response = ( + f"{base_text}\n\n" + f"Додатково знайшов у відкритих джерелах:\n{merged_chunks[0]}" + ) logger.info( "🌐 Vision web enrichment applied " @@ -1896,6 +2504,11 @@ async def agent_infer(agent_id: str, request: InferRequest): f"query='{vision_web_query[:180]}', sources={len(vision_sources)}" ) + # Plant identification safety gate: + # avoid hard species claims when confidence is low or evidence is weak. + if request_agent_id in DETERMINISTIC_PLANT_POLICY_AGENTS and plant_intent and (uncertain or len(vision_sources) < 2): + full_response = _build_cautious_plant_response(full_response or raw_response, len(vision_sources)) + # Image quality gate: one soft retry if response looks empty/meta. if _image_response_needs_retry(full_response): try: @@ -1925,8 +2538,10 @@ async def agent_infer(agent_id: str, request: InferRequest): if _image_response_needs_retry(full_response): full_response = _build_image_fallback_response(request_agent_id, request.prompt) - elif request_agent_id == "agromatrix" and _vision_response_is_blurry(full_response): + elif request_agent_id in DETERMINISTIC_PLANT_POLICY_AGENTS and _vision_response_is_blurry(full_response): full_response = _build_image_fallback_response(request_agent_id, request.prompt) + + full_response = await _finalize_response_text(full_response, "swapper-vision") # Store vision message in agent-specific memory if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and full_response: @@ -1956,8 +2571,12 @@ async def agent_infer(agent_id: str, request: InferRequest): ) else: logger.error(f"❌ Swapper vision error: {vision_resp.status_code} - {vision_resp.text[:200]}") + fallback_response = await _finalize_response_text( + _build_image_fallback_response(request_agent_id, request.prompt), + "swapper-vision-fallback", + ) return InferResponse( - response=_build_image_fallback_response(request_agent_id, request.prompt), + response=fallback_response, model="qwen3-vl-8b", tokens_used=None, backend="swapper-vision-fallback" @@ -1965,8 +2584,12 @@ async def agent_infer(agent_id: str, request: InferRequest): except Exception as e: logger.error(f"❌ Vision processing failed: {e}", exc_info=True) + fallback_response = await _finalize_response_text( + _build_image_fallback_response(request_agent_id, request.prompt), + "swapper-vision-fallback", + ) return InferResponse( - response=_build_image_fallback_response(request_agent_id, request.prompt), + response=fallback_response, model="qwen3-vl-8b", tokens_used=None, backend="swapper-vision-fallback" @@ -2236,6 +2859,19 @@ async def agent_infer(agent_id: str, request: InferRequest): tool_args = {"params": {"count": 3, "timezone": "Europe/Kyiv"}} logger.info("🛠️ oneok: auto-filled schedule_propose_slots.params") + # Plant tools: inject runtime image payload from current request to avoid + # hallucinated page URLs (e.g. t.me//) that are not direct images. + if tool_name in {"nature_id_identify", "plantnet_lookup"}: + if not isinstance(tool_args, dict): + tool_args = {} + runtime_image_data = None + if isinstance(request.images, list) and request.images: + first_image = request.images[0] + if isinstance(first_image, str) and first_image.startswith("data:image/") and ";base64," in first_image: + runtime_image_data = first_image + if runtime_image_data: + tool_args["_runtime_image_data"] = runtime_image_data + result = await tool_manager.execute_tool( tool_name, tool_args, @@ -2421,6 +3057,7 @@ async def agent_infer(agent_id: str, request: InferRequest): logger.debug(f" Tool {tr['name']}: no image_base64") logger.info(f"✅ {cloud['name'].upper()} response received, {tokens_used} tokens") + response_text = await _finalize_response_text(response_text, f"{cloud['name']}-cloud") # Store message in agent-specific memory (async, non-blocking) if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id: @@ -2557,6 +3194,7 @@ async def agent_infer(agent_id: str, request: InferRequest): "Я не отримав корисну відповідь з першої спроби. " "Сформулюй запит коротко ще раз, і я відповім конкретно." ) + local_response = await _finalize_response_text(local_response, "swapper+ollama") # Store in agent-specific memory if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and local_response: @@ -2601,8 +3239,9 @@ async def agent_infer(agent_id: str, request: InferRequest): if generate_resp.status_code == 200: data = generate_resp.json() + fallback_text = await _finalize_response_text(data.get("response", ""), "ollama-direct") return InferResponse( - response=data.get("response", ""), + response=fallback_text, model=model, tokens_used=data.get("eval_count", 0), backend="ollama-direct" @@ -2671,6 +3310,220 @@ async def tools_execute(request: ToolExecuteRequest): return {"status": "failed", "data": data, "error": {"message": result.error or "Tool failed"}} +@app.post("/v1/documents/ingest") +async def documents_ingest(request: DocumentIngestRequest): + """ + Ingest raw document text into Qdrant {agent_id}_docs. + """ + if not MEMORY_RETRIEVAL_AVAILABLE or not memory_retrieval: + raise HTTPException(status_code=503, detail="Memory retrieval not available") + + agent_id = (request.agent_id or "").strip().lower() + if not agent_id: + raise HTTPException(status_code=400, detail="agent_id is required") + + text = (request.text or "").strip() + if not text: + raise HTTPException(status_code=400, detail="text is required") + + doc_id = (request.doc_id or "").strip() + if not doc_id: + # Fallback should be deterministic for same text + file + seed = f"{agent_id}:{request.file_name or ''}:{text[:400]}" + doc_id = hashlib.md5(seed.encode("utf-8")).hexdigest()[:16] + + result = await memory_retrieval.ingest_document_chunks( + agent_id=agent_id, + doc_id=doc_id, + file_name=request.file_name, + text=text, + dao_id=request.dao_id, + user_id=request.user_id, + metadata=request.metadata, + ) + if not result.get("ok"): + return { + "ok": False, + "error": result.get("error", "ingest_failed"), + "doc_id": doc_id, + "collection": result.get("collection"), + } + return result + + +@app.post("/v1/documents/query") +async def documents_query(request: DocumentQueryRequest): + """ + Query ingested document chunks and synthesize source-locked answer. + """ + if not MEMORY_RETRIEVAL_AVAILABLE or not memory_retrieval: + raise HTTPException(status_code=503, detail="Memory retrieval not available") + + agent_id = (request.agent_id or "").strip().lower() + if not agent_id: + raise HTTPException(status_code=400, detail="agent_id is required") + + question = (request.question or "").strip() + if not question: + raise HTTPException(status_code=400, detail="question is required") + + lookup = await memory_retrieval.query_document_chunks( + agent_id=agent_id, + question=question, + doc_id=request.doc_id, + dao_id=request.dao_id, + limit=request.limit, + ) + chunks = lookup.get("chunks") or [] + if not chunks: + return { + "ok": False, + "error": lookup.get("error", "no_relevant_chunks"), + "data": { + "answer": None, + "citations": [], + "doc_id": request.doc_id, + }, + } + + citations: List[Dict[str, Any]] = [] + context_blocks: List[str] = [] + for i, ch in enumerate(chunks, start=1): + c_doc_id = ch.get("doc_id") or request.doc_id + c_file = ch.get("file_name") + c_idx = ch.get("chunk_index") + c_score = float(ch.get("score", 0.0) or 0.0) + citations.append( + { + "doc_id": c_doc_id, + "file_name": c_file, + "chunk_index": c_idx, + "version_no": ch.get("version_no"), + "score": round(c_score, 4), + } + ) + src = [] + if c_file: + src.append(f"file={c_file}") + if c_idx is not None: + src.append(f"chunk={int(c_idx) + 1}") + src_label = ", ".join(src) if src else "chunk" + context_blocks.append(f"[{i}] ({src_label}) {str(ch.get('text') or '').strip()[:1400]}") + + answer_text = "" + try: + llm_req = InternalLLMRequest( + prompt=( + "Питання користувача:\n" + f"{question}\n\n" + "Контекст із документа (дозволено використовувати ТІЛЬКИ його):\n" + + "\n\n".join(context_blocks) + + "\n\n" + "Правила відповіді:\n" + "1) Відповідай лише на основі наведеного контексту.\n" + "2) Якщо даних недостатньо, прямо скажи: 'Недостатньо даних у документі'.\n" + "3) В кінці додай коротке посилання на джерело у форматі [source: N].\n" + ), + llm_profile="reasoning", + max_tokens=320, + temperature=0.1, + role_context="Document QA source-locked", + metadata={"agent_id": agent_id, "mode": "documents_query"}, + ) + llm_resp = await internal_llm_complete(llm_req) + answer_text = (llm_resp.text or "").strip() + except Exception as e: + logger.warning(f"documents_query LLM synthesis failed: {e}") + + if not answer_text: + top = chunks[0] + answer_text = ( + "Знайшов релевантний фрагмент у документі, але не вдалося сформувати підсумок. " + f"Ось ключовий уривок:\n{str(top.get('text') or '').strip()[:1200]}" + ) + + return { + "ok": True, + "data": { + "answer": answer_text, + "citations": citations, + "doc_id": request.doc_id or chunks[0].get("doc_id"), + "chunks_used": len(chunks), + "collection": lookup.get("collection"), + }, + } + + +@app.post("/v1/documents/update") +async def documents_update(request: DocumentUpdateRequest): + """ + Replace document chunks for doc_id with new text and create a new version row. + """ + if not MEMORY_RETRIEVAL_AVAILABLE or not memory_retrieval: + raise HTTPException(status_code=503, detail="Memory retrieval not available") + + agent_id = (request.agent_id or "").strip().lower() + if not agent_id: + raise HTTPException(status_code=400, detail="agent_id is required") + + doc_id = (request.doc_id or "").strip() + if not doc_id: + raise HTTPException(status_code=400, detail="doc_id is required") + + text = (request.text or "").strip() + if not text: + raise HTTPException(status_code=400, detail="text is required") + + result = await memory_retrieval.update_document_chunks( + agent_id=agent_id, + doc_id=doc_id, + file_name=request.file_name, + text=text, + dao_id=request.dao_id, + user_id=request.user_id, + metadata=request.metadata, + storage_ref=request.storage_ref, + ) + if not result.get("ok"): + return { + "ok": False, + "error": result.get("error", "update_failed"), + "doc_id": doc_id, + "collection": result.get("collection"), + } + return result + + +@app.get("/v1/documents/{doc_id}/versions") +async def documents_versions(doc_id: str, agent_id: str, limit: int = 20): + """ + List stored versions for a document. + """ + if not MEMORY_RETRIEVAL_AVAILABLE or not memory_retrieval: + raise HTTPException(status_code=503, detail="Memory retrieval not available") + + aid = (agent_id or "").strip().lower() + if not aid: + raise HTTPException(status_code=400, detail="agent_id is required") + + did = (doc_id or "").strip() + if not did: + raise HTTPException(status_code=400, detail="doc_id is required") + + items = await memory_retrieval.list_document_versions( + agent_id=aid, + doc_id=did, + limit=limit, + ) + return { + "ok": True, + "agent_id": aid, + "doc_id": did, + "total": len(items), + "items": items, + } + + @app.get("/v1/models") async def list_available_models(): """List all available models across backends""" @@ -2712,6 +3565,42 @@ async def list_available_models(): return {"models": models, "total": len(models)} +@app.get("/v1/agromatrix/shared-memory/pending") +async def agromatrix_shared_pending(limit: int = 50): + """List pending shared agronomy memory cases for mentor review.""" + if not MEMORY_RETRIEVAL_AVAILABLE or not memory_retrieval: + raise HTTPException(status_code=503, detail="Memory retrieval not available") + if not hasattr(memory_retrieval, "list_shared_pending_cases"): + raise HTTPException(status_code=501, detail="Pending review API not enabled") + items = await memory_retrieval.list_shared_pending_cases(limit=limit) + return {"items": items, "total": len(items)} + + +@app.post("/v1/agromatrix/shared-memory/review") +async def agromatrix_shared_review(req: SharedMemoryReviewRequest, request: Request): + """Approve or reject a pending shared agronomy memory case.""" + _require_agromatrix_review_auth(request) + + if not MEMORY_RETRIEVAL_AVAILABLE or not memory_retrieval: + raise HTTPException(status_code=503, detail="Memory retrieval not available") + if not hasattr(memory_retrieval, "review_shared_pending_case"): + raise HTTPException(status_code=501, detail="Review API not enabled") + + result = await memory_retrieval.review_shared_pending_case( + point_id=req.point_id, + approve=req.approve, + reviewer=req.reviewer, + note=req.note, + ) + if not isinstance(result, dict): + raise HTTPException(status_code=500, detail="Invalid review result") + if result.get("ok"): + return result + if result.get("error") == "not_found": + raise HTTPException(status_code=404, detail="Pending case not found") + raise HTTPException(status_code=500, detail=result.get("error", "review_failed")) + + # ============================================================================= # NEO4J GRAPH API ENDPOINTS # ============================================================================= diff --git a/services/router/memory_retrieval.py b/services/router/memory_retrieval.py index c3f887d0..bf1aaaea 100644 --- a/services/router/memory_retrieval.py +++ b/services/router/memory_retrieval.py @@ -18,6 +18,8 @@ Collections per agent: import os import json import logging +import re +import hashlib from typing import Optional, Dict, Any, List from dataclasses import dataclass, field from datetime import datetime @@ -35,6 +37,10 @@ COHERE_API_KEY = os.getenv("COHERE_API_KEY", "") NEO4J_BOLT_URL = os.getenv("NEO4J_BOLT_URL", "bolt://neo4j:7687") NEO4J_USER = os.getenv("NEO4J_USER", "neo4j") NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "neo4j") +PENDING_QUESTIONS_LIMIT = int(os.getenv("AGENT_PENDING_QUESTIONS_LIMIT", "5")) +SHARED_AGRO_LIBRARY_ENABLED = os.getenv("AGROMATRIX_SHARED_LIBRARY_ENABLED", "true").lower() == "true" +SHARED_AGRO_LIBRARY_REQUIRE_REVIEW = os.getenv("AGROMATRIX_SHARED_LIBRARY_REQUIRE_REVIEW", "true").lower() == "true" +DOC_VERSION_PREVIEW_CHARS = int(os.getenv("DOC_VERSION_PREVIEW_CHARS", "240")) @dataclass @@ -61,6 +67,7 @@ class SessionState: last_answer_fingerprint: Optional[str] = None trust_mode: bool = False apprentice_mode: bool = False + pending_questions: List[str] = field(default_factory=list) @dataclass @@ -95,6 +102,10 @@ class MemoryBrief: lines.append("📚 Режим учня — можеш ставити уточнюючі питання") if self.session_state.active_topic: lines.append(f"📌 Активна тема: {self.session_state.active_topic}") + if self.session_state.pending_questions: + lines.append("🕘 Невідповідані питання в цьому чаті (відповідай на них першочергово):") + for q in self.session_state.pending_questions[:3]: + lines.append(f" - {q[:180]}") # User facts (preferences, profile) if self.user_facts: @@ -178,6 +189,7 @@ class MemoryRetrieval: # HTTP client for embeddings self.http_client = httpx.AsyncClient(timeout=30.0) + await self._ensure_aux_tables() async def close(self): """Close connections""" @@ -187,6 +199,77 @@ class MemoryRetrieval: await self.neo4j_driver.close() if self.http_client: await self.http_client.aclose() + + async def _ensure_aux_tables(self): + """Create auxiliary tables used by agent runtime policies.""" + if not self.pg_pool: + return + try: + async with self.pg_pool.acquire() as conn: + await conn.execute( + """ + CREATE TABLE IF NOT EXISTS agent_session_state ( + channel TEXT NOT NULL, + chat_id TEXT NOT NULL, + user_id TEXT NOT NULL, + agent_id TEXT NOT NULL, + conversation_id TEXT NOT NULL, + last_user_id TEXT, + last_user_nick TEXT, + active_topic TEXT, + context_open BOOLEAN NOT NULL DEFAULT FALSE, + last_media_handled BOOLEAN NOT NULL DEFAULT TRUE, + last_answer_fingerprint TEXT, + trust_mode BOOLEAN NOT NULL DEFAULT FALSE, + apprentice_mode BOOLEAN NOT NULL DEFAULT FALSE, + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + PRIMARY KEY (channel, chat_id, user_id, agent_id) + ); + CREATE INDEX IF NOT EXISTS idx_agent_session_state_conv + ON agent_session_state (conversation_id); + + CREATE TABLE IF NOT EXISTS agent_pending_questions ( + id BIGSERIAL PRIMARY KEY, + channel TEXT NOT NULL, + chat_id TEXT NOT NULL, + user_id TEXT NOT NULL, + agent_id TEXT NOT NULL, + question_text TEXT NOT NULL, + question_fingerprint TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'pending', + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + answered_at TIMESTAMPTZ, + metadata JSONB NOT NULL DEFAULT '{}'::jsonb + ); + CREATE INDEX IF NOT EXISTS idx_agent_pending_questions_scope + ON agent_pending_questions (agent_id, channel, chat_id, user_id, status, created_at DESC); + CREATE UNIQUE INDEX IF NOT EXISTS idx_agent_pending_questions_unique_open + ON agent_pending_questions (agent_id, channel, chat_id, user_id, question_fingerprint, status); + + CREATE TABLE IF NOT EXISTS agent_document_versions ( + id BIGSERIAL PRIMARY KEY, + agent_id TEXT NOT NULL, + doc_id TEXT NOT NULL, + version_no INTEGER NOT NULL, + text_hash TEXT NOT NULL, + text_len INTEGER NOT NULL DEFAULT 0, + text_preview TEXT, + file_name TEXT, + dao_id TEXT, + user_id TEXT, + storage_ref TEXT, + source TEXT NOT NULL DEFAULT 'ingest', + metadata JSONB NOT NULL DEFAULT '{}'::jsonb, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + UNIQUE (agent_id, doc_id, version_no) + ); + CREATE INDEX IF NOT EXISTS idx_agent_document_versions_latest + ON agent_document_versions (agent_id, doc_id, version_no DESC); + """ + ) + except Exception as e: + logger.warning(f"Aux tables init failed: {e}") # ========================================================================= # L2: Platform Identity Resolution @@ -236,7 +319,7 @@ class MemoryRetrieval: identity.is_mentor = bool(is_mentor) except Exception as e: - logger.warning(f"Identity resolution failed: {e}") + logger.debug(f"Identity resolution fallback: {e}") return identity @@ -248,7 +331,9 @@ class MemoryRetrieval: self, channel: str, chat_id: str, - thread_id: Optional[str] = None + thread_id: Optional[str] = None, + agent_id: Optional[str] = None, + user_id: Optional[str] = None, ) -> SessionState: """Get or create session state for conversation""" state = SessionState() @@ -258,42 +343,78 @@ class MemoryRetrieval: try: async with self.pg_pool.acquire() as conn: - # Get or create conversation - conv_id = await conn.fetchval( - "SELECT get_or_create_conversation($1, $2, $3, NULL)", - channel, chat_id, thread_id - ) - state.conversation_id = str(conv_id) if conv_id else None - - # Get conversation state - if conv_id: - row = await conn.fetchrow(""" - SELECT * FROM helion_conversation_state - WHERE conversation_id = $1 - """, conv_id) - - if row: - state.last_addressed = row.get('last_addressed_to_helion', False) - state.active_topic = row.get('active_topic_id') - state.context_open = row.get('active_context_open', False) - state.last_media_handled = row.get('last_media_handled', True) - state.last_answer_fingerprint = row.get('last_answer_fingerprint') - state.trust_mode = row.get('group_trust_mode', False) - state.apprentice_mode = row.get('apprentice_mode', False) + if agent_id and user_id: + conv_id = self._build_conversation_id(channel, chat_id, user_id, agent_id) + row = await conn.fetchrow( + """ + SELECT conversation_id, active_topic, context_open, last_media_handled, + last_answer_fingerprint, trust_mode, apprentice_mode + FROM agent_session_state + WHERE channel = $1 + AND chat_id = $2 + AND user_id = $3 + AND agent_id = $4 + """, + channel, + chat_id, + user_id, + agent_id, + ) + if not row: + await conn.execute( + """ + INSERT INTO agent_session_state + (channel, chat_id, user_id, agent_id, conversation_id) + VALUES ($1, $2, $3, $4, $5) + ON CONFLICT (channel, chat_id, user_id, agent_id) DO NOTHING + """, + channel, + chat_id, + user_id, + agent_id, + conv_id, + ) + state.conversation_id = conv_id else: - # Create initial state - await conn.execute(""" - INSERT INTO helion_conversation_state (conversation_id) - VALUES ($1) - ON CONFLICT (conversation_id) DO NOTHING - """, conv_id) - - # Check if trusted group - is_trusted = await conn.fetchval( - "SELECT is_trusted_group($1, $2)", - channel, chat_id - ) - state.trust_mode = bool(is_trusted) + state.conversation_id = str(row.get("conversation_id") or conv_id) + state.active_topic = row.get("active_topic") + state.context_open = bool(row.get("context_open", False)) + state.last_media_handled = bool(row.get("last_media_handled", True)) + state.last_answer_fingerprint = row.get("last_answer_fingerprint") + state.trust_mode = bool(row.get("trust_mode", False)) + state.apprentice_mode = bool(row.get("apprentice_mode", False)) + else: + state.conversation_id = self._build_conversation_id( + channel, + chat_id, + user_id or "unknown", + agent_id or "agent", + ) + + if agent_id and user_id: + pending_rows = await conn.fetch( + """ + SELECT question_text + FROM agent_pending_questions + WHERE channel = $1 + AND chat_id = $2 + AND user_id = $3 + AND agent_id = $4 + AND status = 'pending' + ORDER BY created_at ASC + LIMIT $5 + """, + channel, + chat_id, + user_id, + agent_id, + PENDING_QUESTIONS_LIMIT, + ) + state.pending_questions = [ + str(r.get("question_text") or "").strip() + for r in pending_rows + if str(r.get("question_text") or "").strip() + ] except Exception as e: logger.warning(f"Session state retrieval failed: {e}") @@ -453,7 +574,7 @@ class MemoryRetrieval: # Higher threshold for messages; even higher when user asks about docs to avoid pulling old chatter. msg_thresh = 0.5 if is_doc_query else 0.4 if r.score > msg_thresh: - text = r.payload.get("text", r.payload.get("content", "")) + text = self._extract_message_text(r.payload) # Skip very short or system messages if len(text) > 20 and not text.startswith("<"): if is_doc_query and topic_keywords: @@ -493,6 +614,32 @@ class MemoryRetrieval: }) except Exception as e: logger.debug(f"{docs_collection} search: {e}") + + # Search 4: shared agronomy memory (reviewed, cross-chat, anonymized) + if ( + SHARED_AGRO_LIBRARY_ENABLED + and agent_id == "agromatrix" + and self._is_plant_query(query) + ): + try: + results = self.qdrant_client.search( + collection_name="agromatrix_shared_library", + query_vector=embedding, + limit=3, + with_payload=True + ) + for r in results: + if r.score > 0.45: + text = str(r.payload.get("text") or "").strip() + if len(text) > 20: + all_results.append({ + "text": text[:500], + "type": "shared_agro_fact", + "score": r.score + 0.05, + "source": "shared_agronomy_library" + }) + except Exception as e: + logger.debug(f"agromatrix_shared_library search: {e}") # Sort by score and deduplicate all_results.sort(key=lambda x: x.get("score", 0), reverse=True) @@ -501,7 +648,7 @@ class MemoryRetrieval: seen_texts = set() unique_results = [] for r in all_results: - text_key = r.get("text", "")[:50].lower() + text_key = self._canonical_text_key(r.get("text", "")) if text_key not in seen_texts: seen_texts.add(text_key) unique_results.append(r) @@ -511,6 +658,62 @@ class MemoryRetrieval: except Exception as e: logger.warning(f"Memory search failed for {agent_id}: {e}") return [] + + @staticmethod + def _extract_message_text(payload: Dict[str, Any]) -> str: + """ + Normalize text across both payload schemas: + - memory-service: content/text (+ role/channel_id) + - router: user_message + assistant_response (+ chat_id) + """ + if not payload: + return "" + + text = (payload.get("text") or payload.get("content") or "").strip() + if text: + lower = text.lower() + marker = "\n\nassistant:" + idx = lower.rfind(marker) + if lower.startswith("user:") and idx != -1: + assistant_text = text[idx + len(marker):].strip() + if assistant_text: + return assistant_text + return text + + user_text = (payload.get("user_message") or "").strip() + assistant_text = (payload.get("assistant_response") or "").strip() + if user_text and assistant_text: + return f"User: {user_text}\n\nAssistant: {assistant_text}" + return user_text or assistant_text + + @staticmethod + def _canonical_text_key(text: str) -> str: + if not text: + return "" + normalized = re.sub(r"\s+", " ", text.strip().lower()) + return normalized[:220] + + @staticmethod + def _is_plant_query(text: str) -> bool: + q = (text or "").lower() + if not q: + return False + markers = [ + "рослин", "культур", "лист", "стебл", "бур'ян", "хвороб", "шкідник", + "what plant", "identify plant", "crop", "species", "leaf", "stem", + "что за растение", "культура", "листок", "фото рослини" + ] + return any(m in q for m in markers) + + @staticmethod + def _question_fingerprint(question_text: str) -> str: + normalized = re.sub(r"\s+", " ", (question_text or "").strip().lower()) + return hashlib.sha1(normalized.encode("utf-8")).hexdigest()[:16] + + @staticmethod + def _build_conversation_id(channel: str, chat_id: str, user_id: str, agent_id: str) -> str: + seed = f"{channel}:{chat_id}:{user_id}:{agent_id}" + return hashlib.sha1(seed.encode("utf-8")).hexdigest()[:24] async def get_user_graph_context( self, @@ -604,7 +807,13 @@ class MemoryRetrieval: brief.user_identity = identity # L1: Session State - session = await self.get_session_state(channel, chat_id, thread_id) + session = await self.get_session_state( + channel, + chat_id, + thread_id, + agent_id=agent_id, + user_id=user_id, + ) brief.session_state = session brief.is_trusted_group = session.trust_mode @@ -619,6 +828,8 @@ class MemoryRetrieval: query=message, agent_id=agent_id, platform_user_id=identity.platform_user_id, + chat_id=chat_id, + user_id=user_id, limit=5 ) brief.relevant_memories = memories @@ -712,6 +923,22 @@ class MemoryRetrieval: ) ] ) + + # Optional shared agronomy memory: + # - never stores user/chat identifiers + # - supports review gate (pending vs approved) + if ( + SHARED_AGRO_LIBRARY_ENABLED + and agent_id == "agromatrix" + and message_type in {"vision", "conversation"} + and isinstance(metadata, dict) + and metadata.get("deterministic_plant_id") + ): + await self._store_shared_agronomy_memory( + message_text=message_text, + response_text=response_text, + metadata=metadata, + ) logger.debug(f"✅ Stored message in {messages_collection}: {point_id[:8]}...") return True @@ -719,6 +946,935 @@ class MemoryRetrieval: except Exception as e: logger.warning(f"Failed to store message in {messages_collection}: {e}") return False + + async def _store_shared_agronomy_memory( + self, + message_text: str, + response_text: str, + metadata: Dict[str, Any], + ) -> bool: + if not self.qdrant_client or not COHERE_API_KEY: + return False + try: + from qdrant_client.http import models as qmodels + import uuid + + reviewed = bool(metadata.get("mentor_confirmed") or metadata.get("reviewed")) + collection = "agromatrix_shared_library" + if SHARED_AGRO_LIBRARY_REQUIRE_REVIEW and not reviewed: + collection = "agromatrix_shared_pending" + + try: + self.qdrant_client.get_collection(collection) + except Exception: + self.qdrant_client.create_collection( + collection_name=collection, + vectors_config=qmodels.VectorParams( + size=1024, + distance=qmodels.Distance.COSINE, + ), + ) + + compact = ( + f"Plant case\nQuestion: {message_text[:800]}\n" + f"Answer: {response_text[:1200]}\n" + f"Candidates: {json.dumps(metadata.get('candidates', []), ensure_ascii=False)[:1200]}" + ) + embedding = await self.get_embedding(compact[:2000]) + if not embedding: + return False + + payload = { + "text": compact[:3000], + "type": "plant_case", + "deterministic_plant_id": True, + "decision": metadata.get("decision"), + "confidence_threshold": metadata.get("confidence_threshold"), + "candidates": metadata.get("candidates", [])[:5], + "reviewed": reviewed, + "timestamp": datetime.utcnow().isoformat(), + } + self.qdrant_client.upsert( + collection_name=collection, + points=[qmodels.PointStruct(id=str(uuid.uuid4()), vector=embedding, payload=payload)], + ) + return True + except Exception as e: + logger.debug(f"Shared agronomy memory store failed: {e}") + return False + + async def register_pending_question( + self, + channel: str, + chat_id: str, + user_id: str, + agent_id: str, + question_text: str, + metadata: Optional[Dict[str, Any]] = None, + ) -> bool: + if not self.pg_pool: + return False + text = (question_text or "").strip() + if not text: + return False + fp = self._question_fingerprint(text) + try: + async with self.pg_pool.acquire() as conn: + await conn.execute( + """ + INSERT INTO agent_pending_questions + (channel, chat_id, user_id, agent_id, question_text, question_fingerprint, status, metadata) + VALUES ($1, $2, $3, $4, $5, $6, 'pending', $7::jsonb) + ON CONFLICT (agent_id, channel, chat_id, user_id, question_fingerprint, status) + DO NOTHING + """, + channel, + chat_id, + user_id, + agent_id, + text[:1200], + fp, + json.dumps(metadata or {}, ensure_ascii=False), + ) + # Keep only last N open items. + await conn.execute( + """ + WITH ranked AS ( + SELECT id, ROW_NUMBER() OVER ( + PARTITION BY channel, chat_id, user_id, agent_id, status + ORDER BY created_at DESC + ) AS rn + FROM agent_pending_questions + WHERE channel = $1 + AND chat_id = $2 + AND user_id = $3 + AND agent_id = $4 + AND status = 'pending' + ) + UPDATE agent_pending_questions p + SET status = 'dismissed', + answered_at = NOW(), + metadata = COALESCE(p.metadata, '{}'::jsonb) || '{"reason":"overflow_trim"}'::jsonb + FROM ranked r + WHERE p.id = r.id + AND r.rn > $5 + """, + channel, + chat_id, + user_id, + agent_id, + max(1, PENDING_QUESTIONS_LIMIT), + ) + return True + except Exception as e: + logger.warning(f"register_pending_question failed: {e}") + return False + + async def resolve_pending_question( + self, + channel: str, + chat_id: str, + user_id: str, + agent_id: str, + answer_text: Optional[str] = None, + reason: str = "answered", + ) -> bool: + if not self.pg_pool: + return False + try: + async with self.pg_pool.acquire() as conn: + row = await conn.fetchrow( + """ + WITH target AS ( + SELECT id + FROM agent_pending_questions + WHERE channel = $1 + AND chat_id = $2 + AND user_id = $3 + AND agent_id = $4 + AND status = 'pending' + ORDER BY created_at ASC + LIMIT 1 + ) + UPDATE agent_pending_questions p + SET status = CASE WHEN $5 = 'dismissed' THEN 'dismissed' ELSE 'answered' END, + answered_at = NOW(), + metadata = COALESCE(p.metadata, '{}'::jsonb) + || jsonb_build_object( + 'resolution_reason', $5, + 'answer_fingerprint', COALESCE($6, '') + ) + FROM target t + WHERE p.id = t.id + RETURNING p.id + """, + channel, + chat_id, + user_id, + agent_id, + reason, + self._question_fingerprint(answer_text or "") if answer_text else "", + ) + return bool(row) + except Exception as e: + logger.warning(f"resolve_pending_question failed: {e}") + return False + + @staticmethod + def _to_qdrant_point_id(raw_id: Any) -> Any: + if isinstance(raw_id, int): + return raw_id + if isinstance(raw_id, float) and raw_id.is_integer(): + return int(raw_id) + if isinstance(raw_id, str): + v = raw_id.strip() + if not v: + return raw_id + if v.isdigit(): + try: + return int(v) + except Exception: + return v + return v + return raw_id + + async def list_shared_pending_cases(self, limit: int = 50) -> List[Dict[str, Any]]: + if not self.qdrant_client or not SHARED_AGRO_LIBRARY_ENABLED: + return [] + size = max(1, min(int(limit or 50), 200)) + try: + points, _ = self.qdrant_client.scroll( + collection_name="agromatrix_shared_pending", + limit=size, + with_payload=True, + with_vectors=False, + ) + except Exception as e: + logger.debug(f"list_shared_pending_cases failed: {e}") + return [] + + items: List[Dict[str, Any]] = [] + for p in points or []: + payload = getattr(p, "payload", {}) or {} + text = str(payload.get("text") or "").strip() + timestamp = payload.get("timestamp") or "" + candidates = payload.get("candidates") if isinstance(payload.get("candidates"), list) else [] + items.append( + { + "point_id": str(getattr(p, "id", "")), + "timestamp": timestamp, + "decision": payload.get("decision"), + "reviewed": bool(payload.get("reviewed")), + "excerpt": text[:240], + "candidates": candidates[:5], + } + ) + items.sort(key=lambda x: x.get("timestamp") or "", reverse=True) + return items + + async def review_shared_pending_case( + self, + point_id: str, + approve: bool, + reviewer: Optional[str] = None, + note: Optional[str] = None, + ) -> Dict[str, Any]: + if not self.qdrant_client: + return {"ok": False, "error": "qdrant_unavailable"} + + try: + from qdrant_client.http import models as qmodels + import uuid + + pid = self._to_qdrant_point_id(point_id) + records = self.qdrant_client.retrieve( + collection_name="agromatrix_shared_pending", + ids=[pid], + with_payload=True, + with_vectors=True, + ) + if not records: + return {"ok": False, "error": "not_found"} + + point = records[0] + payload = dict(getattr(point, "payload", {}) or {}) + now_iso = datetime.utcnow().isoformat() + payload["reviewed"] = bool(approve) + payload["review"] = { + "reviewer": (reviewer or "system")[:120], + "approved": bool(approve), + "note": (note or "")[:500], + "reviewed_at": now_iso, + } + + library_point_id: Optional[str] = None + if approve: + vector = getattr(point, "vector", None) + if isinstance(vector, dict): + # Named vectors mode: pick first vector value. + vector = next(iter(vector.values()), None) + if not vector and COHERE_API_KEY: + basis = str(payload.get("text") or payload.get("assistant_response") or "")[:2000] + vector = await self.get_embedding(basis) + if not vector: + return {"ok": False, "error": "missing_vector"} + + try: + self.qdrant_client.get_collection("agromatrix_shared_library") + except Exception: + self.qdrant_client.create_collection( + collection_name="agromatrix_shared_library", + vectors_config=qmodels.VectorParams( + size=len(vector), + distance=qmodels.Distance.COSINE, + ), + ) + + library_point_id = str(uuid.uuid4()) + payload["approved_at"] = now_iso + self.qdrant_client.upsert( + collection_name="agromatrix_shared_library", + points=[ + qmodels.PointStruct( + id=library_point_id, + vector=vector, + payload=payload, + ) + ], + ) + + self.qdrant_client.delete( + collection_name="agromatrix_shared_pending", + points_selector=qmodels.PointIdsList(points=[pid]), + ) + + return { + "ok": True, + "approved": bool(approve), + "point_id": str(getattr(point, "id", point_id)), + "library_point_id": library_point_id, + } + except Exception as e: + logger.warning(f"review_shared_pending_case failed: {e}") + return {"ok": False, "error": str(e)} + + def _chunk_document_text( + self, + text: str, + chunk_chars: int = 1200, + overlap_chars: int = 180, + ) -> List[str]: + """ + Split document text into overlap-aware chunks for RAG indexing. + Keeps paragraph structure when possible. + """ + raw = re.sub(r"\r\n?", "\n", text or "").strip() + if not raw: + return [] + + paragraphs = [p.strip() for p in re.split(r"\n{2,}", raw) if p and p.strip()] + if not paragraphs: + return [] + + chunks: List[str] = [] + current = "" + max_hard = max(chunk_chars, 600) + + def _push_current() -> None: + nonlocal current + if current and len(current.strip()) >= 20: + chunks.append(current.strip()) + current = "" + + for para in paragraphs: + if len(para) > max_hard * 2: + _push_current() + i = 0 + step = max_hard - max(80, min(overlap_chars, max_hard // 2)) + while i < len(para): + part = para[i : i + max_hard] + if len(part.strip()) >= 20: + chunks.append(part.strip()) + i += max(1, step) + continue + + candidate = f"{current}\n\n{para}".strip() if current else para + if len(candidate) <= max_hard: + current = candidate + continue + + _push_current() + if overlap_chars > 0 and chunks: + tail = chunks[-1][-overlap_chars:] + current = f"{tail}\n\n{para}".strip() + if len(current) > max_hard: + _push_current() + current = para + else: + current = para + + _push_current() + return chunks + + async def _next_document_version_no( + self, + agent_id: str, + doc_id: str, + ) -> int: + if self.pg_pool: + try: + async with self.pg_pool.acquire() as conn: + value = await conn.fetchval( + """ + SELECT COALESCE(MAX(version_no), 0) + 1 + FROM agent_document_versions + WHERE agent_id = $1 + AND doc_id = $2 + """, + (agent_id or "").lower(), + doc_id, + ) + return max(1, int(value or 1)) + except Exception as e: + logger.warning(f"next_document_version_no(pg) failed: {e}") + + # Fallback: infer from existing chunk payloads in Qdrant. + if self.qdrant_client: + try: + from qdrant_client.http import models as qmodels + + collection = f"{(agent_id or 'daarwizz').lower()}_docs" + points, _ = self.qdrant_client.scroll( + collection_name=collection, + scroll_filter=qmodels.Filter( + must=[ + qmodels.FieldCondition( + key="doc_id", + match=qmodels.MatchValue(value=doc_id), + ) + ] + ), + limit=256, + with_payload=True, + ) + current_max = 0 + for p in points or []: + payload = getattr(p, "payload", {}) or {} + ver = payload.get("version_no") + if isinstance(ver, int): + current_max = max(current_max, ver) + elif isinstance(ver, str) and ver.isdigit(): + current_max = max(current_max, int(ver)) + return current_max + 1 if current_max > 0 else 1 + except Exception as e: + logger.debug(f"next_document_version_no(qdrant) fallback failed: {e}") + + return 1 + + async def _latest_document_version_no( + self, + agent_id: str, + doc_id: str, + ) -> int: + nxt = await self._next_document_version_no(agent_id=agent_id, doc_id=doc_id) + return max(0, int(nxt) - 1) + + async def _record_document_version( + self, + agent_id: str, + doc_id: str, + version_no: int, + text: str, + file_name: Optional[str] = None, + dao_id: Optional[str] = None, + user_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + source: str = "ingest", + storage_ref: Optional[str] = None, + ) -> Dict[str, Any]: + text_body = (text or "").strip() + text_hash = hashlib.sha256(text_body.encode("utf-8")).hexdigest() if text_body else "" + text_len = len(text_body) + preview = text_body[:DOC_VERSION_PREVIEW_CHARS] if text_body else "" + payload = metadata if isinstance(metadata, dict) else {} + + if not self.pg_pool: + return {"ok": True, "version_no": int(version_no), "id": None} + + try: + async with self.pg_pool.acquire() as conn: + row = await conn.fetchrow( + """ + INSERT INTO agent_document_versions + (agent_id, doc_id, version_no, text_hash, text_len, text_preview, + file_name, dao_id, user_id, storage_ref, source, metadata) + VALUES + ($1, $2, $3, $4, $5, $6, + $7, $8, $9, $10, $11, $12::jsonb) + ON CONFLICT (agent_id, doc_id, version_no) + DO UPDATE SET + text_hash = EXCLUDED.text_hash, + text_len = EXCLUDED.text_len, + text_preview = EXCLUDED.text_preview, + file_name = EXCLUDED.file_name, + dao_id = EXCLUDED.dao_id, + user_id = EXCLUDED.user_id, + storage_ref = EXCLUDED.storage_ref, + source = EXCLUDED.source, + metadata = EXCLUDED.metadata + RETURNING id, version_no + """, + (agent_id or "").lower(), + doc_id, + int(version_no), + text_hash, + int(text_len), + preview, + file_name, + dao_id, + user_id, + storage_ref, + source, + json.dumps(payload), + ) + return { + "ok": True, + "id": int(row["id"]) if row and row.get("id") is not None else None, + "version_no": int(row["version_no"]) if row and row.get("version_no") is not None else int(version_no), + } + except Exception as e: + logger.warning(f"record_document_version failed: {e}") + return {"ok": False, "error": str(e), "version_no": int(version_no)} + + async def list_document_versions( + self, + agent_id: str, + doc_id: str, + limit: int = 20, + ) -> List[Dict[str, Any]]: + rows_out: List[Dict[str, Any]] = [] + if self.pg_pool: + try: + async with self.pg_pool.acquire() as conn: + rows = await conn.fetch( + """ + SELECT id, agent_id, doc_id, version_no, text_hash, text_len, text_preview, + file_name, dao_id, user_id, storage_ref, source, metadata, created_at + FROM agent_document_versions + WHERE agent_id = $1 + AND doc_id = $2 + ORDER BY version_no DESC + LIMIT $3 + """, + (agent_id or "").lower(), + doc_id, + max(1, min(int(limit or 20), 200)), + ) + for r in rows: + meta_raw = r["metadata"] + if isinstance(meta_raw, dict): + meta_obj = meta_raw + elif isinstance(meta_raw, str): + try: + parsed = json.loads(meta_raw) + meta_obj = parsed if isinstance(parsed, dict) else {"raw": parsed} + except Exception: + meta_obj = {"raw": meta_raw} + else: + meta_obj = {} + rows_out.append( + { + "id": int(r["id"]), + "agent_id": r["agent_id"], + "doc_id": r["doc_id"], + "version_no": int(r["version_no"]), + "text_hash": r["text_hash"], + "text_len": int(r["text_len"] or 0), + "text_preview": r["text_preview"], + "file_name": r["file_name"], + "dao_id": r["dao_id"], + "user_id": r["user_id"], + "storage_ref": r["storage_ref"], + "source": r["source"], + "metadata": meta_obj, + "created_at": r["created_at"].isoformat() if r["created_at"] else None, + } + ) + return rows_out + except Exception as e: + logger.warning(f"list_document_versions failed: {e}") + + # PG unavailable fallback: aggregate distinct versions from Qdrant payloads. + if self.qdrant_client: + try: + from qdrant_client.http import models as qmodels + + collection = f"{(agent_id or 'daarwizz').lower()}_docs" + offset = None + seen: Dict[int, Dict[str, Any]] = {} + max_points = max(64, min(int(limit or 20) * 80, 4096)) + fetched = 0 + while fetched < max_points: + points, next_offset = self.qdrant_client.scroll( + collection_name=collection, + scroll_filter=qmodels.Filter( + must=[ + qmodels.FieldCondition( + key="doc_id", + match=qmodels.MatchValue(value=doc_id), + ) + ] + ), + offset=offset, + limit=256, + with_payload=True, + ) + if not points: + break + fetched += len(points) + for p in points: + payload = getattr(p, "payload", {}) or {} + ver_raw = payload.get("version_no") + if isinstance(ver_raw, int): + ver = ver_raw + elif isinstance(ver_raw, str) and ver_raw.isdigit(): + ver = int(ver_raw) + else: + ver = 1 + + existing = seen.get(ver) + ts = payload.get("timestamp") + if not existing or (ts and str(ts) > str(existing.get("created_at") or "")): + seen[ver] = { + "id": None, + "agent_id": (agent_id or "").lower(), + "doc_id": doc_id, + "version_no": int(ver), + "text_hash": None, + "text_len": None, + "text_preview": None, + "file_name": payload.get("file_name"), + "dao_id": payload.get("dao_id"), + "user_id": payload.get("user_id"), + "storage_ref": payload.get("storage_ref"), + "source": payload.get("source") or "ingest", + "metadata": payload.get("metadata") or {}, + "created_at": ts, + } + if not next_offset: + break + offset = next_offset + rows_out = sorted(seen.values(), key=lambda x: int(x.get("version_no") or 0), reverse=True)[: max(1, min(int(limit or 20), 200))] + except Exception: + pass + + return rows_out + + def _build_doc_filter( + self, + doc_id: str, + dao_id: Optional[str] = None, + ): + from qdrant_client.http import models as qmodels + + must_conditions = [ + qmodels.FieldCondition( + key="doc_id", + match=qmodels.MatchValue(value=doc_id), + ) + ] + if dao_id: + must_conditions.append( + qmodels.FieldCondition( + key="dao_id", + match=qmodels.MatchValue(value=dao_id), + ) + ) + return qmodels.Filter(must=must_conditions) + + def _delete_document_points( + self, + collection: str, + doc_id: str, + dao_id: Optional[str] = None, + ) -> bool: + if not self.qdrant_client: + return False + try: + from qdrant_client.http import models as qmodels + + self.qdrant_client.delete( + collection_name=collection, + points_selector=qmodels.FilterSelector( + filter=self._build_doc_filter(doc_id=doc_id, dao_id=dao_id) + ), + ) + return True + except Exception as e: + logger.warning(f"delete_document_points failed for {collection}/{doc_id}: {e}") + return False + + async def ingest_document_chunks( + self, + agent_id: str, + doc_id: str, + file_name: Optional[str], + text: str, + dao_id: Optional[str] = None, + user_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + replace_existing: bool = False, + version_no: Optional[int] = None, + source: str = "ingest", + storage_ref: Optional[str] = None, + ) -> Dict[str, Any]: + """ + Ingest normalized document chunks into {agent_id}_docs collection. + """ + if not self.qdrant_client: + return {"ok": False, "error": "qdrant_unavailable"} + if not COHERE_API_KEY: + return {"ok": False, "error": "cohere_unavailable"} + + body = (text or "").strip() + if not body: + return {"ok": False, "error": "empty_document"} + + chunks = self._chunk_document_text(body) + if not chunks: + return {"ok": False, "error": "no_chunks"} + + collection = f"{(agent_id or 'daarwizz').lower()}_docs" + stored_points = [] + + try: + from qdrant_client.http import models as qmodels + import uuid + + try: + self.qdrant_client.get_collection(collection) + except Exception: + self.qdrant_client.create_collection( + collection_name=collection, + vectors_config=qmodels.VectorParams( + size=1024, + distance=qmodels.Distance.COSINE, + ), + ) + logger.info(f"✅ Created collection: {collection}") + + total = len(chunks) + resolved_version_no = int(version_no or 0) or await self._next_document_version_no(agent_id=agent_id, doc_id=doc_id) + for idx, chunk in enumerate(chunks): + emb = await self.get_embedding(chunk[:2000]) + if not emb: + continue + payload: Dict[str, Any] = { + "text": chunk[:6000], + "doc_id": doc_id, + "file_name": file_name, + "agent_id": (agent_id or "").lower(), + "dao_id": dao_id, + "user_id": user_id, + "chunk_index": idx, + "chunks_total": total, + "type": "document_chunk", + "version_no": int(resolved_version_no), + "source": source, + "storage_ref": storage_ref, + "timestamp": datetime.utcnow().isoformat(), + } + if isinstance(metadata, dict) and metadata: + payload["metadata"] = metadata + stored_points.append( + qmodels.PointStruct( + id=str(uuid.uuid4()), + vector=emb, + payload=payload, + ) + ) + + if not stored_points: + return {"ok": False, "error": "embedding_failed"} + + # Keep previous versions in the same collection when updating. + # Query path will select only the latest version_no for doc_id. + + self.qdrant_client.upsert(collection_name=collection, points=stored_points) + version_row = await self._record_document_version( + agent_id=agent_id, + doc_id=doc_id, + version_no=resolved_version_no, + text=body, + file_name=file_name, + dao_id=dao_id, + user_id=user_id, + metadata=metadata, + source=source, + storage_ref=storage_ref, + ) + return { + "ok": True, + "doc_id": doc_id, + "version_no": int(resolved_version_no), + "version_id": version_row.get("id"), + "chunks_total": len(chunks), + "chunks_stored": len(stored_points), + "replaced_existing": bool(replace_existing), + "collection": collection, + } + except Exception as e: + logger.warning(f"ingest_document_chunks failed for {collection}: {e}") + return {"ok": False, "error": str(e)} + + async def update_document_chunks( + self, + agent_id: str, + doc_id: str, + file_name: Optional[str], + text: str, + dao_id: Optional[str] = None, + user_id: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + storage_ref: Optional[str] = None, + ) -> Dict[str, Any]: + """ + Update existing document content with version bump. + Keeps the same logical doc_id and replaces indexed chunks. + """ + next_version = await self._next_document_version_no(agent_id=agent_id, doc_id=doc_id) + result = await self.ingest_document_chunks( + agent_id=agent_id, + doc_id=doc_id, + file_name=file_name, + text=text, + dao_id=dao_id, + user_id=user_id, + metadata=metadata, + replace_existing=False, + version_no=next_version, + source="update", + storage_ref=storage_ref, + ) + if result.get("ok"): + result["updated"] = True + result["replaced_existing"] = True + return result + + async def query_document_chunks( + self, + agent_id: str, + question: str, + doc_id: Optional[str] = None, + dao_id: Optional[str] = None, + limit: int = 5, + ) -> Dict[str, Any]: + """ + Retrieve top document chunks from {agent_id}_docs for a question. + """ + if not self.qdrant_client: + return {"ok": False, "error": "qdrant_unavailable", "chunks": []} + if not COHERE_API_KEY: + return {"ok": False, "error": "cohere_unavailable", "chunks": []} + + q = (question or "").strip() + if not q: + return {"ok": False, "error": "empty_question", "chunks": []} + + embedding = await self.get_embedding(q[:2000]) + if not embedding: + return {"ok": False, "error": "embedding_failed", "chunks": []} + + collection = f"{(agent_id or 'daarwizz').lower()}_docs" + + try: + from qdrant_client.http import models as qmodels + must_conditions = [] + if doc_id: + latest_ver = await self._latest_document_version_no(agent_id=agent_id, doc_id=doc_id) + must_conditions.append( + qmodels.FieldCondition( + key="doc_id", + match=qmodels.MatchValue(value=doc_id), + ) + ) + if latest_ver > 0: + must_conditions.append( + qmodels.FieldCondition( + key="version_no", + match=qmodels.MatchValue(value=int(latest_ver)), + ) + ) + if dao_id: + must_conditions.append( + qmodels.FieldCondition( + key="dao_id", + match=qmodels.MatchValue(value=dao_id), + ) + ) + query_filter = qmodels.Filter(must=must_conditions) if must_conditions else None + + rows = self.qdrant_client.search( + collection_name=collection, + query_vector=embedding, + query_filter=query_filter, + limit=max(1, min(int(limit or 5), 12)), + with_payload=True, + ) + except Exception as e: + logger.debug(f"query_document_chunks search failed for {collection}: {e}") + return {"ok": False, "error": "search_failed", "chunks": [], "collection": collection} + + hits: List[Dict[str, Any]] = [] + for row in rows or []: + score = float(getattr(row, "score", 0.0) or 0.0) + if score < 0.30: + continue + payload = getattr(row, "payload", {}) or {} + text = str(payload.get("text") or "").strip() + if len(text) < 10: + continue + hits.append( + { + "text": text, + "score": score, + "doc_id": payload.get("doc_id"), + "file_name": payload.get("file_name"), + "chunk_index": payload.get("chunk_index"), + "chunks_total": payload.get("chunks_total"), + "version_no": payload.get("version_no"), + } + ) + + return { + "ok": bool(hits), + "chunks": hits, + "collection": collection, + "doc_id": doc_id, + } + + async def store_interaction( + self, + channel: str, + chat_id: str, + user_id: str, + agent_id: str, + username: Optional[str], + user_message: str, + assistant_response: str, + metadata: Optional[Dict[str, Any]] = None, + ) -> bool: + # Backward-compatible wrapper for older call sites. + return await self.store_message( + agent_id=agent_id, + user_id=user_id, + username=username, + message_text=user_message, + response_text=assistant_response, + chat_id=chat_id, + message_type="conversation", + metadata=metadata, + ) async def update_session_state( self, @@ -737,10 +1893,10 @@ class MemoryRetrieval: param_idx = 2 allowed_fields = [ - 'last_addressed_to_helion', 'last_user_id', 'last_user_nick', - 'active_topic_id', 'active_context_open', 'last_media_id', - 'last_media_handled', 'last_answer_fingerprint', 'group_trust_mode', - 'apprentice_mode', 'proactive_questions_today' + 'last_user_id', 'last_user_nick', + 'active_topic', 'context_open', + 'last_media_handled', 'last_answer_fingerprint', + 'trust_mode', 'apprentice_mode' ] for field, value in updates.items(): @@ -750,7 +1906,7 @@ class MemoryRetrieval: param_idx += 1 query = f""" - UPDATE helion_conversation_state + UPDATE agent_session_state SET {', '.join(set_clauses)} WHERE conversation_id = $1 """ diff --git a/services/router/router-config.yml b/services/router/router-config.yml index 14e238af..09d7f108 100644 --- a/services/router/router-config.yml +++ b/services/router/router-config.yml @@ -408,8 +408,9 @@ agents: description: "Monitor Agent - архітектор-інспектор DAGI" default_llm: local_qwen3_8b system_prompt: | - Ти - Monitor Agent, стежиш за нодами, сервісами, агентами. - Якщо бачиш у чаті інших ботів, відповідай тільки за інфраструктурою або прямим тегом. + Ти - Monitor Agent, інфраструктурний інспектор DAGI: ноди, сервіси, пайплайни, алерти. + Ти знаєш, що DAARWIZZ — головний оркестратор мережі DAARION.city; для governance/маршрутизації посилайся на нього. + Відповідай коротко і по суті; якщо даних бракує — одразу кажи, який саме метрик/лог потрібен. tools: - id: get_metrics type: builtin diff --git a/services/router/tool_manager.py b/services/router/tool_manager.py index 881f64e4..b09f397e 100644 --- a/services/router/tool_manager.py +++ b/services/router/tool_manager.py @@ -19,6 +19,7 @@ from typing import Dict, List, Any, Optional from dataclasses import dataclass from io import BytesIO, StringIO from pathlib import PurePath +from urllib.parse import urlparse import xml.etree.ElementTree as ET from xml.sax.saxutils import escape as xml_escape from zipfile import ZIP_DEFLATED, ZipFile @@ -108,6 +109,115 @@ TOOL_DEFINITIONS = [ } } }, + { + "type": "function", + "function": { + "name": "plantnet_lookup", + "description": "Визначення рослин через Pl@ntNet API. Повертає top-k кандидатів з confidence.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Короткий опис рослини/культури (якщо немає image_url)" + }, + "image_url": { + "type": "string", + "description": "Публічне посилання на фото рослини" + }, + "organ": { + "type": "string", + "description": "Орган рослини: leaf/flower/fruit/bark/auto", + "default": "auto" + }, + "top_k": { + "type": "integer", + "description": "Скільки кандидатів повернути (1-10)", + "default": 3 + } + } + } + } + }, + { + "type": "function", + "function": { + "name": "nature_id_identify", + "description": "Локальна/open-source ідентифікація рослин через nature-id сумісний сервіс.", + "parameters": { + "type": "object", + "properties": { + "image_url": { + "type": "string", + "description": "Публічне посилання на фото рослини" + }, + "image_data": { + "type": "string", + "description": "Data URL зображення (data:image/...;base64,...)" + }, + "top_k": { + "type": "integer", + "description": "Скільки кандидатів повернути (1-10)", + "default": 3 + }, + "min_confidence": { + "type": "number", + "description": "Поріг confidence для fallback на GBIF", + "default": 0.65 + } + } + } + } + }, + { + "type": "function", + "function": { + "name": "gbif_species_lookup", + "description": "Пошук таксонів у GBIF для валідації назви культури/рослини.", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Назва/термін для пошуку виду" + }, + "limit": { + "type": "integer", + "description": "Кількість результатів (1-10)", + "default": 5 + } + }, + "required": ["query"] + } + } + }, + { + "type": "function", + "function": { + "name": "agrovoc_lookup", + "description": "Нормалізація агро-термінів через AGROVOC (SPARQL).", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Термін культури/хвороби/технології" + }, + "lang": { + "type": "string", + "description": "Мова міток (en/uk/ru)", + "default": "en" + }, + "limit": { + "type": "integer", + "description": "Кількість результатів (1-10)", + "default": 5 + } + }, + "required": ["query"] + } + } + }, # PRIORITY 3: Generation tools { "type": "function", @@ -681,6 +791,42 @@ class ToolManager: tool_names = [t.get("function", {}).get("name") for t in filtered] logger.debug(f"Agent {agent_id} has {len(filtered)} tools: {tool_names}") return filtered + + @staticmethod + def _is_image_data_url(value: str) -> bool: + v = str(value or "").strip() + return bool(v.startswith("data:image/") and ";base64," in v) + + @staticmethod + def _is_known_non_direct_image_url(url: str) -> bool: + u = str(url or "").strip() + if not u: + return False + try: + p = urlparse(u) + except Exception: + return True + host = (p.netloc or "").lower() + if host in {"t.me", "telegram.me"}: + return True + if "web.telegram.org" in host: + return True + return False + + @staticmethod + def _normalize_confidence(value: Any) -> float: + try: + v = float(value) + except Exception: + return 0.0 + if v < 0: + return 0.0 + # Some backends return percentages (e.g. 97.6) instead of 0..1. + if v > 1.0 and v <= 100.0: + v = v / 100.0 + if v > 1.0: + v = 1.0 + return v async def execute_tool( self, @@ -709,6 +855,14 @@ class ToolManager: return await self._web_search(arguments) elif tool_name == "web_extract": return await self._web_extract(arguments) + elif tool_name == "plantnet_lookup": + return await self._plantnet_lookup(arguments) + elif tool_name == "nature_id_identify": + return await self._nature_id_identify(arguments) + elif tool_name == "gbif_species_lookup": + return await self._gbif_species_lookup(arguments) + elif tool_name == "agrovoc_lookup": + return await self._agrovoc_lookup(arguments) elif tool_name == "image_generate": return await self._image_generate(arguments) elif tool_name == "comfy_generate_image": @@ -2530,6 +2684,272 @@ class ToolManager: except Exception as e: return ToolResult(success=False, result=None, error=str(e)) + async def _plantnet_lookup(self, args: Dict) -> ToolResult: + """Plant identification via Pl@ntNet API (skeleton adapter).""" + query = str(args.get("query", "") or "").strip() + image_url = str(args.get("image_url", "") or "").strip() + image_data = str(args.get("image_data", "") or "").strip() + runtime_image_data = str(args.get("_runtime_image_data", "") or "").strip() + if not image_data and self._is_image_data_url(runtime_image_data): + image_data = runtime_image_data + organ = str(args.get("organ", "auto") or "auto").strip().lower() + top_k = max(1, min(int(args.get("top_k", 3)), 5)) + + api_key = (os.getenv("PLANTNET_API_KEY") or "").strip() + if image_url and api_key: + try: + params = { + "api-key": api_key, + "images": image_url, + "organs": "leaf" if organ == "auto" else organ, + "lang": "en", + } + resp = await self.http_client.get( + "https://my-api.plantnet.org/v2/identify/all", + params=params, + timeout=25.0, + ) + if resp.status_code == 200: + data = resp.json() + results = (data.get("results") or [])[:top_k] + if not results: + return ToolResult(success=True, result="Pl@ntNet: кандидатів не знайдено.") + lines = [] + for idx, item in enumerate(results, 1): + species = (item.get("species") or {}) + sname = species.get("scientificNameWithoutAuthor") or species.get("scientificName") or "unknown" + common = species.get("commonNames") or [] + cname = common[0] if common else "-" + score = float(item.get("score") or 0.0) + lines.append(f"{idx}. {sname} ({cname}) score={score:.3f}") + return ToolResult(success=True, result="Pl@ntNet candidates:\n" + "\n".join(lines)) + return ToolResult(success=False, result=None, error=f"plantnet_http_{resp.status_code}") + except Exception as e: + return ToolResult(success=False, result=None, error=f"plantnet_error: {e}") + + if image_url or image_data: + ni_args: Dict[str, Any] = {"top_k": top_k} + if image_data: + ni_args["image_data"] = image_data + else: + ni_args["image_url"] = image_url + if runtime_image_data: + ni_args["_runtime_image_data"] = runtime_image_data + ni = await self._nature_id_identify(ni_args) + if ni.success: + return ni + + if query: + return await self._gbif_species_lookup({"query": query, "limit": top_k}) + + return ToolResult( + success=False, + result=None, + error="No available plant ID backend (set PLANTNET_API_KEY or NATURE_ID_URL, or provide text query)", + ) + + async def _nature_id_identify(self, args: Dict) -> ToolResult: + """Open-source plant identification via self-hosted nature-id compatible endpoint.""" + image_url = str(args.get("image_url", "") or "").strip() + image_data = str(args.get("image_data", "") or "").strip() + runtime_image_data = str(args.get("_runtime_image_data", "") or "").strip() + if not image_data and self._is_image_data_url(runtime_image_data): + image_data = runtime_image_data + top_k = max(1, min(int(args.get("top_k", 3)), 10)) + min_confidence = float(args.get("min_confidence", os.getenv("NATURE_ID_MIN_CONFIDENCE", "0.65"))) + + if image_url and self._is_known_non_direct_image_url(image_url): + if image_data: + logger.info("nature_id_identify: replacing non-direct image_url with runtime image_data") + image_url = "" + else: + return ToolResult( + success=False, + result=None, + error="image_url is not direct image URL; provide image_data or direct Telegram file URL", + ) + + if not image_url and not image_data: + return ToolResult(success=False, result=None, error="image_url or image_data is required") + + base = (os.getenv("NATURE_ID_URL") or "").strip().rstrip("/") + if not base: + return ToolResult(success=False, result=None, error="NATURE_ID_URL is not configured") + + try: + if image_data: + # data URL -> multipart /identify-file + if not image_data.startswith("data:") or "," not in image_data: + return ToolResult(success=False, result=None, error="invalid image_data format") + header, b64 = image_data.split(",", 1) + mime = "image/jpeg" + if ";base64" in header: + mime = header.split(":", 1)[1].split(";", 1)[0] or "image/jpeg" + ext = "jpg" + if "png" in mime: + ext = "png" + try: + image_bytes = base64.b64decode(b64) + except Exception: + return ToolResult(success=False, result=None, error="invalid image_data base64") + files = {"file": (f"upload.{ext}", image_bytes, mime)} + resp = await self.http_client.post( + f"{base}/identify-file", + params={"top_k": top_k}, + files=files, + timeout=45.0, + ) + else: + payload = {"image_url": image_url, "top_k": top_k} + resp = await self.http_client.post(f"{base}/identify", json=payload, timeout=45.0) + + if resp.status_code != 200: + return ToolResult(success=False, result=None, error=f"nature_id_http_{resp.status_code}") + + data = resp.json() or {} + status = str(data.get("status") or "success") + raw_top_k = data.get("top_k") or [] + raw_preds = data.get("predictions") or data.get("results") or [] + + top_k_rows = [] + if isinstance(raw_top_k, list) and raw_top_k: + for row in raw_top_k[:top_k]: + if not isinstance(row, dict): + continue + conf = row.get("confidence", 0.0) + conf_f = self._normalize_confidence(conf) + top_k_rows.append({ + "confidence": conf_f, + "name": str(row.get("name") or row.get("scientific_name") or "unknown"), + "scientific_name": str(row.get("scientific_name") or row.get("name") or "unknown"), + }) + else: + for item in raw_preds[:top_k]: + if not isinstance(item, dict): + continue + score = item.get("score", item.get("confidence", 0.0)) + score_f = self._normalize_confidence(score) + sname = item.get("scientific_name") or item.get("label") or item.get("name") or "unknown" + cname = item.get("common_name") or item.get("common") or sname + top_k_rows.append({ + "confidence": score_f, + "name": str(cname), + "scientific_name": str(sname), + }) + + if not top_k_rows: + return ToolResult(success=True, result=json.dumps({ + "status": status, + "model": data.get("model") or "aiy_plants_V1", + "source": data.get("source") or "nature-id-cli", + "top_k": [], + "confidence": 0.0, + "recommend_fallback": True, + "reason": "no_predictions", + }, ensure_ascii=False)) + + top1 = top_k_rows[0] + top1_conf = float(top1.get("confidence", 0.0)) + recommend_fallback = top1_conf < min_confidence + + out = { + "status": status, + "model": data.get("model") or "aiy_plants_V1", + "source": data.get("source") or "nature-id-cli", + "inference_time_sec": data.get("inference_time_sec"), + "top_k": top_k_rows, + "confidence": top1_conf, + "min_confidence": min_confidence, + "recommend_fallback": recommend_fallback, + "fallback": "gbif_species_lookup", + } + + if recommend_fallback: + fallback_query = str(top1.get("scientific_name") or top1.get("name") or "").strip() + if fallback_query and fallback_query.lower() != "unknown": + gbif = await self._gbif_species_lookup({"query": fallback_query, "limit": min(5, top_k)}) + if gbif.success and gbif.result: + out["gbif_validation"] = gbif.result + + return ToolResult(success=True, result=json.dumps(out, ensure_ascii=False)) + except Exception as e: + return ToolResult(success=False, result=None, error=f"nature_id_error: {e}") + + async def _gbif_species_lookup(self, args: Dict) -> ToolResult: + """Species lookup via GBIF public API.""" + query = str(args.get("query", "") or "").strip() + limit = max(1, min(int(args.get("limit", 5)), 10)) + if not query: + return ToolResult(success=False, result=None, error="query is required") + + try: + resp = await self.http_client.get( + "https://api.gbif.org/v1/species/search", + params={"q": query, "limit": limit, "status": "ACCEPTED"}, + timeout=20.0, + ) + if resp.status_code != 200: + return ToolResult(success=False, result=None, error=f"gbif_http_{resp.status_code}") + + data = resp.json() or {} + results = data.get("results") or [] + if not results: + return ToolResult(success=True, result="GBIF: результатів не знайдено.") + + lines = [] + for idx, item in enumerate(results[:limit], 1): + sci = item.get("scientificName") or item.get("canonicalName") or "unknown" + rank = item.get("rank") or "-" + status = item.get("taxonomicStatus") or "-" + key = item.get("key") + lines.append(f"{idx}. {sci} | rank={rank} | status={status} | key={key}") + return ToolResult(success=True, result="GBIF matches:\n" + "\n".join(lines)) + except Exception as e: + return ToolResult(success=False, result=None, error=f"gbif_error: {e}") + + async def _agrovoc_lookup(self, args: Dict) -> ToolResult: + """AGROVOC term normalization via public SPARQL endpoint.""" + query = str(args.get("query", "") or "").strip() + lang = str(args.get("lang", "en") or "en").strip().lower() + limit = max(1, min(int(args.get("limit", 5)), 10)) + if not query: + return ToolResult(success=False, result=None, error="query is required") + if lang not in {"en", "uk", "ru"}: + lang = "en" + + safe_q = query.replace('\\', ' ').replace('"', ' ').strip() + sparql = ( + "PREFIX skos: " + "SELECT ?concept ?label WHERE { " + "?concept skos:prefLabel ?label . " + f"FILTER(lang(?label) = '{lang}') " + f"FILTER(CONTAINS(LCASE(STR(?label)), LCASE(\"{safe_q}\"))) " + "} LIMIT " + str(limit) + ) + + try: + resp = await self.http_client.get( + "https://agrovoc.fao.org/sparql", + params={"query": sparql, "format": "json"}, + timeout=25.0, + ) + if resp.status_code != 200: + return ToolResult(success=False, result=None, error=f"agrovoc_http_{resp.status_code}") + + data = resp.json() or {} + bindings = (((data.get("results") or {}).get("bindings")) or []) + if not bindings: + return ToolResult(success=True, result="AGROVOC: результатів не знайдено.") + + lines = [] + for idx, b in enumerate(bindings[:limit], 1): + label = ((b.get("label") or {}).get("value") or "").strip() + concept = ((b.get("concept") or {}).get("value") or "").strip() + lines.append(f"{idx}. {label} | {concept}") + return ToolResult(success=True, result="AGROVOC matches:\n" + "\n".join(lines)) + except Exception as e: + return ToolResult(success=False, result=None, error=f"agrovoc_error: {e}") + async def _unload_ollama_models(self): """Unload all Ollama models to free VRAM for heavy operations like FLUX""" ollama_url = os.getenv("OLLAMA_BASE_URL", "http://172.18.0.1:11434") @@ -2942,7 +3362,11 @@ class ToolManager: if results: result = results[0] if isinstance(results, list) else results - markdown = result.get("markdown", "") or result.get("cleaned_html", "") or result.get("text", "") + raw_content = result.get("markdown", "") or result.get("cleaned_html", "") or result.get("text", "") + if isinstance(raw_content, (dict, list, tuple)): + markdown = json.dumps(raw_content, ensure_ascii=False) + else: + markdown = str(raw_content or "") title = result.get("title", url) if len(markdown) > 3000: @@ -2951,13 +3375,30 @@ class ToolManager: response_parts = [f"**{title}**", "", markdown] if extract_links: - links = result.get("links", []) - if links: + links_raw = result.get("links", []) + normalized_links: List[Any] = [] + if isinstance(links_raw, dict): + for bucket in links_raw.values(): + if isinstance(bucket, list): + normalized_links.extend(bucket) + elif bucket: + normalized_links.append(bucket) + elif isinstance(links_raw, list): + normalized_links = links_raw + elif links_raw: + normalized_links = [links_raw] + + if normalized_links: response_parts.append("") response_parts.append("**Посилання:**") - for link in links[:10]: + for link in normalized_links[:10]: if isinstance(link, dict): - link_url = link.get("href", "") + link_url = ( + link.get("href") + or link.get("url") + or link.get("link") + or "" + ) else: link_url = str(link) if link_url: diff --git a/services/swapper-service/app/main.py b/services/swapper-service/app/main.py index bd2a4274..07591750 100644 --- a/services/swapper-service/app/main.py +++ b/services/swapper-service/app/main.py @@ -11,10 +11,13 @@ import os import asyncio import logging import base64 +import json +import re from typing import Optional, Dict, List, Any, Union from datetime import datetime, timedelta from enum import Enum from io import BytesIO +import xml.etree.ElementTree as ET from fastapi import FastAPI, HTTPException, BackgroundTasks, File, UploadFile, Form from fastapi.middleware.cors import CORSMiddleware @@ -56,16 +59,34 @@ def _csv_to_markdown(content: bytes) -> str: text = _decode_text_bytes(content) reader = csv.reader(text.splitlines()) rows = list(reader) + return _rows_to_markdown(rows) + + +def _tsv_to_markdown(content: bytes) -> str: + text = _decode_text_bytes(content) + reader = csv.reader(text.splitlines(), delimiter="\t") + rows = list(reader) + return _rows_to_markdown(rows) + + +def _rows_to_markdown(rows: List[List[Any]]) -> str: if not rows: return "" - header = rows[0] - body = rows[1:] + width = max(len(r) for r in rows) + norm_rows = [] + for r in rows: + rr = [str(c) if c is not None else "" for c in r] + if len(rr) < width: + rr.extend([""] * (width - len(rr))) + norm_rows.append(rr) + header = norm_rows[0] + body = norm_rows[1:] lines = [ "| " + " | ".join(header) + " |", "| " + " | ".join(["---"] * len(header)) + " |", ] for row in body: - lines.append("| " + " | ".join(row) + " |") + lines.append("| " + " | ".join([str(c) if c is not None else "" for c in row]) + " |") return "\n".join(lines) @@ -91,6 +112,69 @@ def _xlsx_to_markdown(content: bytes) -> str: return "\n".join(parts) +def _xls_to_markdown(content: bytes) -> str: + try: + import xlrd + except Exception as e: + raise HTTPException(status_code=500, detail=f"xlrd not available: {e}") + wb = xlrd.open_workbook(file_contents=content) + parts = [] + for s in wb.sheets(): + parts.append(f"## Sheet: {s.name}") + rows = [] + for r in range(s.nrows): + rows.append([s.cell_value(r, c) for c in range(s.ncols)]) + if not rows: + parts.append("_Empty sheet_") + continue + parts.append(_rows_to_markdown(rows)) + return "\n\n".join(parts) + + +def _ods_to_markdown(content: bytes) -> str: + try: + from odf.opendocument import load + from odf.table import Table, TableRow, TableCell + from odf.text import P + except Exception as e: + raise HTTPException(status_code=500, detail=f"odfpy not available: {e}") + + try: + doc = load(BytesIO(content)) + except Exception as e: + raise HTTPException(status_code=400, detail=f"Invalid ODS file: {e}") + + parts = [] + for table in doc.spreadsheet.getElementsByType(Table): + table_name = str(table.getAttribute("name") or "Sheet") + parts.append(f"## Sheet: {table_name}") + rows: List[List[str]] = [] + for row in table.getElementsByType(TableRow): + cells_out: List[str] = [] + for cell in row.getElementsByType(TableCell): + txt_parts = [] + for p in cell.getElementsByType(P): + txt_parts.extend( + [str(getattr(node, "data", "")).strip() for node in p.childNodes if getattr(node, "data", None)] + ) + cell_text = " ".join([t for t in txt_parts if t]).strip() + repeat_raw = cell.getAttribute("numbercolumnsrepeated") + try: + repeat = int(repeat_raw) if repeat_raw else 1 + except Exception: + repeat = 1 + repeat = max(1, min(repeat, 100)) + for _ in range(repeat): + cells_out.append(cell_text) + if cells_out: + rows.append(cells_out) + if not rows: + parts.append("_Empty sheet_") + continue + parts.append(_rows_to_markdown(rows)) + return "\n\n".join(parts) + + def _docx_to_text(content: bytes) -> str: try: from docx import Document @@ -115,18 +199,111 @@ def _pdf_to_text(content: bytes) -> str: return "\n\n".join(text_content) +def _pptx_to_text(content: bytes) -> str: + try: + from pptx import Presentation + except Exception as e: + raise HTTPException(status_code=500, detail=f"python-pptx not available: {e}") + prs = Presentation(BytesIO(content)) + parts = [] + for idx, slide in enumerate(prs.slides, start=1): + parts.append(f"## Slide {idx}") + slide_lines = [] + for shape in slide.shapes: + text = getattr(shape, "text", None) + if text and str(text).strip(): + slide_lines.append(str(text).strip()) + parts.extend(slide_lines if slide_lines else ["_No text on this slide_"]) + return "\n\n".join(parts) + + +def _json_to_text(content: bytes) -> str: + raw = _decode_text_bytes(content) + try: + parsed = json.loads(raw) + return json.dumps(parsed, ensure_ascii=False, indent=2) + except Exception: + return raw + + +def _yaml_to_text(content: bytes) -> str: + raw = _decode_text_bytes(content) + try: + parsed = yaml.safe_load(raw) + return yaml.safe_dump(parsed, allow_unicode=True, sort_keys=False) + except Exception: + return raw + + +def _xml_to_text(content: bytes) -> str: + raw = _decode_text_bytes(content) + try: + root = ET.fromstring(raw) + text = " ".join([t.strip() for t in root.itertext() if t and t.strip()]) + return text or raw + except Exception: + return raw + + +def _html_to_text(content: bytes) -> str: + raw = _decode_text_bytes(content) + try: + from bs4 import BeautifulSoup + + soup = BeautifulSoup(raw, "html.parser") + text = soup.get_text(separator="\n") + text = re.sub(r"\n{3,}", "\n\n", text) + return text.strip() or raw + except Exception: + # Minimal fallback if bs4 is unavailable + text = re.sub(r"<[^>]+>", " ", raw) + text = re.sub(r"\s+", " ", text) + return text.strip() + + +def _rtf_to_text(content: bytes) -> str: + raw = _decode_text_bytes(content) + try: + from striprtf.striprtf import rtf_to_text + return rtf_to_text(raw) + except Exception: + # Basic fallback: strip common RTF control tokens + text = re.sub(r"\\'[0-9a-fA-F]{2}", " ", raw) + text = re.sub(r"\\[a-zA-Z]+-?\d* ?", " ", text) + text = text.replace("{", " ").replace("}", " ") + return re.sub(r"\s+", " ", text).strip() + + def _extract_text_by_ext(filename: str, content: bytes) -> str: ext = filename.split(".")[-1].lower() if "." in filename else "" - if ext in ["txt", "md"]: + if ext in ["txt", "md", "markdown"]: return _decode_text_bytes(content) if ext == "csv": return _csv_to_markdown(content) - if ext == "xlsx": + if ext == "tsv": + return _tsv_to_markdown(content) + if ext in {"xlsx", "xlsm"}: return _xlsx_to_markdown(content) + if ext == "xls": + return _xls_to_markdown(content) + if ext == "ods": + return _ods_to_markdown(content) if ext == "docx": return _docx_to_text(content) if ext == "pdf": return _pdf_to_text(content) + if ext == "pptx": + return _pptx_to_text(content) + if ext == "json": + return _json_to_text(content) + if ext in {"yaml", "yml"}: + return _yaml_to_text(content) + if ext == "xml": + return _xml_to_text(content) + if ext in {"html", "htm"}: + return _html_to_text(content) + if ext == "rtf": + return _rtf_to_text(content) raise HTTPException(status_code=400, detail=f"Unsupported file type: .{ext}") @@ -139,7 +316,12 @@ def _zip_to_markdown(content: bytes, max_files: int = 50, max_total_mb: int = 10 if total_size > max_total_mb * 1024 * 1024: raise HTTPException(status_code=400, detail=f"ZIP слишком большой: {total_size / 1024 / 1024:.1f} MB") parts = [] - allowed_exts = {"txt", "md", "csv", "xlsx", "docx", "pdf"} + allowed_exts = { + "txt", "md", "markdown", "csv", "tsv", + "xls", "xlsx", "xlsm", "ods", + "docx", "pdf", "pptx", + "json", "yaml", "yml", "xml", "html", "htm", "rtf", + } processed = [] skipped = [] for member in members: @@ -1655,7 +1837,8 @@ async def document_endpoint( - json: Structured JSON with document elements - text: Plain text extraction - Supported files: PDF, DOCX, PPTX, images (PNG, JPG) + Supported files: + PDF, DOCX, XLS/XLSX/XLSM/ODS, PPTX, TXT/MD/CSV/TSV, JSON/YAML/XML/HTML, RTF, ZIP, images. """ try: import time @@ -1672,15 +1855,28 @@ async def document_endpoint( filename = file.filename if file else "document" file_ext = filename.split(".")[-1].lower() if "." in filename else "pdf" - # Handle text-based formats without Docling - if file_ext in ["txt", "md", "csv", "xlsx", "zip"]: + # Handle deterministic extraction for standard office/text formats + if file_ext in [ + "txt", "md", "markdown", "csv", "tsv", + "xlsx", "xls", "xlsm", "ods", + "json", "yaml", "yml", "xml", "html", "htm", "rtf", + "pptx", "zip", + ]: try: if file_ext == "zip": content = _zip_to_markdown(doc_data) output_format = "markdown" else: content = _extract_text_by_ext(filename, doc_data) - output_format = "markdown" if file_ext in ["md", "csv", "xlsx"] else "text" + output_format = ( + "markdown" + if file_ext in { + "md", "markdown", "csv", "tsv", + "xlsx", "xls", "xlsm", "ods", + "json", "yaml", "yml", "xml", "html", "htm", "pptx", + } + else "text" + ) processing_time_ms = (time.time() - start_time) * 1000 return { "success": True, @@ -1764,22 +1960,27 @@ async def document_endpoint( "device": swapper.device } - # For DOCX, try python-docx - if file_ext == "docx": + # For common office/text formats, try deterministic extractors. + if file_ext in { + "docx", "txt", "md", "markdown", "csv", "tsv", + "xlsx", "xls", "xlsm", "ods", + "pptx", "json", "yaml", "yml", "xml", "html", "htm", "rtf", + }: try: - content = _docx_to_text(doc_data) + content = _extract_text_by_ext(filename, doc_data) + out_fmt = "markdown" if file_ext not in {"txt", "rtf"} else "text" return { "success": True, - "model": "python-docx (fallback)", - "output_format": "text", + "model": "text-extract (fallback)", + "output_format": out_fmt, "result": content, "filename": filename, "processing_time_ms": (time.time() - start_time) * 1000, "device": swapper.device } except Exception as e: - logger.error(f"DOCX fallback failed: {e}") - raise HTTPException(status_code=500, detail="DOCX extraction failed") + logger.error(f"Text fallback failed for .{file_ext}: {e}") + raise HTTPException(status_code=500, detail=f"Extraction failed for .{file_ext}") # For PDFs, try pdfplumber if file_ext == "pdf": @@ -1807,7 +2008,7 @@ async def document_endpoint( # For other documents, return error raise HTTPException( status_code=503, - detail="Document processing not available. Supported: PDF (with pdfplumber), images (with OCR)" + detail="Document processing unavailable for this type. Supported: office/text/image/zip standard formats." ) finally: @@ -2312,4 +2513,3 @@ async def get_multimodal_stack(): if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8890) - diff --git a/services/swapper-service/app/requirements.txt b/services/swapper-service/app/requirements.txt index 6f40062a..75e38998 100644 --- a/services/swapper-service/app/requirements.txt +++ b/services/swapper-service/app/requirements.txt @@ -4,6 +4,15 @@ httpx==0.25.2 pydantic==2.5.0 pyyaml==6.0.1 python-multipart==0.0.6 +chardet>=5.2.0 +openpyxl>=3.1.2 +python-docx>=1.1.2 +pdfplumber>=0.11.0 +python-pptx>=0.6.23 +xlrd>=2.0.1 +odfpy>=1.4.1 +beautifulsoup4>=4.12.0 +striprtf>=0.0.26 # HuggingFace dependencies for OCR models torch>=2.0.0 @@ -25,4 +34,4 @@ safetensors>=0.4.0 # Web Scraping & Search trafilatura>=1.6.0 -duckduckgo-search>=4.0.0 \ No newline at end of file +duckduckgo-search>=4.0.0 diff --git a/services/swapper-service/requirements.txt b/services/swapper-service/requirements.txt index e15ea696..22acb316 100644 --- a/services/swapper-service/requirements.txt +++ b/services/swapper-service/requirements.txt @@ -43,3 +43,8 @@ pdfplumber>=0.10.0 python-docx>=1.1.0 openpyxl>=3.1.2 chardet>=5.2.0 +python-pptx>=0.6.23 +xlrd>=2.0.1 +odfpy>=1.4.1 +beautifulsoup4>=4.12.0 +striprtf>=0.0.26 diff --git a/third_party/nature-id/.gitignore b/third_party/nature-id/.gitignore new file mode 100644 index 00000000..0361d4fb --- /dev/null +++ b/third_party/nature-id/.gitignore @@ -0,0 +1,6 @@ +__pycache__/ +*.py[cod] +*$py.class +*.csv +*.tflite +*.zip diff --git a/third_party/nature-id/LICENSE b/third_party/nature-id/LICENSE new file mode 100644 index 00000000..f9ab6f16 --- /dev/null +++ b/third_party/nature-id/LICENSE @@ -0,0 +1,10 @@ +MIT License + +Copyright (c) 2020, joergmlpts + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/third_party/nature-id/README.md b/third_party/nature-id/README.md new file mode 100644 index 00000000..32f2bc04 --- /dev/null +++ b/third_party/nature-id/README.md @@ -0,0 +1,372 @@ +# Identify Plants, Birds, and Insects in Photos + +This repository provides Python code that identifies plants, birds, and insects in photos. + +This project was inspired by the amazing progress in identifying plants, animals and mushrooms in photos that has been made by [iNaturalist](https://iNaturalist.org) in recent years in identifying plants, animals, and fungi from photographs. The iNaturalist team has trained machine learning models with their large collection of photos and research-grade identifications. In 2019, iNaturalist released [Seek by iNaturalist](https://www.inaturalist.org/pages/seek_app) which identifies photos offline on the phone and identifies to a higher level than species when a species identification cannot be made. + +Google provides three models that have been trained with iNaturalist data - classification models for plants, birds, and insects. These Google models can be downloaded and used with Google's `TensorFlow` and `TensorFlow Lite` tools. + +This code is based on the trained models provided by Google. It was written to experiment with identifying species from photos and to try out Seek's approach to calculating scores (probabilities) across the taxonomic hierarchy. + +This tool `nature_id.py` has been tested on Linux and Windows. It should also work on MacOS. + +## Usage + +This is a command-line tool. It is invoked with images or directories containing images and identifies the plants, birds, and insects in those images. + +Here is an example. This is the command for Linux and macOS: + +``` +./nature_id.py -m plants plant_images/Persicaria_amphibia.jpg +``` + +On Windows the command is: + +``` +python .\nature_id.py -m plants plant_images\Persicaria_amphibia.jpg +``` + +![Smartweed](/plant_images/Persicaria_amphibia.jpg) + +The above image results in this identification: +``` +Classification of 'plant_images/Persicaria_amphibia.jpg' took 0.2 secs. +100.0% kingdom Plants (Plantae) +100.0% phylum Tracheophytes (Tracheophyta) +100.0% subphylum Flowering Plants (Angiospermae) + 99.6% class Dicots (Magnoliopsida) + 99.2% order Pinks, Cactuses, and Allies (Caryophyllales) + 98.8% family Knotweed Family (Polygonaceae) + 98.8% subfamily Polygonoideae + 98.8% tribe Persicarieae + 98.8% subtribe Persicariinae + 98.8% genus Smartweeds (Persicaria) + 97.6% species Water Smartweed (Persicaria amphibia) +``` + +These scores can be used to guide identification: define a threshold and report as result the taxon with the lowest score that is larger than or equal to this threshold. In this example for a threshold of 95% an identification to species *Persicaria amphibia* has been achieved. For a threshold of 99%, this is only an identification to order *Caryophyllales*. 95% and 99% would be unusually high thresholds; Seek, I believe, uses a threshold of 70%. + +## Command-line Options + +This script is a command-line utility. It is called with options, filenames and directory names as arguments. These options are supported: + +``` +usage: nature_id.py [-h] [-m MODEL] [-a] [-l] [-s] [-r RESULT_SIZE] file/directory [file/directory ...] + +positional arguments: + file/directory Image files or directories with images. + +options: + -h, --help show this help message and exit + -m MODEL, --model MODEL + Model to load to identify organisms. + -a, --all_common_names + Show all common names and not just one. + -l, --label_scores_only + Compute and display only label scores, do not propagate scores up the hierarchy. + -s, --scientific_names_only + Only use scientific names, do not load common names. + -r RESULT_SIZE, --result_size RESULT_SIZE + Number of labels and their scores to report in results. +``` + +### Option -m MODEL, --model MODEL + +The `-m` and `--model` options select a classification model. Possible models are `plants`, `birds`, and `insects`. These models must be installed in the `classifiers` directory. This option is required if more than one classifier is installed. + +### Option -a, --all_common_names + +The `-a` and `--all_common_names` options cause all common names to be displayed, not just one. Multiple common names are separated by semicolons. The output with this option looks like this: + +![Phyla_nodiflora.jpg](/plant_images/Phyla_nodiflora.jpg) + +``` +Classification of 'plant_images/Phyla_nodiflora.jpg' took 0.2 secs. +100.0% kingdom Plants; Flora; Green Plants; Greenery; Foliage; Vegetation; Salpichlaena Papyrus; Trees; Bushes; Shrubs; Vines (Plantae) +100.0% phylum Tracheophytes; Seed Plants; Vascular Plants (Tracheophyta) +100.0% subphylum Flowering Plants; Angiosperms; Flowers; Basal Angiosperms; True Dicotyledons; Basal True Dicots; Rose Dicots; Daisy Dicots (Angiospermae) +100.0% class Dicots; Dicots; Dicotyledons; Eudicots (Magnoliopsida) + 98.2% order Mints, Plantains, Olives, and Allies (Lamiales) + 97.4% family Verbena Family; Lantanas (Verbenaceae) + 97.4% tribe Lantaneae + 85.5% genus Frogfruits; Fogfruits (Phyla) + 85.5% species Turkey Tangle; Lippia; Common Lippia; Turkey Tangle Frogfruit; Sawtooth Fogfruit; Carpet Weed; Roundleaf Frogfruit; Texas Frogfruit; Cape Weed; Sawtooth Frogfruit; Lipia; Turkey Tangle Fogfruit; Daisy Lawn; Fog Grass (Phyla nodiflora) +``` + +### Option -l, --label_scores_only + +The `-l` and `--label_scores_only` options switch from the taxonomic hierarchy view to a flat list of labels and their scores. The output with this option looks like this: + +![Solidago_velutina_ssp_californica.jpg](/plant_images/Solidago_velutina_ssp_californica.jpg) + +``` +Classification of 'plant_images/Solidago_velutina_ssp_californica.jpg' took 0.2 secs. + 86.1% Canada Goldenrod (Solidago canadensis) + 9.8% Late Goldenrod (Solidago altissima) + 1.6% Flat-Topped Goldenrod (Euthamia graminifolia) + 1.2% Northern Seaside Goldenrod (Solidago sempervirens) + 0.4% Stiff-Leaved Goldenrod (Solidago rigida) +``` + +Five labels with decreasing scores are shown by default. The `-r` and `--result_size` options can be used to request fewer or more labels. + +### Option -s, --scientific_names_only + +The `-s` and `--scientific_names_only` options disable common names; only the scientific names are displayed. The output with this option looks like this: + +![Trichostema_lanceolatum.jpg](/plant_images/Trichostema_lanceolatum.jpg) + +``` +Classification of 'plant_images/Trichostema_lanceolatum.jpg' took 0.2 secs. +100.0% kingdom Plantae +100.0% phylum Tracheophyta +100.0% subphylum Angiospermae +100.0% class Magnoliopsida + 99.6% order Lamiales + 99.6% family Lamiaceae + 99.2% subfamily Ajugoideae + 99.2% genus Trichostema + 99.2% species Trichostema lanceolatum +``` + +### Option -r RESULT_SIZE, --result_size RESULT_SIZE + +The `-r` and `--result_size` options modify the number of labels displayed when a flat list of labels is requested with the `-l` or `--label_scores_only` options. The default is 5. Options `-r` and `--result_size` allow you to choose a number between 1 and 100. + +This is an example with 15 labels. The command-line for Linux is +``` +./nature_id.py -m plants -l -r 15 plant_images/Primula_hendersonii.jpg +``` + +![Primula_hendersonii.jpg](/plant_images/Primula_hendersonii.jpg) + +``` +Classification of 'plant_images/Primula_hendersonii.jpg' took 0.2 secs. + 50.4% Henderson's Shooting Star (Primula hendersonii) + 37.2% Eastern Shooting Star (Primula meadia) + 2.5% Dark-Throated Shooting Star (Primula pauciflora) + 1.7% Red Ribbons (Clarkia concinna) + 1.2% Ruby Chalice Clarkia (Clarkia rubicunda) + 0.8% Purple Paintbrush (Castilleja purpurea) + 0.8% Fireweed (Chamaenerion angustifolium) + 0.4% Western Fairy-Slipper (Calypso bulbosa occidentalis) + 0.4% Texas Skeleton Plant (Lygodesmia texana) + 0.4% Rhodora (Rhododendron canadense) + 0.4% Ragged-Robin (Silene flos-cuculi) + 0.4% Hemp Dogbane (Apocynum cannabinum) + 0.4% Garden Cosmos (Cosmos bipinnatus) + 0.4% Farewell-To-Spring (Clarkia amoena) + 0.4% Dwarf Fireweed (Chamaenerion latifolium) +``` + +## Dependencies + +Several things need to be installed in order for `nature-id.py` to run. Some Python packages are required, classification models need to be downloaded and installed into the `classifiers` directory, and finally the taxonomy and common names need to be downloaded into the `inaturalist-taxonomy` directory. + +### Python Packages + +This code is written in Python 3. Besides Python 3, the packages `Pillow` and `requests` are used to load and process images and to access the iNaturalist API. + +These packages as well as `TensorFlow Lite` can be installed on Ubuntu Linux and other Debian distributions with the command + +``` +sudo apt install python3-pillow python3-requests +pip3 install tflite-runtime +``` + +and on other platforms with the command + +``` +pip install Pillow requests tflite-runtime +``` + +Where appropriate `pip3` should be called instead of `pip` to avoid accidentally installing Python 2 packages. + + +### Classification Models + +The classification models and their labelmap files have to be downloaded from Kaggle and they go into directory `classifiers`. + +The classifiers can be downloaded from these links: + + * [classifier for plants](https://www.kaggle.com/models/google/aiy/tensorFlow1/vision-classifier-plants-v1/1) + * [classifier for birds](https://www.kaggle.com/models/google/aiy/tensorFlow1/vision-classifier-birds-v1/1) + * [classifier for insects](https://www.kaggle.com/models/google/aiy/tensorFlow1/vision-classifier-insects-v1/1) + +Each classifier consists of a `.tflite` model and a `.csv` labelmap file. Both are required. Click on `Model Variations` under `TensorFlow Lite` to download the TFLite model. Please also note the paragraphs at the bottom of these web pages about appropriate and inappropriate use cases and licensing. + +These are the links to download the labelmaps: [aiy_insects_V1_labelmap.csv](https://www.gstatic.com/aihub/tfhub/labelmaps/aiy_insects_V1_labelmap.csv), [aiy_birds_V1_labelmap.csv](https://www.gstatic.com/aihub/tfhub/labelmaps/aiy_birds_V1_labelmap.csv), and [aiy_plants_V1_labelmap.csv](https://www.gstatic.com/aihub/tfhub/labelmaps/aiy_plants_V1_labelmap.csv). On Windows, the default action for a .csv file may be to open it in Excel; be sure to save the downloaded file to disk. + +### Taxonomy and Common Names Files + +The trained models come with scientific names as labels and many of these scientific names are already outdated. The common names and the current taxonomy are obtained from this file: [https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip](https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip) This tool expects this zip archive in the `inaturalist-taxonomy` directory. + +## Example Images + +Example Images pictures of plants are provided in the `plant_images` directory. The filenames indicate the species that I think is in the photo. Note that these examples only lead to successful identification to varying degrees. The *Mentzelia lindleyi* is certainly not correctly identified. + +## Messages + +The first call with a model transforms the labels into a taxonomic hierarchy. Each label is replaced with its representation in the current taxonomy and all its ancestors are added. This process takes some time and results in many messages. Once the hierarchy has been successfully computed, it is written to disk. Future calls to `nature_id.py` will load the taxonomic hierarchy from disk instead of reading the labels and computing the taxonomy again. + +This is what the first calls look like. Again, we use the plant model as an example. The bird and insect models are smaller and result in fewer messages. + +``` +PS C:\nature-id> python -m plants nature_id.py .\plant_images +Read 2,102 labels from 'classifiers\aiy_plants_V1_labelmap.csv' in 0.0 secs. +Loading iNaturalist taxonomy... +Loaded iNaturalist taxonomy of 993,552 taxa in 15.2 secs. +Info: Taxon for label 'background' not found, inserting as pseudo-kingdom. +Info: Taxon 'Eichhornia crassipes' changed to 'Pontederia crassipes', iNat taxa id 962637. +Info: Taxon 'Potentilla anserina' changed to 'Argentina anserina', iNat taxa id 158615. +Info: Taxon 'Stenosiphon linifolius' changed to 'Oenothera glaucifolia', iNat taxa id 914092. +Info: Taxon 'Sophora secundiflora' changed to 'Dermatophyllum secundiflorum', iNat taxa id 499559. +Info: Taxon 'Mimulus bigelovii' changed to 'Diplacus bigelovii', iNat taxa id 701989. +Info: Taxon 'Botrychium dissectum' changed to 'Sceptridium dissectum', iNat taxa id 122085. +Info: Taxon 'Trientalis borealis' changed to 'Lysimachia borealis', iNat taxa id 204174. +Info: Taxon 'Hyptis emoryi' changed to 'Condea emoryi', iNat taxa id 489286. +Info: Taxon 'Opuntia engelmannii lindheimeri' changed to 'Opuntia lindheimeri', iNat taxa id 119980. +Info: Taxon 'Aquilegia caerulea' changed to 'Aquilegia coerulea', iNat taxa id 501742. +Info: Taxon 'Fuscospora cliffortioides' changed to 'Nothofagus cliffortioides', iNat taxa id 404204. +Info: Taxon 'Cooperia drummondii' changed to 'Zephyranthes chlorosolen', iNat taxa id 554401. +Info: Taxon 'Dracopis amplexicaulis' changed to 'Rudbeckia amplexicaulis', iNat taxa id 200073. +Info: Taxon 'Dodecatheon meadia' changed to 'Primula meadia', iNat taxa id 549981. +Info: Taxon 'Aptenia cordifolia' changed to 'Mesembryanthemum cordifolium', iNat taxa id 589815. +Info: Taxon 'Chamerion latifolium' changed to 'Chamaenerion latifolium', iNat taxa id 564970. +Info: Taxon 'Echinocereus mojavensis' changed to 'Echinocereus triglochidiatus mojavensis', iNat taxa id 858352. +Warning: multiple taxa named 'Aquilegia vulgaris': species 51807, complex 1042772; choosing species. +Info: Taxon 'Dodecatheon pulchellum' changed to 'Primula pauciflora', iNat taxa id 498086. +Info: Taxon 'Mimulus lewisii' changed to 'Erythranthe lewisii', iNat taxa id 777190. +Info: Taxon 'Sambucus nigra canadensis' changed to 'Sambucus canadensis', iNat taxa id 84300. +Info: Taxon 'Asyneuma prenanthoides' changed to 'Campanula prenanthoides', iNat taxa id 851072. +Info: Taxon 'Anemone quinquefolia' changed to 'Anemonoides quinquefolia', iNat taxa id 950598. +Info: Taxon 'Hedypnois cretica' changed to 'Hedypnois rhagadioloides', iNat taxa id 492864. +Warning: multiple taxa named 'Achillea millefolium': species 52821, complex 1105043; choosing species. +Info: Taxon 'Anagallis arvensis' changed to 'Lysimachia arvensis', iNat taxa id 791928. +Info: Taxon 'Hieracium caespitosum' changed to 'Pilosella caespitosa', iNat taxa id 711086. +Info: Taxon 'Potentilla anserina pacifica' changed to 'Argentina pacifica', iNat taxa id 524900. +Info: Taxon 'Sambucus nigra caerulea' changed to 'Sambucus cerulea', iNat taxa id 143799. +Info: Taxon 'Polygala californica' changed to 'Rhinotropis californica', iNat taxa id 876453. +Info: Taxon 'Calylophus berlandieri' changed to 'Oenothera berlandieri', iNat taxa id 359779. +Info: Taxon 'Mimulus cardinalis' changed to 'Erythranthe cardinalis', iNat taxa id 319974. +Info: Taxon 'Callistemon citrinus' changed to 'Melaleuca citrina', iNat taxa id 77976. +Info: Taxon 'Liatris mucronata' changed to 'Liatris punctata mucronata', iNat taxa id 371814. +Warning: multiple taxa named 'Stellaria media': species 53298, complex 1087592; choosing species. +Info: Taxon 'Anemone americana' changed to 'Hepatica americana', iNat taxa id 741014. +Info: Taxon 'Anemone occidentalis' changed to 'Pulsatilla occidentalis', iNat taxa id 60482. +Info: Taxon 'Orobanche fasciculata' changed to 'Aphyllon fasciculatum', iNat taxa id 802543. +Info: Taxon 'Mimulus primuloides' changed to 'Erythranthe primuloides', iNat taxa id 635401. +Info: Taxon 'Polygala paucifolia' changed to 'Polygaloides paucifolia', iNat taxa id 497911. +Warning: multiple taxa named 'Campanula rotundifolia': species 62312, complex 984576; choosing species. +Info: Taxon 'Cissus incisa' changed to 'Cissus trifoliata', iNat taxa id 133333. +Info: Taxon 'Schinus terebinthifolius' changed to 'Schinus terebinthifolia', iNat taxa id 130872. +Info: Taxon 'Cooperia pedunculata' changed to 'Zephyranthes drummondii', iNat taxa id 120026. +Info: Taxon 'Scabiosa atropurpurea' changed to 'Sixalix atropurpurea', iNat taxa id 372376. +Info: Taxon 'Sphenosciadium capitellatum' changed to 'Angelica capitellata', iNat taxa id 704166. +Info: Taxon 'Trientalis latifolia' changed to 'Lysimachia latifolia', iNat taxa id 496537. +Warning: multiple taxa named 'Spiranthes cernua': species 773385, complex 931407; choosing species. +Info: Taxon 'Spartina pectinata' changed to 'Sporobolus michauxianus', iNat taxa id 772984. +Info: Taxon 'Centaurea americana' changed to 'Plectocephalus americanus', iNat taxa id 699778. +Info: Taxon 'Fuscospora solandri' changed to 'Nothofagus solandri', iNat taxa id 70246. +Info: Taxon 'Heliotropium tenellum' changed to 'Euploca tenella', iNat taxa id 769888. +Info: Taxon 'Blechnum spicant' changed to 'Struthiopteris spicant', iNat taxa id 774894. +Info: Taxon 'Fallopia japonica' changed to 'Reynoutria japonica', iNat taxa id 914922. +Info: Taxon 'Echinocactus texensis' changed to 'Homalocephala texensis', iNat taxa id 870496. +Info: Taxon 'Gaura parviflora' changed to 'Oenothera curtiflora', iNat taxa id 78241. +Info: Taxon 'Parentucellia viscosa' changed to 'Bellardia viscosa', iNat taxa id 537967. +Info: Taxon 'Anemone nemorosa' changed to 'Anemonoides nemorosa', iNat taxa id 950603. +Info: Taxon 'Hieracium aurantiacum' changed to 'Pilosella aurantiaca', iNat taxa id 711103. +Info: Taxon 'Anemone hepatica' changed to 'Hepatica nobilis', iNat taxa id 639660. +Info: Taxon 'Merremia dissecta' changed to 'Distimake dissectus', iNat taxa id 907480. +Info: Taxon 'Anemone canadensis' changed to 'Anemonastrum canadense', iNat taxa id 881527. +Info: Taxon 'Chamerion angustifolium' changed to 'Chamaenerion angustifolium', iNat taxa id 564969. +Info: Taxon 'Lychnis flos-cuculi' changed to 'Silene flos-cuculi', iNat taxa id 740984. +Throttling API calls, sleeping for 44.5 seconds. +Info: Taxon 'Ampelopsis brevipedunculata' changed to 'Ampelopsis glandulosa brevipedunculata', iNat taxa id 457553. +Info: Taxon 'Anemone acutiloba' changed to 'Hepatica acutiloba', iNat taxa id 179786. +Info: Taxon 'Pennisetum setaceum' changed to 'Cenchrus setaceus', iNat taxa id 430581. +Info: Taxon 'Mimulus guttatus' changed to 'Erythranthe guttata', iNat taxa id 470643. +Info: Taxon 'Blechnum fluviatile' changed to 'Cranfillia fluviatilis', iNat taxa id 700995. +Info: Taxon 'Blechnum discolor' changed to 'Lomaria discolor', iNat taxa id 403546. +Info: Taxon 'Andropogon gerardii' changed to 'Andropogon gerardi', iNat taxa id 121968. +Info: Taxon 'Ferocactus hamatacanthus' changed to 'Hamatocactus hamatacanthus', iNat taxa id 855937. +Info: Taxon 'Gaura lindheimeri' changed to 'Oenothera lindheimeri', iNat taxa id 590726. +Info: Taxon 'Gaura suffulta' changed to 'Oenothera suffulta', iNat taxa id 521639. +Info: Taxon 'Glottidium vesicarium' changed to 'Sesbania vesicaria', iNat taxa id 890511. +Info: Taxon 'Acacia farnesiana' changed to 'Vachellia farnesiana', iNat taxa id 79472. +Warning: multiple taxa named 'Rubus fruticosus': complex 55911, species 1090496; choosing species. +Info: Taxon 'Othocallis siberica' changed to 'Scilla siberica', iNat taxa id 862704. +Info: Taxon 'Mimulus aurantiacus' changed to 'Diplacus', iNat taxa id 777236. +Info: Taxon 'Phoradendron tomentosum' changed to 'Phoradendron leucarpum', iNat taxa id 49668. +Info: Taxon 'Orobanche uniflora' changed to 'Aphyllon uniflorum', iNat taxa id 802714. +Info: Taxon 'Rosmarinus officinalis' changed to 'Salvia rosmarinus', iNat taxa id 636795. +Info: Taxon 'Cynoglossum grande' changed to 'Adelinia grande', iNat taxa id 769151. +Computed taxonomic tree from labels in 64.8 secs: 4,091 taxa including 2,102 leaf taxa. +Taxonomy written to file 'classifiers\aiy_plants_V1_taxonomy.csv'. +Reading common names from 'inaturalist-taxonomy\inaturalist-taxonomy.dwca.zip' member 'VernacularNames-english.csv'... +Read 203,093 common names in 1.5 secs, loaded 3,071 in language "en_US" for 4,091 taxa. +``` + +### Messages Explained + +``` +Read 2,102 labels from 'classifiers\aiy_plants_V1_labelmap.csv' in 0.0 secs. +``` + +`nature-id` reads a label file. If no errors occur, a taxonomy will be written for these labels and further runs will load `classifiers\aiy_plants_V1_taxonomy.csv` instead. + +``` +Loading iNaturalist taxonomy... +Loaded iNaturalist taxonomy of 993,552 taxa in 15.2 secs. +``` + +The entire iNaturalist taxonomy of about 1 million taxa is loaded. `nature-id` will look up the labels in this taxonomy and insert them, along with all their ancestors, into a taxonomy for the labels. + +``` +Info: Taxon for label 'background' not found, inserting as pseudo-kingdom. +``` + +Label `background` was not found. It is not a species, but denotes something else in the Google model. It is treated as a kingdom in the taxonomy; it has no ancestors. + +``` +Info: Taxon 'Potentilla anserina' changed to 'Argentina anserina', iNat taxa id 158615. +``` + +In the current taxonomy, this species belongs to a different genus. The numeric ID in this message is useful for getting more information. This number can be prefixed with `https://www.inaturalist.org/taxa/` and opened in a browser: [https://www.inaturalist.org/taxa/158615](https://www.inaturalist.org/taxa/158615). + +``` +Warning: multiple taxa named 'Achillea millefolium': species 52821, complex 1105043; choosing species. +``` + +The label name for this common yarrow is not unique, there are several taxa for this scientific name. `nature-id` assumes that the species is the one we want. + +``` +Throttling API calls, sleeping for 44.5 seconds. +``` + +This message is followed by 45 seconds of silence. When a name is not found in the the current taxonomy, the one previously loaded with about 1 million taxa, then iNaturalist API calls are made to look up the inactive scientific name. The iNaturalist team would like us to throttle API calls to no more than 60 calls per minute. This delay has been implemented to accommodate their request. + +``` +Info: Taxon 'Mimulus aurantiacus' changed to 'Diplacus', iNat taxa id 777236. +``` + +The species *Mimulus aurantiacus* in the label file is replaced with the genus *Diplacus* and not with the current species *Diplacus aurantiacus*. This looks like a bug and hence deserves a closer look. + +The reason for this decision of `nature_id` is that *Mimulus aurantiacus* consisted of several varieties *Mimulus aurantiacus aurantiacus*, *Mimulus aurantiacus grandiflorus*, *Mimulus aurantiacus parviflorus*, and 3 more. + +In the current taxonomy, these varieties are species *Diplacus aurantiacus*, *Diplacus grandiflorus*, and *Diplacus parviflorus*. *Diplacus aurantiacus* does not replace *Mimulus aurantiacus*; it replaces the variety *Mimulus aurantiacus aurantiacus*. + +Another way to understand this issue is to realize that photos of all varieties *Mimulus aurantiacus aurantiacus*, *Mimulus aurantiacus grandiflorus*, *Mimulus aurantiacus parviflorus* and the 3 others were used to train the classification model to recognize *Mimulus aurantiacus*. In the current taxonomy, this label is triggered for each of the species *Diplacus aurantiacus*, *Diplacus grandiflorus*, and *Diplacus parviflorus*. `nature_id` cannot say which of current species it sees. It can only identify images as genus *Diplacus*. + +``` +Taxonomy written to file 'classifiers\aiy_plants_V1_taxonomy.csv'. +``` + +A taxonomy for the scientific names in the label file has been successfully computed and this taxonomy was written to disk. Future calls will load this taxonomy instead of loading the labels and re-computing the taxonomy. + +``` +Reading common names from 'inaturalist-taxonomy\inaturalist-taxonomy.dwca.zip' member 'VernacularNames-english.csv'... +Read 203,093 common names in 1.5 secs, loaded 3,071 in language "en_US" for 4,091 taxa. +``` + +Common names have been read. The common names are always selected for the local language, not necessarily for English as shown here. diff --git a/third_party/nature-id/classifiers/README.md b/third_party/nature-id/classifiers/README.md new file mode 100644 index 00000000..973bd679 --- /dev/null +++ b/third_party/nature-id/classifiers/README.md @@ -0,0 +1,13 @@ +# Download Instructions + +The [Tensorflow Lite](https://www.tensorflow.org/lite/guide) classifiers that go in this directory can be downloaded from these websites: + + * [classifier for plants](https://tfhub.dev/google/aiy/vision/classifier/plants_V1/1) + * [classifier for birds](https://tfhub.dev/google/aiy/vision/classifier/birds_V1/1) + * [classifier for insects](https://tfhub.dev/google/aiy/vision/classifier/insects_V1/1) + +Each classifier consists of a `.tflite` model and a `.csv` labelmap file. Both are required. + +On each of the above websites scroll down and under `Output` click on `labelmap` to download the labels. Then scroll back up and under `Model formats` switch to `TFLite (aiyvision/classifier/...)`. There click on `Download` to get the `.tflite` file. + +If you happen to have the classifier included in [Seek](https://www.inaturalist.org/pages/seek_app), it can go in this directory as well. It consists of two files `optimized_model_v1.tflite` and `taxonomy_v1.csv`. diff --git a/third_party/nature-id/inat_api.py b/third_party/nature-id/inat_api.py new file mode 100644 index 00000000..570df037 --- /dev/null +++ b/third_party/nature-id/inat_api.py @@ -0,0 +1,110 @@ +import json, os, pickle, requests, shelve, sys, time + +############################################################################# +# # +# API calls to obtain taxonomic information. Used in case of name changes. # +# # +# See documention at https://api.inaturalist.org/v1/docs/#/Taxa # +# # +# We throttle the number of calls to less than 60 per minute. We also # +# implement a cache to avoid repeated lookups of the same taxa across runs. # +# Cache entries include time stamps and they expire after two weeks. # +# # +############################################################################# + +API_HOST = "https://api.inaturalist.org/v1" +CACHE_EXPIRATION = 14 * 24 * 3600 # cache expires after 2 weeks +TOO_MANY_API_CALLS_DELAY = 60 # wait this long after error 429 + +# The cache stores the json responses. + +if sys.platform == 'win32': + DATA_DIR = os.path.join(os.path.expanduser('~'), + 'AppData', 'Local', 'inat_api') +else: + DATA_DIR = os.path.join(os.path.expanduser('~'), '.cache', 'inat_api') + +if not os.path.exists(DATA_DIR): + os.makedirs(DATA_DIR) + +cache = shelve.open(os.path.join(DATA_DIR, 'api.cache')) + +# API call throttling. + +class Throttle: + + API_MAX_CALLS = 60 # max 60 calls per minute + API_INTERVAL = 60 # 1 minute + + def __init__(self): + self.callTimes = [] # times of api calls + + # wait if necessary to avoid more than API_MAX_CALLS in API_INTERVAL + def wait(self): + while len(self.callTimes) >= self.API_MAX_CALLS: + waitTime = self.callTimes[0] - (time.time() - self.API_INTERVAL) + if waitTime > 0: + print('Throttling API calls, ' + f'sleeping for {waitTime:.1f} seconds.') + time.sleep(waitTime) + continue + self.callTimes = self.callTimes[1:] + self.callTimes.append(time.time()) + +api_call_throttle = Throttle() + +# argument is an id or a list of id's +def get_taxa_by_id(id): + if type(id) is list: + url = API_HOST + '/taxa/' + '%2C'.join([str(i) for i in id]) + else: + url = API_HOST + f'/taxa/{id}' + tim = time.time() + if not url in cache or cache[url][0] < tim - CACHE_EXPIRATION: + delay = TOO_MANY_API_CALLS_DELAY + headers = {'Content-type' : 'application/json' } + while True: + api_call_throttle.wait() + response = requests.get(url, headers=headers) + if response.status_code == requests.codes.too_many: + time.sleep(delay) + delay *= 2 + else: + break + if response.status_code == requests.codes.ok: + cache[url] = (tim, response.json()) + else: + print(response.text) + return None + return cache[url][1] + +# returns taxa by name +def get_taxa(params): + url = API_HOST + '/taxa' + for key, val in params.items(): + if type(val) == bool: + params[key] = 'true' if val else 'false' + key = pickle.dumps((url, params)).hex() + tim = time.time() + if not key in cache or cache[key][0] < tim - CACHE_EXPIRATION: + delay = TOO_MANY_API_CALLS_DELAY + headers = {'Content-type' : 'application/json' } + while True: + api_call_throttle.wait() + response = requests.get(url, headers=headers, params=params) + if response.status_code == requests.codes.too_many: + time.sleep(delay) + delay *= 2 + else: + break + if response.status_code == requests.codes.ok: + cache[key] = (tim, response.json()) + else: + print(response.text) + return None + return cache[key][1] + + +if __name__ == '__main__': + + assert not 'Not a top-level Python module!' diff --git a/third_party/nature-id/inat_taxonomy.py b/third_party/nature-id/inat_taxonomy.py new file mode 100644 index 00000000..b666a413 --- /dev/null +++ b/third_party/nature-id/inat_taxonomy.py @@ -0,0 +1,318 @@ +import csv, sys, os, time, locale, zipfile, io +import inat_api +from dataclasses import dataclass +from typing import List, Dict + +# The directory where this Python script is located. +INSTALL_DIR = os.path.dirname(__file__) +while os.path.islink(INSTALL_DIR): + INSTALL_DIR = os.path.join(INSTALL_DIR, + os.path.dirname(os.readlink(INSTALL_DIR))) + +# This zip file contains the taxonomy and all common names. +# Download https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip and +# leave this zip file in directory 'inaturalist-taxonomy'. Do not extract the +# files from this zip archive. +INAT_TAXONOMY = os.path.join(INSTALL_DIR, 'inaturalist-taxonomy', + 'inaturalist-taxonomy.dwca.zip') + +# A special node represents the root of the tree, the parent of kingdoms. +ROOT_TAXON_ID = 48460 +ROOT_NAME = 'Life' +ROOT_RANK_LEVEL = 100 + +# maps rank-level to its name +gRankLevel2Name = { + ROOT_RANK_LEVEL : 'stateofmatter', # used for the parent of kingdoms + 70 : 'kingdom', + 67 : 'subkingdom', + 60 : 'phylum', + 57 : 'subphylum', + 53 : 'superclass', + 50 : 'class', + 47 : 'subclass', + 45 : 'infraclass', + 44 : 'subterclass', + 43 : 'superorder', + 40 : 'order', + 37 : 'suborder', + 35 : 'infraorder', + 34.5: 'parvorder', + 34 : 'zoosection', + 33.5: 'zoosubsection', + 33 : 'superfamily', + 32 : 'epifamily', + 30 : 'family', + 27 : 'subfamily', + 26 : 'supertribe', + 25 : 'tribe', + 24 : 'subtribe', + 20 : 'genus', + 19 : 'genushybrid', # changed, was same as genus in iNaturalist + 15 : 'subgenus', + 13 : 'section', + 12 : 'subsection', + 11 : 'complex', + 10 : 'species', + 9 : 'hybrid', # changed, was same as species in iNaturalist + 5 : 'subspecies', + 4 : 'variety', # changed, was same as subspecies in iNaturalist + 3 : 'form', # changed, was same as subspecies in iNaturalist + 2 : 'infrahybrid' # changed, was same as subspecies in iNaturalist +} + +# maps rank name to numeric rank-level +gName2RankLevel = {} +for key, value in gRankLevel2Name.items(): + gName2RankLevel[value] = key + +KINGDOM_RANK_LEVEL = gName2RankLevel['kingdom'] + +def get_rank_level(rank): + assert rank in gName2RankLevel + return gName2RankLevel[rank] + +def get_rank_name(rank_level, default_name = 'clade'): + return gRankLevel2Name[rank_level] if rank_level in gRankLevel2Name \ + else default_name + +@dataclass(frozen=True) +class Taxon: + id : int + parent_id : int + name : str + rank_level: float + +# iNaturalist taxa, only loaded when a taxonomic tree needs +# to be computed from a label file. + +gName2Taxa: Dict[str,List[Taxon]] = {} +"maps taxon name to list of taxa" + +gId2Taxon: Dict[int,Taxon] = {} +"maps taxon id to taxon" + +def load_inat_taxonomy(): + "Load all iNaturalist taxa from file 'taxa.csv'." + global gName2Taxa + global gId2Taxon + + if gName2Taxa and gId2Taxon: + return True # already loaded + + print('Loading iNaturalist taxonomy...') + start_time = time.time() + gName2Taxa = {} + gId2Taxon = {} + + try: + with zipfile.ZipFile(INAT_TAXONOMY, 'r') as zf: + with zf.open('taxa.csv', 'r') as zfile: + with io.TextIOWrapper(zfile, encoding = 'latin-1') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + id = int(row['id']) + parent_id = row['parentNameUsageID'].split('/')[-1] + parent_id = int(parent_id) if parent_id else \ + ROOT_TAXON_ID if id != ROOT_TAXON_ID else None + name = row['scientificName'] + rank = row['taxonRank'] + if not rank in gName2RankLevel: + response = inat_api.get_taxa_by_id(id) + if response and 'results' in response: + rank_level = response['results'][0]\ + ['rank_level'] + gName2RankLevel[rank] = rank_level + if not rank_level in gRankLevel2Name: + gRankLevel2Name[rank_level] = rank + print(f"Please add rank '{rank}' to gName2Rank" + f"Level, numeric value {rank_level}.") + else: + gName2RankLevel[rank] = -1 + rank_level = gName2RankLevel[rank] + inat_taxon = Taxon(id, parent_id, name, rank_level) + if name in gName2Taxa: + gName2Taxa[name].append(inat_taxon) + else: + gName2Taxa[name] = [inat_taxon] + assert not id in gId2Taxon + gId2Taxon[id] = inat_taxon + if len(gId2Taxon) % 10000 == 0: + print(f' {len(gId2Taxon):,} ' if len(gId2Taxon) % + 100000 == 0 else '.', end='') + sys.stdout.flush() + + assert ROOT_TAXON_ID in gId2Taxon + print(f' {len(gId2Taxon):,}.') + print(f'Loaded iNaturalist taxonomy of {len(gId2Taxon):,} taxa ' + f'in {time.time()-start_time:.1f} secs.') + return True + + except Exception as e: + print("Cannot load taxonomy 'taxa.csv' from archive " + f"'{INAT_TAXONOMY}': {str(e)}.") + gName2Taxa = {} + gId2Taxon = {} + return False + +def beautify_common_name(name): + "Capitalize (most) words in common name; helper function for common names." + if name.endswith(' [paraphyletic]'): + name = name[:-15] # fix dicots + name = '-'.join(word[0].upper() + word[1:] + for word in name.split('-')) + return ' '.join(word if word == 'and' or word.endswith('.') + else word[0].upper() + word[1:] + for word in name.split()) + +def annotate_common_names(id2taxon, all_common_names = False): + """ + Load the common names in our language, annotate taxonomic tree with them. + The parameter `id2taxon' includes the taxa we are interested in. + """ + start_time = time.time() + language, _ = locale.getdefaultlocale() + + if language in ['C', 'C.UTF-8', 'POSIX']: + language = 'en' + + if not os.path.isfile(INAT_TAXONOMY): + print("Cannot load common names, archive " + f"'{INAT_TAXONOMY}' does not exist.") + return + + try: + with zipfile.ZipFile(INAT_TAXONOMY, 'r') as zf: + perfect_match = [] + other_matches = [] + + # check all common names files for names in our language + for fname in zf.namelist(): + if fname.startswith("VernacularNames-") and \ + fname.endswith(".csv"): + with zf.open(fname, 'r') as zfile: + with io.TextIOWrapper(zfile, encoding='utf-8') as csvf: + reader = csv.DictReader(csvf) + for row in reader: + lang = row['language'] + if lang == language: + perfect_match.append(fname) # en vs en + elif len(lang) < len(language) and \ + lang == language[:len(lang)]: + other_matches.append(fname) # en vs en_US + break + + if not perfect_match and not other_matches: + print("Cannot find common names for language '{language}'.") + return + + # annotate the taxa with common names + total_names = loaded_names = 0 + for fname in perfect_match + other_matches: + print(f"Reading common names from '{INAT_TAXONOMY}' " + f"member '{fname}'...") + with zf.open(fname, 'r') as zfile: + with io.TextIOWrapper(zfile, encoding='utf-8') as csvf: + reader = csv.DictReader(csvf) + for row in reader: + total_names += 1 + id = int(row['id']) + if id in id2taxon and (all_common_names or \ + id2taxon[id].common_name is None): + loaded_names += 1 + cname = beautify_common_name(row['vernacular' + 'Name']) + if id2taxon[id].common_name is None: + id2taxon[id].common_name = cname + else: + id2taxon[id].common_name += '; ' + cname + + print(f'Read {total_names:,} common names in ' + f'{time.time()-start_time:.1f} secs, loaded {loaded_names:,} ' + f'in language "{language}" for {len(id2taxon)-1:,} taxa.') + + except Exception as e: + print(f"Cannot load common names from archive '{INAT_TAXONOMY}':" + f" {str(e)}.") + +def get_ancestors(id, ancestors): + """ + Ancestors are a list of instances of Taxon; they are ordered from the + kingdom down. + """ + taxon = gId2Taxon[id] + if taxon.rank_level < KINGDOM_RANK_LEVEL: + get_ancestors(taxon.parent_id, ancestors) + ancestors.append(taxon) + +def lookup_id(name, desired_ranks = ['species', 'subspecies']): + """ + Lookup by name, returns a pair, a Taxon and its ancestors, a list of + Taxon. Desired_ranks are returned in case of ambiguities (duplicate names). + """ + if not gName2Taxa: + return None # taxonomy not loaded + if name in gName2Taxa: + taxa = gName2Taxa[name] + if len(taxa) > 1: + species = None + subspecies = None + print(f"Warning: multiple taxa named '{name}':", end='') + prefix = ' ' + taxon = None + for t in taxa: + rank = get_rank_name(t.rank_level) + print(f"{prefix}{rank} {t.id}", end='') + if rank in desired_ranks: + taxon = t + prefix = ', ' + if not taxon: + taxon = taxa[0] + rank = get_rank_name(taxon.rank_level) + print(f"; choosing {rank}.") + else: + taxon = taxa[0] + ancestors = [] + if taxon.rank_level < KINGDOM_RANK_LEVEL: + get_ancestors(taxon.parent_id, ancestors) + return (taxon, ancestors) + else: + # likely taxon change, query iNat API + response = inat_api.get_taxa({ 'q' : name, + 'all_names' : 'true', + 'per_page' : 200 }) + if not response: + print(f"API lookup for name '{name}' failed.") + return + taxa = response['results'] + if len(taxa) > 1: + # more than one taxon, find the one that used to have this name + exact_matches = [taxon for taxon in taxa for nam in taxon['names'] + if nam['locale'] == 'sci' and nam['name'] == name] + if exact_matches: + taxa = exact_matches + ids = [taxon['id'] for taxon in taxa] + taxa = set([gId2Taxon[id] for id in ids if id in gId2Taxon]) + if not taxa: + return + while len(taxa) > 1: + # multiple taxa, find their common ancestor + min_rank_level = min([taxon.rank_level for taxon in taxa]) + new_taxa = set() + for taxon in taxa: + new_taxon = gId2Taxon[taxon.parent_id] \ + if taxon.rank_level == min_rank_level \ + else taxon + if not new_taxon in new_taxa: + new_taxa.add(new_taxon) + taxa = new_taxa + taxon = taxa.pop() + ancestors = [] + if taxon.rank_level < KINGDOM_RANK_LEVEL: + get_ancestors(taxon.parent_id, ancestors) + return (taxon, ancestors) + + +if __name__ == '__main__': + + assert not 'Not a top-level Python module!' diff --git a/third_party/nature-id/inaturalist-taxonomy/README b/third_party/nature-id/inaturalist-taxonomy/README new file mode 100644 index 00000000..2a9eb72e --- /dev/null +++ b/third_party/nature-id/inaturalist-taxonomy/README @@ -0,0 +1,3 @@ +The .zip archive with the taxonomy and common names belongs in this directory. + +Download https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip to this directory. Do not unpack this archive. diff --git a/third_party/nature-id/inaturalist-taxonomy/install.sh b/third_party/nature-id/inaturalist-taxonomy/install.sh new file mode 100755 index 00000000..dfda0a16 --- /dev/null +++ b/third_party/nature-id/inaturalist-taxonomy/install.sh @@ -0,0 +1,4 @@ +#!/bin/sh +rm -f inaturalist-taxonomy.dwca.zip +curl https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip \ + -o inaturalist-taxonomy.dwca.zip diff --git a/third_party/nature-id/nature_id.py b/third_party/nature-id/nature_id.py new file mode 100755 index 00000000..79d6881e --- /dev/null +++ b/third_party/nature-id/nature_id.py @@ -0,0 +1,537 @@ +#!/usr/bin/env python3 + +import numpy as np +from PIL import Image, ImageOps +import csv, sys, os, time +import inat_taxonomy + +try: + # try importing TensorFlow Lite first + import tflite_runtime.interpreter as tflite +except Exception: + try: + # TensorFlow Lite not found, try to import full TensorFlow + import tensorflow.lite as tflite + except Exception: + print('Error: TensorFlow Lite could not be loaded.', file=sys.stderr) + print(' Follow instructions at https://www.tensorflow.org/lite/' + 'guide/python to install it.', file=sys.stderr) + sys.exit(1) + +# The directory where this Python script is located. +INSTALL_DIR = inat_taxonomy.INSTALL_DIR + +# This directory contains models, label files, and taxonomy files. +CLASSIFIER_DIRECTORY = os.path.join(INSTALL_DIR, 'classifiers') + +# These flags can be modified with command-line options. +scientific_names_only = False # only scientific names or also common names +label_scores_only = False # scores for labels or hierarchical +all_common_names = False # show only one or all common names +result_sz = 5 # result size (for label_scores_only) + +# This class is used by class Taxonomy. +class Taxon: + + def __init__(self, taxon_id): + self.taxon_id = taxon_id # for internal lookups and iNat API calls + self.rank_level = None # taxonomic rank, e.g. species, genus, family + self.name = None # scientific name + self.common_name = None # common name or None + self.children = [] # list of child taxa + self.leaf_class_ids = [] # list of indices into scores; there + # can be more than one when we use old models + # whose taxa have since been lumped together + + def add_child(self, child_taxon): + self.children.append(child_taxon) + + # get taxonomic rank as a string + def get_rank(self): + if self.taxon_id < 0: # pseudo-kingdom? + assert self.rank_level == inat_taxonomy.KINGDOM_RANK_LEVEL + return '' + return inat_taxonomy.get_rank_name(self.rank_level) + + # get the name to display; customize here to show common names differently + def get_name(self): + if self.common_name: + return f'{self.common_name} ({self.name})' + else: + return self.name + + +# This taxonomy is represented in terms of instances of class Taxon. +class Taxonomy: + + def __init__(self): + # The taxonomy file may contain multiple trees, one for each kingdom. + # In order to have a single tree for prediction, we add a node for + # Life as the parent of all kingdoms. This will be the root of our tree. + self.root = Taxon(inat_taxonomy.ROOT_TAXON_ID) + self.root.name = inat_taxonomy.ROOT_NAME + self.root.rank_level = inat_taxonomy.ROOT_RANK_LEVEL + self.id2taxon = { self.root.taxon_id : self.root } + self.idx2label = {} + + def reset(self): + self.root.children = [] + self.id2taxon = { self.root.taxon_id : self.root } + self.idx2label = {} + + def taxonomy_available(self): + return len(self.root.children) > 0 + + def read_taxonomy(self, filename): + start_time = time.time() + self.reset() + with open(filename, newline='', encoding='latin-1') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + if 'id' in row: # this is a label file + self.idx2label[int(row['id'])] = row['name'] + continue + + taxon_id = int(row['taxon_id']) + if taxon_id in self.id2taxon: + taxon = self.id2taxon[taxon_id] # inserted earlier as parent + else: + self.id2taxon[taxon_id] = taxon = Taxon(taxon_id) + + taxon.name = row['name'] + if row['rank_level'].isdigit(): + taxon.rank_level = int(row['rank_level']) + else: + taxon.rank_level = float(row['rank_level']) + + if len(row['leaf_class_id']): + for leaf_class_id in row['leaf_class_id'].split(';'): + leaf_class_id = int(leaf_class_id) + taxon.leaf_class_ids.append(leaf_class_id) + self.idx2label[leaf_class_id] = taxon.name + + if len(row['parent_taxon_id']): + parent_taxon_id = int(row['parent_taxon_id']) + else: + parent_taxon_id = self.root.taxon_id + if not parent_taxon_id in self.id2taxon: + self.id2taxon[parent_taxon_id] = Taxon(parent_taxon_id) + + self.id2taxon[parent_taxon_id].add_child(taxon) + + if not self.taxonomy_available(): + # We parsed a label file; unless told otherwise, we use these + # labels to build a taxonomic tree. + print(f"Read {len(self.idx2label):,} labels from '{filename}' " + f"in {time.time() - start_time:.1f} secs.") + + if not label_scores_only: + self.compute_taxonomic_tree() + if self.taxonomy_available(): + self.write_taxonomic_tree(filename.replace('labelmap', + 'taxonomy')) + else: + print(f"Read taxonomy from '{filename}' in " + f"{time.time() - start_time:.1f} secs: " + f"{len(self.id2taxon) - 1:,} taxa including " + f"{len(self.idx2label):,} leaf taxa.") + + if not scientific_names_only and self.taxonomy_available(): + inat_taxonomy.annotate_common_names(self.id2taxon, all_common_names) + if label_scores_only: + self.annotate_labels_with_common_names() + del self.id2taxon # not needed anymore + + # augment labels with common names + def annotate_labels_with_common_names(self): + for taxon in self.id2taxon.values(): + for leaf_class_id in taxon.leaf_class_ids: + self.idx2label[leaf_class_id] = taxon.get_name() + + # write one row to taxonomy file + def write_row(self, writer, taxon, parent_taxon_id): + writer.writerow([parent_taxon_id, taxon.taxon_id, taxon.rank_level, + ';'.join([str(id) for id in taxon.leaf_class_ids]), + taxon.name]) + for child in taxon.children: + self.write_row(writer, child, taxon.taxon_id) + + # write taxonomy file + def write_taxonomic_tree(self, filename): + try: + with open(filename, 'w', newline='', encoding='latin-1') as csvfile: + writer = csv.writer(csvfile) + writer.writerow(['parent_taxon_id', 'taxon_id', 'rank_level', + 'leaf_class_id', 'name']) + for child in self.root.children: + self.write_row(writer, child, '') + print(f"Taxonomy written to file '{filename}'.") + except Exception as e: + print(f"Failure writing taxonomy to file '{filename}':", str(e)) + try: + os.remove(filename) + except Exception: + pass + + # Called after loading label file for Google's AIY Vision Kit. + # Adds all the labels' direct and indirect ancestors to compute + # the taxonomic tree. + def compute_taxonomic_tree(self): + global label_scores_only + if not inat_taxonomy.load_inat_taxonomy(): + label_scores_only = True + return + + start_time = time.time() + new_id = 0 # id's we add on the fly for pseudo-kingdoms + + for idx, name in self.idx2label.items(): + inat_taxa = inat_taxonomy.lookup_id(name) + if not inat_taxa: + print(f"Info: Taxon for label '{name}' not found, " + "inserting as pseudo-kingdom.") + new_id -= 1 + taxon_id = new_id + self.id2taxon[taxon_id] = taxon = Taxon(taxon_id) + taxon.rank_level = inat_taxonomy.KINGDOM_RANK_LEVEL + taxon.name = name + taxon.leaf_class_ids = [idx] + self.root.add_child(taxon) + continue + + inat_taxon, ancestors = inat_taxa + if name != inat_taxon.name: + print(f"Info: Taxon '{name}' changed to " + f"'{inat_taxon.name}', iNat taxa " + f"id {inat_taxon.id}.") + + # ancestor taxa + prev_ancestor = self.root + for ancestor in ancestors: + if ancestor.id in self.id2taxon: + prev_ancestor = self.id2taxon[ancestor.id] + else: + self.id2taxon[ancestor.id] = ancestor_taxon = Taxon(ancestor.id) + ancestor_taxon.name = ancestor.name + ancestor_taxon.rank_level = ancestor.rank_level + prev_ancestor.add_child(ancestor_taxon) + prev_ancestor = ancestor_taxon + + # this taxon + if inat_taxon.id in self.id2taxon: + taxon = self.id2taxon[inat_taxon.id] + assert taxon.name == inat_taxon.name + assert taxon.rank_level == inat_taxon.rank_level + else: + self.id2taxon[inat_taxon.id] = taxon = Taxon(inat_taxon.id) + taxon.name = inat_taxon.name + taxon.rank_level = inat_taxon.rank_level + prev_ancestor.add_child(taxon) + taxon.leaf_class_ids.append(idx) + + print("Computed taxonomic tree from labels in " + f"{time.time() - start_time:.1f} secs: {len(self.id2taxon)-1:,} " + f"taxa including {len(self.idx2label):,} leaf taxa.") + + # propagate scores to taxon and all below + def assign_scores(self, taxon, scores): + taxon.score = 0.0 + for leaf_class_id in taxon.leaf_class_ids: + taxon.score += scores[leaf_class_id] + for child in taxon.children: + self.assign_scores(child, scores) + taxon.score += child.score + + # Returns list of 5-tuples (score, taxon_id, taxonomic rank, + # scientific name, common name) ordered by taxonomic rank from kingdom + # down to e.g. species. + # Returns pairs (score, scientific name) if label_scores_only + # is set. + def prediction(self, scores): + + if label_scores_only: + # return list of pairs (score, scientific name) + total = np.sum(scores) + indices = np.argpartition(scores, -result_sz)[-result_sz:] + results = [(scores[i] / total, self.idx2label[i]) + for i in indices if scores[i] != 0] + results.sort(reverse=True) + return results + + # annotate all taxa across the hierarchy with scores. + self.assign_scores(self.root, scores) + + # return one hierarchical path guided by scores + path = [] + taxon = self.root + while taxon.children: + # Find child with highest score. + best_child = None + for child in taxon.children: + if not best_child or child.score > best_child.score: + best_child = child + + # Truncate path if all the other children combined are better + if best_child.score < 0.5 * taxon.score: + break + + path.append((best_child.score / self.root.score, + best_child.taxon_id, best_child.get_rank(), + best_child.get_name())) + + taxon = best_child + + return path + +# +# Offline image classification. +# + +class OfflineClassifier: + + def __init__(self, filenames): + self.min_pixel_value = 0.0 + self.max_pixel_value = 255.0 + + if os.path.split(filenames[0])[1] in ['optimized_model.tflite', + 'optimized_model_v1.tflite']: + self.min_pixel_value = -1.0 + self.max_pixel_value = 1.0 + + # Load TFLite model and allocate tensors. + self.mInterpreter = tflite.Interpreter(model_path=filenames[0]) + self.mInterpreter.allocate_tensors() + + # Get input and output tensors. + self.mInput_details = self.mInterpreter.get_input_details() + self.mOutput_details = self.mInterpreter.get_output_details() + + # Read labels or taxonomy + self.mTaxonomy = Taxonomy() + self.mTaxonomy.read_taxonomy(filenames[1]) + + def classify_image(self, image_filename): + start_time = time.time() + try: + img = Image.open(image_filename) + except: + print(f"Error: cannot load image '{image_filename}'.") + return [] + + if img.mode != 'RGB': + print(f"Error: image '{image_filename}' is of mode '{img.mode}'," + " only mode RGB is supported.") + return [] + + # rotate image if needed as it may contain EXIF orientation tag + img = ImageOps.exif_transpose(img) + + model_size = tuple(self.mInput_details[0]['shape'][1:3]) + + # square target shape expected by crop code below + assert model_size[0] == model_size[1] + + if img.size != model_size: + # We need to scale and maybe want to crop image. + width, height = img.size + if width != height: + # Before scaling, we crop image to square shape. + left = 0 + right = width + top = 0 + bottom = height + if width < height: + top = (height - width) / 2 + bottom = top + width + else: + left = (width - height) / 2 + right = left + height + img = img.crop((left, top, right, bottom)) + + # scale image + img = img.resize(model_size) + + #img.show() + + # pixels are in range 0 ... 255, turn into numpy array + input_data = np.array([np.array(img, self.mInput_details[0]['dtype'])]) + + if self.mInput_details[0]['dtype'] == np.float32: + input_data *= (self.max_pixel_value - self.min_pixel_value) / 255.0 + input_data += self.min_pixel_value + + self.mInterpreter.set_tensor(self.mInput_details[0]['index'], + input_data) + self.mInterpreter.invoke() + + output_data = self.mInterpreter.get_tensor(self.mOutput_details[0] + ['index']) + path = self.mTaxonomy.prediction(output_data[0]) + print() + print(f"Classification of '{image_filename}' took " + f"{time.time() - start_time:.1f} secs.") + return path + +# Returns a dictionary that maps available classifiers to a pair of filenames. +def get_installed_models(): + + if not os.path.isdir(CLASSIFIER_DIRECTORY): + print("Cannot load classifiers, directory " + f"'{CLASSIFIER_DIRECTORY}' does not exist.") + sys.exit(1) + + choices = [ 'birds', 'insects', 'plants'] + models = {} + + for filename in os.listdir(CLASSIFIER_DIRECTORY): + model = None + if filename.endswith(".csv"): + if filename == 'taxonomy_v2_13.csv': + model = 'v2_13' + elif filename == 'taxonomy_v1.csv': + model = 'Seek' + else: + for m in choices: + if filename.find(m) != -1: + model = m + break + if model: + filename = os.path.join(CLASSIFIER_DIRECTORY, filename) + if model in models: + if not models[model][1] or models[model][1].\ + endswith('labelmap.csv'): + models[model] = (models[model][0], filename) + else: + models[model] = (None, filename) + elif filename.endswith(".tflite"): + if filename == 'optimized_model_v2_13.tflite': + model = 'v2_13' + elif filename == 'optimized_model_v1.tflite': + model = 'Seek' + else: + for m in choices: + if filename.find(m) != -1: + model = m + break + if model: + filename = os.path.join(CLASSIFIER_DIRECTORY, filename) + if model in models: + models[model] = (filename, models[model][1]) + else: + models[model] = (filename, None) + + delete_elements = [] # postponed deletion, cannot delete during iteration + for name, files in models.items(): + if not files[0] or not files[1]: + tf_missing = ".csv file but no .tflite file" + csv_missing = ".tflite file but no .csv file" + print("Installation issue: Excluding incomplete classifier for" + f" '{name}': {tf_missing if files[1] else csv_missing}.") + delete_elements.append(name) + + for element in delete_elements: + del models[element] + + if not models: + print(f"No classifiers found in directory '{CLASSIFIER_DIRECTORY}'; " + "follow instructions in " + f"'{os.path.join(CLASSIFIER_DIRECTORY,'README.md')}'" + " to install them.", file=sys.stderr) + sys.exit(1) + return models + +def identify_species(classifier, filename): + result = classifier.classify_image(filename) + if result: + # Print list of tuples (score, taxon id, taxonomic rank, name) + # ordered by taxonomic rank from kingdom down to species. + for entry in result: + if len(entry) == 2: # labels only + print(f'{100 * entry[0]:5.1f}% {entry[1]}') + continue + print(f'{100 * entry[0]:5.1f}% {entry[2]:11s} {entry[3]}') + +# command-line parsing + +models = get_installed_models() + +def model_parameter_check(arg): + if not arg in models: + msg = f"Model '{arg}' not available. Available "\ + f"model{'' if len(models)==1 else 's'}:" + prefix = ' ' + for m in models: + msg += f"{prefix}'{m}'" + prefix = ', ' + msg += '.' + raise argparse.ArgumentTypeError(msg) + return arg + +def result_size_check(arg): + if arg.isdigit() and int(arg) > 0 and int(arg) <= 100: + return int(arg) + raise argparse.ArgumentTypeError(f"'{arg}' is not a number " + "between 1 and 100.") + +def file_directory_check(arg): + if os.path.isdir(arg) or os.path.isfile(arg): + return arg + raise argparse.ArgumentTypeError(f"'{arg}' is not a file or directory.") + +# +# Identify species for picture files and directories given as command line args +# + +if __name__ == '__main__': + import argparse + + preferred1 = 'v2_13' # default if this model is available + preferred2 = 'Seek' # second preference + + parser = argparse.ArgumentParser() + if len(models) == 1 or preferred1 in models or preferred2 in models: + default_model = preferred1 if preferred1 in models else \ + preferred2 if preferred2 in models else \ + next(iter(models)) + parser.add_argument("-m", "--model", type=model_parameter_check, + default=default_model, + help="Model to load to identify organisms.") + else: # no default for classification model + parser.add_argument("-m", "--model", type=model_parameter_check, + required=True, + help="Model to load to identify organisms.") + parser.add_argument('-a', '--all_common_names', action="store_true", + help='Show all common names and not just one.') + parser.add_argument('-l', '--label_scores_only', action="store_true", + help='Compute and display only label scores, ' + 'do not propagate scores up the hierarchy.') + parser.add_argument('-s', '--scientific_names_only', action="store_true", + help='Only use scientific names, do not load common ' + 'names.') + parser.add_argument('-r', '--result_size', type=result_size_check, + default=result_sz, help='Number of labels and their ' + 'scores to report in results.') + parser.add_argument('files_dirs', metavar='file/directory', + type=file_directory_check, nargs='+', + help='Image files or directories with images.') + args = parser.parse_args() + + scientific_names_only = args.scientific_names_only + label_scores_only = args.label_scores_only + all_common_names = args.all_common_names + result_sz = args.result_size + + # make classifier instance + + classifier = OfflineClassifier(models[args.model]) + + # process photos + + for arg in args.files_dirs: + if os.path.isfile(arg): + identify_species(classifier, arg) + elif os.path.isdir(arg): + for file in os.listdir(arg): + ext = os.path.splitext(file)[1].lower() + if ext in ['.jpg', '.jepg', '.png']: + identify_species(classifier, os.path.join(arg, file)) diff --git a/third_party/nature-id/plant_images/Mentzelia_lindleyi.jpg b/third_party/nature-id/plant_images/Mentzelia_lindleyi.jpg new file mode 100644 index 00000000..40616345 Binary files /dev/null and b/third_party/nature-id/plant_images/Mentzelia_lindleyi.jpg differ diff --git a/third_party/nature-id/plant_images/Persicaria_amphibia.jpg b/third_party/nature-id/plant_images/Persicaria_amphibia.jpg new file mode 100644 index 00000000..caafe36e Binary files /dev/null and b/third_party/nature-id/plant_images/Persicaria_amphibia.jpg differ diff --git a/third_party/nature-id/plant_images/Phyla_nodiflora.jpg b/third_party/nature-id/plant_images/Phyla_nodiflora.jpg new file mode 100644 index 00000000..3181d3c2 Binary files /dev/null and b/third_party/nature-id/plant_images/Phyla_nodiflora.jpg differ diff --git a/third_party/nature-id/plant_images/Primula_hendersonii.jpg b/third_party/nature-id/plant_images/Primula_hendersonii.jpg new file mode 100644 index 00000000..42366f98 Binary files /dev/null and b/third_party/nature-id/plant_images/Primula_hendersonii.jpg differ diff --git a/third_party/nature-id/plant_images/Solidago_velutina_ssp_californica.jpg b/third_party/nature-id/plant_images/Solidago_velutina_ssp_californica.jpg new file mode 100644 index 00000000..5b400ec1 Binary files /dev/null and b/third_party/nature-id/plant_images/Solidago_velutina_ssp_californica.jpg differ diff --git a/third_party/nature-id/plant_images/Tragopogon_porrifolius.jpg b/third_party/nature-id/plant_images/Tragopogon_porrifolius.jpg new file mode 100644 index 00000000..4c680c1e Binary files /dev/null and b/third_party/nature-id/plant_images/Tragopogon_porrifolius.jpg differ diff --git a/third_party/nature-id/plant_images/Trichostema_lanceolatum.jpg b/third_party/nature-id/plant_images/Trichostema_lanceolatum.jpg new file mode 100644 index 00000000..934739f7 Binary files /dev/null and b/third_party/nature-id/plant_images/Trichostema_lanceolatum.jpg differ diff --git a/third_party/nature-id/requirements.txt b/third_party/nature-id/requirements.txt new file mode 100644 index 00000000..b123b221 --- /dev/null +++ b/third_party/nature-id/requirements.txt @@ -0,0 +1,3 @@ +Pillow +requests +tflite-runtime