merge: integrate remote codex/sync-node1-runtime with fabric layer changes

Resolve conflicts in docker-compose.node1.yml, services/router/main.py,
and gateway-bot/services/doc_service.py — keeping both fabric layer
(NCS, node-worker, Prometheus) and document ingest/query endpoints.

Made-with: Cursor
This commit is contained in:
Apple
2026-02-27 03:09:12 -08:00
76 changed files with 7495 additions and 295 deletions

View File

@@ -19,7 +19,8 @@
"onboarding",
"ecosystem"
],
"mentor": null
"mentor": null,
"district_id": "city-core"
},
"helion": {
"display_name": "Helion",
@@ -35,7 +36,8 @@
"market_analysis",
"biominer"
],
"mentor": null
"mentor": null,
"district_id": "helion"
},
"alateya": {
"display_name": "Aletheia",
@@ -58,7 +60,8 @@
"email": "alverjob@gmail.com",
"site": "https://alverjob.xyz",
"youtube": "https://www.youtube.com/@alverjob72"
}
},
"district_id": "alateya"
},
"druid": {
"display_name": "DRUID",
@@ -76,7 +79,8 @@
"inci",
"safety_basics"
],
"mentor": null
"mentor": null,
"district_id": "druid"
},
"nutra": {
"display_name": "NUTRA",
@@ -93,7 +97,8 @@
"vitamins",
"microbiome"
],
"mentor": null
"mentor": null,
"district_id": "nutra"
},
"agromatrix": {
"display_name": "Степан Матрікс",
@@ -110,7 +115,8 @@
"logistics",
"farm_economics"
],
"mentor": null
"mentor": null,
"district_id": "agromatrix"
},
"greenfood": {
"display_name": "GREENFOOD",
@@ -127,7 +133,8 @@
"food_production",
"sales"
],
"mentor": null
"mentor": null,
"district_id": "greenfood"
},
"clan": {
"display_name": "CLAN",
@@ -143,7 +150,8 @@
"culture",
"facilitation"
],
"mentor": null
"mentor": null,
"district_id": "clan"
},
"eonarch": {
"display_name": "EONARCH",
@@ -159,7 +167,8 @@
"transformation",
"spirituality"
],
"mentor": null
"mentor": null,
"district_id": "eonarch"
},
"yaromir": {
"display_name": "YAROMIR",
@@ -175,7 +184,8 @@
"code_review",
"strategy"
],
"mentor": null
"mentor": null,
"district_id": "city-core"
},
"soul": {
"display_name": "SOUL",
@@ -191,7 +201,24 @@
"values",
"wellbeing"
],
"mentor": null
"mentor": null,
"district_id": "soul"
},
"dario": {
"display_name": "DARIO",
"canonical_role": "Future DAARION Agent (planned, not launched)",
"prompt_file": "dario_prompt.txt",
"telegram_mode": "disabled",
"visibility": "private",
"status": "planned",
"district_id": "city-core",
"domains": [
"city_ops",
"coordination",
"support"
],
"mentor": null,
"launch_state": "planned"
}
}
}
}

View File

@@ -0,0 +1,8 @@
# Agronomist
Фокус: агрономія, діагностика стану рослин, фази розвитку, ризики хвороб/стресів.
Правила відповіді:
- Коротко і прикладно.
- Ніяких вигаданих фактів; при невизначеності чітко позначити припущення.
- Для фото-питань: аналізувати в межах доступного контексту; якщо файл відсутній зараз — просити фото повторно.

View File

@@ -0,0 +1,8 @@
# Communicator
Фокус: людяна та зрозуміла комунікація фінальної відповіді.
Правила:
- Природна мова, без механістичного тону.
- Не дублюй технічні обмеження, якщо вони не потрібні для дії користувача.
- Завершуй конкретним корисним кроком.

View File

@@ -0,0 +1,7 @@
# Field Data Analyst
Фокус: аналіз польових даних, тренди, аномалії, порівняння сценаріїв.
Правила:
- Пояснювати висновки простою мовою.
- Якщо даних недостатньо — вказати, які саме дані потрібні для точного висновку.

View File

@@ -0,0 +1,8 @@
# Farm Ops Planner
Фокус: планування польових робіт, ресурси, пріоритезація задач, таймінги.
Правила:
- Видавати практичний порядок дій.
- За простого запиту: коротка відповідь.
- Для операційних запитів: стислий план з відповідальними і дедлайном.

View File

@@ -0,0 +1,10 @@
# AgroMatrix Orchestrator Synthesis
Ти синтезуєш відповіді ролей у фінальну відповідь Степана.
Правила:
- За замовчуванням: 1-3 природні речення без шаблонної канцелярії.
- Детальний формат (пункти/чекліст) тільки коли користувач просить "детально", "план", "чекліст", "розрахунок".
- Якщо для аналізу бракує фото в поточному контексті, скажи це просто і попроси надіслати фото повторно.
- Уникай службових формулювань про "технічні обмеження", "text-only" чи "відсутній vision-модуль".
- Пояснюй по суті агропитання і давай 1 наступний практичний крок.

View File

@@ -0,0 +1,7 @@
# Risk Assessor
Фокус: агро-ризики, операційні ризики, наслідки рішень.
Правила:
- Давай коротку оцінку ризику (низький/середній/високий) і як зменшити ризик.
- Без зайвої бюрократії у відповіді користувачу.

View File

@@ -11,6 +11,10 @@
- Деструктивні дії (delete/migrate/prod) ТІЛЬКИ через план + dry-run + backup
- Ніколи не логувати секрети/токени
- Інші ролі НЕ спілкуються з користувачем напряму
- Мультимодальність активна: фото/голос/документи підтримуються через стек платформи.
- Якщо в поточному контексті не вистачає зображення для аналізу, пояснюйте це простою людською мовою і попросіть надіслати фото ще раз без технічних формулювань.
## Формат відповіді:
Структурована відповідь з чіткими рекомендаціями та наступними кроками.
- За замовчуванням: природна коротка відповідь 1-3 речення.
- Якщо користувач просить детально/план/чекліст: структурована відповідь з чіткими наступними кроками.
- Тон: живий і професійний, без канцеляризмів, шаблонів і фраз про "обмеження моделі".

View File

@@ -7,3 +7,7 @@
- Структурувати інформацію логічно
- Включати конкретні наступні кроки
- Позначати ризики якщо є
- За замовчуванням відповідати природно і коротко (1-3 речення), без шаблонної канцелярії.
- Для детальних запитів переходити у структурований режим.
- Якщо для аналізу бракує зображення у поточному контексті, скажіть це природно і попросіть надіслати фото повторно.
- Не вживати службові формулювання на кшталт "обмеження моделі", "text-only", "vision unavailable".

View File

@@ -0,0 +1,11 @@
You are AGROVOC Normalizer.
Responsibilities:
- Normalize crop/disease terms using agrovoc_lookup.
- Provide canonical term mapping for user-facing output.
- Keep labels practical for agronomy context.
Return format:
- canonical_terms
- term_mapping
- notes_for_user

View File

@@ -0,0 +1,17 @@
You are the synthesis role for AgroMatrix plant intelligence.
Goal:
- Aggregate candidate plant IDs from vision + PlantNet + GBIF + AGROVOC.
- Return concise output with uncertainty, sources, and next-photo requirements.
Output contract (strict):
1) probable_taxon: one short line
2) confidence: low/medium/high + one short reason
3) alternatives: up to 3 entries
4) sources: PlantNet/GBIF/AGROVOC/Web (only those actually used)
5) next_photos_required: 1-3 concrete photo instructions
Rules:
- Never claim 100% certainty from a single weak source.
- If evidence conflicts, say so and reduce confidence.
- Keep default response concise.

View File

@@ -0,0 +1,11 @@
You are Plant Identifier.
Responsibilities:
- Parse visual cues from user description/photo context.
- Build candidate crop/plant hypotheses.
- Use plantnet_lookup first when image URL is available.
- If PlantNet is unavailable, provide top hypotheses with explicit uncertainty.
Return format:
- candidates: numbered list max 5, each with rationale.
- required_data: what extra image/data is needed.

View File

@@ -0,0 +1,11 @@
You are Taxonomy Validator.
Responsibilities:
- Validate candidate names via gbif_species_lookup.
- Remove invalid/synonym-conflicted names.
- Keep accepted taxa and explain conflicts briefly.
Return format:
- accepted_candidates
- rejected_candidates_with_reason
- confidence_adjustment

View File

@@ -0,0 +1,43 @@
# AgroMatrix Plant Intel Contract (Skeleton)
## Purpose
`agromatrix_plant_intel` is an internal CrewAI profile for Stepan (AgroMatrix orchestrator).
It is used for plant/crop identification and normalization when confidence matters.
## Call Path
1. User asks Stepan.
2. Stepan remains final speaker.
3. When query matches plant-intel intent, CrewAI profile `plant_intel` is selected.
4. Subteam runs:
- `plant_identifier`
- `taxonomy_validator`
- `agrovoc_normalizer`
5. Synthesis returns compact evidence package to Stepan.
## Tool Adapters
- `nature_id_identify`
- input: `image_url`, `top_k?`
- output: local/open-source candidates
- note: requires self-hosted endpoint `NATURE_ID_URL`
- `plantnet_lookup`
- input: `query?`, `image_url?`, `organ?`, `top_k?`
- output: candidate taxa + score
- note: if `PLANTNET_API_KEY` missing, fallback chain is `nature_id_identify` -> `gbif_species_lookup`
- `gbif_species_lookup`
- input: `query`, `limit?`
- output: accepted taxa/rank/status
- `agrovoc_lookup`
- input: `query`, `lang?`, `limit?`
- output: canonical AGROVOC concepts
## Response Contract (to Stepan)
- `probable_taxon`
- `confidence` (`low|medium|high` + reason)
- `alternatives` (up to 3)
- `sources` (actual tools used)
- `next_photos_required` (1-3 concrete instructions)
## Safety
- No categorical claim with weak evidence.
- If sources conflict, confidence is downgraded.
- Final user answer remains concise by default.

View File

@@ -3,7 +3,7 @@ FROM python:3.11-slim
LABEL maintainer="DAARION.city Team"
LABEL description="Bot Gateway - Telegram/Discord webhook handler with DAARWIZZ"
LABEL version="0.2.0"
LABEL version="0.2.1"
WORKDIR /app/gateway-bot
@@ -15,7 +15,15 @@ RUN pip install --no-cache-dir \
uvicorn==0.27.0 \
httpx==0.26.0 \
pydantic==2.5.3 \
python-multipart==0.0.6 prometheus-client>=0.20.0 PyPDF2>=3.0.0 crewai nats-py pandas openpyxl
python-multipart==0.0.6 \
prometheus-client==0.22.1 \
PyPDF2>=3.0.0 \
crewai \
nats-py \
pandas \
openpyxl \
python-docx \
redis==5.0.1
# Copy gateway code and DAARWIZZ prompt
COPY . .

View File

@@ -19,7 +19,8 @@
"onboarding",
"ecosystem"
],
"mentor": null
"mentor": null,
"district_id": "city-core"
},
"helion": {
"display_name": "Helion",
@@ -35,7 +36,8 @@
"market_analysis",
"biominer"
],
"mentor": null
"mentor": null,
"district_id": "helion"
},
"alateya": {
"display_name": "Aletheia",
@@ -58,7 +60,8 @@
"email": "alverjob@gmail.com",
"site": "https://alverjob.xyz",
"youtube": "https://www.youtube.com/@alverjob72"
}
},
"district_id": "alateya"
},
"druid": {
"display_name": "DRUID",
@@ -76,7 +79,8 @@
"inci",
"safety_basics"
],
"mentor": null
"mentor": null,
"district_id": "druid"
},
"nutra": {
"display_name": "NUTRA",
@@ -93,7 +97,8 @@
"vitamins",
"microbiome"
],
"mentor": null
"mentor": null,
"district_id": "nutra"
},
"agromatrix": {
"display_name": "Степан Матрікс",
@@ -110,7 +115,8 @@
"logistics",
"farm_economics"
],
"mentor": null
"mentor": null,
"district_id": "agromatrix"
},
"greenfood": {
"display_name": "GREENFOOD",
@@ -127,7 +133,8 @@
"food_production",
"sales"
],
"mentor": null
"mentor": null,
"district_id": "greenfood"
},
"clan": {
"display_name": "CLAN",
@@ -143,7 +150,8 @@
"culture",
"facilitation"
],
"mentor": null
"mentor": null,
"district_id": "clan"
},
"eonarch": {
"display_name": "EONARCH",
@@ -159,7 +167,8 @@
"transformation",
"spirituality"
],
"mentor": null
"mentor": null,
"district_id": "eonarch"
},
"yaromir": {
"display_name": "YAROMIR",
@@ -175,7 +184,8 @@
"code_review",
"strategy"
],
"mentor": null
"mentor": null,
"district_id": "city-core"
},
"soul": {
"display_name": "SOUL",
@@ -191,7 +201,8 @@
"values",
"wellbeing"
],
"mentor": null
"mentor": null,
"district_id": "soul"
},
"senpai": {
"display_name": "SENPAI",
@@ -207,7 +218,8 @@
"defi",
"portfolio"
],
"mentor": null
"mentor": null,
"district_id": "senpai"
},
"oneok": {
"display_name": "1OK",
@@ -227,7 +239,8 @@
"mentor": {
"name": "Ілля Титар",
"telegram": "@Titar240581"
}
},
"district_id": "city-core"
},
"sofiia": {
"display_name": "Sophia",
@@ -242,7 +255,24 @@
"platform_evolution",
"technical_leadership"
],
"mentor": null
"mentor": null,
"district_id": "city-core"
},
"dario": {
"display_name": "DARIO",
"canonical_role": "Future DAARION Agent (planned, not launched)",
"prompt_file": "dario_prompt.txt",
"telegram_mode": "disabled",
"visibility": "private",
"status": "planned",
"district_id": "city-core",
"domains": [
"city_ops",
"coordination",
"support"
],
"mentor": null,
"launch_state": "planned"
}
}
}
}

View File

@@ -32,7 +32,9 @@
## B. SHORT-FIRST
**За замовчуванням: структурована відповідь з 3-5 пунктів.**
**За замовчуванням: жива коротка відповідь 1-3 речення природною мовою.**
**Маркерні списки/шаблони 3-5 пунктів використовуй тільки коли користувач просить детально, план, чеклист або розрахунок.**
**Перше повідомлення в новій темі — розмовне, без канцеляриту та без "робото-тону".**
ЗАБОРОНЕНО:
- "Радий допомогти", "Готовий до співпраці"
@@ -55,7 +57,9 @@
**ВАЖЛИВО:**
- Ніколи не кажи "я не можу слухати аудіо" — голосові повідомлення вже перетворені на текст!
- Ніколи не кажи "я не можу бачити/аналізувати зображення" — ти МАЄШ Vision API і МОЖЕШ аналізувати фото! Якщо в історії розмови є твій опис зображення — це означає ти його вже проаналізував(ла) через Vision. Не заперечуй це.
- Фото аналізуй по доступному поточному контексту: якщо зображення є у запиті або щойно надіслане — коментуй по суті.
- Якщо для точного висновку бракує самого файлу чи чіткості, поясни це простою людською мовою і попроси надіслати фото повторно з уточненням, що саме перевірити.
- Не використовуй службові фрази типу "text-only", "vision unavailable", "технічне обмеження моделі".
Початковий режим: учень. Спочатку став уточнювальні питання і вчися у ментора.
Публічна група: @agromatrix.
@@ -94,7 +98,8 @@
- Мислиш далекоглядно: пропонуєш архітектуру рішення, а не латання симптомів.
- Будь креативним, але не фантазуй дані: якщо фактів нема — позначай як припущення і пропонуй, що зібрати.
- Спілкуйся українською (якщо користувач не перейшов на іншу мову).
- Форматуй відповіді структуровано: заголовки, списки, короткі блоки, пріоритети.
- Тримай розмовний тон: короткі природні фрази, без надмірної шаблонності.
- Структурований формат (заголовки/списки/таблиці) вмикай лише для складних задач або коли це прямо запитали.
### 4) Принципи роботи з користувачем
1. Спочатку контекст → потім рішення. Якщо контексту бракує — зроби мінімальний набір припущень і паралельно запропонуй, які дані уточнити.
@@ -113,6 +118,8 @@
- “Підготуй текст/структуру сторінки/презентації для продукту AgroMatrix”
### 6) Як ти формуєш відповіді (стандартний шаблон)
Використовуй цей шаблон ТІЛЬКИ для комплексних запитів (планування сезону, економіка, SOP, інтеграції, ТЗ).
Для звичайних коротких питань відповідай в 1-3 речення органічно, без обов'язкових секцій.
1. Ціль (12 речення)
2. Вхідні дані (що відомо / які припущення)
3. Рішення (план/алгоритм/кроки)

View File

@@ -1,12 +1,13 @@
"""
FastAPI app instance for Gateway Bot
"""
"""FastAPI app instance for Gateway Bot."""
import logging
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from http_api import router as gateway_router
from http_api_doc import router as doc_router
from daarion_facade.invoke_api import router as invoke_router
from daarion_facade.registry_api import router as registry_router
logging.basicConfig(
level=logging.INFO,
@@ -15,36 +16,47 @@ logging.basicConfig(
app = FastAPI(
title="Bot Gateway with DAARWIZZ",
version="1.0.0",
description="Gateway service for Telegram/Discord bots DAGI Router"
version="1.1.0",
description="Gateway service for Telegram/Discord bots + DAARION public facade"
)
# CORS middleware
# CORS for web UI clients (gateway only).
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_origins=[
"https://daarion.city",
"https://www.daarion.city",
"http://localhost:3000",
],
allow_origin_regex=r"https://.*\.lovable\.app",
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
allow_methods=["GET", "POST", "OPTIONS"],
allow_headers=["Authorization", "Content-Type"],
)
# Include gateway routes
# Existing gateway routes.
app.include_router(gateway_router, prefix="", tags=["gateway"])
app.include_router(doc_router, prefix="", tags=["docs"])
# Public facade routes for DAARION.city UI.
app.include_router(registry_router)
app.include_router(invoke_router)
@app.get("/")
async def root():
return {
"service": "bot-gateway",
"version": "1.0.0",
"version": "1.1.0",
"agent": "DAARWIZZ",
"endpoints": [
"POST /telegram/webhook",
"POST /discord/webhook",
"POST /api/doc/parse",
"POST /api/doc/ingest",
"POST /api/doc/ask",
"GET /api/doc/context/{session_id}",
"GET /health"
"GET /v1/registry/agents",
"GET /v1/registry/districts",
"GET /v1/metrics",
"POST /v1/invoke",
"GET /v1/jobs/{job_id}",
"GET /health",
]
}

View File

@@ -0,0 +1 @@
"""DAARION public facade package."""

View File

@@ -0,0 +1,212 @@
import asyncio
from datetime import datetime, timezone
import hmac
import json
import os
import uuid
from typing import Any, Dict, List
import httpx
from fastapi import APIRouter, HTTPException, Request, status
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field
from .redis_jobs import create_job, enqueue_job, get_job
from .registry_api import _load_registry
router = APIRouter(prefix="/v1", tags=["daarion-facade"])
EVENT_TERMINAL_STATUSES = {"done", "failed"}
EVENT_KNOWN_STATUSES = {"queued", "running", "done", "failed"}
EVENT_POLL_SECONDS = float(os.getenv("DAARION_JOB_EVENTS_POLL_SECONDS", "0.5"))
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000").rstrip("/")
ROUTER_REVIEW_TIMEOUT = float(os.getenv("DAARION_ROUTER_REVIEW_TIMEOUT_SECONDS", "20"))
AGROMATRIX_REVIEW_AUTH_MODE = os.getenv("AGROMATRIX_REVIEW_AUTH_MODE", "bearer").strip().lower()
AGROMATRIX_REVIEW_BEARER_TOKENS = [
part.strip()
for part in os.getenv("AGROMATRIX_REVIEW_BEARER_TOKENS", "").replace(";", ",").split(",")
if part.strip()
]
class InvokeInput(BaseModel):
prompt: str = Field(min_length=1)
images: List[str] = Field(default_factory=list)
class InvokeRequest(BaseModel):
agent_id: str
input: InvokeInput
metadata: Dict[str, Any] = Field(default_factory=dict)
class InvokeResponse(BaseModel):
job_id: str
status: str
status_url: str
class SharedMemoryReviewRequest(BaseModel):
point_id: str
approve: bool
reviewer: str | None = None
note: str | None = None
def _extract_bearer_token(request: Request) -> str:
auth_header = request.headers.get("Authorization", "")
if not auth_header.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Missing Bearer token")
token = auth_header[len("Bearer ") :].strip()
if not token:
raise HTTPException(status_code=401, detail="Empty Bearer token")
return token
def _require_mentor_auth(request: Request) -> str:
mode = AGROMATRIX_REVIEW_AUTH_MODE
if mode in {"off", "none", "disabled"}:
return ""
if mode != "bearer":
raise HTTPException(status_code=500, detail=f"Unsupported AGROMATRIX_REVIEW_AUTH_MODE={mode}")
if not AGROMATRIX_REVIEW_BEARER_TOKENS:
raise HTTPException(status_code=503, detail="Review auth is not configured")
token = _extract_bearer_token(request)
if not any(hmac.compare_digest(token, candidate) for candidate in AGROMATRIX_REVIEW_BEARER_TOKENS):
raise HTTPException(status_code=403, detail="Invalid mentor token")
return token
async def _router_json(
method: str,
path: str,
*,
payload: Dict[str, Any] | None = None,
params: Dict[str, Any] | None = None,
authorization: str | None = None,
) -> Dict[str, Any]:
headers: Dict[str, str] = {}
if authorization:
headers["Authorization"] = authorization
url = f"{ROUTER_URL}{path}"
try:
async with httpx.AsyncClient(timeout=ROUTER_REVIEW_TIMEOUT) as client:
resp = await client.request(method, url, json=payload, params=params, headers=headers)
except httpx.TimeoutException:
raise HTTPException(status_code=504, detail="Router timeout")
except Exception as e:
raise HTTPException(status_code=502, detail=f"Router unavailable: {e}")
try:
body = resp.json()
except Exception:
body = {"raw": resp.text}
if resp.status_code >= 400:
detail = body.get("detail") if isinstance(body, dict) else body
raise HTTPException(status_code=resp.status_code, detail=detail or f"Router error {resp.status_code}")
return body if isinstance(body, dict) else {"data": body}
def _sse_message(event: str, payload: Dict[str, Any]) -> str:
return f"event: {event}\ndata: {json.dumps(payload, ensure_ascii=False)}\n\n"
@router.post("/invoke", status_code=status.HTTP_202_ACCEPTED, response_model=InvokeResponse)
async def invoke(payload: InvokeRequest) -> InvokeResponse:
registry = _load_registry().get("agents", {})
if payload.agent_id not in registry:
raise HTTPException(status_code=404, detail=f"Unknown agent_id: {payload.agent_id}")
job_id = f"job_{uuid.uuid4().hex}"
now = datetime.now(timezone.utc).isoformat()
job_doc = {
"job_id": job_id,
"status": "queued",
"agent_id": payload.agent_id,
"input": payload.input.model_dump(),
"metadata": payload.metadata,
"result": None,
"error": None,
"created_at": now,
"updated_at": now,
"started_at": None,
"finished_at": None,
}
await create_job(job_id, job_doc)
await enqueue_job(job_id)
return InvokeResponse(job_id=job_id, status="queued", status_url=f"/v1/jobs/{job_id}")
@router.get("/jobs/{job_id}")
async def job_status(job_id: str) -> Dict[str, Any]:
job = await get_job(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
return job
@router.get("/jobs/{job_id}/events")
async def job_events(job_id: str, request: Request) -> StreamingResponse:
existing = await get_job(job_id)
if not existing:
raise HTTPException(status_code=404, detail="Job not found")
async def event_stream():
last_state = None
yield "retry: 1000\n\n"
while True:
if await request.is_disconnected():
break
job = await get_job(job_id)
if not job:
yield _sse_message("failed", {"job_id": job_id, "status": "failed", "error": {"message": "Job not found"}})
break
status_value = str(job.get("status", "unknown"))
updated_at = str(job.get("updated_at", ""))
state = (status_value, updated_at)
if state != last_state:
event_name = status_value if status_value in EVENT_KNOWN_STATUSES else "status"
yield _sse_message(event_name, job)
last_state = state
if status_value in EVENT_TERMINAL_STATUSES:
break
await asyncio.sleep(EVENT_POLL_SECONDS)
return StreamingResponse(
event_stream(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no",
},
)
@router.get("/agromatrix/shared-memory/pending")
async def agromatrix_shared_pending(limit: int = 50) -> Dict[str, Any]:
return await _router_json(
"GET",
"/v1/agromatrix/shared-memory/pending",
params={"limit": max(1, min(limit, 200))},
)
@router.post("/agromatrix/shared-memory/review")
async def agromatrix_shared_review(req: SharedMemoryReviewRequest, request: Request) -> Dict[str, Any]:
token = _require_mentor_auth(request)
auth_header = f"Bearer {token}" if token else None
return await _router_json(
"POST",
"/v1/agromatrix/shared-memory/review",
payload=req.model_dump(),
authorization=auth_header,
)

View File

@@ -0,0 +1,287 @@
import asyncio
import json
import logging
import os
import time
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, Tuple
import httpx
from redis.asyncio import Redis
from .registry_api import _load_crewai_roles, _load_district_registry, _load_registry
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
logger = logging.getLogger("daarion-metrics-poller")
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
POLL_INTERVAL_SECONDS = int(os.getenv("DAARION_METRICS_POLL_INTERVAL_SECONDS", "10"))
METRICS_TTL_SECONDS = int(os.getenv("DAARION_METRICS_TTL_SECONDS", "60"))
HTTP_CONNECT_TIMEOUT_SECONDS = float(os.getenv("DAARION_METRICS_HTTP_CONNECT_TIMEOUT_SECONDS", "2"))
HTTP_TOTAL_TIMEOUT_SECONDS = float(os.getenv("DAARION_METRICS_HTTP_TOTAL_TIMEOUT_SECONDS", "5"))
NODES_TOTAL = int(os.getenv("DAARION_NODE_COUNT", "1"))
MEMORY_SERVICE_URL = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000")
DASHBOARD_KEY = "daarion:metrics:dashboard"
DISTRICT_KEY_PREFIX = "daarion:metrics:district"
_redis: Optional[Redis] = None
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def _ensure_url(value: str) -> str:
value = (value or "").strip()
if not value:
return ""
if value.startswith("http://") or value.startswith("https://"):
return value
return f"https://{value}"
def _health_candidates(district: Dict[str, Any]) -> List[str]:
base = _ensure_url(str(district.get("domain") or ""))
candidates: List[str] = []
explicit = str(district.get("health_url") or "").strip()
if explicit:
candidates.append(_ensure_url(explicit))
if base:
candidates.extend(
[
f"{base}/.well-known/daarion-health.json",
f"{base}/health",
f"{base}/v1/health",
]
)
dedup: List[str] = []
seen = set()
for url in candidates:
if url and url not in seen:
dedup.append(url)
seen.add(url)
return dedup
def _extract_agents_online(payload: Dict[str, Any], agents_total: int) -> Optional[int]:
raw = payload.get("agents_online")
if isinstance(raw, bool):
return agents_total if raw else 0
if isinstance(raw, int):
return max(0, min(raw, agents_total))
agents = payload.get("agents")
if isinstance(agents, list):
count = 0
for agent in agents:
if not isinstance(agent, dict):
continue
status = str(agent.get("status", "")).lower()
if status in {"online", "active", "ok"}:
count += 1
return min(count, agents_total)
return None
async def redis_client() -> Redis:
global _redis
if _redis is None:
_redis = Redis.from_url(REDIS_URL, decode_responses=True)
return _redis
async def close_redis() -> None:
global _redis
if _redis is not None:
await _redis.close()
_redis = None
async def _fetch_json_with_latency(
client: httpx.AsyncClient,
url: str,
) -> Tuple[bool, Optional[Dict[str, Any]], Optional[float], Optional[str]]:
started = time.perf_counter()
try:
response = await client.get(url)
latency_ms = round((time.perf_counter() - started) * 1000, 2)
if response.status_code >= 400:
return False, None, latency_ms, f"HTTP {response.status_code}"
data: Optional[Dict[str, Any]] = None
try:
parsed = response.json()
if isinstance(parsed, dict):
data = parsed
except Exception:
data = None
return True, data, latency_ms, None
except Exception as e:
latency_ms = round((time.perf_counter() - started) * 1000, 2)
return False, None, latency_ms, str(e)
async def _read_memory_vectors(client: httpx.AsyncClient) -> int:
try:
ok, payload, _, _ = await _fetch_json_with_latency(client, f"{MEMORY_SERVICE_URL}/health")
if not ok or not payload:
return 0
return int(payload.get("vector_store", {}).get("memories", {}).get("vectors_count", 0) or 0)
except Exception:
return 0
async def _registry_snapshot() -> Tuple[List[Dict[str, Any]], Dict[str, List[Dict[str, Any]]], int, int]:
raw_districts = _load_district_registry().get("districts", [])
districts = [d for d in raw_districts if isinstance(d, dict) and d.get("district_id")]
agents_map = _load_registry().get("agents", {})
role_counts = await _load_crewai_roles()
by_district: Dict[str, List[Dict[str, Any]]] = {}
subagents_total = 0
for aid, cfg in agents_map.items():
if not isinstance(cfg, dict):
continue
aid_str = str(aid)
district_id = str(cfg.get("district_id") or "city-core")
subagents_total += int(role_counts.get(aid_str, 0))
by_district.setdefault(district_id, []).append(
{
"agent_id": aid_str,
"status": str(cfg.get("status", "active")),
}
)
return districts, by_district, len(agents_map), subagents_total
async def build_dashboard() -> Dict[str, Any]:
districts, agents_by_district, agents_total, subagents_total = await _registry_snapshot()
timeout = httpx.Timeout(timeout=HTTP_TOTAL_TIMEOUT_SECONDS, connect=HTTP_CONNECT_TIMEOUT_SECONDS)
by_district: List[Dict[str, Any]] = []
districts_online = 0
agents_online_total = 0
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
memory_vectors = await _read_memory_vectors(client)
for district in districts:
district_id = str(district.get("district_id"))
title = district.get("title") or district_id
domain = str(district.get("domain") or "")
status = district.get("status") or "active"
members = agents_by_district.get(district_id, [])
agents_total_district = len(members)
sample = {
"district_id": district_id,
"title": title,
"domain": domain,
"status": status,
"ok": False,
"agents_total": agents_total_district,
"agents_online": 0,
"latency_ms": None,
"last_check_ts": _now_iso(),
"error": None,
}
last_error = "No health endpoint configured"
for candidate in _health_candidates(district):
ok, payload, latency_ms, error_message = await _fetch_json_with_latency(client, candidate)
sample["latency_ms"] = latency_ms
if ok:
sample["ok"] = True
sample["error"] = None
inferred = _extract_agents_online(payload or {}, agents_total_district)
sample["agents_online"] = inferred if inferred is not None else agents_total_district
break
last_error = error_message or "health check failed"
if sample["ok"]:
districts_online += 1
agents_online_total += int(sample.get("agents_online") or 0)
else:
sample["error"] = {"message": last_error}
by_district.append(sample)
return {
"global": {
"nodes": NODES_TOTAL,
"districts": len(districts),
"agents": agents_total,
"subagents": subagents_total,
"memory_vectors": memory_vectors,
"districts_online": districts_online,
"agents_online": agents_online_total,
},
"by_district": by_district,
"updated_at": _now_iso(),
}
async def publish_dashboard(dashboard: Dict[str, Any]) -> None:
redis = await redis_client()
payload = json.dumps(dashboard, ensure_ascii=False)
await redis.set(DASHBOARD_KEY, payload, ex=METRICS_TTL_SECONDS)
for row in dashboard.get("by_district", []):
district_id = row.get("district_id")
if not district_id:
continue
key = f"{DISTRICT_KEY_PREFIX}:{district_id}"
await redis.set(key, json.dumps(row, ensure_ascii=False), ex=METRICS_TTL_SECONDS)
async def run_once() -> None:
dashboard = await build_dashboard()
await publish_dashboard(dashboard)
logger.info(
"dashboard_updated districts=%s districts_online=%s agents=%s agents_online=%s",
dashboard["global"].get("districts"),
dashboard["global"].get("districts_online"),
dashboard["global"].get("agents"),
dashboard["global"].get("agents_online"),
)
async def worker_loop() -> None:
logger.info(
"metrics_poller_started interval=%ss ttl=%ss redis=%s",
POLL_INTERVAL_SECONDS,
METRICS_TTL_SECONDS,
REDIS_URL,
)
while True:
started = time.perf_counter()
try:
await run_once()
except asyncio.CancelledError:
raise
except Exception:
logger.exception("metrics_poller_cycle_failed")
elapsed = time.perf_counter() - started
sleep_for = max(1.0, POLL_INTERVAL_SECONDS - elapsed)
await asyncio.sleep(sleep_for)
if __name__ == "__main__":
try:
asyncio.run(worker_loop())
finally:
try:
asyncio.run(close_redis())
except Exception:
pass

View File

@@ -0,0 +1,84 @@
import asyncio
import json
import os
from typing import Any, Dict, Optional
from redis.asyncio import Redis
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
JOB_KEY_PREFIX = "daarion:jobs"
QUEUE_KEY = "daarion:jobs:queue"
JOB_TTL_SECONDS = int(os.getenv("DAARION_JOB_TTL_SECONDS", str(72 * 3600)))
_redis: Optional[Redis] = None
def _job_key(job_id: str) -> str:
return f"{JOB_KEY_PREFIX}:{job_id}"
async def redis_client() -> Redis:
global _redis
if _redis is None:
_redis = Redis.from_url(REDIS_URL, decode_responses=True)
return _redis
async def close_redis() -> None:
global _redis
if _redis is not None:
await _redis.close()
_redis = None
async def create_job(job_id: str, payload: Dict[str, Any]) -> None:
r = await redis_client()
key = _job_key(job_id)
await r.set(key, json.dumps(payload, ensure_ascii=False), ex=JOB_TTL_SECONDS)
async def get_job(job_id: str) -> Optional[Dict[str, Any]]:
r = await redis_client()
raw = await r.get(_job_key(job_id))
if not raw:
return None
try:
return json.loads(raw)
except json.JSONDecodeError:
return None
async def update_job(job_id: str, patch: Dict[str, Any]) -> Optional[Dict[str, Any]]:
current = await get_job(job_id)
if not current:
return None
current.update(patch)
await create_job(job_id, current)
return current
async def enqueue_job(job_id: str) -> None:
r = await redis_client()
await r.lpush(QUEUE_KEY, job_id)
async def dequeue_job(block_seconds: int = 5) -> Optional[str]:
r = await redis_client()
result = await r.brpop(QUEUE_KEY, timeout=block_seconds)
if not result:
return None
_, job_id = result
return job_id
async def wait_for_redis(timeout_seconds: int = 30) -> None:
deadline = asyncio.get_running_loop().time() + timeout_seconds
while True:
try:
r = await redis_client()
await r.ping()
return
except Exception:
if asyncio.get_running_loop().time() >= deadline:
raise
await asyncio.sleep(1)

View File

@@ -0,0 +1,268 @@
import json
import os
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
import httpx
from fastapi import APIRouter
from redis.asyncio import Redis
router = APIRouter(prefix="/v1", tags=["daarion-facade"])
REGISTRY_CACHE_TTL = int(os.getenv("REGISTRY_CACHE_TTL", "30"))
MEMORY_SERVICE_URL = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000")
CREWAI_SERVICE_URL = os.getenv("CREWAI_SERVICE_URL", "http://dagi-staging-crewai-service:9010")
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
METRICS_DASHBOARD_KEY = "daarion:metrics:dashboard"
_REGISTRY_CACHE: Dict[str, Any] = {"loaded_at": 0.0, "data": None}
_DISTRICT_CACHE: Dict[str, Any] = {"loaded_at": 0.0, "data": None}
_CREWAI_CACHE: Dict[str, Any] = {"loaded_at": 0.0, "data": {}}
_REDIS: Optional[Redis] = None
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def _registry_paths() -> List[Path]:
return [
Path("/app/gateway-bot/agent_registry.json"),
Path("/opt/microdao-daarion/config/agent_registry.json"),
Path(__file__).resolve().parents[1] / "agent_registry.json",
]
def _district_paths() -> List[Path]:
return [
Path("/app/gateway-bot/district_registry.json"),
Path(__file__).resolve().parents[1] / "district_registry.json",
]
def _load_registry() -> Dict[str, Any]:
now = time.time()
if _REGISTRY_CACHE.get("data") and (now - _REGISTRY_CACHE.get("loaded_at", 0.0) < REGISTRY_CACHE_TTL):
return _REGISTRY_CACHE["data"]
for path in _registry_paths():
if path.exists():
with path.open("r", encoding="utf-8") as f:
data = json.load(f)
_REGISTRY_CACHE.update({"loaded_at": now, "data": data})
return data
data = {"agents": {}}
_REGISTRY_CACHE.update({"loaded_at": now, "data": data})
return data
def _load_district_registry() -> Dict[str, Any]:
now = time.time()
if _DISTRICT_CACHE.get("data") and (now - _DISTRICT_CACHE.get("loaded_at", 0.0) < REGISTRY_CACHE_TTL):
return _DISTRICT_CACHE["data"]
for path in _district_paths():
if path.exists():
with path.open("r", encoding="utf-8") as f:
data = json.load(f)
_DISTRICT_CACHE.update({"loaded_at": now, "data": data})
return data
data = {"districts": []}
_DISTRICT_CACHE.update({"loaded_at": now, "data": data})
return data
async def _redis_client() -> Redis:
global _REDIS
if _REDIS is None:
_REDIS = Redis.from_url(REDIS_URL, decode_responses=True)
return _REDIS
async def _load_cached_dashboard() -> Optional[Dict[str, Any]]:
try:
r = await _redis_client()
raw = await r.get(METRICS_DASHBOARD_KEY)
if not raw:
return None
return json.loads(raw)
except Exception:
return None
async def _load_crewai_roles() -> Dict[str, int]:
now = time.time()
if now - _CREWAI_CACHE.get("loaded_at", 0.0) < REGISTRY_CACHE_TTL:
return _CREWAI_CACHE.get("data", {})
out: Dict[str, int] = {}
try:
async with httpx.AsyncClient(timeout=8.0) as client:
resp = await client.get(f"{CREWAI_SERVICE_URL}/crew/agents")
if resp.status_code == 200:
payload = resp.json()
for aid, info in payload.items():
default_roles = info.get("default_roles")
out[str(aid)] = int(default_roles) if isinstance(default_roles, int) else 0
except Exception:
out = {}
_CREWAI_CACHE.update({"loaded_at": now, "data": out})
return out
@router.get("/registry/agents")
async def get_agents() -> Dict[str, Any]:
reg = _load_registry()
agents = reg.get("agents", {}) if isinstance(reg, dict) else {}
role_counts = await _load_crewai_roles()
items: List[Dict[str, Any]] = []
for agent_id, cfg in agents.items():
if not isinstance(cfg, dict):
continue
domains = cfg.get("domains") or []
district_id = cfg.get("district_id") or "city-core"
items.append(
{
"agent_id": agent_id,
"title": cfg.get("display_name") or agent_id,
"role": cfg.get("canonical_role") or "",
"domain_primary": domains[0] if domains else "general",
"domain_aliases": domains[1:] if len(domains) > 1 else [],
"visibility": cfg.get("visibility", "public"),
"status": cfg.get("status", "active"),
"team": {"subagents_total": role_counts.get(agent_id, 0)},
"district_id": district_id,
"avatar_url": cfg.get("avatar_url"),
"health_url": cfg.get("health_url"),
}
)
return {"items": items, "total": len(items)}
@router.get("/registry/districts")
async def get_districts() -> Dict[str, Any]:
agents_payload = await get_agents()
agents = agents_payload.get("items", [])
by_district: Dict[str, List[Dict[str, Any]]] = {}
for a in agents:
by_district.setdefault(a.get("district_id", "city-core"), []).append(a)
catalog = _load_district_registry().get("districts", [])
catalog_by_id: Dict[str, Dict[str, Any]] = {
str(d.get("district_id")): d for d in catalog if isinstance(d, dict) and d.get("district_id")
}
district_ids = sorted(set(catalog_by_id.keys()) | set(by_district.keys()))
items: List[Dict[str, Any]] = []
for district_id in district_ids:
members = by_district.get(district_id, [])
base = catalog_by_id.get(district_id, {})
domain = base.get("domain") or ("daarion.city" if district_id == "city-core" else f"{district_id}.daarion.city")
lead_agent_id = base.get("lead_agent_id")
if not lead_agent_id:
if district_id == "city-core" and any(m.get("agent_id") == "daarwizz" for m in members):
lead_agent_id = "daarwizz"
elif members:
lead_agent_id = members[0].get("agent_id")
else:
lead_agent_id = None
items.append(
{
"district_id": district_id,
"title": base.get("title") or district_id.replace("-", " ").title(),
"domain": domain,
"status": base.get("status", "active"),
"logo_url": base.get("logo_url"),
"health_url": base.get("health_url"),
"well_known": {
"manifest": f"https://{domain}/.well-known/daarion-district.json",
"health": f"https://{domain}/.well-known/daarion-health.json",
"capabilities": f"https://{domain}/.well-known/daarion-capabilities.json",
},
"lead_agent_id": lead_agent_id,
"agents_total": len(members),
}
)
return {"items": items, "total": len(items)}
@router.get("/metrics")
async def get_metrics() -> Dict[str, Any]:
agents_payload = await get_agents()
districts_payload = await get_districts()
agents = agents_payload.get("items", [])
memory_vectors = 0
try:
async with httpx.AsyncClient(timeout=5.0) as client:
resp = await client.get(f"{MEMORY_SERVICE_URL}/health")
if resp.status_code == 200:
data = resp.json()
memory_vectors = int(
data.get("vector_store", {})
.get("memories", {})
.get("vectors_count", 0)
)
except Exception:
memory_vectors = 0
return {
"nodes": 1,
"districts": districts_payload.get("total", 0),
"agents": len(agents),
"subagents": sum(int((a.get("team") or {}).get("subagents_total", 0)) for a in agents),
"memory_vectors": memory_vectors,
}
@router.get("/metrics/dashboard")
async def get_metrics_dashboard() -> Dict[str, Any]:
cached = await _load_cached_dashboard()
if cached:
return cached
metrics = await get_metrics()
districts_payload = await get_districts()
districts = districts_payload.get("items", [])
by_district = []
for d in districts:
by_district.append(
{
"district_id": d.get("district_id"),
"title": d.get("title"),
"domain": d.get("domain"),
"status": d.get("status"),
"ok": None,
"agents_total": d.get("agents_total", 0),
"agents_online": None,
"latency_ms": None,
"last_check_ts": None,
}
)
return {
"global": {
"nodes": metrics.get("nodes", 1),
"districts": metrics.get("districts", 0),
"agents": metrics.get("agents", 0),
"subagents": metrics.get("subagents", 0),
"memory_vectors": metrics.get("memory_vectors", 0),
"districts_online": 0,
"agents_online": 0,
},
"by_district": by_district,
"updated_at": _now_iso(),
"source": "fallback_registry",
}

View File

@@ -0,0 +1,100 @@
import asyncio
import logging
import os
from typing import Dict
import httpx
from .reminders import close_redis, pop_due_reminders
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
logger = logging.getLogger("daarion-reminder-worker")
POLL_SECONDS = float(os.getenv("DAARION_REMINDER_POLL_SECONDS", "2"))
TELEGRAM_TIMEOUT = float(os.getenv("DAARION_REMINDER_TELEGRAM_TIMEOUT", "20"))
AGENT_TOKEN_ENV: Dict[str, str] = {
"daarwizz": "DAARWIZZ_TELEGRAM_BOT_TOKEN",
"helion": "HELION_TELEGRAM_BOT_TOKEN",
"greenfood": "GREENFOOD_TELEGRAM_BOT_TOKEN",
"agromatrix": "AGROMATRIX_TELEGRAM_BOT_TOKEN",
"alateya": "ALATEYA_TELEGRAM_BOT_TOKEN",
"nutra": "NUTRA_TELEGRAM_BOT_TOKEN",
"druid": "DRUID_TELEGRAM_BOT_TOKEN",
"clan": "CLAN_TELEGRAM_BOT_TOKEN",
"eonarch": "EONARCH_TELEGRAM_BOT_TOKEN",
"senpai": "SENPAI_TELEGRAM_BOT_TOKEN",
"oneok": "ONEOK_TELEGRAM_BOT_TOKEN",
"soul": "SOUL_TELEGRAM_BOT_TOKEN",
"yaromir": "YAROMIR_TELEGRAM_BOT_TOKEN",
"sofiia": "SOFIIA_TELEGRAM_BOT_TOKEN",
}
def _token_for_agent(agent_id: str) -> str:
env = AGENT_TOKEN_ENV.get((agent_id or "").lower(), "")
return os.getenv(env, "") if env else ""
async def _send_reminder(item: Dict[str, str]) -> bool:
agent_id = str(item.get("agent_id", ""))
chat_id = str(item.get("chat_id", ""))
reminder_text = str(item.get("text", "")).strip()
due_at = str(item.get("due_at", ""))
token = _token_for_agent(agent_id)
if not token:
logger.warning("reminder_skip_no_token agent=%s reminder_id=%s", agent_id, item.get("reminder_id"))
return False
if not chat_id or not reminder_text:
logger.warning("reminder_skip_invalid_payload reminder_id=%s", item.get("reminder_id"))
return False
body = {
"chat_id": chat_id,
"text": f"⏰ Нагадування ({agent_id})\n\n{reminder_text}\n\n🕒 {due_at}",
}
url = f"https://api.telegram.org/bot{token}/sendMessage"
async with httpx.AsyncClient(timeout=TELEGRAM_TIMEOUT) as client:
resp = await client.post(url, json=body)
if resp.status_code != 200:
logger.warning(
"reminder_send_failed reminder_id=%s status=%s body=%s",
item.get("reminder_id"),
resp.status_code,
resp.text[:300],
)
return False
logger.info("reminder_sent reminder_id=%s agent=%s chat=%s", item.get("reminder_id"), agent_id, chat_id)
return True
async def worker_loop() -> None:
logger.info("reminder_worker_started poll_seconds=%s", POLL_SECONDS)
while True:
try:
items = await pop_due_reminders(limit=20)
if items:
for item in items:
try:
await _send_reminder(item)
except Exception:
logger.exception("reminder_send_exception reminder_id=%s", item.get("reminder_id"))
except asyncio.CancelledError:
raise
except Exception:
logger.exception("reminder_worker_cycle_failed")
await asyncio.sleep(POLL_SECONDS)
if __name__ == "__main__":
try:
asyncio.run(worker_loop())
finally:
try:
asyncio.run(close_redis())
except Exception:
pass

View File

@@ -0,0 +1,154 @@
import json
import os
import time
import uuid
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional
from redis.asyncio import Redis
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
REMINDER_PREFIX = "daarion:reminders"
REMINDER_BY_ID = f"{REMINDER_PREFIX}:by_id"
REMINDER_SCHEDULE = f"{REMINDER_PREFIX}:schedule"
REMINDER_TTL_SECONDS = int(os.getenv("DAARION_REMINDER_TTL_SECONDS", str(30 * 24 * 3600)))
_redis: Optional[Redis] = None
@dataclass
class Reminder:
reminder_id: str
agent_id: str
chat_id: str
user_id: str
text: str
due_ts: int
created_at: str
def to_dict(self) -> Dict[str, Any]:
return {
"reminder_id": self.reminder_id,
"agent_id": self.agent_id,
"chat_id": self.chat_id,
"user_id": self.user_id,
"text": self.text,
"due_ts": self.due_ts,
"created_at": self.created_at,
}
async def redis_client() -> Redis:
global _redis
if _redis is None:
_redis = Redis.from_url(REDIS_URL, decode_responses=True)
return _redis
async def close_redis() -> None:
global _redis
if _redis is not None:
await _redis.close()
_redis = None
def _iso_now() -> str:
return datetime.now(timezone.utc).isoformat()
def _iso_from_ts(ts: int) -> str:
return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()
async def create_reminder(agent_id: str, chat_id: str, user_id: str, text: str, due_ts: int) -> Dict[str, Any]:
reminder = Reminder(
reminder_id=f"rem_{uuid.uuid4().hex[:16]}",
agent_id=agent_id,
chat_id=str(chat_id),
user_id=str(user_id),
text=text.strip(),
due_ts=int(due_ts),
created_at=_iso_now(),
)
r = await redis_client()
key = f"{REMINDER_BY_ID}:{reminder.reminder_id}"
payload = json.dumps(reminder.to_dict(), ensure_ascii=False)
await r.set(key, payload, ex=REMINDER_TTL_SECONDS)
await r.zadd(REMINDER_SCHEDULE, {reminder.reminder_id: float(reminder.due_ts)})
result = reminder.to_dict()
result["due_at"] = _iso_from_ts(reminder.due_ts)
return result
async def list_reminders(agent_id: str, chat_id: str, user_id: str, limit: int = 10) -> List[Dict[str, Any]]:
r = await redis_client()
now_ts = int(time.time())
ids = await r.zrangebyscore(REMINDER_SCHEDULE, min=now_ts - 365 * 24 * 3600, max="+inf", start=0, num=max(1, limit * 5))
out: List[Dict[str, Any]] = []
for reminder_id in ids:
raw = await r.get(f"{REMINDER_BY_ID}:{reminder_id}")
if not raw:
continue
try:
item = json.loads(raw)
except json.JSONDecodeError:
continue
if item.get("agent_id") != agent_id:
continue
if str(item.get("chat_id")) != str(chat_id):
continue
if str(item.get("user_id")) != str(user_id):
continue
item["due_at"] = _iso_from_ts(int(item.get("due_ts", 0)))
out.append(item)
if len(out) >= limit:
break
return out
async def cancel_reminder(reminder_id: str, agent_id: str, chat_id: str, user_id: str) -> bool:
r = await redis_client()
key = f"{REMINDER_BY_ID}:{reminder_id}"
raw = await r.get(key)
if not raw:
return False
try:
item = json.loads(raw)
except json.JSONDecodeError:
return False
if item.get("agent_id") != agent_id or str(item.get("chat_id")) != str(chat_id) or str(item.get("user_id")) != str(user_id):
return False
await r.delete(key)
await r.zrem(REMINDER_SCHEDULE, reminder_id)
return True
async def pop_due_reminders(limit: int = 20) -> List[Dict[str, Any]]:
r = await redis_client()
now_ts = int(time.time())
ids = await r.zrangebyscore(REMINDER_SCHEDULE, min="-inf", max=now_ts, start=0, num=max(1, limit))
out: List[Dict[str, Any]] = []
for reminder_id in ids:
removed = await r.zrem(REMINDER_SCHEDULE, reminder_id)
if removed == 0:
continue
raw = await r.get(f"{REMINDER_BY_ID}:{reminder_id}")
if not raw:
continue
await r.delete(f"{REMINDER_BY_ID}:{reminder_id}")
try:
item = json.loads(raw)
item["due_at"] = _iso_from_ts(int(item.get("due_ts", now_ts)))
out.append(item)
except json.JSONDecodeError:
continue
return out

View File

@@ -0,0 +1,107 @@
import asyncio
from datetime import datetime, timezone
import logging
import os
from typing import Any, Dict
import httpx
from .redis_jobs import close_redis, dequeue_job, get_job, update_job, wait_for_redis
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
logger = logging.getLogger("daarion-gateway-worker")
ROUTER_BASE_URL = os.getenv("ROUTER_BASE_URL", os.getenv("ROUTER_URL", "http://router:8000"))
ROUTER_TIMEOUT_SECONDS = float(os.getenv("ROUTER_WORKER_TIMEOUT", "60"))
def _now() -> str:
return datetime.now(timezone.utc).isoformat()
async def _call_router(agent_id: str, input_payload: Dict[str, Any], metadata: Dict[str, Any]) -> Dict[str, Any]:
body: Dict[str, Any] = {
"prompt": input_payload.get("prompt", ""),
"metadata": metadata or {},
}
images = input_payload.get("images") or []
if images:
body["images"] = images
url = f"{ROUTER_BASE_URL}/v1/agents/{agent_id}/infer"
async with httpx.AsyncClient(timeout=ROUTER_TIMEOUT_SECONDS) as client:
resp = await client.post(url, json=body)
resp.raise_for_status()
data = resp.json()
return {
"response": data.get("response", ""),
"model": data.get("model"),
"backend": data.get("backend"),
"tokens_used": data.get("tokens_used"),
}
async def run_once(job_id: str) -> None:
job = await get_job(job_id)
if not job:
logger.warning("job_missing: %s", job_id)
return
await update_job(job_id, {"status": "running", "started_at": _now(), "updated_at": _now()})
agent_id = job.get("agent_id")
input_payload = job.get("input") or {}
metadata = job.get("metadata") or {}
try:
result = await _call_router(agent_id, input_payload, metadata)
await update_job(
job_id,
{
"status": "done",
"result": result,
"error": None,
"finished_at": _now(),
"updated_at": _now(),
},
)
logger.info("job_done: %s agent=%s", job_id, agent_id)
except Exception as e:
await update_job(
job_id,
{
"status": "failed",
"error": {"type": e.__class__.__name__, "message": str(e)},
"finished_at": _now(),
"updated_at": _now(),
},
)
logger.exception("job_failed: %s agent=%s", job_id, agent_id)
async def worker_loop() -> None:
await wait_for_redis(60)
logger.info("worker_started router=%s", ROUTER_BASE_URL)
while True:
try:
job_id = await dequeue_job(block_seconds=10)
if not job_id:
continue
await run_once(job_id)
except asyncio.CancelledError:
raise
except Exception:
logger.exception("worker_loop_error")
await asyncio.sleep(1)
if __name__ == "__main__":
try:
asyncio.run(worker_loop())
finally:
try:
asyncio.run(close_redis())
except Exception:
pass

View File

@@ -0,0 +1,92 @@
{
"districts": [
{
"district_id": "city-core",
"title": "City Core - DAARION.city",
"domain": "daarion.city",
"status": "active",
"logo_url": null,
"lead_agent_id": "daarwizz"
},
{
"district_id": "helion",
"title": "Helion District",
"domain": "helion.daarion.city",
"status": "active",
"logo_url": null,
"lead_agent_id": "helion"
},
{
"district_id": "alateya",
"title": "Alateya District",
"domain": "alateya.daarion.city",
"status": "active",
"logo_url": null,
"lead_agent_id": "alateya"
},
{
"district_id": "druid",
"title": "Druid District",
"domain": "druid.daarion.city",
"status": "active",
"logo_url": null,
"lead_agent_id": "druid"
},
{
"district_id": "nutra",
"title": "Nutra District",
"domain": "nutra.daarion.city",
"status": "active",
"logo_url": null,
"lead_agent_id": "nutra"
},
{
"district_id": "agromatrix",
"title": "AgroMatrix District",
"domain": "agromatrix.daarion.city",
"status": "active",
"logo_url": null,
"lead_agent_id": "agromatrix"
},
{
"district_id": "greenfood",
"title": "GreenFood District",
"domain": "greenfood.daarion.city",
"status": "active",
"logo_url": null,
"lead_agent_id": "greenfood"
},
{
"district_id": "clan",
"title": "Clan District",
"domain": "clan.daarion.city",
"status": "active",
"logo_url": null,
"lead_agent_id": "clan"
},
{
"district_id": "eonarch",
"title": "Eonarch District",
"domain": "eonarch.daarion.city",
"status": "active",
"logo_url": null,
"lead_agent_id": "eonarch"
},
{
"district_id": "soul",
"title": "Soul District",
"domain": "soul.daarion.city",
"status": "active",
"logo_url": null,
"lead_agent_id": "soul"
},
{
"district_id": "senpai",
"title": "Senpai District",
"domain": "senpai.daarion.city",
"status": "active",
"logo_url": null,
"lead_agent_id": "senpai"
}
]
}

View File

@@ -1871,23 +1871,53 @@ async def process_document(
Dict з результатом обробки
"""
mime_type = document.get("mime_type", "")
mime_type_l = (mime_type or "").lower()
file_name = document.get("file_name", "")
file_id = document.get("file_id")
file_name_lower = file_name.lower()
allowed_exts = {".pdf", ".docx", ".txt", ".md", ".csv", ".xlsx", ".zip"}
allowed_exts = {
".pdf", ".doc", ".docx", ".rtf", ".odt",
".txt", ".md", ".markdown",
".csv", ".tsv", ".xls", ".xlsx", ".xlsm", ".ods",
".ppt", ".pptx", ".odp",
".json", ".yaml", ".yml", ".xml", ".html", ".htm",
".zip",
".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff",
}
is_allowed = any(file_name_lower.endswith(ext) for ext in allowed_exts)
if mime_type == "application/pdf":
if mime_type_l == "application/pdf":
is_allowed = True
if mime_type in {
if mime_type_l in {
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/rtf",
"text/rtf",
"application/vnd.oasis.opendocument.text",
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.ms-excel.sheet.macroenabled.12",
"application/vnd.oasis.opendocument.spreadsheet",
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.oasis.opendocument.presentation",
"text/plain",
"text/markdown",
"text/csv",
"text/tab-separated-values",
"application/json",
"application/yaml",
"application/x-yaml",
"text/yaml",
"application/xml",
"text/xml",
"text/html",
"application/zip",
"application/x-zip-compressed",
}:
is_allowed = True
if mime_type_l.startswith("image/"):
is_allowed = True
if is_allowed and file_id:
logger.info(f"{agent_config.name}: Document from {username} (tg:{user_id}), file_id: {file_id}, file_name: {file_name}")
@@ -2027,7 +2057,7 @@ async def process_document(
telegram_token = agent_config.get_telegram_token()
await send_telegram_message(
chat_id,
"Наразі підтримуються формати: PDF, DOCX, TXT, MD, CSV, XLSX, ZIP.",
"Підтримуються формати: PDF/DOC/DOCX/RTF/ODT, TXT/MD/CSV/TSV, XLS/XLSX/XLSM/ODS, PPT/PPTX/ODP, JSON/YAML/XML/HTML, ZIP, зображення.",
telegram_token,
)
return {"ok": False, "error": "Unsupported document type"}
@@ -3681,7 +3711,8 @@ async def _old_telegram_webhook(update: TelegramUpdate):
doc_url=file_url,
file_name=file_name,
dao_id=dao_id,
user_id=f"tg:{user_id}"
user_id=f"tg:{user_id}",
agent_id=agent_config.agent_id,
)
if result.success:
@@ -3705,7 +3736,8 @@ async def _old_telegram_webhook(update: TelegramUpdate):
result = await ingest_document(
session_id=session_id,
dao_id=dao_id,
user_id=f"tg:{user_id}"
user_id=f"tg:{user_id}",
agent_id=agent_config.agent_id,
)
if result.success:

View File

@@ -6,20 +6,32 @@ Endpoints:
- POST /api/doc/parse - Parse a document
- POST /api/doc/ingest - Ingest document to RAG
- POST /api/doc/ask - Ask question about document
- POST /api/doc/update - Update existing document text (versioned)
- POST /api/doc/publish - Publish physical file version via artifact registry
- GET /api/doc/versions/{doc_id} - List document versions
- GET /api/doc/artifacts/{artifact_id}/versions/{version_id}/download - Download via gateway proxy
"""
import logging
import os
import re
from typing import Optional, Dict, Any
from fastapi import APIRouter, HTTPException, UploadFile, File, Form
from fastapi.responses import Response
from pydantic import BaseModel
import httpx
from services.doc_service import (
doc_service,
parse_document,
ingest_document,
ask_about_document,
update_document,
list_document_versions,
publish_document_artifact,
get_doc_context,
ParsedResult,
IngestResult,
UpdateResult,
QAResult,
DocContext
)
@@ -27,6 +39,8 @@ from services.doc_service import (
logger = logging.getLogger(__name__)
router = APIRouter()
ARTIFACT_REGISTRY_URL = os.getenv("ARTIFACT_REGISTRY_URL", "http://artifact-registry:9220").rstrip("/")
DOC_DOWNLOAD_TIMEOUT_SECONDS = float(os.getenv("DOC_DOWNLOAD_TIMEOUT_SECONDS", "60"))
# ========================================
@@ -52,6 +66,7 @@ class IngestDocumentRequest(BaseModel):
file_name: Optional[str] = None
dao_id: Optional[str] = None
user_id: Optional[str] = None
agent_id: str = "daarwizz"
class AskDocumentRequest(BaseModel):
@@ -61,6 +76,40 @@ class AskDocumentRequest(BaseModel):
doc_id: Optional[str] = None
dao_id: Optional[str] = None
user_id: Optional[str] = None
agent_id: str = "daarwizz"
class UpdateDocumentRequest(BaseModel):
"""Request to update existing document content."""
session_id: str
doc_id: Optional[str] = None
doc_url: Optional[str] = None
file_name: Optional[str] = None
text: Optional[str] = None
dao_id: Optional[str] = None
user_id: Optional[str] = None
agent_id: str = "daarwizz"
storage_ref: Optional[str] = None
publish_artifact: bool = False
artifact_id: Optional[str] = None
target_format: Optional[str] = None
artifact_label: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None
class PublishDocumentRequest(BaseModel):
"""Request to publish document as physical artifact version."""
session_id: str
doc_id: Optional[str] = None
doc_url: Optional[str] = None
file_name: Optional[str] = None
text: Optional[str] = None
dao_id: Optional[str] = None
user_id: Optional[str] = None
artifact_id: Optional[str] = None
target_format: Optional[str] = None
artifact_label: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None
# ========================================
@@ -167,7 +216,8 @@ async def ingest_document_endpoint(request: IngestDocumentRequest):
doc_url=request.doc_url,
file_name=request.file_name,
dao_id=request.dao_id,
user_id=request.user_id
user_id=request.user_id,
agent_id=request.agent_id,
)
if not result.success:
@@ -209,7 +259,8 @@ async def ask_about_document_endpoint(request: AskDocumentRequest):
question=request.question,
doc_id=doc_id,
dao_id=request.dao_id,
user_id=request.user_id
user_id=request.user_id,
agent_id=request.agent_id,
)
if not result.success:
@@ -227,6 +278,107 @@ async def ask_about_document_endpoint(request: AskDocumentRequest):
raise HTTPException(status_code=500, detail=str(e))
@router.post("/api/doc/update")
async def update_document_endpoint(request: UpdateDocumentRequest):
"""
Update a document and bump its version.
If text is omitted and doc_url exists, text is re-parsed from the source document.
"""
try:
result = await update_document(
session_id=request.session_id,
doc_id=request.doc_id,
doc_url=request.doc_url,
file_name=request.file_name,
text=request.text,
dao_id=request.dao_id,
user_id=request.user_id,
agent_id=request.agent_id,
storage_ref=request.storage_ref,
publish_artifact=request.publish_artifact,
artifact_id=request.artifact_id,
target_format=request.target_format,
artifact_label=request.artifact_label,
metadata=request.metadata,
)
if not result.success:
raise HTTPException(status_code=400, detail=result.error)
response = {
"ok": True,
"doc_id": result.doc_id,
"version_no": result.version_no,
"version_id": result.version_id,
"updated_chunks": result.updated_chunks,
"status": result.status,
"publish_error": result.publish_error,
"artifact_id": result.artifact_id,
"artifact_version_id": result.artifact_version_id,
"artifact_storage_key": result.artifact_storage_key,
"artifact_mime": result.artifact_mime,
"artifact_download_url": result.artifact_download_url,
}
return response
except HTTPException:
raise
except Exception as e:
logger.error(f"Update document error: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.post("/api/doc/publish")
async def publish_document_endpoint(request: PublishDocumentRequest):
"""
Publish current document text as physical file artifact version.
"""
try:
result = await publish_document_artifact(
session_id=request.session_id,
doc_id=request.doc_id,
doc_url=request.doc_url,
file_name=request.file_name,
text=request.text,
dao_id=request.dao_id,
user_id=request.user_id,
artifact_id=request.artifact_id,
target_format=request.target_format,
artifact_label=request.artifact_label,
metadata=request.metadata,
)
if not result.success:
raise HTTPException(status_code=400, detail=result.error)
return {
"ok": True,
"artifact_id": result.artifact_id,
"version_id": result.version_id,
"storage_key": result.storage_key,
"mime": result.mime,
"file_name": result.file_name,
"download_url": result.download_url,
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Publish document error: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.get("/api/doc/versions/{doc_id}")
async def list_document_versions_endpoint(doc_id: str, agent_id: str = "daarwizz", limit: int = 20):
"""
List document versions for agent/doc pair.
"""
try:
data = await list_document_versions(agent_id=agent_id, doc_id=doc_id, limit=limit)
if not data.get("ok"):
raise HTTPException(status_code=400, detail=data.get("error", "Failed to load versions"))
return data
except HTTPException:
raise
except Exception as e:
logger.error(f"List document versions error: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.get("/api/doc/context/{session_id}")
async def get_document_context(session_id: str):
"""
@@ -258,3 +410,56 @@ async def get_document_context(session_id: str):
logger.error(f"Get document context error: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.get("/api/doc/artifacts/{artifact_id}/versions/{version_id}/download")
async def download_artifact_version_via_gateway(
artifact_id: str,
version_id: str,
filename: Optional[str] = None,
inline: bool = False,
):
"""
Proxy download for artifact version to avoid exposing internal MinIO host to browser clients.
"""
aid = (artifact_id or "").strip()
vid = (version_id or "").strip()
if not aid or not vid:
raise HTTPException(status_code=400, detail="artifact_id and version_id are required")
try:
async with httpx.AsyncClient(timeout=DOC_DOWNLOAD_TIMEOUT_SECONDS) as client:
meta_resp = await client.get(
f"{ARTIFACT_REGISTRY_URL}/artifacts/{aid}/versions/{vid}/download"
)
if meta_resp.status_code >= 400:
detail = ""
try:
detail = meta_resp.json().get("detail") # type: ignore[assignment]
except Exception:
detail = meta_resp.text[:200]
raise HTTPException(status_code=meta_resp.status_code, detail=detail or "Version download info failed")
meta = meta_resp.json()
signed_url = (meta.get("url") or "").strip()
if not signed_url:
raise HTTPException(status_code=502, detail="artifact-registry returned empty download URL")
file_resp = await client.get(signed_url)
if file_resp.status_code >= 400:
raise HTTPException(status_code=502, detail=f"Artifact storage download failed: {file_resp.status_code}")
mime = (meta.get("mime") or file_resp.headers.get("content-type") or "application/octet-stream").strip()
storage_key = str(meta.get("storage_key") or "")
inferred_name = storage_key.rsplit("/", 1)[-1] if "/" in storage_key else storage_key
out_name = (filename or inferred_name or f"{aid}_{vid}.bin").strip()
out_name = re.sub(r"[^A-Za-z0-9._-]+", "_", out_name).strip("._") or f"{aid}_{vid}.bin"
disposition = "inline" if inline else "attachment"
headers = {
"Content-Disposition": f'{disposition}; filename="{out_name}"',
"Cache-Control": "private, max-age=60",
}
return Response(content=file_resp.content, media_type=mime, headers=headers)
except HTTPException:
raise
except Exception as e:
logger.error(f"Artifact version proxy download failed: aid={aid}, vid={vid}, err={e}", exc_info=True)
raise HTTPException(status_code=500, detail="Artifact proxy download failed")

View File

@@ -143,6 +143,10 @@ class MemoryClient:
"body_text": e.get("content", ""),
"kind": e.get("kind", "message"),
"type": "user" if e.get("role") == "user" else "agent",
"role": e.get("role", "unknown"),
"timestamp": e.get("timestamp"),
"user_id": e.get("user_id"),
"sender_name": e.get("sender_name"),
}
for e in events
if e.get("content")
@@ -445,4 +449,3 @@ class MemoryClient:
# Глобальний екземпляр клієнта
memory_client = MemoryClient()

View File

@@ -11,18 +11,23 @@ This service can be used by:
import os
import logging
import hashlib
import base64
import json
import re
from typing import Optional, Dict, Any, List
from pydantic import BaseModel
from datetime import datetime
from io import BytesIO
from router_client import send_to_router
from memory_client import memory_client
logger = logging.getLogger(__name__)
SHARED_EXCEL_POLICY_AGENTS = {"agromatrix", "helion", "nutra", "greenfood"}
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000")
ARTIFACT_REGISTRY_URL = os.getenv("ARTIFACT_REGISTRY_URL", "http://artifact-registry:9220").rstrip("/")
DOC_WRITEBACK_CREATED_BY = os.getenv("DOC_WRITEBACK_CREATED_BY", "gateway-doc-service")
GATEWAY_PUBLIC_BASE_URL = os.getenv("GATEWAY_PUBLIC_BASE_URL", "").rstrip("/")
class QAItem(BaseModel):
@@ -51,6 +56,35 @@ class IngestResult(BaseModel):
error: Optional[str] = None
class UpdateResult(BaseModel):
"""Result of document update with version bump."""
success: bool
doc_id: Optional[str] = None
version_no: Optional[int] = None
version_id: Optional[int] = None
updated_chunks: int = 0
status: str = "unknown"
publish_error: Optional[str] = None
artifact_id: Optional[str] = None
artifact_version_id: Optional[str] = None
artifact_storage_key: Optional[str] = None
artifact_mime: Optional[str] = None
artifact_download_url: Optional[str] = None
error: Optional[str] = None
class PublishResult(BaseModel):
"""Result of artifact write-back publish."""
success: bool
artifact_id: Optional[str] = None
version_id: Optional[str] = None
storage_key: Optional[str] = None
mime: Optional[str] = None
file_name: Optional[str] = None
download_url: Optional[str] = None
error: Optional[str] = None
class QAResult(BaseModel):
"""Result of RAG query about a document"""
success: bool
@@ -84,6 +118,266 @@ class DocumentService:
"""Initialize document service"""
self.memory_client = memory_client
async def _router_post_json(
self,
path: str,
payload: Dict[str, Any],
timeout: float = 45.0,
) -> Dict[str, Any]:
import httpx
base = ROUTER_URL.rstrip("/")
url = f"{base}{path}"
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(url, json=payload)
body = {}
try:
body = resp.json()
except Exception:
body = {"ok": False, "error": f"Invalid JSON from router ({resp.status_code})"}
if resp.status_code >= 400:
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
raise RuntimeError(f"Router error on {path}: {err}")
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"}
async def _router_get_json(
self,
path: str,
timeout: float = 30.0,
) -> Dict[str, Any]:
import httpx
base = ROUTER_URL.rstrip("/")
url = f"{base}{path}"
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.get(url)
body = {}
try:
body = resp.json()
except Exception:
body = {"ok": False, "error": f"Invalid JSON from router ({resp.status_code})"}
if resp.status_code >= 400:
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
raise RuntimeError(f"Router error on {path}: {err}")
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"}
async def _artifact_post_json(
self,
path: str,
payload: Dict[str, Any],
timeout: float = 45.0,
) -> Dict[str, Any]:
import httpx
base = ARTIFACT_REGISTRY_URL.rstrip("/")
url = f"{base}{path}"
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(url, json=payload)
body = {}
try:
body = resp.json()
except Exception:
body = {"ok": False, "error": f"Invalid JSON from artifact-registry ({resp.status_code})"}
if resp.status_code >= 400:
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
raise RuntimeError(f"Artifact registry error on {path}: {err}")
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid artifact response type"}
async def _artifact_get_json(
self,
path: str,
timeout: float = 30.0,
) -> Dict[str, Any]:
import httpx
base = ARTIFACT_REGISTRY_URL.rstrip("/")
url = f"{base}{path}"
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.get(url)
body = {}
try:
body = resp.json()
except Exception:
body = {"ok": False, "error": f"Invalid JSON from artifact-registry ({resp.status_code})"}
if resp.status_code >= 400:
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
raise RuntimeError(f"Artifact registry error on {path}: {err}")
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid artifact response type"}
def _resolve_format(self, file_name: Optional[str], target_format: Optional[str]) -> str:
fmt = (target_format or "").strip().lower().lstrip(".")
if fmt:
return fmt
if file_name and "." in file_name:
return file_name.rsplit(".", 1)[1].strip().lower()
return "txt"
def _compose_output_name(self, file_name: Optional[str], doc_id: str, fmt: str) -> str:
base = "document"
if file_name:
base = file_name.rsplit("/", 1)[-1].rsplit("\\", 1)[-1]
if "." in base:
base = base.rsplit(".", 1)[0]
elif doc_id:
base = doc_id
safe_base = re.sub(r"[^A-Za-z0-9._-]+", "_", base).strip("._") or "document"
return f"{safe_base}.{fmt}"
def _gateway_artifact_download_path(self, artifact_id: str, version_id: str) -> str:
aid = (artifact_id or "").strip()
vid = (version_id or "").strip()
return f"/api/doc/artifacts/{aid}/versions/{vid}/download"
def _gateway_artifact_download_url(self, artifact_id: str, version_id: str) -> str:
path = self._gateway_artifact_download_path(artifact_id, version_id)
if GATEWAY_PUBLIC_BASE_URL:
return f"{GATEWAY_PUBLIC_BASE_URL}{path}"
return path
def _render_document_bytes(
self,
text: str,
file_name: Optional[str],
doc_id: str,
target_format: Optional[str] = None,
) -> Dict[str, Any]:
body = (text or "").strip()
if not body:
raise ValueError("Cannot render empty document text")
fmt = self._resolve_format(file_name=file_name, target_format=target_format)
output_name = self._compose_output_name(file_name=file_name, doc_id=doc_id, fmt=fmt)
if fmt in {"txt"}:
payload = body.encode("utf-8")
return {"bytes": payload, "mime": "text/plain; charset=utf-8", "file_name": output_name}
if fmt in {"md", "markdown"}:
payload = body.encode("utf-8")
return {"bytes": payload, "mime": "text/markdown; charset=utf-8", "file_name": output_name}
if fmt in {"json"}:
parsed: Any
try:
parsed = json.loads(body)
except Exception:
parsed = {"text": body}
payload = json.dumps(parsed, ensure_ascii=False, indent=2).encode("utf-8")
return {"bytes": payload, "mime": "application/json", "file_name": output_name}
if fmt in {"csv"}:
payload = body.encode("utf-8")
return {"bytes": payload, "mime": "text/csv; charset=utf-8", "file_name": output_name}
if fmt in {"xlsx", "xlsm", "xls"}:
try:
from openpyxl import Workbook
except Exception as e:
raise RuntimeError(f"openpyxl is required for {fmt} rendering: {e}")
wb = Workbook()
ws = wb.active
ws.title = "Document"
lines = [ln for ln in body.splitlines()] or [body]
for idx, line in enumerate(lines, start=1):
ws.cell(row=idx, column=1, value=line)
buf = BytesIO()
wb.save(buf)
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
return {"bytes": buf.getvalue(), "mime": mime, "file_name": self._compose_output_name(file_name, doc_id, "xlsx")}
if fmt in {"docx"}:
try:
from docx import Document
except Exception as e:
raise RuntimeError(f"python-docx is required for docx rendering: {e}")
doc = Document()
for line in body.splitlines():
doc.add_paragraph(line if line else " ")
buf = BytesIO()
doc.save(buf)
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
return {"bytes": buf.getvalue(), "mime": mime, "file_name": self._compose_output_name(file_name, doc_id, "docx")}
payload = body.encode("utf-8")
fallback_name = self._compose_output_name(file_name=file_name, doc_id=doc_id, fmt="txt")
return {"bytes": payload, "mime": "text/plain; charset=utf-8", "file_name": fallback_name}
async def _publish_text_artifact(
self,
text: str,
doc_id: str,
file_name: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
artifact_id: Optional[str] = None,
target_format: Optional[str] = None,
label: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> PublishResult:
try:
rendered = self._render_document_bytes(
text=text,
file_name=file_name,
doc_id=doc_id,
target_format=target_format,
)
content_bytes = rendered["bytes"]
content_b64 = base64.b64encode(content_bytes).decode("ascii")
effective_artifact_id = (artifact_id or "").strip()
if not effective_artifact_id:
create_resp = await self._artifact_post_json(
"/artifacts",
{
"type": "doc",
"title": file_name or doc_id,
"project_id": dao_id,
"acl_ref": dao_id,
"created_by": user_id or DOC_WRITEBACK_CREATED_BY,
},
timeout=30.0,
)
effective_artifact_id = str(create_resp.get("artifact_id") or "").strip()
if not effective_artifact_id:
return PublishResult(success=False, error="Artifact create failed: empty artifact_id")
meta = {"doc_id": doc_id, "source": "doc_update_publish"}
if isinstance(metadata, dict):
meta.update(metadata)
version_resp = await self._artifact_post_json(
f"/artifacts/{effective_artifact_id}/versions/from_base64",
{
"content_base64": content_b64,
"mime": rendered["mime"],
"filename": rendered["file_name"],
"label": label or "edited",
"meta_json": meta,
},
timeout=45.0,
)
version_id = str(version_resp.get("version_id") or "").strip()
storage_key = version_resp.get("storage_key")
if not version_id:
return PublishResult(
success=False,
artifact_id=effective_artifact_id,
error="Artifact version create failed: empty version_id",
)
download_url = self._gateway_artifact_download_url(
artifact_id=effective_artifact_id,
version_id=version_id,
)
return PublishResult(
success=True,
artifact_id=effective_artifact_id,
version_id=version_id,
storage_key=storage_key,
mime=rendered["mime"],
file_name=rendered["file_name"],
download_url=download_url,
)
except Exception as e:
logger.error(f"publish_text_artifact failed: {e}", exc_info=True)
return PublishResult(success=False, error=str(e))
def _is_excel_filename(self, file_name: Optional[str]) -> bool:
if not file_name:
return False
@@ -462,7 +756,8 @@ class DocumentService:
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
dao_id: str = None,
user_id: str = None
user_id: str = None,
agent_id: str = "daarwizz",
) -> IngestResult:
"""
Ingest document chunks into RAG/Memory.
@@ -488,64 +783,60 @@ class DocumentService:
file_name = file_name or doc_context.file_name
dao_id = dao_id or doc_context.dao_id
if not doc_id and not doc_url:
if not doc_url:
return IngestResult(
success=False,
error="No document ID or URL provided"
error="No document URL available for ingest"
)
# Build request to Router with ingest flag
router_request = {
"mode": "doc_parse",
"agent": "parser",
parsed = await self.parse_document(
session_id=session_id,
doc_url=doc_url,
file_name=file_name or "document",
dao_id=dao_id or "",
user_id=user_id or "",
output_mode="markdown",
metadata={"source": self._extract_source(session_id), "mode": "ingest"},
)
if not parsed.success:
return IngestResult(success=False, error=parsed.error or "Document parse failed")
effective_doc_id = doc_id or parsed.doc_id
if not effective_doc_id:
effective_doc_id = hashlib.md5(f"{session_id}:{file_name}:{datetime.utcnow().isoformat()}".encode()).hexdigest()[:12]
doc_text = (parsed.markdown or "").strip()
if not doc_text:
return IngestResult(success=False, error="No extractable text for ingestion")
payload = {
"agent_id": (agent_id or "daarwizz").lower(),
"doc_id": effective_doc_id,
"file_name": file_name or "document",
"text": doc_text,
"dao_id": dao_id,
"user_id": user_id,
"metadata": {
"source": self._extract_source(session_id),
"dao_id": dao_id,
"user_id": user_id,
"session_id": session_id,
},
"payload": {
"output_mode": "chunks", # Use chunks for RAG ingestion
"dao_id": dao_id,
"user_id": user_id,
"ingest": True, # Flag for ingestion
"source": self._extract_source(session_id),
},
}
if doc_url:
router_request["payload"]["doc_url"] = doc_url
router_request["payload"]["file_name"] = file_name or "document.pdf"
if doc_id:
router_request["payload"]["doc_id"] = doc_id
logger.info(f"Ingesting document: session={session_id}, doc_id={doc_id}")
# Send to Router
response = await send_to_router(router_request)
if not isinstance(response, dict):
return IngestResult(
success=False,
error="Invalid response from router"
)
data = response.get("data", {})
chunks = data.get("chunks", [])
if chunks:
response = await self._router_post_json("/v1/documents/ingest", payload, timeout=90.0)
if response.get("ok"):
return IngestResult(
success=True,
doc_id=doc_id or data.get("doc_id"),
ingested_chunks=len(chunks),
status="ingested"
)
else:
return IngestResult(
success=False,
status="failed",
error="No chunks to ingest"
doc_id=response.get("doc_id") or effective_doc_id,
ingested_chunks=int(response.get("chunks_stored", 0) or 0),
status="ingested",
)
return IngestResult(
success=False,
doc_id=effective_doc_id,
status="failed",
error=response.get("error", "Router ingest failed"),
)
except Exception as e:
logger.error(f"Document ingestion failed: {e}", exc_info=True)
@@ -553,6 +844,245 @@ class DocumentService:
success=False,
error=str(e)
)
async def update_document(
self,
session_id: str,
doc_id: Optional[str] = None,
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
text: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
agent_id: str = "daarwizz",
storage_ref: Optional[str] = None,
publish_artifact: bool = False,
artifact_id: Optional[str] = None,
target_format: Optional[str] = None,
artifact_label: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> UpdateResult:
"""
Update existing document content and bump version in router memory.
"""
try:
context = await self.get_doc_context(session_id)
if context:
if not doc_id:
doc_id = context.doc_id
if not doc_url:
doc_url = context.doc_url
if not file_name:
file_name = context.file_name
if not dao_id:
dao_id = context.dao_id
if not doc_id:
return UpdateResult(
success=False,
status="failed",
error="No document context found. Provide doc_id or parse/ingest first.",
)
effective_text = (text or "").strip()
if not effective_text:
if not doc_url:
return UpdateResult(
success=False,
doc_id=doc_id,
status="failed",
error="No text or doc_url provided for update",
)
parsed = await self.parse_document(
session_id=session_id,
doc_url=doc_url,
file_name=file_name or "document",
dao_id=dao_id or "",
user_id=user_id or "",
output_mode="markdown",
metadata={"source": self._extract_source(session_id), "mode": "update"},
)
if not parsed.success:
return UpdateResult(
success=False,
doc_id=doc_id,
status="failed",
error=parsed.error or "Document parse failed",
)
effective_text = (parsed.markdown or "").strip()
if not effective_text:
return UpdateResult(
success=False,
doc_id=doc_id,
status="failed",
error="No extractable text for update",
)
meta = {
"session_id": session_id,
"source": self._extract_source(session_id),
}
if isinstance(metadata, dict):
meta.update(metadata)
response = await self._router_post_json(
"/v1/documents/update",
{
"agent_id": (agent_id or "daarwizz").lower(),
"doc_id": doc_id,
"file_name": file_name,
"text": effective_text,
"dao_id": dao_id,
"user_id": user_id,
"storage_ref": storage_ref,
"metadata": meta,
},
timeout=90.0,
)
if not response.get("ok"):
return UpdateResult(
success=False,
doc_id=doc_id,
status="failed",
error=response.get("error", "Router update failed"),
)
await self.save_doc_context(
session_id=session_id,
doc_id=doc_id,
doc_url=doc_url,
file_name=file_name,
dao_id=dao_id,
user_id=user_id,
)
publish = PublishResult(success=False)
if publish_artifact:
publish = await self._publish_text_artifact(
text=effective_text,
doc_id=doc_id,
file_name=file_name,
dao_id=dao_id,
user_id=user_id,
artifact_id=artifact_id,
target_format=target_format,
label=artifact_label,
metadata=meta,
)
return UpdateResult(
success=True,
doc_id=response.get("doc_id") or doc_id,
version_no=int(response.get("version_no", 0) or 0) or None,
version_id=int(response.get("version_id", 0) or 0) or None,
updated_chunks=int(response.get("chunks_stored", 0) or 0),
status="updated_published" if publish_artifact and publish.success else ("updated_publish_failed" if publish_artifact else "updated"),
publish_error=publish.error if publish_artifact and not publish.success else None,
artifact_id=publish.artifact_id if publish_artifact else None,
artifact_version_id=publish.version_id if publish_artifact else None,
artifact_storage_key=publish.storage_key if publish_artifact else None,
artifact_mime=publish.mime if publish_artifact else None,
artifact_download_url=publish.download_url if publish_artifact else None,
)
except Exception as e:
logger.error(f"Document update failed: {e}", exc_info=True)
return UpdateResult(
success=False,
doc_id=doc_id,
status="failed",
error=str(e),
)
async def list_document_versions(
self,
agent_id: str,
doc_id: str,
limit: int = 20,
) -> Dict[str, Any]:
aid = (agent_id or "daarwizz").lower()
did = (doc_id or "").strip()
if not did:
return {"ok": False, "error": "doc_id is required", "items": []}
try:
response = await self._router_get_json(
f"/v1/documents/{did}/versions?agent_id={aid}&limit={max(1, min(int(limit or 20), 200))}",
timeout=30.0,
)
return response if isinstance(response, dict) else {"ok": False, "error": "invalid_response", "items": []}
except Exception as e:
logger.error(f"list_document_versions failed: {e}")
return {"ok": False, "error": str(e), "items": []}
async def publish_document_artifact(
self,
session_id: str,
doc_id: Optional[str] = None,
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
text: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
artifact_id: Optional[str] = None,
target_format: Optional[str] = None,
artifact_label: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> PublishResult:
"""
Publish text as a physical artifact version (.docx/.xlsx/.txt/...) without changing RAG index.
"""
try:
context = await self.get_doc_context(session_id)
if context:
if not doc_id:
doc_id = context.doc_id
if not doc_url:
doc_url = context.doc_url
if not file_name:
file_name = context.file_name
if not dao_id:
dao_id = context.dao_id
if not user_id:
user_id = context.user_id
if not doc_id:
return PublishResult(success=False, error="doc_id is required")
body = (text or "").strip()
if not body:
if not doc_url:
return PublishResult(success=False, error="text or doc_url is required")
parsed = await self.parse_document(
session_id=session_id,
doc_url=doc_url,
file_name=file_name or "document",
dao_id=dao_id or "",
user_id=user_id or "",
output_mode="markdown",
metadata={"source": self._extract_source(session_id), "mode": "publish"},
)
if not parsed.success:
return PublishResult(success=False, error=parsed.error or "Document parse failed")
body = (parsed.markdown or "").strip()
if not body:
return PublishResult(success=False, error="No text available for publish")
return await self._publish_text_artifact(
text=body,
doc_id=doc_id,
file_name=file_name,
dao_id=dao_id,
user_id=user_id,
artifact_id=artifact_id,
target_format=target_format,
label=artifact_label,
metadata=metadata,
)
except Exception as e:
logger.error(f"publish_document_artifact failed: {e}", exc_info=True)
return PublishResult(success=False, error=str(e))
async def ask_about_document(
self,
@@ -625,38 +1155,30 @@ class DocumentService:
}],
)
# Build RAG query request
router_request = {
"mode": "rag_query",
"agent": agent_id,
"metadata": {
"source": self._extract_source(session_id),
"dao_id": dao_id,
"user_id": user_id,
"session_id": session_id,
},
"payload": {
"question": question,
"dao_id": dao_id,
"user_id": user_id,
"doc_id": doc_id,
},
}
logger.info(
f"RAG query: agent={agent_id}, session={session_id}, question={question[:50]}, doc_id={doc_id}"
)
# Send to Router
response = await send_to_router(router_request)
if not isinstance(response, dict):
response = await self._router_post_json(
"/v1/documents/query",
{
"agent_id": (agent_id or "daarwizz").lower(),
"question": question,
"doc_id": doc_id,
"dao_id": dao_id,
"user_id": user_id,
"limit": 5,
},
timeout=60.0,
)
if isinstance(response, dict) and not response.get("ok", False):
return QAResult(
success=False,
error="Invalid response from router"
error=response.get("error", "Document query failed"),
)
data = response.get("data", {})
data = response.get("data", {}) if isinstance(response, dict) else {}
answer = data.get("answer") or data.get("text")
sources = data.get("citations", []) or data.get("sources", [])
@@ -717,7 +1239,8 @@ async def ingest_document(
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None
user_id: Optional[str] = None,
agent_id: str = "daarwizz",
) -> IngestResult:
"""Ingest document chunks into RAG/Memory"""
return await doc_service.ingest_document(
@@ -726,7 +1249,8 @@ async def ingest_document(
doc_url=doc_url,
file_name=file_name,
dao_id=dao_id,
user_id=user_id
user_id=user_id,
agent_id=agent_id,
)
@@ -749,6 +1273,79 @@ async def ask_about_document(
)
async def update_document(
session_id: str,
doc_id: Optional[str] = None,
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
text: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
agent_id: str = "daarwizz",
storage_ref: Optional[str] = None,
publish_artifact: bool = False,
artifact_id: Optional[str] = None,
target_format: Optional[str] = None,
artifact_label: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> UpdateResult:
"""Update document chunks and bump version."""
return await doc_service.update_document(
session_id=session_id,
doc_id=doc_id,
doc_url=doc_url,
file_name=file_name,
text=text,
dao_id=dao_id,
user_id=user_id,
agent_id=agent_id,
storage_ref=storage_ref,
publish_artifact=publish_artifact,
artifact_id=artifact_id,
target_format=target_format,
artifact_label=artifact_label,
metadata=metadata,
)
async def list_document_versions(agent_id: str, doc_id: str, limit: int = 20) -> Dict[str, Any]:
"""List document versions from router."""
return await doc_service.list_document_versions(
agent_id=agent_id,
doc_id=doc_id,
limit=limit,
)
async def publish_document_artifact(
session_id: str,
doc_id: Optional[str] = None,
doc_url: Optional[str] = None,
file_name: Optional[str] = None,
text: Optional[str] = None,
dao_id: Optional[str] = None,
user_id: Optional[str] = None,
artifact_id: Optional[str] = None,
target_format: Optional[str] = None,
artifact_label: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> PublishResult:
"""Publish physical artifact version for document text."""
return await doc_service.publish_document_artifact(
session_id=session_id,
doc_id=doc_id,
doc_url=doc_url,
file_name=file_name,
text=text,
dao_id=dao_id,
user_id=user_id,
artifact_id=artifact_id,
target_format=target_format,
artifact_label=artifact_label,
metadata=metadata,
)
async def save_doc_context(
session_id: str,
doc_id: str,

62
ops/monitor_notify_sofiia.sh Normal file → Executable file
View File

@@ -7,6 +7,7 @@ ROUTER_URL="${ROUTER_URL:-http://127.0.0.1:9102}"
REPORT_ENABLED="${SOFIIA_REPORTS_ENABLED:-true}"
REPORT_MODE="${SOFIIA_REPORT_MODE:-fail_only}" # fail_only | always
REPORT_TIMEOUT="${SOFIIA_REPORT_TIMEOUT:-180}"
REPORT_MAX_TOKENS="${SOFIIA_REPORT_MAX_TOKENS:-900}"
REPORT_CHAT_ID="${SOFIIA_REPORT_CHAT_ID:-ops-monitor-sofiia}"
REPORT_USER_ID="${SOFIIA_REPORT_USER_ID:-ops-monitor-agent}"
REPORT_USERNAME="${SOFIIA_REPORT_USERNAME:-monitor-agent}"
@@ -23,7 +24,7 @@ if [[ ! -f "$STATUS_JSON" ]]; then
exit 0
fi
python3 - "$STATUS_JSON" "$ROOT" "$ROUTER_URL" "$REPORT_MODE" "$REPORT_TIMEOUT" "$REPORT_CHAT_ID" "$REPORT_USER_ID" "$REPORT_USERNAME" "$REPORT_TELEGRAM_CHAT_ID" "$SOFIIA_BOT_TOKEN" <<'PY'
python3 - "$STATUS_JSON" "$ROOT" "$ROUTER_URL" "$REPORT_MODE" "$REPORT_TIMEOUT" "$REPORT_MAX_TOKENS" "$REPORT_CHAT_ID" "$REPORT_USER_ID" "$REPORT_USERNAME" "$REPORT_TELEGRAM_CHAT_ID" "$SOFIIA_BOT_TOKEN" <<'PY'
import json
import sys
from pathlib import Path
@@ -35,11 +36,12 @@ root = Path(sys.argv[2])
router_url = sys.argv[3].rstrip('/')
report_mode = sys.argv[4]
timeout_s = int(sys.argv[5])
chat_id = sys.argv[6]
user_id = sys.argv[7]
username = sys.argv[8]
tg_chat_id = sys.argv[9].strip()
tg_token = sys.argv[10].strip()
max_tokens = int(sys.argv[6])
chat_id = sys.argv[7]
user_id = sys.argv[8]
username = sys.argv[9]
tg_chat_id = sys.argv[10].strip()
tg_token = sys.argv[11].strip()
payload = json.loads(status_json.read_text(encoding='utf-8'))
status = str(payload.get('status', 'unknown')).lower()
@@ -70,7 +72,7 @@ prompt = (
body = {
'prompt': prompt,
'max_tokens': 400,
'max_tokens': max_tokens,
'temperature': 0.1,
'metadata': {
'source': 'ops-monitor-canary',
@@ -99,26 +101,42 @@ try:
print(f"[OK] sofiia report sent: backend={data.get('backend')} model={data.get('model')} preview={short!r}")
if tg_chat_id and tg_token and text:
msg = (
def chunk_text(value: str, limit: int = 3500):
chunks = []
remaining = value
while remaining:
if len(remaining) <= limit:
chunks.append(remaining)
break
split_at = remaining.rfind('\n', 0, limit)
if split_at < max(1, limit // 2):
split_at = limit
chunks.append(remaining[:split_at].rstrip())
remaining = remaining[split_at:].lstrip()
return chunks or [value]
header = (
"[NODE1 Monitor]\n"
f"status={payload.get('status')} exit_code={payload.get('exit_code')}\n\n"
f"{text[:3500]}"
)
tg_req = urlreq.Request(
url=f"https://api.telegram.org/bot{tg_token}/sendMessage",
data=json.dumps({"chat_id": tg_chat_id, "text": msg}).encode('utf-8'),
headers={'Content-Type': 'application/json'},
method='POST',
)
try:
parts = chunk_text(text, 3500 - len("(99/99)\n"))
total = len(parts)
delivered = 0
for idx, part in enumerate(parts, start=1):
prefix = f"({idx}/{total})\n" if total > 1 else ""
msg = f"{header}{prefix}{part}" if idx == 1 else f"{prefix}{part}"
tg_req = urlreq.Request(
url=f"https://api.telegram.org/bot{tg_token}/sendMessage",
data=json.dumps({"chat_id": tg_chat_id, "text": msg}).encode('utf-8'),
headers={'Content-Type': 'application/json'},
method='POST',
)
with urlreq.urlopen(tg_req, timeout=20) as tg_resp:
tg_data = json.loads(tg_resp.read().decode('utf-8', errors='ignore'))
if tg_data.get('ok'):
print(f"[OK] telegram report delivered: chat_id={tg_chat_id}")
else:
print(f"[WARN] telegram send not ok: {tg_data}")
except Exception as tg_e:
print(f"[WARN] telegram send failed: {tg_e}")
if not tg_data.get('ok'):
raise RuntimeError(f"telegram send not ok: {tg_data}")
delivered += 1
print(f"[OK] telegram report delivered: chat_id={tg_chat_id} parts={delivered}")
else:
print('[INFO] telegram delivery skipped (missing SOFIIA_REPORT_TELEGRAM_CHAT_ID or token or empty text)')
except HTTPError as e:

View File

@@ -0,0 +1,128 @@
#!/usr/bin/env python3
import argparse
import json
import os
import sys
import urllib.error
import urllib.request
TINY_PNG_DATA_URL = (
"data:image/png;base64,"
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8Xw8AAoMBgQhP2YkAAAAASUVORK5CYII="
)
def http_json(method: str, url: str, payload=None, headers=None):
data = None
req_headers = dict(headers or {})
if payload is not None:
data = json.dumps(payload).encode("utf-8")
req_headers.setdefault("Content-Type", "application/json")
req = urllib.request.Request(url, data=data, headers=req_headers, method=method)
try:
with urllib.request.urlopen(req, timeout=60) as resp:
body = resp.read().decode("utf-8", errors="replace")
return resp.status, json.loads(body) if body else {}
except urllib.error.HTTPError as e:
body = e.read().decode("utf-8", errors="replace")
try:
parsed = json.loads(body) if body else {}
except Exception:
parsed = {"raw": body}
return e.code, parsed
def check(cond: bool, label: str, details: str = "") -> bool:
prefix = "PASS" if cond else "FAIL"
tail = f" :: {details}" if details else ""
print(f"[{prefix}] {label}{tail}")
return cond
def main() -> int:
parser = argparse.ArgumentParser(description="AgroMatrix regression smoke checks")
parser.add_argument("--base-url", default="http://127.0.0.1:9102")
parser.add_argument("--agent-id", default="agromatrix")
parser.add_argument("--chat-id", default="smoke-agromatrix")
parser.add_argument("--user-id", default="smoke-user")
parser.add_argument("--skip-review-404", action="store_true")
parser.add_argument(
"--mentor-token",
default=(
os.getenv("AGROMATRIX_REVIEW_BEARER_TOKEN")
or (os.getenv("AGROMATRIX_REVIEW_BEARER_TOKENS", "").split(",")[0].strip())
or ""
),
)
args = parser.parse_args()
ok_all = True
status, health = http_json("GET", f"{args.base_url}/health")
ok_all &= check(status == 200 and health.get("status") == "ok", "health", str(health))
numeric_payload = {
"prompt": "напиши мені яка сума була витрачена на добрива",
"metadata": {
"channel": "telegram",
"chat_id": args.chat_id,
"user_id": args.user_id,
"user_name": "smoke",
},
}
status, infer_num = http_json("POST", f"{args.base_url}/v1/agents/{args.agent_id}/infer", numeric_payload)
resp_text = str(infer_num.get("response") or "")
numeric_guard = (
"Не можу підтвердити точне число" in resp_text
or "value + unit + source" in resp_text
or "source(sheet,row)" in resp_text
)
ok_all &= check(status == 200 and numeric_guard, "numeric_contract_guard", resp_text[:180])
plant_payload = {
"prompt": "Що це за рослина на фото?",
"images": [TINY_PNG_DATA_URL],
"metadata": {
"channel": "telegram",
"chat_id": args.chat_id,
"user_id": args.user_id,
"user_name": "smoke",
},
}
status, infer_plant = http_json("POST", f"{args.base_url}/v1/agents/{args.agent_id}/infer", plant_payload)
plant_text = str(infer_plant.get("response") or "")
plant_ok = (
"Не впевнений" in plant_text
or "Надішли" in plant_text
or "канд" in plant_text.lower()
)
ok_all &= check(status == 200 and plant_ok, "deterministic_plant_response", plant_text[:180])
status, pending = http_json("GET", f"{args.base_url}/v1/agromatrix/shared-memory/pending")
pending_shape = isinstance(pending, dict) and isinstance(pending.get("items"), list)
ok_all &= check(status == 200 and pending_shape, "shared_pending_endpoint", f"total={pending.get('total')}")
if not args.skip_review_404:
req_headers = {}
if args.mentor_token:
req_headers["Authorization"] = f"Bearer {args.mentor_token}"
status, review = http_json(
"POST",
f"{args.base_url}/v1/agromatrix/shared-memory/review",
{
"point_id": "11111111-1111-1111-1111-111111111111",
"approve": False,
"reviewer": "smoke",
"note": "nonexistent id check",
},
headers=req_headers,
)
expected = 404 if args.mentor_token else 401
ok_all &= check(status == expected, "shared_review_not_found_contract", str(review))
return 0 if ok_all else 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -16,9 +16,16 @@ logger = logging.getLogger(__name__)
# Configuration
GATEWAY_URL = os.getenv("GATEWAY_URL", "http://gateway:9300")
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000")
PROBE_INTERVAL = int(os.getenv("PROBE_INTERVAL", "60")) # seconds
PROBE_TIMEOUT = int(os.getenv("PROBE_TIMEOUT", "30")) # seconds
SEMANTIC_TIMEOUT = int(os.getenv("SEMANTIC_TIMEOUT", "45")) # seconds
METRICS_PORT = int(os.getenv("METRICS_PORT", "9108"))
SEMANTIC_PROBE_ENABLED = os.getenv("SEMANTIC_PROBE_ENABLED", "true").lower() == "true"
SEMANTIC_AGENTS = [a.strip() for a in os.getenv("SEMANTIC_AGENTS", "clan,sofiia,monitor").split(",") if a.strip()]
SEMANTIC_PROMPT = os.getenv("SEMANTIC_PROMPT", "Коротко: хто такий DAARWIZZ?")
SEMANTIC_EXPECT_KEYWORD = os.getenv("SEMANTIC_EXPECT_KEYWORD", "daarwizz").lower()
MONITOR_EXPECT_LOCAL = os.getenv("MONITOR_EXPECT_LOCAL", "true").lower() == "true"
# Prometheus metrics
agent_e2e_success = Gauge('agent_e2e_success', 'Whether last E2E probe succeeded', ['target'])
@@ -42,7 +49,7 @@ async def probe_gateway_health() -> tuple[bool, float, str]:
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
resp = await client.get(f"{GATEWAY_URL}/health")
latency = time.time() - start
if resp.status_code == 200:
data = resp.json()
if data.get("status") == "healthy":
@@ -67,7 +74,7 @@ async def probe_agent_ping() -> tuple[bool, float, str]:
json={"probe": True, "timestamp": datetime.utcnow().isoformat()}
)
latency = time.time() - start
if resp.status_code == 200:
data = resp.json()
if data.get("success"):
@@ -100,7 +107,7 @@ async def probe_webhook_echo() -> tuple[bool, float, str]:
"text": "/health" # Simple health check command
}
}
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
# Use helion webhook as it's the most tested
resp = await client.post(
@@ -108,7 +115,7 @@ async def probe_webhook_echo() -> tuple[bool, float, str]:
json=test_update
)
latency = time.time() - start
if resp.status_code == 200:
return True, latency, ""
else:
@@ -119,53 +126,102 @@ async def probe_webhook_echo() -> tuple[bool, float, str]:
return False, time.time() - start, f"error: {str(e)[:50]}"
async def probe_agent_semantic(agent_id: str) -> tuple[bool, float, str]:
"""Probe semantic response via router infer and assert DAARWIZZ awareness."""
start = time.time()
try:
payload = {
"prompt": SEMANTIC_PROMPT,
"max_tokens": 180,
"temperature": 0.1,
"metadata": {
"agent_id": agent_id,
"user_id": "tg:0",
"chat_id": "0",
"username": "e2e-prober",
"raw_user_text": SEMANTIC_PROMPT,
},
}
async with httpx.AsyncClient(timeout=SEMANTIC_TIMEOUT) as client:
resp = await client.post(f"{ROUTER_URL}/v1/agents/{agent_id}/infer", json=payload)
latency = time.time() - start
if resp.status_code != 200:
return False, latency, f"http_{resp.status_code}"
data = resp.json()
answer = str(data.get("response") or "")
backend = str(data.get("backend") or "")
model = str(data.get("model") or "")
answer_lc = answer.lower()
if SEMANTIC_EXPECT_KEYWORD not in answer_lc and "даар" not in answer_lc:
return False, latency, "no_daarwizz_in_answer"
if MONITOR_EXPECT_LOCAL and agent_id == "monitor":
local_ok = ("ollama" in backend.lower()) or model.lower().startswith("qwen")
if not local_ok:
return False, latency, f"monitor_nonlocal_backend:{backend}:{model}"
return True, latency, ""
except httpx.TimeoutException:
return False, time.time() - start, "timeout"
except Exception as e:
return False, time.time() - start, f"error: {str(e)[:50]}"
def record_probe(target: str, success: bool, latency: float, reason: str):
"""Record probe metrics and log line."""
agent_e2e_runs_total.labels(target=target).inc()
agent_e2e_success.labels(target=target).set(1 if success else 0)
agent_e2e_latency.labels(target=target).set(latency)
agent_e2e_latency_histogram.labels(target=target).observe(latency)
if not success:
agent_e2e_failures_total.labels(target=target, reason=reason).inc()
logger.info(f"{target}: success={success}, latency={latency:.3f}s, reason={reason}")
async def run_probes():
"""Run all probes and update metrics"""
# Probe 1: Gateway health
success, latency, reason = await probe_gateway_health()
agent_e2e_runs_total.labels(target="gateway_health").inc()
agent_e2e_success.labels(target="gateway_health").set(1 if success else 0)
agent_e2e_latency.labels(target="gateway_health").set(latency)
agent_e2e_latency_histogram.labels(target="gateway_health").observe(latency)
if not success:
agent_e2e_failures_total.labels(target="gateway_health", reason=reason).inc()
logger.info(f"gateway_health: success={success}, latency={latency:.3f}s, reason={reason}")
record_probe("gateway_health", success, latency, reason)
# Probe 2: Agent ping (if endpoint exists)
success, latency, reason = await probe_agent_ping()
agent_e2e_runs_total.labels(target="agent_ping").inc()
agent_e2e_success.labels(target="agent_ping").set(1 if success else 0)
agent_e2e_latency.labels(target="agent_ping").set(latency)
agent_e2e_latency_histogram.labels(target="agent_ping").observe(latency)
if not success:
agent_e2e_failures_total.labels(target="agent_ping", reason=reason).inc()
logger.info(f"agent_ping: success={success}, latency={latency:.3f}s, reason={reason}")
record_probe("agent_ping", success, latency, reason)
# Probe 3: Webhook E2E (full path test)
success, latency, reason = await probe_webhook_echo()
agent_e2e_runs_total.labels(target="webhook_e2e").inc()
agent_e2e_success.labels(target="webhook_e2e").set(1 if success else 0)
agent_e2e_latency.labels(target="webhook_e2e").set(latency)
agent_e2e_latency_histogram.labels(target="webhook_e2e").observe(latency)
if not success:
agent_e2e_failures_total.labels(target="webhook_e2e", reason=reason).inc()
logger.info(f"webhook_e2e: success={success}, latency={latency:.3f}s, reason={reason}")
record_probe("webhook_e2e", success, latency, reason)
# Probe 4+: semantic checks for selected agents (parallel)
if SEMANTIC_PROBE_ENABLED and SEMANTIC_AGENTS:
results = await asyncio.gather(*(probe_agent_semantic(agent_id) for agent_id in SEMANTIC_AGENTS))
matrix = []
for agent_id, (success, latency, reason) in zip(SEMANTIC_AGENTS, results):
record_probe(f"semantic_{agent_id}", success, latency, reason)
matrix.append(f"{agent_id}:{'PASS' if success else 'FAIL'}")
logger.info("semantic_matrix: " + " | ".join(matrix))
async def main():
logger.info(f"Starting E2E Agent Prober")
logger.info("Starting E2E Agent Prober")
logger.info(f" GATEWAY_URL: {GATEWAY_URL}")
logger.info(f" ROUTER_URL: {ROUTER_URL}")
logger.info(f" PROBE_INTERVAL: {PROBE_INTERVAL}s")
logger.info(f" PROBE_TIMEOUT: {PROBE_TIMEOUT}s")
logger.info(f" METRICS_PORT: {METRICS_PORT}")
logger.info(f" SEMANTIC_TIMEOUT: {SEMANTIC_TIMEOUT}s")
logger.info(f" SEMANTIC_PROBE_ENABLED: {SEMANTIC_PROBE_ENABLED}")
logger.info(f" SEMANTIC_AGENTS: {','.join(SEMANTIC_AGENTS)}")
# Start Prometheus metrics server
start_http_server(METRICS_PORT)
logger.info(f"Prometheus metrics available at :{METRICS_PORT}/metrics")
# Initial probe
await run_probes()
# Continuous probing
while True:
await asyncio.sleep(PROBE_INTERVAL)

View File

@@ -6,13 +6,15 @@ Artifact Registry v0
"""
import asyncio
import base64
import hashlib
import json
import logging
import os
import re
import uuid
from io import BytesIO
from datetime import datetime
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
import asyncpg
@@ -90,6 +92,14 @@ class ArtifactVersionFromUrlRequest(BaseModel):
meta_json: Optional[Dict[str, Any]] = None
class ArtifactVersionFromBase64Request(BaseModel):
content_base64: str
mime: str
filename: Optional[str] = "source.bin"
label: Optional[str] = "source"
meta_json: Optional[Dict[str, Any]] = None
class ArtifactVersionResponse(BaseModel):
version_id: str
storage_key: str
@@ -208,15 +218,38 @@ def _normalize_meta_json(meta: Any) -> Dict[str, Any]:
def _format_to_mime(fmt: str) -> str:
fmt = fmt.lower()
if "/" in fmt:
return fmt
if fmt == "pptx":
return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
if fmt == "pdf":
return "application/pdf"
if fmt == "source":
return "application/json"
if fmt == "docx":
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
if fmt == "xlsx":
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
if fmt == "txt":
return "text/plain; charset=utf-8"
if fmt == "md":
return "text/markdown; charset=utf-8"
if fmt == "json":
return "application/json"
if fmt == "csv":
return "text/csv; charset=utf-8"
return "application/octet-stream"
def _safe_filename(name: Optional[str], fallback: str = "source.bin") -> str:
raw = (name or fallback).strip() or fallback
cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", raw)
cleaned = cleaned.strip("._")
if not cleaned:
return fallback
return cleaned[:120]
async def _download_bytes(url: str) -> bytes:
async with httpx.AsyncClient(timeout=60.0) as client:
resp = await client.get(url)
@@ -462,6 +495,73 @@ async def add_version_from_url(artifact_id: str, payload: ArtifactVersionFromUrl
)
@app.post("/artifacts/{artifact_id}/versions/from_base64", response_model=ArtifactVersionResponse)
async def add_version_from_base64(artifact_id: str, payload: ArtifactVersionFromBase64Request) -> ArtifactVersionResponse:
if not minio_client:
raise HTTPException(status_code=500, detail="MinIO not available")
if not pool:
raise HTTPException(status_code=500, detail="DB not available")
raw = (payload.content_base64 or "").strip()
if not raw:
raise HTTPException(status_code=400, detail="content_base64 is required")
if raw.startswith("data:") and "," in raw:
raw = raw.split(",", 1)[1]
try:
content = base64.b64decode(raw, validate=True)
except Exception:
raise HTTPException(status_code=400, detail="Invalid base64 payload")
if not content:
raise HTTPException(status_code=400, detail="Decoded payload is empty")
version_id = f"ver_{uuid.uuid4().hex}"
filename = _safe_filename(payload.filename, fallback="source.bin")
sha256 = _hash_bytes(content)
storage_key = _storage_key(artifact_id, version_id, filename)
try:
minio_client.put_object(
MINIO_BUCKET,
storage_key,
data=BytesIO(content),
length=len(content),
content_type=payload.mime,
)
except S3Error as e:
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
meta_json = _normalize_meta_json(payload.meta_json)
if "file_name" not in meta_json:
meta_json["file_name"] = filename
async with pool.acquire() as conn:
await conn.execute(
"""
insert into artifact_versions
(id, artifact_id, label, sha256, mime, size_bytes, storage_key, meta_json)
values ($1, $2, $3, $4, $5, $6, $7, $8)
""",
version_id,
artifact_id,
payload.label or "source",
sha256,
payload.mime,
len(content),
storage_key,
json.dumps(meta_json),
)
return ArtifactVersionResponse(
version_id=version_id,
storage_key=storage_key,
sha256=sha256,
size_bytes=len(content),
)
@app.post("/artifacts/{artifact_id}/versions", response_model=ArtifactVersionResponse)
async def add_version(artifact_id: str, payload: ArtifactVersionCreateRequest) -> ArtifactVersionResponse:
if not pool:
@@ -678,7 +778,39 @@ async def download_artifact(artifact_id: str, format: str = Query("pptx")) -> Di
if not row:
raise HTTPException(status_code=404, detail="Version not found")
try:
url = minio_client.presigned_get_object(MINIO_BUCKET, row["storage_key"], expires=1800)
url = minio_client.presigned_get_object(
MINIO_BUCKET,
row["storage_key"],
expires=timedelta(seconds=1800),
)
except S3Error as e:
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
return {"url": url, "storage_key": row["storage_key"], "mime": row["mime"]}
@app.get("/artifacts/{artifact_id}/versions/{version_id}/download")
async def download_artifact_version(artifact_id: str, version_id: str) -> Dict[str, Any]:
if not pool or not minio_client:
raise HTTPException(status_code=500, detail="Service not available")
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
select * from artifact_versions
where artifact_id=$1 and id=$2
limit 1
""",
artifact_id,
version_id,
)
if not row:
raise HTTPException(status_code=404, detail="Version not found")
try:
url = minio_client.presigned_get_object(
MINIO_BUCKET,
row["storage_key"],
expires=timedelta(seconds=1800),
)
except S3Error as e:
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
return {"url": url, "storage_key": row["storage_key"], "mime": row["mime"], "version_id": row["id"]}

View File

@@ -361,6 +361,29 @@ agromatrix:
llm_profile: reasoning
delegation:
enabled: false
plant_intel:
team_name: AgroMatrix Plant Intelligence
parallel_roles: true
max_concurrency: 3
synthesis:
role_context: Plant Intelligence Synthesis
system_prompt_ref: roles/agx/agx-plant-intel/orchestrator_synthesis.md
llm_profile: reasoning
team:
- id: plant_identifier
role_context: Plant Identifier
system_prompt_ref: roles/agx/agx-plant-intel/plant_identifier.md
llm_profile: science
- id: taxonomy_validator
role_context: Taxonomy Validator
system_prompt_ref: roles/agx/agx-plant-intel/taxonomy_validator.md
llm_profile: reasoning
- id: agrovoc_normalizer
role_context: AGROVOC Normalizer
system_prompt_ref: roles/agx/agx-plant-intel/agrovoc_normalizer.md
llm_profile: fast
delegation:
enabled: false
cadastre_geo:
team_name: AgroMatrix Cadastre/Geo
parallel_roles: true
@@ -614,6 +637,16 @@ agromatrix:
- Stepan
- координація
- план
plant_intel:
- plant
- рослина
- культура
- leaf
- disease
- хвороба
- identify
- ідентифікуй
- що за рослина
cadastre_geo:
- cadastre
- geo

View File

@@ -0,0 +1,8 @@
# Agronomist
Фокус: агрономія, діагностика стану рослин, фази розвитку, ризики хвороб/стресів.
Правила відповіді:
- Коротко і прикладно.
- Ніяких вигаданих фактів; при невизначеності чітко позначити припущення.
- Для фото-питань: аналізувати в межах доступного контексту; якщо файл відсутній зараз — просити фото повторно.

View File

@@ -0,0 +1,8 @@
# Communicator
Фокус: людяна та зрозуміла комунікація фінальної відповіді.
Правила:
- Природна мова, без механістичного тону.
- Не дублюй технічні обмеження, якщо вони не потрібні для дії користувача.
- Завершуй конкретним корисним кроком.

View File

@@ -0,0 +1,7 @@
# Field Data Analyst
Фокус: аналіз польових даних, тренди, аномалії, порівняння сценаріїв.
Правила:
- Пояснювати висновки простою мовою.
- Якщо даних недостатньо — вказати, які саме дані потрібні для точного висновку.

View File

@@ -0,0 +1,8 @@
# Farm Ops Planner
Фокус: планування польових робіт, ресурси, пріоритезація задач, таймінги.
Правила:
- Видавати практичний порядок дій.
- За простого запиту: коротка відповідь.
- Для операційних запитів: стислий план з відповідальними і дедлайном.

View File

@@ -0,0 +1,10 @@
# AgroMatrix Orchestrator Synthesis
Ти синтезуєш відповіді ролей у фінальну відповідь Степана.
Правила:
- За замовчуванням: 1-3 природні речення без шаблонної канцелярії.
- Детальний формат (пункти/чекліст) тільки коли користувач просить "детально", "план", "чекліст", "розрахунок".
- Якщо для аналізу бракує фото в поточному контексті, скажи це просто і попроси надіслати фото повторно.
- Уникай службових формулювань про "технічні обмеження", "text-only" чи "відсутній vision-модуль".
- Пояснюй по суті агропитання і давай 1 наступний практичний крок.

View File

@@ -0,0 +1,7 @@
# Risk Assessor
Фокус: агро-ризики, операційні ризики, наслідки рішень.
Правила:
- Давай коротку оцінку ризику (низький/середній/високий) і як зменшити ризик.
- Без зайвої бюрократії у відповіді користувачу.

View File

@@ -11,6 +11,10 @@
- Деструктивні дії (delete/migrate/prod) ТІЛЬКИ через план + dry-run + backup
- Ніколи не логувати секрети/токени
- Інші ролі НЕ спілкуються з користувачем напряму
- Мультимодальність активна: фото/голос/документи підтримуються через стек платформи.
- Якщо в поточному контексті не вистачає зображення для аналізу, пояснюйте це простою людською мовою і попросіть надіслати фото ще раз без технічних формулювань.
## Формат відповіді:
Структурована відповідь з чіткими рекомендаціями та наступними кроками.
- За замовчуванням: природна коротка відповідь 1-3 речення.
- Якщо користувач просить детально/план/чекліст: структурована відповідь з чіткими наступними кроками.
- Тон: живий і професійний, без канцеляризмів, шаблонів і фраз про "обмеження моделі".

View File

@@ -7,3 +7,7 @@
- Структурувати інформацію логічно
- Включати конкретні наступні кроки
- Позначати ризики якщо є
- За замовчуванням відповідати природно і коротко (1-3 речення), без шаблонної канцелярії.
- Для детальних запитів переходити у структурований режим.
- Якщо для аналізу бракує зображення у поточному контексті, скажіть це природно і попросіть надіслати фото повторно.
- Не вживати службові формулювання на кшталт "обмеження моделі", "text-only", "vision unavailable".

View File

@@ -0,0 +1,11 @@
You are AGROVOC Normalizer.
Responsibilities:
- Normalize crop/disease terms using agrovoc_lookup.
- Provide canonical term mapping for user-facing output.
- Keep labels practical for agronomy context.
Return format:
- canonical_terms
- term_mapping
- notes_for_user

View File

@@ -0,0 +1,24 @@
Ти — Plant Intel Agent у DAARION.city.
Відповідай природно, коротко й по-людськи українською, 13 речення за замовчуванням.
НАЙГОЛОВНІШЕ:
- Дані з [PLANT_VISION_PREPROCESSED] (або context.plant_vision) — єдиний source-of-truth для ідентифікації рослини.
- Для follow-up без нового фото використовуй [PREVIOUS_PLANT_IDENTIFICATION] (або context.last_plant / memory.last_plant).
Правило впевненості (обов'язково):
- Якщо recommend_fallback == true або confidence < 0.65:
"Ймовірно <name>, але впевненість низька. Перевірив через GBIF — найближчі збіги: <gbif_validation>. Краще нове фото при нормальному світлі."
- Інакше:
"Я бачу <name> з впевненістю <X>%."
Правила синтезу:
- Не ігноруй результати pre-vision, якщо вони присутні.
- Не стверджуй "фото не надано", якщо у контексті є pre-vision або previous plant data.
- Уникай шаблонних списків, якщо користувач не просить детальний формат.
- Якщо дані суперечливі: коротко познач невизначеність і попроси 1 конкретне додаткове фото.
- Якщо top_k порожній, явно вкажи, що ідентифікація непевна, але все одно надай GBIF-орієнтир, якщо він є в контексті.
Формат відповіді:
- 13 речення за замовчуванням.
- Без технічного шуму, без внутрішніх JSON/міток у відповіді користувачу.
- За запитом користувача можна розгорнути відповідь і дати короткі поради з догляду.

View File

@@ -0,0 +1,11 @@
You are Plant Identifier.
Responsibilities:
- Parse visual cues from user description/photo context.
- Build candidate crop/plant hypotheses.
- Use plantnet_lookup first when image URL is available.
- If PlantNet is unavailable, provide top hypotheses with explicit uncertainty.
Return format:
- candidates: numbered list max 5, each with rationale.
- required_data: what extra image/data is needed.

View File

@@ -0,0 +1,11 @@
You are Taxonomy Validator.
Responsibilities:
- Validate candidate names via gbif_species_lookup.
- Remove invalid/synonym-conflicted names.
- Keep accepted taxa and explain conflicts briefly.
Return format:
- accepted_candidates
- rejected_candidates_with_reason
- confidence_adjustment

View File

@@ -0,0 +1,15 @@
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY main.py .
EXPOSE 8085
HEALTHCHECK --interval=30s --timeout=10s --start-period=20s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen(http://localhost:8085/health)"
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8085"]

View File

@@ -0,0 +1,238 @@
import json
import os
import re
import shlex
import subprocess
import tempfile
from pathlib import Path
from typing import Any, Dict, List, Optional
import httpx
from fastapi import FastAPI, File, HTTPException, UploadFile
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field
app = FastAPI(title="plant-vision-node1", version="0.1.1")
class IdentifyRequest(BaseModel):
image_url: Optional[str] = None
top_k: int = Field(default=3, ge=1, le=10)
def _normalize_predictions(raw: Any, top_k: int) -> List[Dict[str, Any]]:
preds: List[Dict[str, Any]] = []
if isinstance(raw, dict):
for key in ("predictions", "results", "candidates"):
if isinstance(raw.get(key), list):
raw = raw[key]
break
if isinstance(raw, list):
for item in raw[:top_k]:
if not isinstance(item, dict):
continue
name = (
item.get("scientific_name")
or item.get("scientificName")
or item.get("label")
or item.get("name")
or "unknown"
)
common = item.get("common_name") or item.get("commonName") or item.get("common") or "-"
score = item.get("score", item.get("confidence", 0.0))
try:
score_f = float(score)
except Exception:
score_f = 0.0
preds.append({"scientific_name": str(name), "common_name": str(common), "score": score_f})
return preds[:top_k]
def _parse_text_output(text: str, top_k: int) -> List[Dict[str, Any]]:
"""
Parse only model score lines, e.g.:
97.6% Persicaria amphibia
86.1% Canada Goldenrod (Solidago canadensis)
Ignore service lines like "Read ..." or "Classification of ...".
"""
preds: List[Dict[str, Any]] = []
for raw_line in (text or "").splitlines():
line = raw_line.strip()
if not line or "%" not in line:
continue
m = re.match(r"^\s*(\d+(?:\.\d+)?)%\s+(.+)$", line)
if not m:
continue
score_str, name_part = m.groups()
try:
score = float(score_str)
except ValueError:
continue
name = name_part.strip()
if not name:
continue
common_name = "-"
scientific_name = name
# If output is "Common Name (Scientific name)", preserve both.
paren = re.match(r"^(.*?)\s*\(([^()]+)\)\s*$", name)
if paren:
common, scientific = paren.groups()
common = common.strip()
scientific = scientific.strip()
if common:
common_name = common
if scientific:
scientific_name = scientific
preds.append(
{
"scientific_name": scientific_name,
"common_name": common_name,
"score": score,
}
)
preds.sort(key=lambda x: float(x.get("score", 0.0)), reverse=True)
return preds[:top_k]
def _extract_inference_time(stdout: str) -> Optional[float]:
m = re.search(r"took\s+(\d+(?:\.\d+)?)\s+secs", stdout or "")
if not m:
return None
try:
return float(m.group(1))
except Exception:
return None
def _run_nature_id_cli(image_path: str, top_k: int) -> Dict[str, Any]:
cmd_tmpl = (os.getenv("NATURE_ID_CMD") or "").strip()
timeout_s = int(os.getenv("NATURE_ID_TIMEOUT", "40"))
if not cmd_tmpl:
raise RuntimeError("NATURE_ID_CMD is not configured")
cmd = cmd_tmpl.replace("{image_path}", image_path)
proc = subprocess.run(
shlex.split(cmd),
capture_output=True,
text=True,
timeout=timeout_s,
check=False,
)
if proc.returncode != 0:
raise RuntimeError(f"nature-id cli failed rc={proc.returncode}: {proc.stderr.strip()[:240]}")
out = (proc.stdout or "").strip()
inference_time_sec = _extract_inference_time(out)
if not out:
return {"predictions": [], "inference_time_sec": inference_time_sec}
try:
parsed = json.loads(out)
preds = _normalize_predictions(parsed, top_k)
except Exception:
preds = _parse_text_output(out, top_k)
return {"predictions": preds, "inference_time_sec": inference_time_sec}
async def _download_image(image_url: str) -> str:
timeout_s = float(os.getenv("DOWNLOAD_TIMEOUT", "20"))
async with httpx.AsyncClient(timeout=timeout_s) as client:
resp = await client.get(image_url)
resp.raise_for_status()
data = resp.content
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as f:
f.write(data)
return f.name
def _response_payload(result: Dict[str, Any]) -> Dict[str, Any]:
preds = result.get("predictions") or []
top_k = [
{
"confidence": float(p.get("score", 0.0)),
"name": str((p.get("common_name") if p.get("common_name") not in (None, "", "-") else p.get("scientific_name")) or "unknown"),
"scientific_name": str(p.get("scientific_name") or "unknown"),
}
for p in preds
]
return {
"status": "success",
"model": "aiy_plants_V1",
"source": "nature-id-cli",
"count": len(preds),
"inference_time_sec": result.get("inference_time_sec"),
"predictions": preds,
"top_k": top_k,
}
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(_, exc: RequestValidationError):
# Avoid leaking raw multipart bytes in validation responses.
errs: List[Dict[str, Any]] = []
for e in exc.errors() or []:
errs.append({"loc": e.get("loc"), "msg": e.get("msg"), "type": e.get("type")})
return JSONResponse(status_code=422, content={"detail": errs})
@app.get("/health")
def health() -> Dict[str, Any]:
cmd = (os.getenv("NATURE_ID_CMD") or "").strip()
return {
"status": "healthy",
"nature_id_cmd_configured": bool(cmd),
"nature_id_cmd": cmd,
}
@app.post("/identify")
async def identify(payload: IdentifyRequest) -> Dict[str, Any]:
if not payload.image_url:
raise HTTPException(status_code=400, detail="image_url is required")
tmp_path = ""
try:
tmp_path = await _download_image(payload.image_url)
result = _run_nature_id_cli(tmp_path, payload.top_k)
return _response_payload(result)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=503, detail=f"identify_failed: {e}")
finally:
if tmp_path:
try:
Path(tmp_path).unlink(missing_ok=True)
except Exception:
pass
@app.post("/identify-file")
async def identify_file(file: UploadFile = File(...), top_k: int = 3) -> Dict[str, Any]:
top_k = max(1, min(top_k, 10))
tmp_path = ""
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as f:
f.write(await file.read())
tmp_path = f.name
result = _run_nature_id_cli(tmp_path, top_k)
return _response_payload(result)
except Exception as e:
raise HTTPException(status_code=503, detail=f"identify_failed: {e}")
finally:
if tmp_path:
try:
Path(tmp_path).unlink(missing_ok=True)
except Exception:
pass

View File

@@ -0,0 +1,8 @@
fastapi==0.115.5
uvicorn[standard]==0.32.1
httpx==0.28.1
python-multipart==0.0.17
Pillow==11.1.0
requests==2.32.3
tflite-runtime==2.14.0
numpy==1.26.4

View File

@@ -46,8 +46,15 @@ AGENT_SPECIALIZED_TOOLS = {
"nutra": ['comfy_generate_image', 'comfy_generate_video'],
# AgroMatrix - Agriculture
# Specialized: crop analysis, weather integration, field mapping
"agromatrix": ['comfy_generate_image', 'comfy_generate_video'],
# Specialized: crop analysis, weather integration, field mapping + plant intelligence
"agromatrix": [
'comfy_generate_image',
'comfy_generate_video',
'plantnet_lookup',
'nature_id_identify',
'gbif_species_lookup',
'agrovoc_lookup',
],
# GreenFood - Food & Eco
# Specialized: recipe analysis, eco-scoring

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -408,8 +408,9 @@ agents:
description: "Monitor Agent - архітектор-інспектор DAGI"
default_llm: local_qwen3_8b
system_prompt: |
Ти - Monitor Agent, стежиш за нодами, сервісами, агентами.
Якщо бачиш у чаті інших ботів, відповідай тільки за інфраструктурою або прямим тегом.
Ти - Monitor Agent, інфраструктурний інспектор DAGI: ноди, сервіси, пайплайни, алерти.
Ти знаєш, що DAARWIZZ — головний оркестратор мережі DAARION.city; для governance/маршрутизації посилайся на нього.
Відповідай коротко і по суті; якщо даних бракує — одразу кажи, який саме метрик/лог потрібен.
tools:
- id: get_metrics
type: builtin

View File

@@ -19,6 +19,7 @@ from typing import Dict, List, Any, Optional
from dataclasses import dataclass
from io import BytesIO, StringIO
from pathlib import PurePath
from urllib.parse import urlparse
import xml.etree.ElementTree as ET
from xml.sax.saxutils import escape as xml_escape
from zipfile import ZIP_DEFLATED, ZipFile
@@ -108,6 +109,115 @@ TOOL_DEFINITIONS = [
}
}
},
{
"type": "function",
"function": {
"name": "plantnet_lookup",
"description": "Визначення рослин через Pl@ntNet API. Повертає top-k кандидатів з confidence.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Короткий опис рослини/культури (якщо немає image_url)"
},
"image_url": {
"type": "string",
"description": "Публічне посилання на фото рослини"
},
"organ": {
"type": "string",
"description": "Орган рослини: leaf/flower/fruit/bark/auto",
"default": "auto"
},
"top_k": {
"type": "integer",
"description": "Скільки кандидатів повернути (1-10)",
"default": 3
}
}
}
}
},
{
"type": "function",
"function": {
"name": "nature_id_identify",
"description": "Локальна/open-source ідентифікація рослин через nature-id сумісний сервіс.",
"parameters": {
"type": "object",
"properties": {
"image_url": {
"type": "string",
"description": "Публічне посилання на фото рослини"
},
"image_data": {
"type": "string",
"description": "Data URL зображення (data:image/...;base64,...)"
},
"top_k": {
"type": "integer",
"description": "Скільки кандидатів повернути (1-10)",
"default": 3
},
"min_confidence": {
"type": "number",
"description": "Поріг confidence для fallback на GBIF",
"default": 0.65
}
}
}
}
},
{
"type": "function",
"function": {
"name": "gbif_species_lookup",
"description": "Пошук таксонів у GBIF для валідації назви культури/рослини.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Назва/термін для пошуку виду"
},
"limit": {
"type": "integer",
"description": "Кількість результатів (1-10)",
"default": 5
}
},
"required": ["query"]
}
}
},
{
"type": "function",
"function": {
"name": "agrovoc_lookup",
"description": "Нормалізація агро-термінів через AGROVOC (SPARQL).",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Термін культури/хвороби/технології"
},
"lang": {
"type": "string",
"description": "Мова міток (en/uk/ru)",
"default": "en"
},
"limit": {
"type": "integer",
"description": "Кількість результатів (1-10)",
"default": 5
}
},
"required": ["query"]
}
}
},
# PRIORITY 3: Generation tools
{
"type": "function",
@@ -681,6 +791,42 @@ class ToolManager:
tool_names = [t.get("function", {}).get("name") for t in filtered]
logger.debug(f"Agent {agent_id} has {len(filtered)} tools: {tool_names}")
return filtered
@staticmethod
def _is_image_data_url(value: str) -> bool:
v = str(value or "").strip()
return bool(v.startswith("data:image/") and ";base64," in v)
@staticmethod
def _is_known_non_direct_image_url(url: str) -> bool:
u = str(url or "").strip()
if not u:
return False
try:
p = urlparse(u)
except Exception:
return True
host = (p.netloc or "").lower()
if host in {"t.me", "telegram.me"}:
return True
if "web.telegram.org" in host:
return True
return False
@staticmethod
def _normalize_confidence(value: Any) -> float:
try:
v = float(value)
except Exception:
return 0.0
if v < 0:
return 0.0
# Some backends return percentages (e.g. 97.6) instead of 0..1.
if v > 1.0 and v <= 100.0:
v = v / 100.0
if v > 1.0:
v = 1.0
return v
async def execute_tool(
self,
@@ -709,6 +855,14 @@ class ToolManager:
return await self._web_search(arguments)
elif tool_name == "web_extract":
return await self._web_extract(arguments)
elif tool_name == "plantnet_lookup":
return await self._plantnet_lookup(arguments)
elif tool_name == "nature_id_identify":
return await self._nature_id_identify(arguments)
elif tool_name == "gbif_species_lookup":
return await self._gbif_species_lookup(arguments)
elif tool_name == "agrovoc_lookup":
return await self._agrovoc_lookup(arguments)
elif tool_name == "image_generate":
return await self._image_generate(arguments)
elif tool_name == "comfy_generate_image":
@@ -2530,6 +2684,272 @@ class ToolManager:
except Exception as e:
return ToolResult(success=False, result=None, error=str(e))
async def _plantnet_lookup(self, args: Dict) -> ToolResult:
"""Plant identification via Pl@ntNet API (skeleton adapter)."""
query = str(args.get("query", "") or "").strip()
image_url = str(args.get("image_url", "") or "").strip()
image_data = str(args.get("image_data", "") or "").strip()
runtime_image_data = str(args.get("_runtime_image_data", "") or "").strip()
if not image_data and self._is_image_data_url(runtime_image_data):
image_data = runtime_image_data
organ = str(args.get("organ", "auto") or "auto").strip().lower()
top_k = max(1, min(int(args.get("top_k", 3)), 5))
api_key = (os.getenv("PLANTNET_API_KEY") or "").strip()
if image_url and api_key:
try:
params = {
"api-key": api_key,
"images": image_url,
"organs": "leaf" if organ == "auto" else organ,
"lang": "en",
}
resp = await self.http_client.get(
"https://my-api.plantnet.org/v2/identify/all",
params=params,
timeout=25.0,
)
if resp.status_code == 200:
data = resp.json()
results = (data.get("results") or [])[:top_k]
if not results:
return ToolResult(success=True, result="Pl@ntNet: кандидатів не знайдено.")
lines = []
for idx, item in enumerate(results, 1):
species = (item.get("species") or {})
sname = species.get("scientificNameWithoutAuthor") or species.get("scientificName") or "unknown"
common = species.get("commonNames") or []
cname = common[0] if common else "-"
score = float(item.get("score") or 0.0)
lines.append(f"{idx}. {sname} ({cname}) score={score:.3f}")
return ToolResult(success=True, result="Pl@ntNet candidates:\n" + "\n".join(lines))
return ToolResult(success=False, result=None, error=f"plantnet_http_{resp.status_code}")
except Exception as e:
return ToolResult(success=False, result=None, error=f"plantnet_error: {e}")
if image_url or image_data:
ni_args: Dict[str, Any] = {"top_k": top_k}
if image_data:
ni_args["image_data"] = image_data
else:
ni_args["image_url"] = image_url
if runtime_image_data:
ni_args["_runtime_image_data"] = runtime_image_data
ni = await self._nature_id_identify(ni_args)
if ni.success:
return ni
if query:
return await self._gbif_species_lookup({"query": query, "limit": top_k})
return ToolResult(
success=False,
result=None,
error="No available plant ID backend (set PLANTNET_API_KEY or NATURE_ID_URL, or provide text query)",
)
async def _nature_id_identify(self, args: Dict) -> ToolResult:
"""Open-source plant identification via self-hosted nature-id compatible endpoint."""
image_url = str(args.get("image_url", "") or "").strip()
image_data = str(args.get("image_data", "") or "").strip()
runtime_image_data = str(args.get("_runtime_image_data", "") or "").strip()
if not image_data and self._is_image_data_url(runtime_image_data):
image_data = runtime_image_data
top_k = max(1, min(int(args.get("top_k", 3)), 10))
min_confidence = float(args.get("min_confidence", os.getenv("NATURE_ID_MIN_CONFIDENCE", "0.65")))
if image_url and self._is_known_non_direct_image_url(image_url):
if image_data:
logger.info("nature_id_identify: replacing non-direct image_url with runtime image_data")
image_url = ""
else:
return ToolResult(
success=False,
result=None,
error="image_url is not direct image URL; provide image_data or direct Telegram file URL",
)
if not image_url and not image_data:
return ToolResult(success=False, result=None, error="image_url or image_data is required")
base = (os.getenv("NATURE_ID_URL") or "").strip().rstrip("/")
if not base:
return ToolResult(success=False, result=None, error="NATURE_ID_URL is not configured")
try:
if image_data:
# data URL -> multipart /identify-file
if not image_data.startswith("data:") or "," not in image_data:
return ToolResult(success=False, result=None, error="invalid image_data format")
header, b64 = image_data.split(",", 1)
mime = "image/jpeg"
if ";base64" in header:
mime = header.split(":", 1)[1].split(";", 1)[0] or "image/jpeg"
ext = "jpg"
if "png" in mime:
ext = "png"
try:
image_bytes = base64.b64decode(b64)
except Exception:
return ToolResult(success=False, result=None, error="invalid image_data base64")
files = {"file": (f"upload.{ext}", image_bytes, mime)}
resp = await self.http_client.post(
f"{base}/identify-file",
params={"top_k": top_k},
files=files,
timeout=45.0,
)
else:
payload = {"image_url": image_url, "top_k": top_k}
resp = await self.http_client.post(f"{base}/identify", json=payload, timeout=45.0)
if resp.status_code != 200:
return ToolResult(success=False, result=None, error=f"nature_id_http_{resp.status_code}")
data = resp.json() or {}
status = str(data.get("status") or "success")
raw_top_k = data.get("top_k") or []
raw_preds = data.get("predictions") or data.get("results") or []
top_k_rows = []
if isinstance(raw_top_k, list) and raw_top_k:
for row in raw_top_k[:top_k]:
if not isinstance(row, dict):
continue
conf = row.get("confidence", 0.0)
conf_f = self._normalize_confidence(conf)
top_k_rows.append({
"confidence": conf_f,
"name": str(row.get("name") or row.get("scientific_name") or "unknown"),
"scientific_name": str(row.get("scientific_name") or row.get("name") or "unknown"),
})
else:
for item in raw_preds[:top_k]:
if not isinstance(item, dict):
continue
score = item.get("score", item.get("confidence", 0.0))
score_f = self._normalize_confidence(score)
sname = item.get("scientific_name") or item.get("label") or item.get("name") or "unknown"
cname = item.get("common_name") or item.get("common") or sname
top_k_rows.append({
"confidence": score_f,
"name": str(cname),
"scientific_name": str(sname),
})
if not top_k_rows:
return ToolResult(success=True, result=json.dumps({
"status": status,
"model": data.get("model") or "aiy_plants_V1",
"source": data.get("source") or "nature-id-cli",
"top_k": [],
"confidence": 0.0,
"recommend_fallback": True,
"reason": "no_predictions",
}, ensure_ascii=False))
top1 = top_k_rows[0]
top1_conf = float(top1.get("confidence", 0.0))
recommend_fallback = top1_conf < min_confidence
out = {
"status": status,
"model": data.get("model") or "aiy_plants_V1",
"source": data.get("source") or "nature-id-cli",
"inference_time_sec": data.get("inference_time_sec"),
"top_k": top_k_rows,
"confidence": top1_conf,
"min_confidence": min_confidence,
"recommend_fallback": recommend_fallback,
"fallback": "gbif_species_lookup",
}
if recommend_fallback:
fallback_query = str(top1.get("scientific_name") or top1.get("name") or "").strip()
if fallback_query and fallback_query.lower() != "unknown":
gbif = await self._gbif_species_lookup({"query": fallback_query, "limit": min(5, top_k)})
if gbif.success and gbif.result:
out["gbif_validation"] = gbif.result
return ToolResult(success=True, result=json.dumps(out, ensure_ascii=False))
except Exception as e:
return ToolResult(success=False, result=None, error=f"nature_id_error: {e}")
async def _gbif_species_lookup(self, args: Dict) -> ToolResult:
"""Species lookup via GBIF public API."""
query = str(args.get("query", "") or "").strip()
limit = max(1, min(int(args.get("limit", 5)), 10))
if not query:
return ToolResult(success=False, result=None, error="query is required")
try:
resp = await self.http_client.get(
"https://api.gbif.org/v1/species/search",
params={"q": query, "limit": limit, "status": "ACCEPTED"},
timeout=20.0,
)
if resp.status_code != 200:
return ToolResult(success=False, result=None, error=f"gbif_http_{resp.status_code}")
data = resp.json() or {}
results = data.get("results") or []
if not results:
return ToolResult(success=True, result="GBIF: результатів не знайдено.")
lines = []
for idx, item in enumerate(results[:limit], 1):
sci = item.get("scientificName") or item.get("canonicalName") or "unknown"
rank = item.get("rank") or "-"
status = item.get("taxonomicStatus") or "-"
key = item.get("key")
lines.append(f"{idx}. {sci} | rank={rank} | status={status} | key={key}")
return ToolResult(success=True, result="GBIF matches:\n" + "\n".join(lines))
except Exception as e:
return ToolResult(success=False, result=None, error=f"gbif_error: {e}")
async def _agrovoc_lookup(self, args: Dict) -> ToolResult:
"""AGROVOC term normalization via public SPARQL endpoint."""
query = str(args.get("query", "") or "").strip()
lang = str(args.get("lang", "en") or "en").strip().lower()
limit = max(1, min(int(args.get("limit", 5)), 10))
if not query:
return ToolResult(success=False, result=None, error="query is required")
if lang not in {"en", "uk", "ru"}:
lang = "en"
safe_q = query.replace('\\', ' ').replace('"', ' ').strip()
sparql = (
"PREFIX skos: <http://www.w3.org/2004/02/skos/core#> "
"SELECT ?concept ?label WHERE { "
"?concept skos:prefLabel ?label . "
f"FILTER(lang(?label) = '{lang}') "
f"FILTER(CONTAINS(LCASE(STR(?label)), LCASE(\"{safe_q}\"))) "
"} LIMIT " + str(limit)
)
try:
resp = await self.http_client.get(
"https://agrovoc.fao.org/sparql",
params={"query": sparql, "format": "json"},
timeout=25.0,
)
if resp.status_code != 200:
return ToolResult(success=False, result=None, error=f"agrovoc_http_{resp.status_code}")
data = resp.json() or {}
bindings = (((data.get("results") or {}).get("bindings")) or [])
if not bindings:
return ToolResult(success=True, result="AGROVOC: результатів не знайдено.")
lines = []
for idx, b in enumerate(bindings[:limit], 1):
label = ((b.get("label") or {}).get("value") or "").strip()
concept = ((b.get("concept") or {}).get("value") or "").strip()
lines.append(f"{idx}. {label} | {concept}")
return ToolResult(success=True, result="AGROVOC matches:\n" + "\n".join(lines))
except Exception as e:
return ToolResult(success=False, result=None, error=f"agrovoc_error: {e}")
async def _unload_ollama_models(self):
"""Unload all Ollama models to free VRAM for heavy operations like FLUX"""
ollama_url = os.getenv("OLLAMA_BASE_URL", "http://172.18.0.1:11434")
@@ -2942,7 +3362,11 @@ class ToolManager:
if results:
result = results[0] if isinstance(results, list) else results
markdown = result.get("markdown", "") or result.get("cleaned_html", "") or result.get("text", "")
raw_content = result.get("markdown", "") or result.get("cleaned_html", "") or result.get("text", "")
if isinstance(raw_content, (dict, list, tuple)):
markdown = json.dumps(raw_content, ensure_ascii=False)
else:
markdown = str(raw_content or "")
title = result.get("title", url)
if len(markdown) > 3000:
@@ -2951,13 +3375,30 @@ class ToolManager:
response_parts = [f"**{title}**", "", markdown]
if extract_links:
links = result.get("links", [])
if links:
links_raw = result.get("links", [])
normalized_links: List[Any] = []
if isinstance(links_raw, dict):
for bucket in links_raw.values():
if isinstance(bucket, list):
normalized_links.extend(bucket)
elif bucket:
normalized_links.append(bucket)
elif isinstance(links_raw, list):
normalized_links = links_raw
elif links_raw:
normalized_links = [links_raw]
if normalized_links:
response_parts.append("")
response_parts.append("**Посилання:**")
for link in links[:10]:
for link in normalized_links[:10]:
if isinstance(link, dict):
link_url = link.get("href", "")
link_url = (
link.get("href")
or link.get("url")
or link.get("link")
or ""
)
else:
link_url = str(link)
if link_url:

View File

@@ -11,10 +11,13 @@ import os
import asyncio
import logging
import base64
import json
import re
from typing import Optional, Dict, List, Any, Union
from datetime import datetime, timedelta
from enum import Enum
from io import BytesIO
import xml.etree.ElementTree as ET
from fastapi import FastAPI, HTTPException, BackgroundTasks, File, UploadFile, Form
from fastapi.middleware.cors import CORSMiddleware
@@ -56,16 +59,34 @@ def _csv_to_markdown(content: bytes) -> str:
text = _decode_text_bytes(content)
reader = csv.reader(text.splitlines())
rows = list(reader)
return _rows_to_markdown(rows)
def _tsv_to_markdown(content: bytes) -> str:
text = _decode_text_bytes(content)
reader = csv.reader(text.splitlines(), delimiter="\t")
rows = list(reader)
return _rows_to_markdown(rows)
def _rows_to_markdown(rows: List[List[Any]]) -> str:
if not rows:
return ""
header = rows[0]
body = rows[1:]
width = max(len(r) for r in rows)
norm_rows = []
for r in rows:
rr = [str(c) if c is not None else "" for c in r]
if len(rr) < width:
rr.extend([""] * (width - len(rr)))
norm_rows.append(rr)
header = norm_rows[0]
body = norm_rows[1:]
lines = [
"| " + " | ".join(header) + " |",
"| " + " | ".join(["---"] * len(header)) + " |",
]
for row in body:
lines.append("| " + " | ".join(row) + " |")
lines.append("| " + " | ".join([str(c) if c is not None else "" for c in row]) + " |")
return "\n".join(lines)
@@ -91,6 +112,69 @@ def _xlsx_to_markdown(content: bytes) -> str:
return "\n".join(parts)
def _xls_to_markdown(content: bytes) -> str:
try:
import xlrd
except Exception as e:
raise HTTPException(status_code=500, detail=f"xlrd not available: {e}")
wb = xlrd.open_workbook(file_contents=content)
parts = []
for s in wb.sheets():
parts.append(f"## Sheet: {s.name}")
rows = []
for r in range(s.nrows):
rows.append([s.cell_value(r, c) for c in range(s.ncols)])
if not rows:
parts.append("_Empty sheet_")
continue
parts.append(_rows_to_markdown(rows))
return "\n\n".join(parts)
def _ods_to_markdown(content: bytes) -> str:
try:
from odf.opendocument import load
from odf.table import Table, TableRow, TableCell
from odf.text import P
except Exception as e:
raise HTTPException(status_code=500, detail=f"odfpy not available: {e}")
try:
doc = load(BytesIO(content))
except Exception as e:
raise HTTPException(status_code=400, detail=f"Invalid ODS file: {e}")
parts = []
for table in doc.spreadsheet.getElementsByType(Table):
table_name = str(table.getAttribute("name") or "Sheet")
parts.append(f"## Sheet: {table_name}")
rows: List[List[str]] = []
for row in table.getElementsByType(TableRow):
cells_out: List[str] = []
for cell in row.getElementsByType(TableCell):
txt_parts = []
for p in cell.getElementsByType(P):
txt_parts.extend(
[str(getattr(node, "data", "")).strip() for node in p.childNodes if getattr(node, "data", None)]
)
cell_text = " ".join([t for t in txt_parts if t]).strip()
repeat_raw = cell.getAttribute("numbercolumnsrepeated")
try:
repeat = int(repeat_raw) if repeat_raw else 1
except Exception:
repeat = 1
repeat = max(1, min(repeat, 100))
for _ in range(repeat):
cells_out.append(cell_text)
if cells_out:
rows.append(cells_out)
if not rows:
parts.append("_Empty sheet_")
continue
parts.append(_rows_to_markdown(rows))
return "\n\n".join(parts)
def _docx_to_text(content: bytes) -> str:
try:
from docx import Document
@@ -115,18 +199,111 @@ def _pdf_to_text(content: bytes) -> str:
return "\n\n".join(text_content)
def _pptx_to_text(content: bytes) -> str:
try:
from pptx import Presentation
except Exception as e:
raise HTTPException(status_code=500, detail=f"python-pptx not available: {e}")
prs = Presentation(BytesIO(content))
parts = []
for idx, slide in enumerate(prs.slides, start=1):
parts.append(f"## Slide {idx}")
slide_lines = []
for shape in slide.shapes:
text = getattr(shape, "text", None)
if text and str(text).strip():
slide_lines.append(str(text).strip())
parts.extend(slide_lines if slide_lines else ["_No text on this slide_"])
return "\n\n".join(parts)
def _json_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
parsed = json.loads(raw)
return json.dumps(parsed, ensure_ascii=False, indent=2)
except Exception:
return raw
def _yaml_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
parsed = yaml.safe_load(raw)
return yaml.safe_dump(parsed, allow_unicode=True, sort_keys=False)
except Exception:
return raw
def _xml_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
root = ET.fromstring(raw)
text = " ".join([t.strip() for t in root.itertext() if t and t.strip()])
return text or raw
except Exception:
return raw
def _html_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(raw, "html.parser")
text = soup.get_text(separator="\n")
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip() or raw
except Exception:
# Minimal fallback if bs4 is unavailable
text = re.sub(r"<[^>]+>", " ", raw)
text = re.sub(r"\s+", " ", text)
return text.strip()
def _rtf_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
from striprtf.striprtf import rtf_to_text
return rtf_to_text(raw)
except Exception:
# Basic fallback: strip common RTF control tokens
text = re.sub(r"\\'[0-9a-fA-F]{2}", " ", raw)
text = re.sub(r"\\[a-zA-Z]+-?\d* ?", " ", text)
text = text.replace("{", " ").replace("}", " ")
return re.sub(r"\s+", " ", text).strip()
def _extract_text_by_ext(filename: str, content: bytes) -> str:
ext = filename.split(".")[-1].lower() if "." in filename else ""
if ext in ["txt", "md"]:
if ext in ["txt", "md", "markdown"]:
return _decode_text_bytes(content)
if ext == "csv":
return _csv_to_markdown(content)
if ext == "xlsx":
if ext == "tsv":
return _tsv_to_markdown(content)
if ext in {"xlsx", "xlsm"}:
return _xlsx_to_markdown(content)
if ext == "xls":
return _xls_to_markdown(content)
if ext == "ods":
return _ods_to_markdown(content)
if ext == "docx":
return _docx_to_text(content)
if ext == "pdf":
return _pdf_to_text(content)
if ext == "pptx":
return _pptx_to_text(content)
if ext == "json":
return _json_to_text(content)
if ext in {"yaml", "yml"}:
return _yaml_to_text(content)
if ext == "xml":
return _xml_to_text(content)
if ext in {"html", "htm"}:
return _html_to_text(content)
if ext == "rtf":
return _rtf_to_text(content)
raise HTTPException(status_code=400, detail=f"Unsupported file type: .{ext}")
@@ -139,7 +316,12 @@ def _zip_to_markdown(content: bytes, max_files: int = 50, max_total_mb: int = 10
if total_size > max_total_mb * 1024 * 1024:
raise HTTPException(status_code=400, detail=f"ZIP слишком большой: {total_size / 1024 / 1024:.1f} MB")
parts = []
allowed_exts = {"txt", "md", "csv", "xlsx", "docx", "pdf"}
allowed_exts = {
"txt", "md", "markdown", "csv", "tsv",
"xls", "xlsx", "xlsm", "ods",
"docx", "pdf", "pptx",
"json", "yaml", "yml", "xml", "html", "htm", "rtf",
}
processed = []
skipped = []
for member in members:
@@ -1655,7 +1837,8 @@ async def document_endpoint(
- json: Structured JSON with document elements
- text: Plain text extraction
Supported files: PDF, DOCX, PPTX, images (PNG, JPG)
Supported files:
PDF, DOCX, XLS/XLSX/XLSM/ODS, PPTX, TXT/MD/CSV/TSV, JSON/YAML/XML/HTML, RTF, ZIP, images.
"""
try:
import time
@@ -1672,15 +1855,28 @@ async def document_endpoint(
filename = file.filename if file else "document"
file_ext = filename.split(".")[-1].lower() if "." in filename else "pdf"
# Handle text-based formats without Docling
if file_ext in ["txt", "md", "csv", "xlsx", "zip"]:
# Handle deterministic extraction for standard office/text formats
if file_ext in [
"txt", "md", "markdown", "csv", "tsv",
"xlsx", "xls", "xlsm", "ods",
"json", "yaml", "yml", "xml", "html", "htm", "rtf",
"pptx", "zip",
]:
try:
if file_ext == "zip":
content = _zip_to_markdown(doc_data)
output_format = "markdown"
else:
content = _extract_text_by_ext(filename, doc_data)
output_format = "markdown" if file_ext in ["md", "csv", "xlsx"] else "text"
output_format = (
"markdown"
if file_ext in {
"md", "markdown", "csv", "tsv",
"xlsx", "xls", "xlsm", "ods",
"json", "yaml", "yml", "xml", "html", "htm", "pptx",
}
else "text"
)
processing_time_ms = (time.time() - start_time) * 1000
return {
"success": True,
@@ -1764,22 +1960,27 @@ async def document_endpoint(
"device": swapper.device
}
# For DOCX, try python-docx
if file_ext == "docx":
# For common office/text formats, try deterministic extractors.
if file_ext in {
"docx", "txt", "md", "markdown", "csv", "tsv",
"xlsx", "xls", "xlsm", "ods",
"pptx", "json", "yaml", "yml", "xml", "html", "htm", "rtf",
}:
try:
content = _docx_to_text(doc_data)
content = _extract_text_by_ext(filename, doc_data)
out_fmt = "markdown" if file_ext not in {"txt", "rtf"} else "text"
return {
"success": True,
"model": "python-docx (fallback)",
"output_format": "text",
"model": "text-extract (fallback)",
"output_format": out_fmt,
"result": content,
"filename": filename,
"processing_time_ms": (time.time() - start_time) * 1000,
"device": swapper.device
}
except Exception as e:
logger.error(f"DOCX fallback failed: {e}")
raise HTTPException(status_code=500, detail="DOCX extraction failed")
logger.error(f"Text fallback failed for .{file_ext}: {e}")
raise HTTPException(status_code=500, detail=f"Extraction failed for .{file_ext}")
# For PDFs, try pdfplumber
if file_ext == "pdf":
@@ -1807,7 +2008,7 @@ async def document_endpoint(
# For other documents, return error
raise HTTPException(
status_code=503,
detail="Document processing not available. Supported: PDF (with pdfplumber), images (with OCR)"
detail="Document processing unavailable for this type. Supported: office/text/image/zip standard formats."
)
finally:
@@ -2312,4 +2513,3 @@ async def get_multimodal_stack():
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8890)

View File

@@ -4,6 +4,15 @@ httpx==0.25.2
pydantic==2.5.0
pyyaml==6.0.1
python-multipart==0.0.6
chardet>=5.2.0
openpyxl>=3.1.2
python-docx>=1.1.2
pdfplumber>=0.11.0
python-pptx>=0.6.23
xlrd>=2.0.1
odfpy>=1.4.1
beautifulsoup4>=4.12.0
striprtf>=0.0.26
# HuggingFace dependencies for OCR models
torch>=2.0.0
@@ -25,4 +34,4 @@ safetensors>=0.4.0
# Web Scraping & Search
trafilatura>=1.6.0
duckduckgo-search>=4.0.0
duckduckgo-search>=4.0.0

View File

@@ -43,3 +43,8 @@ pdfplumber>=0.10.0
python-docx>=1.1.0
openpyxl>=3.1.2
chardet>=5.2.0
python-pptx>=0.6.23
xlrd>=2.0.1
odfpy>=1.4.1
beautifulsoup4>=4.12.0
striprtf>=0.0.26

6
third_party/nature-id/.gitignore vendored Normal file
View File

@@ -0,0 +1,6 @@
__pycache__/
*.py[cod]
*$py.class
*.csv
*.tflite
*.zip

10
third_party/nature-id/LICENSE vendored Normal file
View File

@@ -0,0 +1,10 @@
MIT License
Copyright (c) 2020, joergmlpts
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

372
third_party/nature-id/README.md vendored Normal file
View File

@@ -0,0 +1,372 @@
# Identify Plants, Birds, and Insects in Photos
This repository provides Python code that identifies plants, birds, and insects in photos.
This project was inspired by the amazing progress in identifying plants, animals and mushrooms in photos that has been made by [iNaturalist](https://iNaturalist.org) in recent years in identifying plants, animals, and fungi from photographs. The iNaturalist team has trained machine learning models with their large collection of photos and research-grade identifications. In 2019, iNaturalist released [Seek by iNaturalist](https://www.inaturalist.org/pages/seek_app) which identifies photos offline on the phone and identifies to a higher level than species when a species identification cannot be made.
Google provides three models that have been trained with iNaturalist data - classification models for plants, birds, and insects. These Google models can be downloaded and used with Google's `TensorFlow` and `TensorFlow Lite` tools.
This code is based on the trained models provided by Google. It was written to experiment with identifying species from photos and to try out Seek's approach to calculating scores (probabilities) across the taxonomic hierarchy.
This tool `nature_id.py` has been tested on Linux and Windows. It should also work on MacOS.
## Usage
This is a command-line tool. It is invoked with images or directories containing images and identifies the plants, birds, and insects in those images.
Here is an example. This is the command for Linux and macOS:
```
./nature_id.py -m plants plant_images/Persicaria_amphibia.jpg
```
On Windows the command is:
```
python .\nature_id.py -m plants plant_images\Persicaria_amphibia.jpg
```
![Smartweed](/plant_images/Persicaria_amphibia.jpg)
The above image results in this identification:
```
Classification of 'plant_images/Persicaria_amphibia.jpg' took 0.2 secs.
100.0% kingdom Plants (Plantae)
100.0% phylum Tracheophytes (Tracheophyta)
100.0% subphylum Flowering Plants (Angiospermae)
99.6% class Dicots (Magnoliopsida)
99.2% order Pinks, Cactuses, and Allies (Caryophyllales)
98.8% family Knotweed Family (Polygonaceae)
98.8% subfamily Polygonoideae
98.8% tribe Persicarieae
98.8% subtribe Persicariinae
98.8% genus Smartweeds (Persicaria)
97.6% species Water Smartweed (Persicaria amphibia)
```
These scores can be used to guide identification: define a threshold and report as result the taxon with the lowest score that is larger than or equal to this threshold. In this example for a threshold of 95% an identification to species *Persicaria amphibia* has been achieved. For a threshold of 99%, this is only an identification to order *Caryophyllales*. 95% and 99% would be unusually high thresholds; Seek, I believe, uses a threshold of 70%.
## Command-line Options
This script is a command-line utility. It is called with options, filenames and directory names as arguments. These options are supported:
```
usage: nature_id.py [-h] [-m MODEL] [-a] [-l] [-s] [-r RESULT_SIZE] file/directory [file/directory ...]
positional arguments:
file/directory Image files or directories with images.
options:
-h, --help show this help message and exit
-m MODEL, --model MODEL
Model to load to identify organisms.
-a, --all_common_names
Show all common names and not just one.
-l, --label_scores_only
Compute and display only label scores, do not propagate scores up the hierarchy.
-s, --scientific_names_only
Only use scientific names, do not load common names.
-r RESULT_SIZE, --result_size RESULT_SIZE
Number of labels and their scores to report in results.
```
### Option -m MODEL, --model MODEL
The `-m` and `--model` options select a classification model. Possible models are `plants`, `birds`, and `insects`. These models must be installed in the `classifiers` directory. This option is required if more than one classifier is installed.
### Option -a, --all_common_names
The `-a` and `--all_common_names` options cause all common names to be displayed, not just one. Multiple common names are separated by semicolons. The output with this option looks like this:
![Phyla_nodiflora.jpg](/plant_images/Phyla_nodiflora.jpg)
```
Classification of 'plant_images/Phyla_nodiflora.jpg' took 0.2 secs.
100.0% kingdom Plants; Flora; Green Plants; Greenery; Foliage; Vegetation; Salpichlaena Papyrus; Trees; Bushes; Shrubs; Vines (Plantae)
100.0% phylum Tracheophytes; Seed Plants; Vascular Plants (Tracheophyta)
100.0% subphylum Flowering Plants; Angiosperms; Flowers; Basal Angiosperms; True Dicotyledons; Basal True Dicots; Rose Dicots; Daisy Dicots (Angiospermae)
100.0% class Dicots; Dicots; Dicotyledons; Eudicots (Magnoliopsida)
98.2% order Mints, Plantains, Olives, and Allies (Lamiales)
97.4% family Verbena Family; Lantanas (Verbenaceae)
97.4% tribe Lantaneae
85.5% genus Frogfruits; Fogfruits (Phyla)
85.5% species Turkey Tangle; Lippia; Common Lippia; Turkey Tangle Frogfruit; Sawtooth Fogfruit; Carpet Weed; Roundleaf Frogfruit; Texas Frogfruit; Cape Weed; Sawtooth Frogfruit; Lipia; Turkey Tangle Fogfruit; Daisy Lawn; Fog Grass (Phyla nodiflora)
```
### Option -l, --label_scores_only
The `-l` and `--label_scores_only` options switch from the taxonomic hierarchy view to a flat list of labels and their scores. The output with this option looks like this:
![Solidago_velutina_ssp_californica.jpg](/plant_images/Solidago_velutina_ssp_californica.jpg)
```
Classification of 'plant_images/Solidago_velutina_ssp_californica.jpg' took 0.2 secs.
86.1% Canada Goldenrod (Solidago canadensis)
9.8% Late Goldenrod (Solidago altissima)
1.6% Flat-Topped Goldenrod (Euthamia graminifolia)
1.2% Northern Seaside Goldenrod (Solidago sempervirens)
0.4% Stiff-Leaved Goldenrod (Solidago rigida)
```
Five labels with decreasing scores are shown by default. The `-r` and `--result_size` options can be used to request fewer or more labels.
### Option -s, --scientific_names_only
The `-s` and `--scientific_names_only` options disable common names; only the scientific names are displayed. The output with this option looks like this:
![Trichostema_lanceolatum.jpg](/plant_images/Trichostema_lanceolatum.jpg)
```
Classification of 'plant_images/Trichostema_lanceolatum.jpg' took 0.2 secs.
100.0% kingdom Plantae
100.0% phylum Tracheophyta
100.0% subphylum Angiospermae
100.0% class Magnoliopsida
99.6% order Lamiales
99.6% family Lamiaceae
99.2% subfamily Ajugoideae
99.2% genus Trichostema
99.2% species Trichostema lanceolatum
```
### Option -r RESULT_SIZE, --result_size RESULT_SIZE
The `-r` and `--result_size` options modify the number of labels displayed when a flat list of labels is requested with the `-l` or `--label_scores_only` options. The default is 5. Options `-r` and `--result_size` allow you to choose a number between 1 and 100.
This is an example with 15 labels. The command-line for Linux is
```
./nature_id.py -m plants -l -r 15 plant_images/Primula_hendersonii.jpg
```
![Primula_hendersonii.jpg](/plant_images/Primula_hendersonii.jpg)
```
Classification of 'plant_images/Primula_hendersonii.jpg' took 0.2 secs.
50.4% Henderson's Shooting Star (Primula hendersonii)
37.2% Eastern Shooting Star (Primula meadia)
2.5% Dark-Throated Shooting Star (Primula pauciflora)
1.7% Red Ribbons (Clarkia concinna)
1.2% Ruby Chalice Clarkia (Clarkia rubicunda)
0.8% Purple Paintbrush (Castilleja purpurea)
0.8% Fireweed (Chamaenerion angustifolium)
0.4% Western Fairy-Slipper (Calypso bulbosa occidentalis)
0.4% Texas Skeleton Plant (Lygodesmia texana)
0.4% Rhodora (Rhododendron canadense)
0.4% Ragged-Robin (Silene flos-cuculi)
0.4% Hemp Dogbane (Apocynum cannabinum)
0.4% Garden Cosmos (Cosmos bipinnatus)
0.4% Farewell-To-Spring (Clarkia amoena)
0.4% Dwarf Fireweed (Chamaenerion latifolium)
```
## Dependencies
Several things need to be installed in order for `nature-id.py` to run. Some Python packages are required, classification models need to be downloaded and installed into the `classifiers` directory, and finally the taxonomy and common names need to be downloaded into the `inaturalist-taxonomy` directory.
### Python Packages
This code is written in Python 3. Besides Python 3, the packages `Pillow` and `requests` are used to load and process images and to access the iNaturalist API.
These packages as well as `TensorFlow Lite` can be installed on Ubuntu Linux and other Debian distributions with the command
```
sudo apt install python3-pillow python3-requests
pip3 install tflite-runtime
```
and on other platforms with the command
```
pip install Pillow requests tflite-runtime
```
Where appropriate `pip3` should be called instead of `pip` to avoid accidentally installing Python 2 packages.
### Classification Models
The classification models and their labelmap files have to be downloaded from Kaggle and they go into directory `classifiers`.
The classifiers can be downloaded from these links:
* [classifier for plants](https://www.kaggle.com/models/google/aiy/tensorFlow1/vision-classifier-plants-v1/1)
* [classifier for birds](https://www.kaggle.com/models/google/aiy/tensorFlow1/vision-classifier-birds-v1/1)
* [classifier for insects](https://www.kaggle.com/models/google/aiy/tensorFlow1/vision-classifier-insects-v1/1)
Each classifier consists of a `.tflite` model and a `.csv` labelmap file. Both are required. Click on `Model Variations` under `TensorFlow Lite` to download the TFLite model. Please also note the paragraphs at the bottom of these web pages about appropriate and inappropriate use cases and licensing.
These are the links to download the labelmaps: [aiy_insects_V1_labelmap.csv](https://www.gstatic.com/aihub/tfhub/labelmaps/aiy_insects_V1_labelmap.csv), [aiy_birds_V1_labelmap.csv](https://www.gstatic.com/aihub/tfhub/labelmaps/aiy_birds_V1_labelmap.csv), and [aiy_plants_V1_labelmap.csv](https://www.gstatic.com/aihub/tfhub/labelmaps/aiy_plants_V1_labelmap.csv). On Windows, the default action for a .csv file may be to open it in Excel; be sure to save the downloaded file to disk.
### Taxonomy and Common Names Files
The trained models come with scientific names as labels and many of these scientific names are already outdated. The common names and the current taxonomy are obtained from this file: [https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip](https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip) This tool expects this zip archive in the `inaturalist-taxonomy` directory.
## Example Images
Example Images pictures of plants are provided in the `plant_images` directory. The filenames indicate the species that I think is in the photo. Note that these examples only lead to successful identification to varying degrees. The *Mentzelia lindleyi* is certainly not correctly identified.
## Messages
The first call with a model transforms the labels into a taxonomic hierarchy. Each label is replaced with its representation in the current taxonomy and all its ancestors are added. This process takes some time and results in many messages. Once the hierarchy has been successfully computed, it is written to disk. Future calls to `nature_id.py` will load the taxonomic hierarchy from disk instead of reading the labels and computing the taxonomy again.
This is what the first calls look like. Again, we use the plant model as an example. The bird and insect models are smaller and result in fewer messages.
```
PS C:\nature-id> python -m plants nature_id.py .\plant_images
Read 2,102 labels from 'classifiers\aiy_plants_V1_labelmap.csv' in 0.0 secs.
Loading iNaturalist taxonomy...
Loaded iNaturalist taxonomy of 993,552 taxa in 15.2 secs.
Info: Taxon for label 'background' not found, inserting as pseudo-kingdom.
Info: Taxon 'Eichhornia crassipes' changed to 'Pontederia crassipes', iNat taxa id 962637.
Info: Taxon 'Potentilla anserina' changed to 'Argentina anserina', iNat taxa id 158615.
Info: Taxon 'Stenosiphon linifolius' changed to 'Oenothera glaucifolia', iNat taxa id 914092.
Info: Taxon 'Sophora secundiflora' changed to 'Dermatophyllum secundiflorum', iNat taxa id 499559.
Info: Taxon 'Mimulus bigelovii' changed to 'Diplacus bigelovii', iNat taxa id 701989.
Info: Taxon 'Botrychium dissectum' changed to 'Sceptridium dissectum', iNat taxa id 122085.
Info: Taxon 'Trientalis borealis' changed to 'Lysimachia borealis', iNat taxa id 204174.
Info: Taxon 'Hyptis emoryi' changed to 'Condea emoryi', iNat taxa id 489286.
Info: Taxon 'Opuntia engelmannii lindheimeri' changed to 'Opuntia lindheimeri', iNat taxa id 119980.
Info: Taxon 'Aquilegia caerulea' changed to 'Aquilegia coerulea', iNat taxa id 501742.
Info: Taxon 'Fuscospora cliffortioides' changed to 'Nothofagus cliffortioides', iNat taxa id 404204.
Info: Taxon 'Cooperia drummondii' changed to 'Zephyranthes chlorosolen', iNat taxa id 554401.
Info: Taxon 'Dracopis amplexicaulis' changed to 'Rudbeckia amplexicaulis', iNat taxa id 200073.
Info: Taxon 'Dodecatheon meadia' changed to 'Primula meadia', iNat taxa id 549981.
Info: Taxon 'Aptenia cordifolia' changed to 'Mesembryanthemum cordifolium', iNat taxa id 589815.
Info: Taxon 'Chamerion latifolium' changed to 'Chamaenerion latifolium', iNat taxa id 564970.
Info: Taxon 'Echinocereus mojavensis' changed to 'Echinocereus triglochidiatus mojavensis', iNat taxa id 858352.
Warning: multiple taxa named 'Aquilegia vulgaris': species 51807, complex 1042772; choosing species.
Info: Taxon 'Dodecatheon pulchellum' changed to 'Primula pauciflora', iNat taxa id 498086.
Info: Taxon 'Mimulus lewisii' changed to 'Erythranthe lewisii', iNat taxa id 777190.
Info: Taxon 'Sambucus nigra canadensis' changed to 'Sambucus canadensis', iNat taxa id 84300.
Info: Taxon 'Asyneuma prenanthoides' changed to 'Campanula prenanthoides', iNat taxa id 851072.
Info: Taxon 'Anemone quinquefolia' changed to 'Anemonoides quinquefolia', iNat taxa id 950598.
Info: Taxon 'Hedypnois cretica' changed to 'Hedypnois rhagadioloides', iNat taxa id 492864.
Warning: multiple taxa named 'Achillea millefolium': species 52821, complex 1105043; choosing species.
Info: Taxon 'Anagallis arvensis' changed to 'Lysimachia arvensis', iNat taxa id 791928.
Info: Taxon 'Hieracium caespitosum' changed to 'Pilosella caespitosa', iNat taxa id 711086.
Info: Taxon 'Potentilla anserina pacifica' changed to 'Argentina pacifica', iNat taxa id 524900.
Info: Taxon 'Sambucus nigra caerulea' changed to 'Sambucus cerulea', iNat taxa id 143799.
Info: Taxon 'Polygala californica' changed to 'Rhinotropis californica', iNat taxa id 876453.
Info: Taxon 'Calylophus berlandieri' changed to 'Oenothera berlandieri', iNat taxa id 359779.
Info: Taxon 'Mimulus cardinalis' changed to 'Erythranthe cardinalis', iNat taxa id 319974.
Info: Taxon 'Callistemon citrinus' changed to 'Melaleuca citrina', iNat taxa id 77976.
Info: Taxon 'Liatris mucronata' changed to 'Liatris punctata mucronata', iNat taxa id 371814.
Warning: multiple taxa named 'Stellaria media': species 53298, complex 1087592; choosing species.
Info: Taxon 'Anemone americana' changed to 'Hepatica americana', iNat taxa id 741014.
Info: Taxon 'Anemone occidentalis' changed to 'Pulsatilla occidentalis', iNat taxa id 60482.
Info: Taxon 'Orobanche fasciculata' changed to 'Aphyllon fasciculatum', iNat taxa id 802543.
Info: Taxon 'Mimulus primuloides' changed to 'Erythranthe primuloides', iNat taxa id 635401.
Info: Taxon 'Polygala paucifolia' changed to 'Polygaloides paucifolia', iNat taxa id 497911.
Warning: multiple taxa named 'Campanula rotundifolia': species 62312, complex 984576; choosing species.
Info: Taxon 'Cissus incisa' changed to 'Cissus trifoliata', iNat taxa id 133333.
Info: Taxon 'Schinus terebinthifolius' changed to 'Schinus terebinthifolia', iNat taxa id 130872.
Info: Taxon 'Cooperia pedunculata' changed to 'Zephyranthes drummondii', iNat taxa id 120026.
Info: Taxon 'Scabiosa atropurpurea' changed to 'Sixalix atropurpurea', iNat taxa id 372376.
Info: Taxon 'Sphenosciadium capitellatum' changed to 'Angelica capitellata', iNat taxa id 704166.
Info: Taxon 'Trientalis latifolia' changed to 'Lysimachia latifolia', iNat taxa id 496537.
Warning: multiple taxa named 'Spiranthes cernua': species 773385, complex 931407; choosing species.
Info: Taxon 'Spartina pectinata' changed to 'Sporobolus michauxianus', iNat taxa id 772984.
Info: Taxon 'Centaurea americana' changed to 'Plectocephalus americanus', iNat taxa id 699778.
Info: Taxon 'Fuscospora solandri' changed to 'Nothofagus solandri', iNat taxa id 70246.
Info: Taxon 'Heliotropium tenellum' changed to 'Euploca tenella', iNat taxa id 769888.
Info: Taxon 'Blechnum spicant' changed to 'Struthiopteris spicant', iNat taxa id 774894.
Info: Taxon 'Fallopia japonica' changed to 'Reynoutria japonica', iNat taxa id 914922.
Info: Taxon 'Echinocactus texensis' changed to 'Homalocephala texensis', iNat taxa id 870496.
Info: Taxon 'Gaura parviflora' changed to 'Oenothera curtiflora', iNat taxa id 78241.
Info: Taxon 'Parentucellia viscosa' changed to 'Bellardia viscosa', iNat taxa id 537967.
Info: Taxon 'Anemone nemorosa' changed to 'Anemonoides nemorosa', iNat taxa id 950603.
Info: Taxon 'Hieracium aurantiacum' changed to 'Pilosella aurantiaca', iNat taxa id 711103.
Info: Taxon 'Anemone hepatica' changed to 'Hepatica nobilis', iNat taxa id 639660.
Info: Taxon 'Merremia dissecta' changed to 'Distimake dissectus', iNat taxa id 907480.
Info: Taxon 'Anemone canadensis' changed to 'Anemonastrum canadense', iNat taxa id 881527.
Info: Taxon 'Chamerion angustifolium' changed to 'Chamaenerion angustifolium', iNat taxa id 564969.
Info: Taxon 'Lychnis flos-cuculi' changed to 'Silene flos-cuculi', iNat taxa id 740984.
Throttling API calls, sleeping for 44.5 seconds.
Info: Taxon 'Ampelopsis brevipedunculata' changed to 'Ampelopsis glandulosa brevipedunculata', iNat taxa id 457553.
Info: Taxon 'Anemone acutiloba' changed to 'Hepatica acutiloba', iNat taxa id 179786.
Info: Taxon 'Pennisetum setaceum' changed to 'Cenchrus setaceus', iNat taxa id 430581.
Info: Taxon 'Mimulus guttatus' changed to 'Erythranthe guttata', iNat taxa id 470643.
Info: Taxon 'Blechnum fluviatile' changed to 'Cranfillia fluviatilis', iNat taxa id 700995.
Info: Taxon 'Blechnum discolor' changed to 'Lomaria discolor', iNat taxa id 403546.
Info: Taxon 'Andropogon gerardii' changed to 'Andropogon gerardi', iNat taxa id 121968.
Info: Taxon 'Ferocactus hamatacanthus' changed to 'Hamatocactus hamatacanthus', iNat taxa id 855937.
Info: Taxon 'Gaura lindheimeri' changed to 'Oenothera lindheimeri', iNat taxa id 590726.
Info: Taxon 'Gaura suffulta' changed to 'Oenothera suffulta', iNat taxa id 521639.
Info: Taxon 'Glottidium vesicarium' changed to 'Sesbania vesicaria', iNat taxa id 890511.
Info: Taxon 'Acacia farnesiana' changed to 'Vachellia farnesiana', iNat taxa id 79472.
Warning: multiple taxa named 'Rubus fruticosus': complex 55911, species 1090496; choosing species.
Info: Taxon 'Othocallis siberica' changed to 'Scilla siberica', iNat taxa id 862704.
Info: Taxon 'Mimulus aurantiacus' changed to 'Diplacus', iNat taxa id 777236.
Info: Taxon 'Phoradendron tomentosum' changed to 'Phoradendron leucarpum', iNat taxa id 49668.
Info: Taxon 'Orobanche uniflora' changed to 'Aphyllon uniflorum', iNat taxa id 802714.
Info: Taxon 'Rosmarinus officinalis' changed to 'Salvia rosmarinus', iNat taxa id 636795.
Info: Taxon 'Cynoglossum grande' changed to 'Adelinia grande', iNat taxa id 769151.
Computed taxonomic tree from labels in 64.8 secs: 4,091 taxa including 2,102 leaf taxa.
Taxonomy written to file 'classifiers\aiy_plants_V1_taxonomy.csv'.
Reading common names from 'inaturalist-taxonomy\inaturalist-taxonomy.dwca.zip' member 'VernacularNames-english.csv'...
Read 203,093 common names in 1.5 secs, loaded 3,071 in language "en_US" for 4,091 taxa.
```
### Messages Explained
```
Read 2,102 labels from 'classifiers\aiy_plants_V1_labelmap.csv' in 0.0 secs.
```
`nature-id` reads a label file. If no errors occur, a taxonomy will be written for these labels and further runs will load `classifiers\aiy_plants_V1_taxonomy.csv` instead.
```
Loading iNaturalist taxonomy...
Loaded iNaturalist taxonomy of 993,552 taxa in 15.2 secs.
```
The entire iNaturalist taxonomy of about 1 million taxa is loaded. `nature-id` will look up the labels in this taxonomy and insert them, along with all their ancestors, into a taxonomy for the labels.
```
Info: Taxon for label 'background' not found, inserting as pseudo-kingdom.
```
Label `background` was not found. It is not a species, but denotes something else in the Google model. It is treated as a kingdom in the taxonomy; it has no ancestors.
```
Info: Taxon 'Potentilla anserina' changed to 'Argentina anserina', iNat taxa id 158615.
```
In the current taxonomy, this species belongs to a different genus. The numeric ID in this message is useful for getting more information. This number can be prefixed with `https://www.inaturalist.org/taxa/` and opened in a browser: [https://www.inaturalist.org/taxa/158615](https://www.inaturalist.org/taxa/158615).
```
Warning: multiple taxa named 'Achillea millefolium': species 52821, complex 1105043; choosing species.
```
The label name for this common yarrow is not unique, there are several taxa for this scientific name. `nature-id` assumes that the species is the one we want.
```
Throttling API calls, sleeping for 44.5 seconds.
```
This message is followed by 45 seconds of silence. When a name is not found in the the current taxonomy, the one previously loaded with about 1 million taxa, then iNaturalist API calls are made to look up the inactive scientific name. The iNaturalist team would like us to throttle API calls to no more than 60 calls per minute. This delay has been implemented to accommodate their request.
```
Info: Taxon 'Mimulus aurantiacus' changed to 'Diplacus', iNat taxa id 777236.
```
The species *Mimulus aurantiacus* in the label file is replaced with the genus *Diplacus* and not with the current species *Diplacus aurantiacus*. This looks like a bug and hence deserves a closer look.
The reason for this decision of `nature_id` is that *Mimulus aurantiacus* consisted of several varieties *Mimulus aurantiacus aurantiacus*, *Mimulus aurantiacus grandiflorus*, *Mimulus aurantiacus parviflorus*, and 3 more.
In the current taxonomy, these varieties are species *Diplacus aurantiacus*, *Diplacus grandiflorus*, and *Diplacus parviflorus*. *Diplacus aurantiacus* does not replace *Mimulus aurantiacus*; it replaces the variety *Mimulus aurantiacus aurantiacus*.
Another way to understand this issue is to realize that photos of all varieties *Mimulus aurantiacus aurantiacus*, *Mimulus aurantiacus grandiflorus*, *Mimulus aurantiacus parviflorus* and the 3 others were used to train the classification model to recognize *Mimulus aurantiacus*. In the current taxonomy, this label is triggered for each of the species *Diplacus aurantiacus*, *Diplacus grandiflorus*, and *Diplacus parviflorus*. `nature_id` cannot say which of current species it sees. It can only identify images as genus *Diplacus*.
```
Taxonomy written to file 'classifiers\aiy_plants_V1_taxonomy.csv'.
```
A taxonomy for the scientific names in the label file has been successfully computed and this taxonomy was written to disk. Future calls will load this taxonomy instead of loading the labels and re-computing the taxonomy.
```
Reading common names from 'inaturalist-taxonomy\inaturalist-taxonomy.dwca.zip' member 'VernacularNames-english.csv'...
Read 203,093 common names in 1.5 secs, loaded 3,071 in language "en_US" for 4,091 taxa.
```
Common names have been read. The common names are always selected for the local language, not necessarily for English as shown here.

View File

@@ -0,0 +1,13 @@
# Download Instructions
The [Tensorflow Lite](https://www.tensorflow.org/lite/guide) classifiers that go in this directory can be downloaded from these websites:
* [classifier for plants](https://tfhub.dev/google/aiy/vision/classifier/plants_V1/1)
* [classifier for birds](https://tfhub.dev/google/aiy/vision/classifier/birds_V1/1)
* [classifier for insects](https://tfhub.dev/google/aiy/vision/classifier/insects_V1/1)
Each classifier consists of a `.tflite` model and a `.csv` labelmap file. Both are required.
On each of the above websites scroll down and under `Output` click on `labelmap` to download the labels. Then scroll back up and under `Model formats` switch to `TFLite (aiyvision/classifier/...)`. There click on `Download` to get the `.tflite` file.
If you happen to have the classifier included in [Seek](https://www.inaturalist.org/pages/seek_app), it can go in this directory as well. It consists of two files `optimized_model_v1.tflite` and `taxonomy_v1.csv`.

110
third_party/nature-id/inat_api.py vendored Normal file
View File

@@ -0,0 +1,110 @@
import json, os, pickle, requests, shelve, sys, time
#############################################################################
# #
# API calls to obtain taxonomic information. Used in case of name changes. #
# #
# See documention at https://api.inaturalist.org/v1/docs/#/Taxa #
# #
# We throttle the number of calls to less than 60 per minute. We also #
# implement a cache to avoid repeated lookups of the same taxa across runs. #
# Cache entries include time stamps and they expire after two weeks. #
# #
#############################################################################
API_HOST = "https://api.inaturalist.org/v1"
CACHE_EXPIRATION = 14 * 24 * 3600 # cache expires after 2 weeks
TOO_MANY_API_CALLS_DELAY = 60 # wait this long after error 429
# The cache stores the json responses.
if sys.platform == 'win32':
DATA_DIR = os.path.join(os.path.expanduser('~'),
'AppData', 'Local', 'inat_api')
else:
DATA_DIR = os.path.join(os.path.expanduser('~'), '.cache', 'inat_api')
if not os.path.exists(DATA_DIR):
os.makedirs(DATA_DIR)
cache = shelve.open(os.path.join(DATA_DIR, 'api.cache'))
# API call throttling.
class Throttle:
API_MAX_CALLS = 60 # max 60 calls per minute
API_INTERVAL = 60 # 1 minute
def __init__(self):
self.callTimes = [] # times of api calls
# wait if necessary to avoid more than API_MAX_CALLS in API_INTERVAL
def wait(self):
while len(self.callTimes) >= self.API_MAX_CALLS:
waitTime = self.callTimes[0] - (time.time() - self.API_INTERVAL)
if waitTime > 0:
print('Throttling API calls, '
f'sleeping for {waitTime:.1f} seconds.')
time.sleep(waitTime)
continue
self.callTimes = self.callTimes[1:]
self.callTimes.append(time.time())
api_call_throttle = Throttle()
# argument is an id or a list of id's
def get_taxa_by_id(id):
if type(id) is list:
url = API_HOST + '/taxa/' + '%2C'.join([str(i) for i in id])
else:
url = API_HOST + f'/taxa/{id}'
tim = time.time()
if not url in cache or cache[url][0] < tim - CACHE_EXPIRATION:
delay = TOO_MANY_API_CALLS_DELAY
headers = {'Content-type' : 'application/json' }
while True:
api_call_throttle.wait()
response = requests.get(url, headers=headers)
if response.status_code == requests.codes.too_many:
time.sleep(delay)
delay *= 2
else:
break
if response.status_code == requests.codes.ok:
cache[url] = (tim, response.json())
else:
print(response.text)
return None
return cache[url][1]
# returns taxa by name
def get_taxa(params):
url = API_HOST + '/taxa'
for key, val in params.items():
if type(val) == bool:
params[key] = 'true' if val else 'false'
key = pickle.dumps((url, params)).hex()
tim = time.time()
if not key in cache or cache[key][0] < tim - CACHE_EXPIRATION:
delay = TOO_MANY_API_CALLS_DELAY
headers = {'Content-type' : 'application/json' }
while True:
api_call_throttle.wait()
response = requests.get(url, headers=headers, params=params)
if response.status_code == requests.codes.too_many:
time.sleep(delay)
delay *= 2
else:
break
if response.status_code == requests.codes.ok:
cache[key] = (tim, response.json())
else:
print(response.text)
return None
return cache[key][1]
if __name__ == '__main__':
assert not 'Not a top-level Python module!'

318
third_party/nature-id/inat_taxonomy.py vendored Normal file
View File

@@ -0,0 +1,318 @@
import csv, sys, os, time, locale, zipfile, io
import inat_api
from dataclasses import dataclass
from typing import List, Dict
# The directory where this Python script is located.
INSTALL_DIR = os.path.dirname(__file__)
while os.path.islink(INSTALL_DIR):
INSTALL_DIR = os.path.join(INSTALL_DIR,
os.path.dirname(os.readlink(INSTALL_DIR)))
# This zip file contains the taxonomy and all common names.
# Download https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip and
# leave this zip file in directory 'inaturalist-taxonomy'. Do not extract the
# files from this zip archive.
INAT_TAXONOMY = os.path.join(INSTALL_DIR, 'inaturalist-taxonomy',
'inaturalist-taxonomy.dwca.zip')
# A special node represents the root of the tree, the parent of kingdoms.
ROOT_TAXON_ID = 48460
ROOT_NAME = 'Life'
ROOT_RANK_LEVEL = 100
# maps rank-level to its name
gRankLevel2Name = {
ROOT_RANK_LEVEL : 'stateofmatter', # used for the parent of kingdoms
70 : 'kingdom',
67 : 'subkingdom',
60 : 'phylum',
57 : 'subphylum',
53 : 'superclass',
50 : 'class',
47 : 'subclass',
45 : 'infraclass',
44 : 'subterclass',
43 : 'superorder',
40 : 'order',
37 : 'suborder',
35 : 'infraorder',
34.5: 'parvorder',
34 : 'zoosection',
33.5: 'zoosubsection',
33 : 'superfamily',
32 : 'epifamily',
30 : 'family',
27 : 'subfamily',
26 : 'supertribe',
25 : 'tribe',
24 : 'subtribe',
20 : 'genus',
19 : 'genushybrid', # changed, was same as genus in iNaturalist
15 : 'subgenus',
13 : 'section',
12 : 'subsection',
11 : 'complex',
10 : 'species',
9 : 'hybrid', # changed, was same as species in iNaturalist
5 : 'subspecies',
4 : 'variety', # changed, was same as subspecies in iNaturalist
3 : 'form', # changed, was same as subspecies in iNaturalist
2 : 'infrahybrid' # changed, was same as subspecies in iNaturalist
}
# maps rank name to numeric rank-level
gName2RankLevel = {}
for key, value in gRankLevel2Name.items():
gName2RankLevel[value] = key
KINGDOM_RANK_LEVEL = gName2RankLevel['kingdom']
def get_rank_level(rank):
assert rank in gName2RankLevel
return gName2RankLevel[rank]
def get_rank_name(rank_level, default_name = 'clade'):
return gRankLevel2Name[rank_level] if rank_level in gRankLevel2Name \
else default_name
@dataclass(frozen=True)
class Taxon:
id : int
parent_id : int
name : str
rank_level: float
# iNaturalist taxa, only loaded when a taxonomic tree needs
# to be computed from a label file.
gName2Taxa: Dict[str,List[Taxon]] = {}
"maps taxon name to list of taxa"
gId2Taxon: Dict[int,Taxon] = {}
"maps taxon id to taxon"
def load_inat_taxonomy():
"Load all iNaturalist taxa from file 'taxa.csv'."
global gName2Taxa
global gId2Taxon
if gName2Taxa and gId2Taxon:
return True # already loaded
print('Loading iNaturalist taxonomy...')
start_time = time.time()
gName2Taxa = {}
gId2Taxon = {}
try:
with zipfile.ZipFile(INAT_TAXONOMY, 'r') as zf:
with zf.open('taxa.csv', 'r') as zfile:
with io.TextIOWrapper(zfile, encoding = 'latin-1') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
id = int(row['id'])
parent_id = row['parentNameUsageID'].split('/')[-1]
parent_id = int(parent_id) if parent_id else \
ROOT_TAXON_ID if id != ROOT_TAXON_ID else None
name = row['scientificName']
rank = row['taxonRank']
if not rank in gName2RankLevel:
response = inat_api.get_taxa_by_id(id)
if response and 'results' in response:
rank_level = response['results'][0]\
['rank_level']
gName2RankLevel[rank] = rank_level
if not rank_level in gRankLevel2Name:
gRankLevel2Name[rank_level] = rank
print(f"Please add rank '{rank}' to gName2Rank"
f"Level, numeric value {rank_level}.")
else:
gName2RankLevel[rank] = -1
rank_level = gName2RankLevel[rank]
inat_taxon = Taxon(id, parent_id, name, rank_level)
if name in gName2Taxa:
gName2Taxa[name].append(inat_taxon)
else:
gName2Taxa[name] = [inat_taxon]
assert not id in gId2Taxon
gId2Taxon[id] = inat_taxon
if len(gId2Taxon) % 10000 == 0:
print(f' {len(gId2Taxon):,} ' if len(gId2Taxon) %
100000 == 0 else '.', end='')
sys.stdout.flush()
assert ROOT_TAXON_ID in gId2Taxon
print(f' {len(gId2Taxon):,}.')
print(f'Loaded iNaturalist taxonomy of {len(gId2Taxon):,} taxa '
f'in {time.time()-start_time:.1f} secs.')
return True
except Exception as e:
print("Cannot load taxonomy 'taxa.csv' from archive "
f"'{INAT_TAXONOMY}': {str(e)}.")
gName2Taxa = {}
gId2Taxon = {}
return False
def beautify_common_name(name):
"Capitalize (most) words in common name; helper function for common names."
if name.endswith(' [paraphyletic]'):
name = name[:-15] # fix dicots
name = '-'.join(word[0].upper() + word[1:]
for word in name.split('-'))
return ' '.join(word if word == 'and' or word.endswith('.')
else word[0].upper() + word[1:]
for word in name.split())
def annotate_common_names(id2taxon, all_common_names = False):
"""
Load the common names in our language, annotate taxonomic tree with them.
The parameter `id2taxon' includes the taxa we are interested in.
"""
start_time = time.time()
language, _ = locale.getdefaultlocale()
if language in ['C', 'C.UTF-8', 'POSIX']:
language = 'en'
if not os.path.isfile(INAT_TAXONOMY):
print("Cannot load common names, archive "
f"'{INAT_TAXONOMY}' does not exist.")
return
try:
with zipfile.ZipFile(INAT_TAXONOMY, 'r') as zf:
perfect_match = []
other_matches = []
# check all common names files for names in our language
for fname in zf.namelist():
if fname.startswith("VernacularNames-") and \
fname.endswith(".csv"):
with zf.open(fname, 'r') as zfile:
with io.TextIOWrapper(zfile, encoding='utf-8') as csvf:
reader = csv.DictReader(csvf)
for row in reader:
lang = row['language']
if lang == language:
perfect_match.append(fname) # en vs en
elif len(lang) < len(language) and \
lang == language[:len(lang)]:
other_matches.append(fname) # en vs en_US
break
if not perfect_match and not other_matches:
print("Cannot find common names for language '{language}'.")
return
# annotate the taxa with common names
total_names = loaded_names = 0
for fname in perfect_match + other_matches:
print(f"Reading common names from '{INAT_TAXONOMY}' "
f"member '{fname}'...")
with zf.open(fname, 'r') as zfile:
with io.TextIOWrapper(zfile, encoding='utf-8') as csvf:
reader = csv.DictReader(csvf)
for row in reader:
total_names += 1
id = int(row['id'])
if id in id2taxon and (all_common_names or \
id2taxon[id].common_name is None):
loaded_names += 1
cname = beautify_common_name(row['vernacular'
'Name'])
if id2taxon[id].common_name is None:
id2taxon[id].common_name = cname
else:
id2taxon[id].common_name += '; ' + cname
print(f'Read {total_names:,} common names in '
f'{time.time()-start_time:.1f} secs, loaded {loaded_names:,} '
f'in language "{language}" for {len(id2taxon)-1:,} taxa.')
except Exception as e:
print(f"Cannot load common names from archive '{INAT_TAXONOMY}':"
f" {str(e)}.")
def get_ancestors(id, ancestors):
"""
Ancestors are a list of instances of Taxon; they are ordered from the
kingdom down.
"""
taxon = gId2Taxon[id]
if taxon.rank_level < KINGDOM_RANK_LEVEL:
get_ancestors(taxon.parent_id, ancestors)
ancestors.append(taxon)
def lookup_id(name, desired_ranks = ['species', 'subspecies']):
"""
Lookup by name, returns a pair, a Taxon and its ancestors, a list of
Taxon. Desired_ranks are returned in case of ambiguities (duplicate names).
"""
if not gName2Taxa:
return None # taxonomy not loaded
if name in gName2Taxa:
taxa = gName2Taxa[name]
if len(taxa) > 1:
species = None
subspecies = None
print(f"Warning: multiple taxa named '{name}':", end='')
prefix = ' '
taxon = None
for t in taxa:
rank = get_rank_name(t.rank_level)
print(f"{prefix}{rank} {t.id}", end='')
if rank in desired_ranks:
taxon = t
prefix = ', '
if not taxon:
taxon = taxa[0]
rank = get_rank_name(taxon.rank_level)
print(f"; choosing {rank}.")
else:
taxon = taxa[0]
ancestors = []
if taxon.rank_level < KINGDOM_RANK_LEVEL:
get_ancestors(taxon.parent_id, ancestors)
return (taxon, ancestors)
else:
# likely taxon change, query iNat API
response = inat_api.get_taxa({ 'q' : name,
'all_names' : 'true',
'per_page' : 200 })
if not response:
print(f"API lookup for name '{name}' failed.")
return
taxa = response['results']
if len(taxa) > 1:
# more than one taxon, find the one that used to have this name
exact_matches = [taxon for taxon in taxa for nam in taxon['names']
if nam['locale'] == 'sci' and nam['name'] == name]
if exact_matches:
taxa = exact_matches
ids = [taxon['id'] for taxon in taxa]
taxa = set([gId2Taxon[id] for id in ids if id in gId2Taxon])
if not taxa:
return
while len(taxa) > 1:
# multiple taxa, find their common ancestor
min_rank_level = min([taxon.rank_level for taxon in taxa])
new_taxa = set()
for taxon in taxa:
new_taxon = gId2Taxon[taxon.parent_id] \
if taxon.rank_level == min_rank_level \
else taxon
if not new_taxon in new_taxa:
new_taxa.add(new_taxon)
taxa = new_taxa
taxon = taxa.pop()
ancestors = []
if taxon.rank_level < KINGDOM_RANK_LEVEL:
get_ancestors(taxon.parent_id, ancestors)
return (taxon, ancestors)
if __name__ == '__main__':
assert not 'Not a top-level Python module!'

View File

@@ -0,0 +1,3 @@
The .zip archive with the taxonomy and common names belongs in this directory.
Download https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip to this directory. Do not unpack this archive.

View File

@@ -0,0 +1,4 @@
#!/bin/sh
rm -f inaturalist-taxonomy.dwca.zip
curl https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip \
-o inaturalist-taxonomy.dwca.zip

537
third_party/nature-id/nature_id.py vendored Executable file
View File

@@ -0,0 +1,537 @@
#!/usr/bin/env python3
import numpy as np
from PIL import Image, ImageOps
import csv, sys, os, time
import inat_taxonomy
try:
# try importing TensorFlow Lite first
import tflite_runtime.interpreter as tflite
except Exception:
try:
# TensorFlow Lite not found, try to import full TensorFlow
import tensorflow.lite as tflite
except Exception:
print('Error: TensorFlow Lite could not be loaded.', file=sys.stderr)
print(' Follow instructions at https://www.tensorflow.org/lite/'
'guide/python to install it.', file=sys.stderr)
sys.exit(1)
# The directory where this Python script is located.
INSTALL_DIR = inat_taxonomy.INSTALL_DIR
# This directory contains models, label files, and taxonomy files.
CLASSIFIER_DIRECTORY = os.path.join(INSTALL_DIR, 'classifiers')
# These flags can be modified with command-line options.
scientific_names_only = False # only scientific names or also common names
label_scores_only = False # scores for labels or hierarchical
all_common_names = False # show only one or all common names
result_sz = 5 # result size (for label_scores_only)
# This class is used by class Taxonomy.
class Taxon:
def __init__(self, taxon_id):
self.taxon_id = taxon_id # for internal lookups and iNat API calls
self.rank_level = None # taxonomic rank, e.g. species, genus, family
self.name = None # scientific name
self.common_name = None # common name or None
self.children = [] # list of child taxa
self.leaf_class_ids = [] # list of indices into scores; there
# can be more than one when we use old models
# whose taxa have since been lumped together
def add_child(self, child_taxon):
self.children.append(child_taxon)
# get taxonomic rank as a string
def get_rank(self):
if self.taxon_id < 0: # pseudo-kingdom?
assert self.rank_level == inat_taxonomy.KINGDOM_RANK_LEVEL
return ''
return inat_taxonomy.get_rank_name(self.rank_level)
# get the name to display; customize here to show common names differently
def get_name(self):
if self.common_name:
return f'{self.common_name} ({self.name})'
else:
return self.name
# This taxonomy is represented in terms of instances of class Taxon.
class Taxonomy:
def __init__(self):
# The taxonomy file may contain multiple trees, one for each kingdom.
# In order to have a single tree for prediction, we add a node for
# Life as the parent of all kingdoms. This will be the root of our tree.
self.root = Taxon(inat_taxonomy.ROOT_TAXON_ID)
self.root.name = inat_taxonomy.ROOT_NAME
self.root.rank_level = inat_taxonomy.ROOT_RANK_LEVEL
self.id2taxon = { self.root.taxon_id : self.root }
self.idx2label = {}
def reset(self):
self.root.children = []
self.id2taxon = { self.root.taxon_id : self.root }
self.idx2label = {}
def taxonomy_available(self):
return len(self.root.children) > 0
def read_taxonomy(self, filename):
start_time = time.time()
self.reset()
with open(filename, newline='', encoding='latin-1') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
if 'id' in row: # this is a label file
self.idx2label[int(row['id'])] = row['name']
continue
taxon_id = int(row['taxon_id'])
if taxon_id in self.id2taxon:
taxon = self.id2taxon[taxon_id] # inserted earlier as parent
else:
self.id2taxon[taxon_id] = taxon = Taxon(taxon_id)
taxon.name = row['name']
if row['rank_level'].isdigit():
taxon.rank_level = int(row['rank_level'])
else:
taxon.rank_level = float(row['rank_level'])
if len(row['leaf_class_id']):
for leaf_class_id in row['leaf_class_id'].split(';'):
leaf_class_id = int(leaf_class_id)
taxon.leaf_class_ids.append(leaf_class_id)
self.idx2label[leaf_class_id] = taxon.name
if len(row['parent_taxon_id']):
parent_taxon_id = int(row['parent_taxon_id'])
else:
parent_taxon_id = self.root.taxon_id
if not parent_taxon_id in self.id2taxon:
self.id2taxon[parent_taxon_id] = Taxon(parent_taxon_id)
self.id2taxon[parent_taxon_id].add_child(taxon)
if not self.taxonomy_available():
# We parsed a label file; unless told otherwise, we use these
# labels to build a taxonomic tree.
print(f"Read {len(self.idx2label):,} labels from '{filename}' "
f"in {time.time() - start_time:.1f} secs.")
if not label_scores_only:
self.compute_taxonomic_tree()
if self.taxonomy_available():
self.write_taxonomic_tree(filename.replace('labelmap',
'taxonomy'))
else:
print(f"Read taxonomy from '{filename}' in "
f"{time.time() - start_time:.1f} secs: "
f"{len(self.id2taxon) - 1:,} taxa including "
f"{len(self.idx2label):,} leaf taxa.")
if not scientific_names_only and self.taxonomy_available():
inat_taxonomy.annotate_common_names(self.id2taxon, all_common_names)
if label_scores_only:
self.annotate_labels_with_common_names()
del self.id2taxon # not needed anymore
# augment labels with common names
def annotate_labels_with_common_names(self):
for taxon in self.id2taxon.values():
for leaf_class_id in taxon.leaf_class_ids:
self.idx2label[leaf_class_id] = taxon.get_name()
# write one row to taxonomy file
def write_row(self, writer, taxon, parent_taxon_id):
writer.writerow([parent_taxon_id, taxon.taxon_id, taxon.rank_level,
';'.join([str(id) for id in taxon.leaf_class_ids]),
taxon.name])
for child in taxon.children:
self.write_row(writer, child, taxon.taxon_id)
# write taxonomy file
def write_taxonomic_tree(self, filename):
try:
with open(filename, 'w', newline='', encoding='latin-1') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['parent_taxon_id', 'taxon_id', 'rank_level',
'leaf_class_id', 'name'])
for child in self.root.children:
self.write_row(writer, child, '')
print(f"Taxonomy written to file '{filename}'.")
except Exception as e:
print(f"Failure writing taxonomy to file '{filename}':", str(e))
try:
os.remove(filename)
except Exception:
pass
# Called after loading label file for Google's AIY Vision Kit.
# Adds all the labels' direct and indirect ancestors to compute
# the taxonomic tree.
def compute_taxonomic_tree(self):
global label_scores_only
if not inat_taxonomy.load_inat_taxonomy():
label_scores_only = True
return
start_time = time.time()
new_id = 0 # id's we add on the fly for pseudo-kingdoms
for idx, name in self.idx2label.items():
inat_taxa = inat_taxonomy.lookup_id(name)
if not inat_taxa:
print(f"Info: Taxon for label '{name}' not found, "
"inserting as pseudo-kingdom.")
new_id -= 1
taxon_id = new_id
self.id2taxon[taxon_id] = taxon = Taxon(taxon_id)
taxon.rank_level = inat_taxonomy.KINGDOM_RANK_LEVEL
taxon.name = name
taxon.leaf_class_ids = [idx]
self.root.add_child(taxon)
continue
inat_taxon, ancestors = inat_taxa
if name != inat_taxon.name:
print(f"Info: Taxon '{name}' changed to "
f"'{inat_taxon.name}', iNat taxa "
f"id {inat_taxon.id}.")
# ancestor taxa
prev_ancestor = self.root
for ancestor in ancestors:
if ancestor.id in self.id2taxon:
prev_ancestor = self.id2taxon[ancestor.id]
else:
self.id2taxon[ancestor.id] = ancestor_taxon = Taxon(ancestor.id)
ancestor_taxon.name = ancestor.name
ancestor_taxon.rank_level = ancestor.rank_level
prev_ancestor.add_child(ancestor_taxon)
prev_ancestor = ancestor_taxon
# this taxon
if inat_taxon.id in self.id2taxon:
taxon = self.id2taxon[inat_taxon.id]
assert taxon.name == inat_taxon.name
assert taxon.rank_level == inat_taxon.rank_level
else:
self.id2taxon[inat_taxon.id] = taxon = Taxon(inat_taxon.id)
taxon.name = inat_taxon.name
taxon.rank_level = inat_taxon.rank_level
prev_ancestor.add_child(taxon)
taxon.leaf_class_ids.append(idx)
print("Computed taxonomic tree from labels in "
f"{time.time() - start_time:.1f} secs: {len(self.id2taxon)-1:,} "
f"taxa including {len(self.idx2label):,} leaf taxa.")
# propagate scores to taxon and all below
def assign_scores(self, taxon, scores):
taxon.score = 0.0
for leaf_class_id in taxon.leaf_class_ids:
taxon.score += scores[leaf_class_id]
for child in taxon.children:
self.assign_scores(child, scores)
taxon.score += child.score
# Returns list of 5-tuples (score, taxon_id, taxonomic rank,
# scientific name, common name) ordered by taxonomic rank from kingdom
# down to e.g. species.
# Returns pairs (score, scientific name) if label_scores_only
# is set.
def prediction(self, scores):
if label_scores_only:
# return list of pairs (score, scientific name)
total = np.sum(scores)
indices = np.argpartition(scores, -result_sz)[-result_sz:]
results = [(scores[i] / total, self.idx2label[i])
for i in indices if scores[i] != 0]
results.sort(reverse=True)
return results
# annotate all taxa across the hierarchy with scores.
self.assign_scores(self.root, scores)
# return one hierarchical path guided by scores
path = []
taxon = self.root
while taxon.children:
# Find child with highest score.
best_child = None
for child in taxon.children:
if not best_child or child.score > best_child.score:
best_child = child
# Truncate path if all the other children combined are better
if best_child.score < 0.5 * taxon.score:
break
path.append((best_child.score / self.root.score,
best_child.taxon_id, best_child.get_rank(),
best_child.get_name()))
taxon = best_child
return path
#
# Offline image classification.
#
class OfflineClassifier:
def __init__(self, filenames):
self.min_pixel_value = 0.0
self.max_pixel_value = 255.0
if os.path.split(filenames[0])[1] in ['optimized_model.tflite',
'optimized_model_v1.tflite']:
self.min_pixel_value = -1.0
self.max_pixel_value = 1.0
# Load TFLite model and allocate tensors.
self.mInterpreter = tflite.Interpreter(model_path=filenames[0])
self.mInterpreter.allocate_tensors()
# Get input and output tensors.
self.mInput_details = self.mInterpreter.get_input_details()
self.mOutput_details = self.mInterpreter.get_output_details()
# Read labels or taxonomy
self.mTaxonomy = Taxonomy()
self.mTaxonomy.read_taxonomy(filenames[1])
def classify_image(self, image_filename):
start_time = time.time()
try:
img = Image.open(image_filename)
except:
print(f"Error: cannot load image '{image_filename}'.")
return []
if img.mode != 'RGB':
print(f"Error: image '{image_filename}' is of mode '{img.mode}',"
" only mode RGB is supported.")
return []
# rotate image if needed as it may contain EXIF orientation tag
img = ImageOps.exif_transpose(img)
model_size = tuple(self.mInput_details[0]['shape'][1:3])
# square target shape expected by crop code below
assert model_size[0] == model_size[1]
if img.size != model_size:
# We need to scale and maybe want to crop image.
width, height = img.size
if width != height:
# Before scaling, we crop image to square shape.
left = 0
right = width
top = 0
bottom = height
if width < height:
top = (height - width) / 2
bottom = top + width
else:
left = (width - height) / 2
right = left + height
img = img.crop((left, top, right, bottom))
# scale image
img = img.resize(model_size)
#img.show()
# pixels are in range 0 ... 255, turn into numpy array
input_data = np.array([np.array(img, self.mInput_details[0]['dtype'])])
if self.mInput_details[0]['dtype'] == np.float32:
input_data *= (self.max_pixel_value - self.min_pixel_value) / 255.0
input_data += self.min_pixel_value
self.mInterpreter.set_tensor(self.mInput_details[0]['index'],
input_data)
self.mInterpreter.invoke()
output_data = self.mInterpreter.get_tensor(self.mOutput_details[0]
['index'])
path = self.mTaxonomy.prediction(output_data[0])
print()
print(f"Classification of '{image_filename}' took "
f"{time.time() - start_time:.1f} secs.")
return path
# Returns a dictionary that maps available classifiers to a pair of filenames.
def get_installed_models():
if not os.path.isdir(CLASSIFIER_DIRECTORY):
print("Cannot load classifiers, directory "
f"'{CLASSIFIER_DIRECTORY}' does not exist.")
sys.exit(1)
choices = [ 'birds', 'insects', 'plants']
models = {}
for filename in os.listdir(CLASSIFIER_DIRECTORY):
model = None
if filename.endswith(".csv"):
if filename == 'taxonomy_v2_13.csv':
model = 'v2_13'
elif filename == 'taxonomy_v1.csv':
model = 'Seek'
else:
for m in choices:
if filename.find(m) != -1:
model = m
break
if model:
filename = os.path.join(CLASSIFIER_DIRECTORY, filename)
if model in models:
if not models[model][1] or models[model][1].\
endswith('labelmap.csv'):
models[model] = (models[model][0], filename)
else:
models[model] = (None, filename)
elif filename.endswith(".tflite"):
if filename == 'optimized_model_v2_13.tflite':
model = 'v2_13'
elif filename == 'optimized_model_v1.tflite':
model = 'Seek'
else:
for m in choices:
if filename.find(m) != -1:
model = m
break
if model:
filename = os.path.join(CLASSIFIER_DIRECTORY, filename)
if model in models:
models[model] = (filename, models[model][1])
else:
models[model] = (filename, None)
delete_elements = [] # postponed deletion, cannot delete during iteration
for name, files in models.items():
if not files[0] or not files[1]:
tf_missing = ".csv file but no .tflite file"
csv_missing = ".tflite file but no .csv file"
print("Installation issue: Excluding incomplete classifier for"
f" '{name}': {tf_missing if files[1] else csv_missing}.")
delete_elements.append(name)
for element in delete_elements:
del models[element]
if not models:
print(f"No classifiers found in directory '{CLASSIFIER_DIRECTORY}'; "
"follow instructions in "
f"'{os.path.join(CLASSIFIER_DIRECTORY,'README.md')}'"
" to install them.", file=sys.stderr)
sys.exit(1)
return models
def identify_species(classifier, filename):
result = classifier.classify_image(filename)
if result:
# Print list of tuples (score, taxon id, taxonomic rank, name)
# ordered by taxonomic rank from kingdom down to species.
for entry in result:
if len(entry) == 2: # labels only
print(f'{100 * entry[0]:5.1f}% {entry[1]}')
continue
print(f'{100 * entry[0]:5.1f}% {entry[2]:11s} {entry[3]}')
# command-line parsing
models = get_installed_models()
def model_parameter_check(arg):
if not arg in models:
msg = f"Model '{arg}' not available. Available "\
f"model{'' if len(models)==1 else 's'}:"
prefix = ' '
for m in models:
msg += f"{prefix}'{m}'"
prefix = ', '
msg += '.'
raise argparse.ArgumentTypeError(msg)
return arg
def result_size_check(arg):
if arg.isdigit() and int(arg) > 0 and int(arg) <= 100:
return int(arg)
raise argparse.ArgumentTypeError(f"'{arg}' is not a number "
"between 1 and 100.")
def file_directory_check(arg):
if os.path.isdir(arg) or os.path.isfile(arg):
return arg
raise argparse.ArgumentTypeError(f"'{arg}' is not a file or directory.")
#
# Identify species for picture files and directories given as command line args
#
if __name__ == '__main__':
import argparse
preferred1 = 'v2_13' # default if this model is available
preferred2 = 'Seek' # second preference
parser = argparse.ArgumentParser()
if len(models) == 1 or preferred1 in models or preferred2 in models:
default_model = preferred1 if preferred1 in models else \
preferred2 if preferred2 in models else \
next(iter(models))
parser.add_argument("-m", "--model", type=model_parameter_check,
default=default_model,
help="Model to load to identify organisms.")
else: # no default for classification model
parser.add_argument("-m", "--model", type=model_parameter_check,
required=True,
help="Model to load to identify organisms.")
parser.add_argument('-a', '--all_common_names', action="store_true",
help='Show all common names and not just one.')
parser.add_argument('-l', '--label_scores_only', action="store_true",
help='Compute and display only label scores, '
'do not propagate scores up the hierarchy.')
parser.add_argument('-s', '--scientific_names_only', action="store_true",
help='Only use scientific names, do not load common '
'names.')
parser.add_argument('-r', '--result_size', type=result_size_check,
default=result_sz, help='Number of labels and their '
'scores to report in results.')
parser.add_argument('files_dirs', metavar='file/directory',
type=file_directory_check, nargs='+',
help='Image files or directories with images.')
args = parser.parse_args()
scientific_names_only = args.scientific_names_only
label_scores_only = args.label_scores_only
all_common_names = args.all_common_names
result_sz = args.result_size
# make classifier instance
classifier = OfflineClassifier(models[args.model])
# process photos
for arg in args.files_dirs:
if os.path.isfile(arg):
identify_species(classifier, arg)
elif os.path.isdir(arg):
for file in os.listdir(arg):
ext = os.path.splitext(file)[1].lower()
if ext in ['.jpg', '.jepg', '.png']:
identify_species(classifier, os.path.join(arg, file))

Binary file not shown.

After

Width:  |  Height:  |  Size: 196 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 399 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 257 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 254 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 189 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 168 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 198 KiB

View File

@@ -0,0 +1,3 @@
Pillow
requests
tflite-runtime