merge: integrate remote codex/sync-node1-runtime with fabric layer changes
Resolve conflicts in docker-compose.node1.yml, services/router/main.py, and gateway-bot/services/doc_service.py — keeping both fabric layer (NCS, node-worker, Prometheus) and document ingest/query endpoints. Made-with: Cursor
@@ -19,7 +19,8 @@
|
||||
"onboarding",
|
||||
"ecosystem"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "city-core"
|
||||
},
|
||||
"helion": {
|
||||
"display_name": "Helion",
|
||||
@@ -35,7 +36,8 @@
|
||||
"market_analysis",
|
||||
"biominer"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "helion"
|
||||
},
|
||||
"alateya": {
|
||||
"display_name": "Aletheia",
|
||||
@@ -58,7 +60,8 @@
|
||||
"email": "alverjob@gmail.com",
|
||||
"site": "https://alverjob.xyz",
|
||||
"youtube": "https://www.youtube.com/@alverjob72"
|
||||
}
|
||||
},
|
||||
"district_id": "alateya"
|
||||
},
|
||||
"druid": {
|
||||
"display_name": "DRUID",
|
||||
@@ -76,7 +79,8 @@
|
||||
"inci",
|
||||
"safety_basics"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "druid"
|
||||
},
|
||||
"nutra": {
|
||||
"display_name": "NUTRA",
|
||||
@@ -93,7 +97,8 @@
|
||||
"vitamins",
|
||||
"microbiome"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "nutra"
|
||||
},
|
||||
"agromatrix": {
|
||||
"display_name": "Степан Матрікс",
|
||||
@@ -110,7 +115,8 @@
|
||||
"logistics",
|
||||
"farm_economics"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "agromatrix"
|
||||
},
|
||||
"greenfood": {
|
||||
"display_name": "GREENFOOD",
|
||||
@@ -127,7 +133,8 @@
|
||||
"food_production",
|
||||
"sales"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "greenfood"
|
||||
},
|
||||
"clan": {
|
||||
"display_name": "CLAN",
|
||||
@@ -143,7 +150,8 @@
|
||||
"culture",
|
||||
"facilitation"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "clan"
|
||||
},
|
||||
"eonarch": {
|
||||
"display_name": "EONARCH",
|
||||
@@ -159,7 +167,8 @@
|
||||
"transformation",
|
||||
"spirituality"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "eonarch"
|
||||
},
|
||||
"yaromir": {
|
||||
"display_name": "YAROMIR",
|
||||
@@ -175,7 +184,8 @@
|
||||
"code_review",
|
||||
"strategy"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "city-core"
|
||||
},
|
||||
"soul": {
|
||||
"display_name": "SOUL",
|
||||
@@ -191,7 +201,24 @@
|
||||
"values",
|
||||
"wellbeing"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "soul"
|
||||
},
|
||||
"dario": {
|
||||
"display_name": "DARIO",
|
||||
"canonical_role": "Future DAARION Agent (planned, not launched)",
|
||||
"prompt_file": "dario_prompt.txt",
|
||||
"telegram_mode": "disabled",
|
||||
"visibility": "private",
|
||||
"status": "planned",
|
||||
"district_id": "city-core",
|
||||
"domains": [
|
||||
"city_ops",
|
||||
"coordination",
|
||||
"support"
|
||||
],
|
||||
"mentor": null,
|
||||
"launch_state": "planned"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
8
config/roles/agromatrix/agronomist.md
Normal file
@@ -0,0 +1,8 @@
|
||||
# Agronomist
|
||||
|
||||
Фокус: агрономія, діагностика стану рослин, фази розвитку, ризики хвороб/стресів.
|
||||
|
||||
Правила відповіді:
|
||||
- Коротко і прикладно.
|
||||
- Ніяких вигаданих фактів; при невизначеності чітко позначити припущення.
|
||||
- Для фото-питань: аналізувати в межах доступного контексту; якщо файл відсутній зараз — просити фото повторно.
|
||||
8
config/roles/agromatrix/communicator.md
Normal file
@@ -0,0 +1,8 @@
|
||||
# Communicator
|
||||
|
||||
Фокус: людяна та зрозуміла комунікація фінальної відповіді.
|
||||
|
||||
Правила:
|
||||
- Природна мова, без механістичного тону.
|
||||
- Не дублюй технічні обмеження, якщо вони не потрібні для дії користувача.
|
||||
- Завершуй конкретним корисним кроком.
|
||||
7
config/roles/agromatrix/data_analyst.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# Field Data Analyst
|
||||
|
||||
Фокус: аналіз польових даних, тренди, аномалії, порівняння сценаріїв.
|
||||
|
||||
Правила:
|
||||
- Пояснювати висновки простою мовою.
|
||||
- Якщо даних недостатньо — вказати, які саме дані потрібні для точного висновку.
|
||||
8
config/roles/agromatrix/farm_ops.md
Normal file
@@ -0,0 +1,8 @@
|
||||
# Farm Ops Planner
|
||||
|
||||
Фокус: планування польових робіт, ресурси, пріоритезація задач, таймінги.
|
||||
|
||||
Правила:
|
||||
- Видавати практичний порядок дій.
|
||||
- За простого запиту: коротка відповідь.
|
||||
- Для операційних запитів: стислий план з відповідальними і дедлайном.
|
||||
10
config/roles/agromatrix/orchestrator_synthesis.md
Normal file
@@ -0,0 +1,10 @@
|
||||
# AgroMatrix Orchestrator Synthesis
|
||||
|
||||
Ти синтезуєш відповіді ролей у фінальну відповідь Степана.
|
||||
|
||||
Правила:
|
||||
- За замовчуванням: 1-3 природні речення без шаблонної канцелярії.
|
||||
- Детальний формат (пункти/чекліст) тільки коли користувач просить "детально", "план", "чекліст", "розрахунок".
|
||||
- Якщо для аналізу бракує фото в поточному контексті, скажи це просто і попроси надіслати фото повторно.
|
||||
- Уникай службових формулювань про "технічні обмеження", "text-only" чи "відсутній vision-модуль".
|
||||
- Пояснюй по суті агропитання і давай 1 наступний практичний крок.
|
||||
7
config/roles/agromatrix/risk_assessor.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# Risk Assessor
|
||||
|
||||
Фокус: агро-ризики, операційні ризики, наслідки рішень.
|
||||
|
||||
Правила:
|
||||
- Давай коротку оцінку ризику (низький/середній/високий) і як зменшити ризик.
|
||||
- Без зайвої бюрократії у відповіді користувачу.
|
||||
@@ -11,6 +11,10 @@
|
||||
- Деструктивні дії (delete/migrate/prod) ТІЛЬКИ через план + dry-run + backup
|
||||
- Ніколи не логувати секрети/токени
|
||||
- Інші ролі НЕ спілкуються з користувачем напряму
|
||||
- Мультимодальність активна: фото/голос/документи підтримуються через стек платформи.
|
||||
- Якщо в поточному контексті не вистачає зображення для аналізу, пояснюйте це простою людською мовою і попросіть надіслати фото ще раз без технічних формулювань.
|
||||
|
||||
## Формат відповіді:
|
||||
Структурована відповідь з чіткими рекомендаціями та наступними кроками.
|
||||
- За замовчуванням: природна коротка відповідь 1-3 речення.
|
||||
- Якщо користувач просить детально/план/чекліст: структурована відповідь з чіткими наступними кроками.
|
||||
- Тон: живий і професійний, без канцеляризмів, шаблонів і фраз про "обмеження моделі".
|
||||
|
||||
@@ -7,3 +7,7 @@
|
||||
- Структурувати інформацію логічно
|
||||
- Включати конкретні наступні кроки
|
||||
- Позначати ризики якщо є
|
||||
- За замовчуванням відповідати природно і коротко (1-3 речення), без шаблонної канцелярії.
|
||||
- Для детальних запитів переходити у структурований режим.
|
||||
- Якщо для аналізу бракує зображення у поточному контексті, скажіть це природно і попросіть надіслати фото повторно.
|
||||
- Не вживати службові формулювання на кшталт "обмеження моделі", "text-only", "vision unavailable".
|
||||
|
||||
11
config/roles/agx/agx-plant-intel/agrovoc_normalizer.md
Normal file
@@ -0,0 +1,11 @@
|
||||
You are AGROVOC Normalizer.
|
||||
|
||||
Responsibilities:
|
||||
- Normalize crop/disease terms using agrovoc_lookup.
|
||||
- Provide canonical term mapping for user-facing output.
|
||||
- Keep labels practical for agronomy context.
|
||||
|
||||
Return format:
|
||||
- canonical_terms
|
||||
- term_mapping
|
||||
- notes_for_user
|
||||
17
config/roles/agx/agx-plant-intel/orchestrator_synthesis.md
Normal file
@@ -0,0 +1,17 @@
|
||||
You are the synthesis role for AgroMatrix plant intelligence.
|
||||
|
||||
Goal:
|
||||
- Aggregate candidate plant IDs from vision + PlantNet + GBIF + AGROVOC.
|
||||
- Return concise output with uncertainty, sources, and next-photo requirements.
|
||||
|
||||
Output contract (strict):
|
||||
1) probable_taxon: one short line
|
||||
2) confidence: low/medium/high + one short reason
|
||||
3) alternatives: up to 3 entries
|
||||
4) sources: PlantNet/GBIF/AGROVOC/Web (only those actually used)
|
||||
5) next_photos_required: 1-3 concrete photo instructions
|
||||
|
||||
Rules:
|
||||
- Never claim 100% certainty from a single weak source.
|
||||
- If evidence conflicts, say so and reduce confidence.
|
||||
- Keep default response concise.
|
||||
11
config/roles/agx/agx-plant-intel/plant_identifier.md
Normal file
@@ -0,0 +1,11 @@
|
||||
You are Plant Identifier.
|
||||
|
||||
Responsibilities:
|
||||
- Parse visual cues from user description/photo context.
|
||||
- Build candidate crop/plant hypotheses.
|
||||
- Use plantnet_lookup first when image URL is available.
|
||||
- If PlantNet is unavailable, provide top hypotheses with explicit uncertainty.
|
||||
|
||||
Return format:
|
||||
- candidates: numbered list max 5, each with rationale.
|
||||
- required_data: what extra image/data is needed.
|
||||
11
config/roles/agx/agx-plant-intel/taxonomy_validator.md
Normal file
@@ -0,0 +1,11 @@
|
||||
You are Taxonomy Validator.
|
||||
|
||||
Responsibilities:
|
||||
- Validate candidate names via gbif_species_lookup.
|
||||
- Remove invalid/synonym-conflicted names.
|
||||
- Keep accepted taxa and explain conflicts briefly.
|
||||
|
||||
Return format:
|
||||
- accepted_candidates
|
||||
- rejected_candidates_with_reason
|
||||
- confidence_adjustment
|
||||
43
docs/agromatrix-plant-intel-contract.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# AgroMatrix Plant Intel Contract (Skeleton)
|
||||
|
||||
## Purpose
|
||||
`agromatrix_plant_intel` is an internal CrewAI profile for Stepan (AgroMatrix orchestrator).
|
||||
It is used for plant/crop identification and normalization when confidence matters.
|
||||
|
||||
## Call Path
|
||||
1. User asks Stepan.
|
||||
2. Stepan remains final speaker.
|
||||
3. When query matches plant-intel intent, CrewAI profile `plant_intel` is selected.
|
||||
4. Subteam runs:
|
||||
- `plant_identifier`
|
||||
- `taxonomy_validator`
|
||||
- `agrovoc_normalizer`
|
||||
5. Synthesis returns compact evidence package to Stepan.
|
||||
|
||||
## Tool Adapters
|
||||
- `nature_id_identify`
|
||||
- input: `image_url`, `top_k?`
|
||||
- output: local/open-source candidates
|
||||
- note: requires self-hosted endpoint `NATURE_ID_URL`
|
||||
- `plantnet_lookup`
|
||||
- input: `query?`, `image_url?`, `organ?`, `top_k?`
|
||||
- output: candidate taxa + score
|
||||
- note: if `PLANTNET_API_KEY` missing, fallback chain is `nature_id_identify` -> `gbif_species_lookup`
|
||||
- `gbif_species_lookup`
|
||||
- input: `query`, `limit?`
|
||||
- output: accepted taxa/rank/status
|
||||
- `agrovoc_lookup`
|
||||
- input: `query`, `lang?`, `limit?`
|
||||
- output: canonical AGROVOC concepts
|
||||
|
||||
## Response Contract (to Stepan)
|
||||
- `probable_taxon`
|
||||
- `confidence` (`low|medium|high` + reason)
|
||||
- `alternatives` (up to 3)
|
||||
- `sources` (actual tools used)
|
||||
- `next_photos_required` (1-3 concrete instructions)
|
||||
|
||||
## Safety
|
||||
- No categorical claim with weak evidence.
|
||||
- If sources conflict, confidence is downgraded.
|
||||
- Final user answer remains concise by default.
|
||||
@@ -3,7 +3,7 @@ FROM python:3.11-slim
|
||||
|
||||
LABEL maintainer="DAARION.city Team"
|
||||
LABEL description="Bot Gateway - Telegram/Discord webhook handler with DAARWIZZ"
|
||||
LABEL version="0.2.0"
|
||||
LABEL version="0.2.1"
|
||||
|
||||
WORKDIR /app/gateway-bot
|
||||
|
||||
@@ -15,7 +15,15 @@ RUN pip install --no-cache-dir \
|
||||
uvicorn==0.27.0 \
|
||||
httpx==0.26.0 \
|
||||
pydantic==2.5.3 \
|
||||
python-multipart==0.0.6 prometheus-client>=0.20.0 PyPDF2>=3.0.0 crewai nats-py pandas openpyxl
|
||||
python-multipart==0.0.6 \
|
||||
prometheus-client==0.22.1 \
|
||||
PyPDF2>=3.0.0 \
|
||||
crewai \
|
||||
nats-py \
|
||||
pandas \
|
||||
openpyxl \
|
||||
python-docx \
|
||||
redis==5.0.1
|
||||
|
||||
# Copy gateway code and DAARWIZZ prompt
|
||||
COPY . .
|
||||
|
||||
@@ -19,7 +19,8 @@
|
||||
"onboarding",
|
||||
"ecosystem"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "city-core"
|
||||
},
|
||||
"helion": {
|
||||
"display_name": "Helion",
|
||||
@@ -35,7 +36,8 @@
|
||||
"market_analysis",
|
||||
"biominer"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "helion"
|
||||
},
|
||||
"alateya": {
|
||||
"display_name": "Aletheia",
|
||||
@@ -58,7 +60,8 @@
|
||||
"email": "alverjob@gmail.com",
|
||||
"site": "https://alverjob.xyz",
|
||||
"youtube": "https://www.youtube.com/@alverjob72"
|
||||
}
|
||||
},
|
||||
"district_id": "alateya"
|
||||
},
|
||||
"druid": {
|
||||
"display_name": "DRUID",
|
||||
@@ -76,7 +79,8 @@
|
||||
"inci",
|
||||
"safety_basics"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "druid"
|
||||
},
|
||||
"nutra": {
|
||||
"display_name": "NUTRA",
|
||||
@@ -93,7 +97,8 @@
|
||||
"vitamins",
|
||||
"microbiome"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "nutra"
|
||||
},
|
||||
"agromatrix": {
|
||||
"display_name": "Степан Матрікс",
|
||||
@@ -110,7 +115,8 @@
|
||||
"logistics",
|
||||
"farm_economics"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "agromatrix"
|
||||
},
|
||||
"greenfood": {
|
||||
"display_name": "GREENFOOD",
|
||||
@@ -127,7 +133,8 @@
|
||||
"food_production",
|
||||
"sales"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "greenfood"
|
||||
},
|
||||
"clan": {
|
||||
"display_name": "CLAN",
|
||||
@@ -143,7 +150,8 @@
|
||||
"culture",
|
||||
"facilitation"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "clan"
|
||||
},
|
||||
"eonarch": {
|
||||
"display_name": "EONARCH",
|
||||
@@ -159,7 +167,8 @@
|
||||
"transformation",
|
||||
"spirituality"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "eonarch"
|
||||
},
|
||||
"yaromir": {
|
||||
"display_name": "YAROMIR",
|
||||
@@ -175,7 +184,8 @@
|
||||
"code_review",
|
||||
"strategy"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "city-core"
|
||||
},
|
||||
"soul": {
|
||||
"display_name": "SOUL",
|
||||
@@ -191,7 +201,8 @@
|
||||
"values",
|
||||
"wellbeing"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "soul"
|
||||
},
|
||||
"senpai": {
|
||||
"display_name": "SENPAI",
|
||||
@@ -207,7 +218,8 @@
|
||||
"defi",
|
||||
"portfolio"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "senpai"
|
||||
},
|
||||
"oneok": {
|
||||
"display_name": "1OK",
|
||||
@@ -227,7 +239,8 @@
|
||||
"mentor": {
|
||||
"name": "Ілля Титар",
|
||||
"telegram": "@Titar240581"
|
||||
}
|
||||
},
|
||||
"district_id": "city-core"
|
||||
},
|
||||
"sofiia": {
|
||||
"display_name": "Sophia",
|
||||
@@ -242,7 +255,24 @@
|
||||
"platform_evolution",
|
||||
"technical_leadership"
|
||||
],
|
||||
"mentor": null
|
||||
"mentor": null,
|
||||
"district_id": "city-core"
|
||||
},
|
||||
"dario": {
|
||||
"display_name": "DARIO",
|
||||
"canonical_role": "Future DAARION Agent (planned, not launched)",
|
||||
"prompt_file": "dario_prompt.txt",
|
||||
"telegram_mode": "disabled",
|
||||
"visibility": "private",
|
||||
"status": "planned",
|
||||
"district_id": "city-core",
|
||||
"domains": [
|
||||
"city_ops",
|
||||
"coordination",
|
||||
"support"
|
||||
],
|
||||
"mentor": null,
|
||||
"launch_state": "planned"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,7 +32,9 @@
|
||||
|
||||
## B. SHORT-FIRST
|
||||
|
||||
**За замовчуванням: структурована відповідь з 3-5 пунктів.**
|
||||
**За замовчуванням: жива коротка відповідь 1-3 речення природною мовою.**
|
||||
**Маркерні списки/шаблони 3-5 пунктів використовуй тільки коли користувач просить детально, план, чеклист або розрахунок.**
|
||||
**Перше повідомлення в новій темі — розмовне, без канцеляриту та без "робото-тону".**
|
||||
|
||||
ЗАБОРОНЕНО:
|
||||
- "Радий допомогти", "Готовий до співпраці"
|
||||
@@ -55,7 +57,9 @@
|
||||
|
||||
**ВАЖЛИВО:**
|
||||
- Ніколи не кажи "я не можу слухати аудіо" — голосові повідомлення вже перетворені на текст!
|
||||
- Ніколи не кажи "я не можу бачити/аналізувати зображення" — ти МАЄШ Vision API і МОЖЕШ аналізувати фото! Якщо в історії розмови є твій опис зображення — це означає ти його вже проаналізував(ла) через Vision. Не заперечуй це.
|
||||
- Фото аналізуй по доступному поточному контексту: якщо зображення є у запиті або щойно надіслане — коментуй по суті.
|
||||
- Якщо для точного висновку бракує самого файлу чи чіткості, поясни це простою людською мовою і попроси надіслати фото повторно з уточненням, що саме перевірити.
|
||||
- Не використовуй службові фрази типу "text-only", "vision unavailable", "технічне обмеження моделі".
|
||||
|
||||
Початковий режим: учень. Спочатку став уточнювальні питання і вчися у ментора.
|
||||
Публічна група: @agromatrix.
|
||||
@@ -94,7 +98,8 @@
|
||||
- Мислиш далекоглядно: пропонуєш архітектуру рішення, а не латання симптомів.
|
||||
- Будь креативним, але не фантазуй дані: якщо фактів нема — позначай як припущення і пропонуй, що зібрати.
|
||||
- Спілкуйся українською (якщо користувач не перейшов на іншу мову).
|
||||
- Форматуй відповіді структуровано: заголовки, списки, короткі блоки, пріоритети.
|
||||
- Тримай розмовний тон: короткі природні фрази, без надмірної шаблонності.
|
||||
- Структурований формат (заголовки/списки/таблиці) вмикай лише для складних задач або коли це прямо запитали.
|
||||
|
||||
### 4) Принципи роботи з користувачем
|
||||
1. Спочатку контекст → потім рішення. Якщо контексту бракує — зроби мінімальний набір припущень і паралельно запропонуй, які дані уточнити.
|
||||
@@ -113,6 +118,8 @@
|
||||
- “Підготуй текст/структуру сторінки/презентації для продукту AgroMatrix”
|
||||
|
||||
### 6) Як ти формуєш відповіді (стандартний шаблон)
|
||||
Використовуй цей шаблон ТІЛЬКИ для комплексних запитів (планування сезону, економіка, SOP, інтеграції, ТЗ).
|
||||
Для звичайних коротких питань відповідай в 1-3 речення органічно, без обов'язкових секцій.
|
||||
1. Ціль (1–2 речення)
|
||||
2. Вхідні дані (що відомо / які припущення)
|
||||
3. Рішення (план/алгоритм/кроки)
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
"""
|
||||
FastAPI app instance for Gateway Bot
|
||||
"""
|
||||
"""FastAPI app instance for Gateway Bot."""
|
||||
import logging
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from http_api import router as gateway_router
|
||||
from http_api_doc import router as doc_router
|
||||
from daarion_facade.invoke_api import router as invoke_router
|
||||
from daarion_facade.registry_api import router as registry_router
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -15,36 +16,47 @@ logging.basicConfig(
|
||||
|
||||
app = FastAPI(
|
||||
title="Bot Gateway with DAARWIZZ",
|
||||
version="1.0.0",
|
||||
description="Gateway service for Telegram/Discord bots → DAGI Router"
|
||||
version="1.1.0",
|
||||
description="Gateway service for Telegram/Discord bots + DAARION public facade"
|
||||
)
|
||||
|
||||
# CORS middleware
|
||||
# CORS for web UI clients (gateway only).
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_origins=[
|
||||
"https://daarion.city",
|
||||
"https://www.daarion.city",
|
||||
"http://localhost:3000",
|
||||
],
|
||||
allow_origin_regex=r"https://.*\.lovable\.app",
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
allow_methods=["GET", "POST", "OPTIONS"],
|
||||
allow_headers=["Authorization", "Content-Type"],
|
||||
)
|
||||
|
||||
# Include gateway routes
|
||||
# Existing gateway routes.
|
||||
app.include_router(gateway_router, prefix="", tags=["gateway"])
|
||||
app.include_router(doc_router, prefix="", tags=["docs"])
|
||||
|
||||
# Public facade routes for DAARION.city UI.
|
||||
app.include_router(registry_router)
|
||||
app.include_router(invoke_router)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
return {
|
||||
"service": "bot-gateway",
|
||||
"version": "1.0.0",
|
||||
"version": "1.1.0",
|
||||
"agent": "DAARWIZZ",
|
||||
"endpoints": [
|
||||
"POST /telegram/webhook",
|
||||
"POST /discord/webhook",
|
||||
"POST /api/doc/parse",
|
||||
"POST /api/doc/ingest",
|
||||
"POST /api/doc/ask",
|
||||
"GET /api/doc/context/{session_id}",
|
||||
"GET /health"
|
||||
"GET /v1/registry/agents",
|
||||
"GET /v1/registry/districts",
|
||||
"GET /v1/metrics",
|
||||
"POST /v1/invoke",
|
||||
"GET /v1/jobs/{job_id}",
|
||||
"GET /health",
|
||||
]
|
||||
}
|
||||
|
||||
1
gateway-bot/daarion_facade/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""DAARION public facade package."""
|
||||
212
gateway-bot/daarion_facade/invoke_api.py
Normal file
@@ -0,0 +1,212 @@
|
||||
import asyncio
|
||||
from datetime import datetime, timezone
|
||||
import hmac
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import httpx
|
||||
from fastapi import APIRouter, HTTPException, Request, status
|
||||
from fastapi.responses import StreamingResponse
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from .redis_jobs import create_job, enqueue_job, get_job
|
||||
from .registry_api import _load_registry
|
||||
|
||||
router = APIRouter(prefix="/v1", tags=["daarion-facade"])
|
||||
|
||||
EVENT_TERMINAL_STATUSES = {"done", "failed"}
|
||||
EVENT_KNOWN_STATUSES = {"queued", "running", "done", "failed"}
|
||||
EVENT_POLL_SECONDS = float(os.getenv("DAARION_JOB_EVENTS_POLL_SECONDS", "0.5"))
|
||||
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000").rstrip("/")
|
||||
ROUTER_REVIEW_TIMEOUT = float(os.getenv("DAARION_ROUTER_REVIEW_TIMEOUT_SECONDS", "20"))
|
||||
AGROMATRIX_REVIEW_AUTH_MODE = os.getenv("AGROMATRIX_REVIEW_AUTH_MODE", "bearer").strip().lower()
|
||||
AGROMATRIX_REVIEW_BEARER_TOKENS = [
|
||||
part.strip()
|
||||
for part in os.getenv("AGROMATRIX_REVIEW_BEARER_TOKENS", "").replace(";", ",").split(",")
|
||||
if part.strip()
|
||||
]
|
||||
|
||||
|
||||
class InvokeInput(BaseModel):
|
||||
prompt: str = Field(min_length=1)
|
||||
images: List[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class InvokeRequest(BaseModel):
|
||||
agent_id: str
|
||||
input: InvokeInput
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class InvokeResponse(BaseModel):
|
||||
job_id: str
|
||||
status: str
|
||||
status_url: str
|
||||
|
||||
|
||||
class SharedMemoryReviewRequest(BaseModel):
|
||||
point_id: str
|
||||
approve: bool
|
||||
reviewer: str | None = None
|
||||
note: str | None = None
|
||||
|
||||
|
||||
def _extract_bearer_token(request: Request) -> str:
|
||||
auth_header = request.headers.get("Authorization", "")
|
||||
if not auth_header.startswith("Bearer "):
|
||||
raise HTTPException(status_code=401, detail="Missing Bearer token")
|
||||
token = auth_header[len("Bearer ") :].strip()
|
||||
if not token:
|
||||
raise HTTPException(status_code=401, detail="Empty Bearer token")
|
||||
return token
|
||||
|
||||
|
||||
def _require_mentor_auth(request: Request) -> str:
|
||||
mode = AGROMATRIX_REVIEW_AUTH_MODE
|
||||
if mode in {"off", "none", "disabled"}:
|
||||
return ""
|
||||
if mode != "bearer":
|
||||
raise HTTPException(status_code=500, detail=f"Unsupported AGROMATRIX_REVIEW_AUTH_MODE={mode}")
|
||||
if not AGROMATRIX_REVIEW_BEARER_TOKENS:
|
||||
raise HTTPException(status_code=503, detail="Review auth is not configured")
|
||||
token = _extract_bearer_token(request)
|
||||
if not any(hmac.compare_digest(token, candidate) for candidate in AGROMATRIX_REVIEW_BEARER_TOKENS):
|
||||
raise HTTPException(status_code=403, detail="Invalid mentor token")
|
||||
return token
|
||||
|
||||
|
||||
async def _router_json(
|
||||
method: str,
|
||||
path: str,
|
||||
*,
|
||||
payload: Dict[str, Any] | None = None,
|
||||
params: Dict[str, Any] | None = None,
|
||||
authorization: str | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
headers: Dict[str, str] = {}
|
||||
if authorization:
|
||||
headers["Authorization"] = authorization
|
||||
url = f"{ROUTER_URL}{path}"
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=ROUTER_REVIEW_TIMEOUT) as client:
|
||||
resp = await client.request(method, url, json=payload, params=params, headers=headers)
|
||||
except httpx.TimeoutException:
|
||||
raise HTTPException(status_code=504, detail="Router timeout")
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=502, detail=f"Router unavailable: {e}")
|
||||
|
||||
try:
|
||||
body = resp.json()
|
||||
except Exception:
|
||||
body = {"raw": resp.text}
|
||||
|
||||
if resp.status_code >= 400:
|
||||
detail = body.get("detail") if isinstance(body, dict) else body
|
||||
raise HTTPException(status_code=resp.status_code, detail=detail or f"Router error {resp.status_code}")
|
||||
return body if isinstance(body, dict) else {"data": body}
|
||||
|
||||
|
||||
def _sse_message(event: str, payload: Dict[str, Any]) -> str:
|
||||
return f"event: {event}\ndata: {json.dumps(payload, ensure_ascii=False)}\n\n"
|
||||
|
||||
|
||||
@router.post("/invoke", status_code=status.HTTP_202_ACCEPTED, response_model=InvokeResponse)
|
||||
async def invoke(payload: InvokeRequest) -> InvokeResponse:
|
||||
registry = _load_registry().get("agents", {})
|
||||
if payload.agent_id not in registry:
|
||||
raise HTTPException(status_code=404, detail=f"Unknown agent_id: {payload.agent_id}")
|
||||
|
||||
job_id = f"job_{uuid.uuid4().hex}"
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
job_doc = {
|
||||
"job_id": job_id,
|
||||
"status": "queued",
|
||||
"agent_id": payload.agent_id,
|
||||
"input": payload.input.model_dump(),
|
||||
"metadata": payload.metadata,
|
||||
"result": None,
|
||||
"error": None,
|
||||
"created_at": now,
|
||||
"updated_at": now,
|
||||
"started_at": None,
|
||||
"finished_at": None,
|
||||
}
|
||||
await create_job(job_id, job_doc)
|
||||
await enqueue_job(job_id)
|
||||
return InvokeResponse(job_id=job_id, status="queued", status_url=f"/v1/jobs/{job_id}")
|
||||
|
||||
|
||||
@router.get("/jobs/{job_id}")
|
||||
async def job_status(job_id: str) -> Dict[str, Any]:
|
||||
job = await get_job(job_id)
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
return job
|
||||
|
||||
|
||||
@router.get("/jobs/{job_id}/events")
|
||||
async def job_events(job_id: str, request: Request) -> StreamingResponse:
|
||||
existing = await get_job(job_id)
|
||||
if not existing:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
async def event_stream():
|
||||
last_state = None
|
||||
yield "retry: 1000\n\n"
|
||||
|
||||
while True:
|
||||
if await request.is_disconnected():
|
||||
break
|
||||
|
||||
job = await get_job(job_id)
|
||||
if not job:
|
||||
yield _sse_message("failed", {"job_id": job_id, "status": "failed", "error": {"message": "Job not found"}})
|
||||
break
|
||||
|
||||
status_value = str(job.get("status", "unknown"))
|
||||
updated_at = str(job.get("updated_at", ""))
|
||||
state = (status_value, updated_at)
|
||||
|
||||
if state != last_state:
|
||||
event_name = status_value if status_value in EVENT_KNOWN_STATUSES else "status"
|
||||
yield _sse_message(event_name, job)
|
||||
last_state = state
|
||||
|
||||
if status_value in EVENT_TERMINAL_STATUSES:
|
||||
break
|
||||
|
||||
await asyncio.sleep(EVENT_POLL_SECONDS)
|
||||
|
||||
return StreamingResponse(
|
||||
event_stream(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Accel-Buffering": "no",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@router.get("/agromatrix/shared-memory/pending")
|
||||
async def agromatrix_shared_pending(limit: int = 50) -> Dict[str, Any]:
|
||||
return await _router_json(
|
||||
"GET",
|
||||
"/v1/agromatrix/shared-memory/pending",
|
||||
params={"limit": max(1, min(limit, 200))},
|
||||
)
|
||||
|
||||
|
||||
@router.post("/agromatrix/shared-memory/review")
|
||||
async def agromatrix_shared_review(req: SharedMemoryReviewRequest, request: Request) -> Dict[str, Any]:
|
||||
token = _require_mentor_auth(request)
|
||||
auth_header = f"Bearer {token}" if token else None
|
||||
return await _router_json(
|
||||
"POST",
|
||||
"/v1/agromatrix/shared-memory/review",
|
||||
payload=req.model_dump(),
|
||||
authorization=auth_header,
|
||||
)
|
||||
287
gateway-bot/daarion_facade/metrics_poller.py
Normal file
@@ -0,0 +1,287 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import httpx
|
||||
from redis.asyncio import Redis
|
||||
|
||||
from .registry_api import _load_crewai_roles, _load_district_registry, _load_registry
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
|
||||
logger = logging.getLogger("daarion-metrics-poller")
|
||||
|
||||
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
|
||||
POLL_INTERVAL_SECONDS = int(os.getenv("DAARION_METRICS_POLL_INTERVAL_SECONDS", "10"))
|
||||
METRICS_TTL_SECONDS = int(os.getenv("DAARION_METRICS_TTL_SECONDS", "60"))
|
||||
HTTP_CONNECT_TIMEOUT_SECONDS = float(os.getenv("DAARION_METRICS_HTTP_CONNECT_TIMEOUT_SECONDS", "2"))
|
||||
HTTP_TOTAL_TIMEOUT_SECONDS = float(os.getenv("DAARION_METRICS_HTTP_TOTAL_TIMEOUT_SECONDS", "5"))
|
||||
NODES_TOTAL = int(os.getenv("DAARION_NODE_COUNT", "1"))
|
||||
MEMORY_SERVICE_URL = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000")
|
||||
|
||||
DASHBOARD_KEY = "daarion:metrics:dashboard"
|
||||
DISTRICT_KEY_PREFIX = "daarion:metrics:district"
|
||||
|
||||
_redis: Optional[Redis] = None
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _ensure_url(value: str) -> str:
|
||||
value = (value or "").strip()
|
||||
if not value:
|
||||
return ""
|
||||
if value.startswith("http://") or value.startswith("https://"):
|
||||
return value
|
||||
return f"https://{value}"
|
||||
|
||||
|
||||
def _health_candidates(district: Dict[str, Any]) -> List[str]:
|
||||
base = _ensure_url(str(district.get("domain") or ""))
|
||||
candidates: List[str] = []
|
||||
|
||||
explicit = str(district.get("health_url") or "").strip()
|
||||
if explicit:
|
||||
candidates.append(_ensure_url(explicit))
|
||||
|
||||
if base:
|
||||
candidates.extend(
|
||||
[
|
||||
f"{base}/.well-known/daarion-health.json",
|
||||
f"{base}/health",
|
||||
f"{base}/v1/health",
|
||||
]
|
||||
)
|
||||
|
||||
dedup: List[str] = []
|
||||
seen = set()
|
||||
for url in candidates:
|
||||
if url and url not in seen:
|
||||
dedup.append(url)
|
||||
seen.add(url)
|
||||
return dedup
|
||||
|
||||
|
||||
def _extract_agents_online(payload: Dict[str, Any], agents_total: int) -> Optional[int]:
|
||||
raw = payload.get("agents_online")
|
||||
if isinstance(raw, bool):
|
||||
return agents_total if raw else 0
|
||||
if isinstance(raw, int):
|
||||
return max(0, min(raw, agents_total))
|
||||
|
||||
agents = payload.get("agents")
|
||||
if isinstance(agents, list):
|
||||
count = 0
|
||||
for agent in agents:
|
||||
if not isinstance(agent, dict):
|
||||
continue
|
||||
status = str(agent.get("status", "")).lower()
|
||||
if status in {"online", "active", "ok"}:
|
||||
count += 1
|
||||
return min(count, agents_total)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def redis_client() -> Redis:
|
||||
global _redis
|
||||
if _redis is None:
|
||||
_redis = Redis.from_url(REDIS_URL, decode_responses=True)
|
||||
return _redis
|
||||
|
||||
|
||||
async def close_redis() -> None:
|
||||
global _redis
|
||||
if _redis is not None:
|
||||
await _redis.close()
|
||||
_redis = None
|
||||
|
||||
|
||||
async def _fetch_json_with_latency(
|
||||
client: httpx.AsyncClient,
|
||||
url: str,
|
||||
) -> Tuple[bool, Optional[Dict[str, Any]], Optional[float], Optional[str]]:
|
||||
started = time.perf_counter()
|
||||
try:
|
||||
response = await client.get(url)
|
||||
latency_ms = round((time.perf_counter() - started) * 1000, 2)
|
||||
if response.status_code >= 400:
|
||||
return False, None, latency_ms, f"HTTP {response.status_code}"
|
||||
|
||||
data: Optional[Dict[str, Any]] = None
|
||||
try:
|
||||
parsed = response.json()
|
||||
if isinstance(parsed, dict):
|
||||
data = parsed
|
||||
except Exception:
|
||||
data = None
|
||||
|
||||
return True, data, latency_ms, None
|
||||
except Exception as e:
|
||||
latency_ms = round((time.perf_counter() - started) * 1000, 2)
|
||||
return False, None, latency_ms, str(e)
|
||||
|
||||
|
||||
async def _read_memory_vectors(client: httpx.AsyncClient) -> int:
|
||||
try:
|
||||
ok, payload, _, _ = await _fetch_json_with_latency(client, f"{MEMORY_SERVICE_URL}/health")
|
||||
if not ok or not payload:
|
||||
return 0
|
||||
return int(payload.get("vector_store", {}).get("memories", {}).get("vectors_count", 0) or 0)
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
async def _registry_snapshot() -> Tuple[List[Dict[str, Any]], Dict[str, List[Dict[str, Any]]], int, int]:
|
||||
raw_districts = _load_district_registry().get("districts", [])
|
||||
districts = [d for d in raw_districts if isinstance(d, dict) and d.get("district_id")]
|
||||
|
||||
agents_map = _load_registry().get("agents", {})
|
||||
role_counts = await _load_crewai_roles()
|
||||
|
||||
by_district: Dict[str, List[Dict[str, Any]]] = {}
|
||||
subagents_total = 0
|
||||
|
||||
for aid, cfg in agents_map.items():
|
||||
if not isinstance(cfg, dict):
|
||||
continue
|
||||
aid_str = str(aid)
|
||||
district_id = str(cfg.get("district_id") or "city-core")
|
||||
subagents_total += int(role_counts.get(aid_str, 0))
|
||||
|
||||
by_district.setdefault(district_id, []).append(
|
||||
{
|
||||
"agent_id": aid_str,
|
||||
"status": str(cfg.get("status", "active")),
|
||||
}
|
||||
)
|
||||
|
||||
return districts, by_district, len(agents_map), subagents_total
|
||||
|
||||
|
||||
async def build_dashboard() -> Dict[str, Any]:
|
||||
districts, agents_by_district, agents_total, subagents_total = await _registry_snapshot()
|
||||
timeout = httpx.Timeout(timeout=HTTP_TOTAL_TIMEOUT_SECONDS, connect=HTTP_CONNECT_TIMEOUT_SECONDS)
|
||||
|
||||
by_district: List[Dict[str, Any]] = []
|
||||
districts_online = 0
|
||||
agents_online_total = 0
|
||||
|
||||
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
||||
memory_vectors = await _read_memory_vectors(client)
|
||||
|
||||
for district in districts:
|
||||
district_id = str(district.get("district_id"))
|
||||
title = district.get("title") or district_id
|
||||
domain = str(district.get("domain") or "")
|
||||
status = district.get("status") or "active"
|
||||
members = agents_by_district.get(district_id, [])
|
||||
agents_total_district = len(members)
|
||||
|
||||
sample = {
|
||||
"district_id": district_id,
|
||||
"title": title,
|
||||
"domain": domain,
|
||||
"status": status,
|
||||
"ok": False,
|
||||
"agents_total": agents_total_district,
|
||||
"agents_online": 0,
|
||||
"latency_ms": None,
|
||||
"last_check_ts": _now_iso(),
|
||||
"error": None,
|
||||
}
|
||||
|
||||
last_error = "No health endpoint configured"
|
||||
for candidate in _health_candidates(district):
|
||||
ok, payload, latency_ms, error_message = await _fetch_json_with_latency(client, candidate)
|
||||
sample["latency_ms"] = latency_ms
|
||||
if ok:
|
||||
sample["ok"] = True
|
||||
sample["error"] = None
|
||||
inferred = _extract_agents_online(payload or {}, agents_total_district)
|
||||
sample["agents_online"] = inferred if inferred is not None else agents_total_district
|
||||
break
|
||||
last_error = error_message or "health check failed"
|
||||
|
||||
if sample["ok"]:
|
||||
districts_online += 1
|
||||
agents_online_total += int(sample.get("agents_online") or 0)
|
||||
else:
|
||||
sample["error"] = {"message": last_error}
|
||||
|
||||
by_district.append(sample)
|
||||
|
||||
return {
|
||||
"global": {
|
||||
"nodes": NODES_TOTAL,
|
||||
"districts": len(districts),
|
||||
"agents": agents_total,
|
||||
"subagents": subagents_total,
|
||||
"memory_vectors": memory_vectors,
|
||||
"districts_online": districts_online,
|
||||
"agents_online": agents_online_total,
|
||||
},
|
||||
"by_district": by_district,
|
||||
"updated_at": _now_iso(),
|
||||
}
|
||||
|
||||
|
||||
async def publish_dashboard(dashboard: Dict[str, Any]) -> None:
|
||||
redis = await redis_client()
|
||||
payload = json.dumps(dashboard, ensure_ascii=False)
|
||||
await redis.set(DASHBOARD_KEY, payload, ex=METRICS_TTL_SECONDS)
|
||||
|
||||
for row in dashboard.get("by_district", []):
|
||||
district_id = row.get("district_id")
|
||||
if not district_id:
|
||||
continue
|
||||
key = f"{DISTRICT_KEY_PREFIX}:{district_id}"
|
||||
await redis.set(key, json.dumps(row, ensure_ascii=False), ex=METRICS_TTL_SECONDS)
|
||||
|
||||
|
||||
async def run_once() -> None:
|
||||
dashboard = await build_dashboard()
|
||||
await publish_dashboard(dashboard)
|
||||
logger.info(
|
||||
"dashboard_updated districts=%s districts_online=%s agents=%s agents_online=%s",
|
||||
dashboard["global"].get("districts"),
|
||||
dashboard["global"].get("districts_online"),
|
||||
dashboard["global"].get("agents"),
|
||||
dashboard["global"].get("agents_online"),
|
||||
)
|
||||
|
||||
|
||||
async def worker_loop() -> None:
|
||||
logger.info(
|
||||
"metrics_poller_started interval=%ss ttl=%ss redis=%s",
|
||||
POLL_INTERVAL_SECONDS,
|
||||
METRICS_TTL_SECONDS,
|
||||
REDIS_URL,
|
||||
)
|
||||
while True:
|
||||
started = time.perf_counter()
|
||||
try:
|
||||
await run_once()
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception:
|
||||
logger.exception("metrics_poller_cycle_failed")
|
||||
|
||||
elapsed = time.perf_counter() - started
|
||||
sleep_for = max(1.0, POLL_INTERVAL_SECONDS - elapsed)
|
||||
await asyncio.sleep(sleep_for)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(worker_loop())
|
||||
finally:
|
||||
try:
|
||||
asyncio.run(close_redis())
|
||||
except Exception:
|
||||
pass
|
||||
84
gateway-bot/daarion_facade/redis_jobs.py
Normal file
@@ -0,0 +1,84 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from redis.asyncio import Redis
|
||||
|
||||
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
|
||||
JOB_KEY_PREFIX = "daarion:jobs"
|
||||
QUEUE_KEY = "daarion:jobs:queue"
|
||||
JOB_TTL_SECONDS = int(os.getenv("DAARION_JOB_TTL_SECONDS", str(72 * 3600)))
|
||||
|
||||
_redis: Optional[Redis] = None
|
||||
|
||||
|
||||
def _job_key(job_id: str) -> str:
|
||||
return f"{JOB_KEY_PREFIX}:{job_id}"
|
||||
|
||||
|
||||
async def redis_client() -> Redis:
|
||||
global _redis
|
||||
if _redis is None:
|
||||
_redis = Redis.from_url(REDIS_URL, decode_responses=True)
|
||||
return _redis
|
||||
|
||||
|
||||
async def close_redis() -> None:
|
||||
global _redis
|
||||
if _redis is not None:
|
||||
await _redis.close()
|
||||
_redis = None
|
||||
|
||||
|
||||
async def create_job(job_id: str, payload: Dict[str, Any]) -> None:
|
||||
r = await redis_client()
|
||||
key = _job_key(job_id)
|
||||
await r.set(key, json.dumps(payload, ensure_ascii=False), ex=JOB_TTL_SECONDS)
|
||||
|
||||
|
||||
async def get_job(job_id: str) -> Optional[Dict[str, Any]]:
|
||||
r = await redis_client()
|
||||
raw = await r.get(_job_key(job_id))
|
||||
if not raw:
|
||||
return None
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
async def update_job(job_id: str, patch: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
current = await get_job(job_id)
|
||||
if not current:
|
||||
return None
|
||||
current.update(patch)
|
||||
await create_job(job_id, current)
|
||||
return current
|
||||
|
||||
|
||||
async def enqueue_job(job_id: str) -> None:
|
||||
r = await redis_client()
|
||||
await r.lpush(QUEUE_KEY, job_id)
|
||||
|
||||
|
||||
async def dequeue_job(block_seconds: int = 5) -> Optional[str]:
|
||||
r = await redis_client()
|
||||
result = await r.brpop(QUEUE_KEY, timeout=block_seconds)
|
||||
if not result:
|
||||
return None
|
||||
_, job_id = result
|
||||
return job_id
|
||||
|
||||
|
||||
async def wait_for_redis(timeout_seconds: int = 30) -> None:
|
||||
deadline = asyncio.get_running_loop().time() + timeout_seconds
|
||||
while True:
|
||||
try:
|
||||
r = await redis_client()
|
||||
await r.ping()
|
||||
return
|
||||
except Exception:
|
||||
if asyncio.get_running_loop().time() >= deadline:
|
||||
raise
|
||||
await asyncio.sleep(1)
|
||||
268
gateway-bot/daarion_facade/registry_api.py
Normal file
@@ -0,0 +1,268 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import httpx
|
||||
from fastapi import APIRouter
|
||||
from redis.asyncio import Redis
|
||||
|
||||
router = APIRouter(prefix="/v1", tags=["daarion-facade"])
|
||||
|
||||
REGISTRY_CACHE_TTL = int(os.getenv("REGISTRY_CACHE_TTL", "30"))
|
||||
MEMORY_SERVICE_URL = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000")
|
||||
CREWAI_SERVICE_URL = os.getenv("CREWAI_SERVICE_URL", "http://dagi-staging-crewai-service:9010")
|
||||
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
|
||||
METRICS_DASHBOARD_KEY = "daarion:metrics:dashboard"
|
||||
|
||||
_REGISTRY_CACHE: Dict[str, Any] = {"loaded_at": 0.0, "data": None}
|
||||
_DISTRICT_CACHE: Dict[str, Any] = {"loaded_at": 0.0, "data": None}
|
||||
_CREWAI_CACHE: Dict[str, Any] = {"loaded_at": 0.0, "data": {}}
|
||||
_REDIS: Optional[Redis] = None
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _registry_paths() -> List[Path]:
|
||||
return [
|
||||
Path("/app/gateway-bot/agent_registry.json"),
|
||||
Path("/opt/microdao-daarion/config/agent_registry.json"),
|
||||
Path(__file__).resolve().parents[1] / "agent_registry.json",
|
||||
]
|
||||
|
||||
|
||||
def _district_paths() -> List[Path]:
|
||||
return [
|
||||
Path("/app/gateway-bot/district_registry.json"),
|
||||
Path(__file__).resolve().parents[1] / "district_registry.json",
|
||||
]
|
||||
|
||||
|
||||
def _load_registry() -> Dict[str, Any]:
|
||||
now = time.time()
|
||||
if _REGISTRY_CACHE.get("data") and (now - _REGISTRY_CACHE.get("loaded_at", 0.0) < REGISTRY_CACHE_TTL):
|
||||
return _REGISTRY_CACHE["data"]
|
||||
|
||||
for path in _registry_paths():
|
||||
if path.exists():
|
||||
with path.open("r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
_REGISTRY_CACHE.update({"loaded_at": now, "data": data})
|
||||
return data
|
||||
|
||||
data = {"agents": {}}
|
||||
_REGISTRY_CACHE.update({"loaded_at": now, "data": data})
|
||||
return data
|
||||
|
||||
|
||||
def _load_district_registry() -> Dict[str, Any]:
|
||||
now = time.time()
|
||||
if _DISTRICT_CACHE.get("data") and (now - _DISTRICT_CACHE.get("loaded_at", 0.0) < REGISTRY_CACHE_TTL):
|
||||
return _DISTRICT_CACHE["data"]
|
||||
|
||||
for path in _district_paths():
|
||||
if path.exists():
|
||||
with path.open("r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
_DISTRICT_CACHE.update({"loaded_at": now, "data": data})
|
||||
return data
|
||||
|
||||
data = {"districts": []}
|
||||
_DISTRICT_CACHE.update({"loaded_at": now, "data": data})
|
||||
return data
|
||||
|
||||
|
||||
async def _redis_client() -> Redis:
|
||||
global _REDIS
|
||||
if _REDIS is None:
|
||||
_REDIS = Redis.from_url(REDIS_URL, decode_responses=True)
|
||||
return _REDIS
|
||||
|
||||
|
||||
async def _load_cached_dashboard() -> Optional[Dict[str, Any]]:
|
||||
try:
|
||||
r = await _redis_client()
|
||||
raw = await r.get(METRICS_DASHBOARD_KEY)
|
||||
if not raw:
|
||||
return None
|
||||
return json.loads(raw)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
async def _load_crewai_roles() -> Dict[str, int]:
|
||||
now = time.time()
|
||||
if now - _CREWAI_CACHE.get("loaded_at", 0.0) < REGISTRY_CACHE_TTL:
|
||||
return _CREWAI_CACHE.get("data", {})
|
||||
|
||||
out: Dict[str, int] = {}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=8.0) as client:
|
||||
resp = await client.get(f"{CREWAI_SERVICE_URL}/crew/agents")
|
||||
if resp.status_code == 200:
|
||||
payload = resp.json()
|
||||
for aid, info in payload.items():
|
||||
default_roles = info.get("default_roles")
|
||||
out[str(aid)] = int(default_roles) if isinstance(default_roles, int) else 0
|
||||
except Exception:
|
||||
out = {}
|
||||
|
||||
_CREWAI_CACHE.update({"loaded_at": now, "data": out})
|
||||
return out
|
||||
|
||||
|
||||
@router.get("/registry/agents")
|
||||
async def get_agents() -> Dict[str, Any]:
|
||||
reg = _load_registry()
|
||||
agents = reg.get("agents", {}) if isinstance(reg, dict) else {}
|
||||
role_counts = await _load_crewai_roles()
|
||||
|
||||
items: List[Dict[str, Any]] = []
|
||||
for agent_id, cfg in agents.items():
|
||||
if not isinstance(cfg, dict):
|
||||
continue
|
||||
domains = cfg.get("domains") or []
|
||||
district_id = cfg.get("district_id") or "city-core"
|
||||
items.append(
|
||||
{
|
||||
"agent_id": agent_id,
|
||||
"title": cfg.get("display_name") or agent_id,
|
||||
"role": cfg.get("canonical_role") or "",
|
||||
"domain_primary": domains[0] if domains else "general",
|
||||
"domain_aliases": domains[1:] if len(domains) > 1 else [],
|
||||
"visibility": cfg.get("visibility", "public"),
|
||||
"status": cfg.get("status", "active"),
|
||||
"team": {"subagents_total": role_counts.get(agent_id, 0)},
|
||||
"district_id": district_id,
|
||||
"avatar_url": cfg.get("avatar_url"),
|
||||
"health_url": cfg.get("health_url"),
|
||||
}
|
||||
)
|
||||
|
||||
return {"items": items, "total": len(items)}
|
||||
|
||||
|
||||
@router.get("/registry/districts")
|
||||
async def get_districts() -> Dict[str, Any]:
|
||||
agents_payload = await get_agents()
|
||||
agents = agents_payload.get("items", [])
|
||||
by_district: Dict[str, List[Dict[str, Any]]] = {}
|
||||
for a in agents:
|
||||
by_district.setdefault(a.get("district_id", "city-core"), []).append(a)
|
||||
|
||||
catalog = _load_district_registry().get("districts", [])
|
||||
catalog_by_id: Dict[str, Dict[str, Any]] = {
|
||||
str(d.get("district_id")): d for d in catalog if isinstance(d, dict) and d.get("district_id")
|
||||
}
|
||||
|
||||
district_ids = sorted(set(catalog_by_id.keys()) | set(by_district.keys()))
|
||||
items: List[Dict[str, Any]] = []
|
||||
|
||||
for district_id in district_ids:
|
||||
members = by_district.get(district_id, [])
|
||||
base = catalog_by_id.get(district_id, {})
|
||||
domain = base.get("domain") or ("daarion.city" if district_id == "city-core" else f"{district_id}.daarion.city")
|
||||
|
||||
lead_agent_id = base.get("lead_agent_id")
|
||||
if not lead_agent_id:
|
||||
if district_id == "city-core" and any(m.get("agent_id") == "daarwizz" for m in members):
|
||||
lead_agent_id = "daarwizz"
|
||||
elif members:
|
||||
lead_agent_id = members[0].get("agent_id")
|
||||
else:
|
||||
lead_agent_id = None
|
||||
|
||||
items.append(
|
||||
{
|
||||
"district_id": district_id,
|
||||
"title": base.get("title") or district_id.replace("-", " ").title(),
|
||||
"domain": domain,
|
||||
"status": base.get("status", "active"),
|
||||
"logo_url": base.get("logo_url"),
|
||||
"health_url": base.get("health_url"),
|
||||
"well_known": {
|
||||
"manifest": f"https://{domain}/.well-known/daarion-district.json",
|
||||
"health": f"https://{domain}/.well-known/daarion-health.json",
|
||||
"capabilities": f"https://{domain}/.well-known/daarion-capabilities.json",
|
||||
},
|
||||
"lead_agent_id": lead_agent_id,
|
||||
"agents_total": len(members),
|
||||
}
|
||||
)
|
||||
|
||||
return {"items": items, "total": len(items)}
|
||||
|
||||
|
||||
@router.get("/metrics")
|
||||
async def get_metrics() -> Dict[str, Any]:
|
||||
agents_payload = await get_agents()
|
||||
districts_payload = await get_districts()
|
||||
agents = agents_payload.get("items", [])
|
||||
|
||||
memory_vectors = 0
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
resp = await client.get(f"{MEMORY_SERVICE_URL}/health")
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
memory_vectors = int(
|
||||
data.get("vector_store", {})
|
||||
.get("memories", {})
|
||||
.get("vectors_count", 0)
|
||||
)
|
||||
except Exception:
|
||||
memory_vectors = 0
|
||||
|
||||
return {
|
||||
"nodes": 1,
|
||||
"districts": districts_payload.get("total", 0),
|
||||
"agents": len(agents),
|
||||
"subagents": sum(int((a.get("team") or {}).get("subagents_total", 0)) for a in agents),
|
||||
"memory_vectors": memory_vectors,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/metrics/dashboard")
|
||||
async def get_metrics_dashboard() -> Dict[str, Any]:
|
||||
cached = await _load_cached_dashboard()
|
||||
if cached:
|
||||
return cached
|
||||
|
||||
metrics = await get_metrics()
|
||||
districts_payload = await get_districts()
|
||||
districts = districts_payload.get("items", [])
|
||||
|
||||
by_district = []
|
||||
for d in districts:
|
||||
by_district.append(
|
||||
{
|
||||
"district_id": d.get("district_id"),
|
||||
"title": d.get("title"),
|
||||
"domain": d.get("domain"),
|
||||
"status": d.get("status"),
|
||||
"ok": None,
|
||||
"agents_total": d.get("agents_total", 0),
|
||||
"agents_online": None,
|
||||
"latency_ms": None,
|
||||
"last_check_ts": None,
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"global": {
|
||||
"nodes": metrics.get("nodes", 1),
|
||||
"districts": metrics.get("districts", 0),
|
||||
"agents": metrics.get("agents", 0),
|
||||
"subagents": metrics.get("subagents", 0),
|
||||
"memory_vectors": metrics.get("memory_vectors", 0),
|
||||
"districts_online": 0,
|
||||
"agents_online": 0,
|
||||
},
|
||||
"by_district": by_district,
|
||||
"updated_at": _now_iso(),
|
||||
"source": "fallback_registry",
|
||||
}
|
||||
100
gateway-bot/daarion_facade/reminder_worker.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from typing import Dict
|
||||
|
||||
import httpx
|
||||
|
||||
from .reminders import close_redis, pop_due_reminders
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
|
||||
logger = logging.getLogger("daarion-reminder-worker")
|
||||
|
||||
POLL_SECONDS = float(os.getenv("DAARION_REMINDER_POLL_SECONDS", "2"))
|
||||
TELEGRAM_TIMEOUT = float(os.getenv("DAARION_REMINDER_TELEGRAM_TIMEOUT", "20"))
|
||||
|
||||
AGENT_TOKEN_ENV: Dict[str, str] = {
|
||||
"daarwizz": "DAARWIZZ_TELEGRAM_BOT_TOKEN",
|
||||
"helion": "HELION_TELEGRAM_BOT_TOKEN",
|
||||
"greenfood": "GREENFOOD_TELEGRAM_BOT_TOKEN",
|
||||
"agromatrix": "AGROMATRIX_TELEGRAM_BOT_TOKEN",
|
||||
"alateya": "ALATEYA_TELEGRAM_BOT_TOKEN",
|
||||
"nutra": "NUTRA_TELEGRAM_BOT_TOKEN",
|
||||
"druid": "DRUID_TELEGRAM_BOT_TOKEN",
|
||||
"clan": "CLAN_TELEGRAM_BOT_TOKEN",
|
||||
"eonarch": "EONARCH_TELEGRAM_BOT_TOKEN",
|
||||
"senpai": "SENPAI_TELEGRAM_BOT_TOKEN",
|
||||
"oneok": "ONEOK_TELEGRAM_BOT_TOKEN",
|
||||
"soul": "SOUL_TELEGRAM_BOT_TOKEN",
|
||||
"yaromir": "YAROMIR_TELEGRAM_BOT_TOKEN",
|
||||
"sofiia": "SOFIIA_TELEGRAM_BOT_TOKEN",
|
||||
}
|
||||
|
||||
|
||||
def _token_for_agent(agent_id: str) -> str:
|
||||
env = AGENT_TOKEN_ENV.get((agent_id or "").lower(), "")
|
||||
return os.getenv(env, "") if env else ""
|
||||
|
||||
|
||||
async def _send_reminder(item: Dict[str, str]) -> bool:
|
||||
agent_id = str(item.get("agent_id", ""))
|
||||
chat_id = str(item.get("chat_id", ""))
|
||||
reminder_text = str(item.get("text", "")).strip()
|
||||
due_at = str(item.get("due_at", ""))
|
||||
|
||||
token = _token_for_agent(agent_id)
|
||||
if not token:
|
||||
logger.warning("reminder_skip_no_token agent=%s reminder_id=%s", agent_id, item.get("reminder_id"))
|
||||
return False
|
||||
|
||||
if not chat_id or not reminder_text:
|
||||
logger.warning("reminder_skip_invalid_payload reminder_id=%s", item.get("reminder_id"))
|
||||
return False
|
||||
|
||||
body = {
|
||||
"chat_id": chat_id,
|
||||
"text": f"⏰ Нагадування ({agent_id})\n\n{reminder_text}\n\n🕒 {due_at}",
|
||||
}
|
||||
|
||||
url = f"https://api.telegram.org/bot{token}/sendMessage"
|
||||
async with httpx.AsyncClient(timeout=TELEGRAM_TIMEOUT) as client:
|
||||
resp = await client.post(url, json=body)
|
||||
if resp.status_code != 200:
|
||||
logger.warning(
|
||||
"reminder_send_failed reminder_id=%s status=%s body=%s",
|
||||
item.get("reminder_id"),
|
||||
resp.status_code,
|
||||
resp.text[:300],
|
||||
)
|
||||
return False
|
||||
|
||||
logger.info("reminder_sent reminder_id=%s agent=%s chat=%s", item.get("reminder_id"), agent_id, chat_id)
|
||||
return True
|
||||
|
||||
|
||||
async def worker_loop() -> None:
|
||||
logger.info("reminder_worker_started poll_seconds=%s", POLL_SECONDS)
|
||||
while True:
|
||||
try:
|
||||
items = await pop_due_reminders(limit=20)
|
||||
if items:
|
||||
for item in items:
|
||||
try:
|
||||
await _send_reminder(item)
|
||||
except Exception:
|
||||
logger.exception("reminder_send_exception reminder_id=%s", item.get("reminder_id"))
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception:
|
||||
logger.exception("reminder_worker_cycle_failed")
|
||||
await asyncio.sleep(POLL_SECONDS)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(worker_loop())
|
||||
finally:
|
||||
try:
|
||||
asyncio.run(close_redis())
|
||||
except Exception:
|
||||
pass
|
||||
154
gateway-bot/daarion_facade/reminders.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from redis.asyncio import Redis
|
||||
|
||||
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
|
||||
REMINDER_PREFIX = "daarion:reminders"
|
||||
REMINDER_BY_ID = f"{REMINDER_PREFIX}:by_id"
|
||||
REMINDER_SCHEDULE = f"{REMINDER_PREFIX}:schedule"
|
||||
REMINDER_TTL_SECONDS = int(os.getenv("DAARION_REMINDER_TTL_SECONDS", str(30 * 24 * 3600)))
|
||||
|
||||
_redis: Optional[Redis] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Reminder:
|
||||
reminder_id: str
|
||||
agent_id: str
|
||||
chat_id: str
|
||||
user_id: str
|
||||
text: str
|
||||
due_ts: int
|
||||
created_at: str
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"reminder_id": self.reminder_id,
|
||||
"agent_id": self.agent_id,
|
||||
"chat_id": self.chat_id,
|
||||
"user_id": self.user_id,
|
||||
"text": self.text,
|
||||
"due_ts": self.due_ts,
|
||||
"created_at": self.created_at,
|
||||
}
|
||||
|
||||
|
||||
async def redis_client() -> Redis:
|
||||
global _redis
|
||||
if _redis is None:
|
||||
_redis = Redis.from_url(REDIS_URL, decode_responses=True)
|
||||
return _redis
|
||||
|
||||
|
||||
async def close_redis() -> None:
|
||||
global _redis
|
||||
if _redis is not None:
|
||||
await _redis.close()
|
||||
_redis = None
|
||||
|
||||
|
||||
def _iso_now() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _iso_from_ts(ts: int) -> str:
|
||||
return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()
|
||||
|
||||
|
||||
async def create_reminder(agent_id: str, chat_id: str, user_id: str, text: str, due_ts: int) -> Dict[str, Any]:
|
||||
reminder = Reminder(
|
||||
reminder_id=f"rem_{uuid.uuid4().hex[:16]}",
|
||||
agent_id=agent_id,
|
||||
chat_id=str(chat_id),
|
||||
user_id=str(user_id),
|
||||
text=text.strip(),
|
||||
due_ts=int(due_ts),
|
||||
created_at=_iso_now(),
|
||||
)
|
||||
|
||||
r = await redis_client()
|
||||
key = f"{REMINDER_BY_ID}:{reminder.reminder_id}"
|
||||
payload = json.dumps(reminder.to_dict(), ensure_ascii=False)
|
||||
|
||||
await r.set(key, payload, ex=REMINDER_TTL_SECONDS)
|
||||
await r.zadd(REMINDER_SCHEDULE, {reminder.reminder_id: float(reminder.due_ts)})
|
||||
|
||||
result = reminder.to_dict()
|
||||
result["due_at"] = _iso_from_ts(reminder.due_ts)
|
||||
return result
|
||||
|
||||
|
||||
async def list_reminders(agent_id: str, chat_id: str, user_id: str, limit: int = 10) -> List[Dict[str, Any]]:
|
||||
r = await redis_client()
|
||||
now_ts = int(time.time())
|
||||
ids = await r.zrangebyscore(REMINDER_SCHEDULE, min=now_ts - 365 * 24 * 3600, max="+inf", start=0, num=max(1, limit * 5))
|
||||
|
||||
out: List[Dict[str, Any]] = []
|
||||
for reminder_id in ids:
|
||||
raw = await r.get(f"{REMINDER_BY_ID}:{reminder_id}")
|
||||
if not raw:
|
||||
continue
|
||||
try:
|
||||
item = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if item.get("agent_id") != agent_id:
|
||||
continue
|
||||
if str(item.get("chat_id")) != str(chat_id):
|
||||
continue
|
||||
if str(item.get("user_id")) != str(user_id):
|
||||
continue
|
||||
item["due_at"] = _iso_from_ts(int(item.get("due_ts", 0)))
|
||||
out.append(item)
|
||||
if len(out) >= limit:
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
async def cancel_reminder(reminder_id: str, agent_id: str, chat_id: str, user_id: str) -> bool:
|
||||
r = await redis_client()
|
||||
key = f"{REMINDER_BY_ID}:{reminder_id}"
|
||||
raw = await r.get(key)
|
||||
if not raw:
|
||||
return False
|
||||
try:
|
||||
item = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
return False
|
||||
|
||||
if item.get("agent_id") != agent_id or str(item.get("chat_id")) != str(chat_id) or str(item.get("user_id")) != str(user_id):
|
||||
return False
|
||||
|
||||
await r.delete(key)
|
||||
await r.zrem(REMINDER_SCHEDULE, reminder_id)
|
||||
return True
|
||||
|
||||
|
||||
async def pop_due_reminders(limit: int = 20) -> List[Dict[str, Any]]:
|
||||
r = await redis_client()
|
||||
now_ts = int(time.time())
|
||||
ids = await r.zrangebyscore(REMINDER_SCHEDULE, min="-inf", max=now_ts, start=0, num=max(1, limit))
|
||||
out: List[Dict[str, Any]] = []
|
||||
|
||||
for reminder_id in ids:
|
||||
removed = await r.zrem(REMINDER_SCHEDULE, reminder_id)
|
||||
if removed == 0:
|
||||
continue
|
||||
raw = await r.get(f"{REMINDER_BY_ID}:{reminder_id}")
|
||||
if not raw:
|
||||
continue
|
||||
await r.delete(f"{REMINDER_BY_ID}:{reminder_id}")
|
||||
try:
|
||||
item = json.loads(raw)
|
||||
item["due_at"] = _iso_from_ts(int(item.get("due_ts", now_ts)))
|
||||
out.append(item)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
return out
|
||||
107
gateway-bot/daarion_facade/worker.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import asyncio
|
||||
from datetime import datetime, timezone
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
import httpx
|
||||
|
||||
from .redis_jobs import close_redis, dequeue_job, get_job, update_job, wait_for_redis
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
|
||||
logger = logging.getLogger("daarion-gateway-worker")
|
||||
|
||||
ROUTER_BASE_URL = os.getenv("ROUTER_BASE_URL", os.getenv("ROUTER_URL", "http://router:8000"))
|
||||
ROUTER_TIMEOUT_SECONDS = float(os.getenv("ROUTER_WORKER_TIMEOUT", "60"))
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
async def _call_router(agent_id: str, input_payload: Dict[str, Any], metadata: Dict[str, Any]) -> Dict[str, Any]:
|
||||
body: Dict[str, Any] = {
|
||||
"prompt": input_payload.get("prompt", ""),
|
||||
"metadata": metadata or {},
|
||||
}
|
||||
images = input_payload.get("images") or []
|
||||
if images:
|
||||
body["images"] = images
|
||||
|
||||
url = f"{ROUTER_BASE_URL}/v1/agents/{agent_id}/infer"
|
||||
async with httpx.AsyncClient(timeout=ROUTER_TIMEOUT_SECONDS) as client:
|
||||
resp = await client.post(url, json=body)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
return {
|
||||
"response": data.get("response", ""),
|
||||
"model": data.get("model"),
|
||||
"backend": data.get("backend"),
|
||||
"tokens_used": data.get("tokens_used"),
|
||||
}
|
||||
|
||||
|
||||
async def run_once(job_id: str) -> None:
|
||||
job = await get_job(job_id)
|
||||
if not job:
|
||||
logger.warning("job_missing: %s", job_id)
|
||||
return
|
||||
|
||||
await update_job(job_id, {"status": "running", "started_at": _now(), "updated_at": _now()})
|
||||
|
||||
agent_id = job.get("agent_id")
|
||||
input_payload = job.get("input") or {}
|
||||
metadata = job.get("metadata") or {}
|
||||
|
||||
try:
|
||||
result = await _call_router(agent_id, input_payload, metadata)
|
||||
await update_job(
|
||||
job_id,
|
||||
{
|
||||
"status": "done",
|
||||
"result": result,
|
||||
"error": None,
|
||||
"finished_at": _now(),
|
||||
"updated_at": _now(),
|
||||
},
|
||||
)
|
||||
logger.info("job_done: %s agent=%s", job_id, agent_id)
|
||||
except Exception as e:
|
||||
await update_job(
|
||||
job_id,
|
||||
{
|
||||
"status": "failed",
|
||||
"error": {"type": e.__class__.__name__, "message": str(e)},
|
||||
"finished_at": _now(),
|
||||
"updated_at": _now(),
|
||||
},
|
||||
)
|
||||
logger.exception("job_failed: %s agent=%s", job_id, agent_id)
|
||||
|
||||
|
||||
async def worker_loop() -> None:
|
||||
await wait_for_redis(60)
|
||||
logger.info("worker_started router=%s", ROUTER_BASE_URL)
|
||||
|
||||
while True:
|
||||
try:
|
||||
job_id = await dequeue_job(block_seconds=10)
|
||||
if not job_id:
|
||||
continue
|
||||
await run_once(job_id)
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception:
|
||||
logger.exception("worker_loop_error")
|
||||
await asyncio.sleep(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(worker_loop())
|
||||
finally:
|
||||
try:
|
||||
asyncio.run(close_redis())
|
||||
except Exception:
|
||||
pass
|
||||
92
gateway-bot/district_registry.json
Normal file
@@ -0,0 +1,92 @@
|
||||
{
|
||||
"districts": [
|
||||
{
|
||||
"district_id": "city-core",
|
||||
"title": "City Core - DAARION.city",
|
||||
"domain": "daarion.city",
|
||||
"status": "active",
|
||||
"logo_url": null,
|
||||
"lead_agent_id": "daarwizz"
|
||||
},
|
||||
{
|
||||
"district_id": "helion",
|
||||
"title": "Helion District",
|
||||
"domain": "helion.daarion.city",
|
||||
"status": "active",
|
||||
"logo_url": null,
|
||||
"lead_agent_id": "helion"
|
||||
},
|
||||
{
|
||||
"district_id": "alateya",
|
||||
"title": "Alateya District",
|
||||
"domain": "alateya.daarion.city",
|
||||
"status": "active",
|
||||
"logo_url": null,
|
||||
"lead_agent_id": "alateya"
|
||||
},
|
||||
{
|
||||
"district_id": "druid",
|
||||
"title": "Druid District",
|
||||
"domain": "druid.daarion.city",
|
||||
"status": "active",
|
||||
"logo_url": null,
|
||||
"lead_agent_id": "druid"
|
||||
},
|
||||
{
|
||||
"district_id": "nutra",
|
||||
"title": "Nutra District",
|
||||
"domain": "nutra.daarion.city",
|
||||
"status": "active",
|
||||
"logo_url": null,
|
||||
"lead_agent_id": "nutra"
|
||||
},
|
||||
{
|
||||
"district_id": "agromatrix",
|
||||
"title": "AgroMatrix District",
|
||||
"domain": "agromatrix.daarion.city",
|
||||
"status": "active",
|
||||
"logo_url": null,
|
||||
"lead_agent_id": "agromatrix"
|
||||
},
|
||||
{
|
||||
"district_id": "greenfood",
|
||||
"title": "GreenFood District",
|
||||
"domain": "greenfood.daarion.city",
|
||||
"status": "active",
|
||||
"logo_url": null,
|
||||
"lead_agent_id": "greenfood"
|
||||
},
|
||||
{
|
||||
"district_id": "clan",
|
||||
"title": "Clan District",
|
||||
"domain": "clan.daarion.city",
|
||||
"status": "active",
|
||||
"logo_url": null,
|
||||
"lead_agent_id": "clan"
|
||||
},
|
||||
{
|
||||
"district_id": "eonarch",
|
||||
"title": "Eonarch District",
|
||||
"domain": "eonarch.daarion.city",
|
||||
"status": "active",
|
||||
"logo_url": null,
|
||||
"lead_agent_id": "eonarch"
|
||||
},
|
||||
{
|
||||
"district_id": "soul",
|
||||
"title": "Soul District",
|
||||
"domain": "soul.daarion.city",
|
||||
"status": "active",
|
||||
"logo_url": null,
|
||||
"lead_agent_id": "soul"
|
||||
},
|
||||
{
|
||||
"district_id": "senpai",
|
||||
"title": "Senpai District",
|
||||
"domain": "senpai.daarion.city",
|
||||
"status": "active",
|
||||
"logo_url": null,
|
||||
"lead_agent_id": "senpai"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1871,23 +1871,53 @@ async def process_document(
|
||||
Dict з результатом обробки
|
||||
"""
|
||||
mime_type = document.get("mime_type", "")
|
||||
mime_type_l = (mime_type or "").lower()
|
||||
file_name = document.get("file_name", "")
|
||||
file_id = document.get("file_id")
|
||||
|
||||
file_name_lower = file_name.lower()
|
||||
allowed_exts = {".pdf", ".docx", ".txt", ".md", ".csv", ".xlsx", ".zip"}
|
||||
allowed_exts = {
|
||||
".pdf", ".doc", ".docx", ".rtf", ".odt",
|
||||
".txt", ".md", ".markdown",
|
||||
".csv", ".tsv", ".xls", ".xlsx", ".xlsm", ".ods",
|
||||
".ppt", ".pptx", ".odp",
|
||||
".json", ".yaml", ".yml", ".xml", ".html", ".htm",
|
||||
".zip",
|
||||
".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff",
|
||||
}
|
||||
is_allowed = any(file_name_lower.endswith(ext) for ext in allowed_exts)
|
||||
if mime_type == "application/pdf":
|
||||
if mime_type_l == "application/pdf":
|
||||
is_allowed = True
|
||||
if mime_type in {
|
||||
if mime_type_l in {
|
||||
"application/msword",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/rtf",
|
||||
"text/rtf",
|
||||
"application/vnd.oasis.opendocument.text",
|
||||
"application/vnd.ms-excel",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"application/vnd.ms-excel.sheet.macroenabled.12",
|
||||
"application/vnd.oasis.opendocument.spreadsheet",
|
||||
"application/vnd.ms-powerpoint",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/vnd.oasis.opendocument.presentation",
|
||||
"text/plain",
|
||||
"text/markdown",
|
||||
"text/csv",
|
||||
"text/tab-separated-values",
|
||||
"application/json",
|
||||
"application/yaml",
|
||||
"application/x-yaml",
|
||||
"text/yaml",
|
||||
"application/xml",
|
||||
"text/xml",
|
||||
"text/html",
|
||||
"application/zip",
|
||||
"application/x-zip-compressed",
|
||||
}:
|
||||
is_allowed = True
|
||||
if mime_type_l.startswith("image/"):
|
||||
is_allowed = True
|
||||
|
||||
if is_allowed and file_id:
|
||||
logger.info(f"{agent_config.name}: Document from {username} (tg:{user_id}), file_id: {file_id}, file_name: {file_name}")
|
||||
@@ -2027,7 +2057,7 @@ async def process_document(
|
||||
telegram_token = agent_config.get_telegram_token()
|
||||
await send_telegram_message(
|
||||
chat_id,
|
||||
"Наразі підтримуються формати: PDF, DOCX, TXT, MD, CSV, XLSX, ZIP.",
|
||||
"Підтримуються формати: PDF/DOC/DOCX/RTF/ODT, TXT/MD/CSV/TSV, XLS/XLSX/XLSM/ODS, PPT/PPTX/ODP, JSON/YAML/XML/HTML, ZIP, зображення.",
|
||||
telegram_token,
|
||||
)
|
||||
return {"ok": False, "error": "Unsupported document type"}
|
||||
@@ -3681,7 +3711,8 @@ async def _old_telegram_webhook(update: TelegramUpdate):
|
||||
doc_url=file_url,
|
||||
file_name=file_name,
|
||||
dao_id=dao_id,
|
||||
user_id=f"tg:{user_id}"
|
||||
user_id=f"tg:{user_id}",
|
||||
agent_id=agent_config.agent_id,
|
||||
)
|
||||
|
||||
if result.success:
|
||||
@@ -3705,7 +3736,8 @@ async def _old_telegram_webhook(update: TelegramUpdate):
|
||||
result = await ingest_document(
|
||||
session_id=session_id,
|
||||
dao_id=dao_id,
|
||||
user_id=f"tg:{user_id}"
|
||||
user_id=f"tg:{user_id}",
|
||||
agent_id=agent_config.agent_id,
|
||||
)
|
||||
|
||||
if result.success:
|
||||
|
||||
@@ -6,20 +6,32 @@ Endpoints:
|
||||
- POST /api/doc/parse - Parse a document
|
||||
- POST /api/doc/ingest - Ingest document to RAG
|
||||
- POST /api/doc/ask - Ask question about document
|
||||
- POST /api/doc/update - Update existing document text (versioned)
|
||||
- POST /api/doc/publish - Publish physical file version via artifact registry
|
||||
- GET /api/doc/versions/{doc_id} - List document versions
|
||||
- GET /api/doc/artifacts/{artifact_id}/versions/{version_id}/download - Download via gateway proxy
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from typing import Optional, Dict, Any
|
||||
from fastapi import APIRouter, HTTPException, UploadFile, File, Form
|
||||
from fastapi.responses import Response
|
||||
from pydantic import BaseModel
|
||||
import httpx
|
||||
|
||||
from services.doc_service import (
|
||||
doc_service,
|
||||
parse_document,
|
||||
ingest_document,
|
||||
ask_about_document,
|
||||
update_document,
|
||||
list_document_versions,
|
||||
publish_document_artifact,
|
||||
get_doc_context,
|
||||
ParsedResult,
|
||||
IngestResult,
|
||||
UpdateResult,
|
||||
QAResult,
|
||||
DocContext
|
||||
)
|
||||
@@ -27,6 +39,8 @@ from services.doc_service import (
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
ARTIFACT_REGISTRY_URL = os.getenv("ARTIFACT_REGISTRY_URL", "http://artifact-registry:9220").rstrip("/")
|
||||
DOC_DOWNLOAD_TIMEOUT_SECONDS = float(os.getenv("DOC_DOWNLOAD_TIMEOUT_SECONDS", "60"))
|
||||
|
||||
|
||||
# ========================================
|
||||
@@ -52,6 +66,7 @@ class IngestDocumentRequest(BaseModel):
|
||||
file_name: Optional[str] = None
|
||||
dao_id: Optional[str] = None
|
||||
user_id: Optional[str] = None
|
||||
agent_id: str = "daarwizz"
|
||||
|
||||
|
||||
class AskDocumentRequest(BaseModel):
|
||||
@@ -61,6 +76,40 @@ class AskDocumentRequest(BaseModel):
|
||||
doc_id: Optional[str] = None
|
||||
dao_id: Optional[str] = None
|
||||
user_id: Optional[str] = None
|
||||
agent_id: str = "daarwizz"
|
||||
|
||||
|
||||
class UpdateDocumentRequest(BaseModel):
|
||||
"""Request to update existing document content."""
|
||||
session_id: str
|
||||
doc_id: Optional[str] = None
|
||||
doc_url: Optional[str] = None
|
||||
file_name: Optional[str] = None
|
||||
text: Optional[str] = None
|
||||
dao_id: Optional[str] = None
|
||||
user_id: Optional[str] = None
|
||||
agent_id: str = "daarwizz"
|
||||
storage_ref: Optional[str] = None
|
||||
publish_artifact: bool = False
|
||||
artifact_id: Optional[str] = None
|
||||
target_format: Optional[str] = None
|
||||
artifact_label: Optional[str] = None
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class PublishDocumentRequest(BaseModel):
|
||||
"""Request to publish document as physical artifact version."""
|
||||
session_id: str
|
||||
doc_id: Optional[str] = None
|
||||
doc_url: Optional[str] = None
|
||||
file_name: Optional[str] = None
|
||||
text: Optional[str] = None
|
||||
dao_id: Optional[str] = None
|
||||
user_id: Optional[str] = None
|
||||
artifact_id: Optional[str] = None
|
||||
target_format: Optional[str] = None
|
||||
artifact_label: Optional[str] = None
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
# ========================================
|
||||
@@ -167,7 +216,8 @@ async def ingest_document_endpoint(request: IngestDocumentRequest):
|
||||
doc_url=request.doc_url,
|
||||
file_name=request.file_name,
|
||||
dao_id=request.dao_id,
|
||||
user_id=request.user_id
|
||||
user_id=request.user_id,
|
||||
agent_id=request.agent_id,
|
||||
)
|
||||
|
||||
if not result.success:
|
||||
@@ -209,7 +259,8 @@ async def ask_about_document_endpoint(request: AskDocumentRequest):
|
||||
question=request.question,
|
||||
doc_id=doc_id,
|
||||
dao_id=request.dao_id,
|
||||
user_id=request.user_id
|
||||
user_id=request.user_id,
|
||||
agent_id=request.agent_id,
|
||||
)
|
||||
|
||||
if not result.success:
|
||||
@@ -227,6 +278,107 @@ async def ask_about_document_endpoint(request: AskDocumentRequest):
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/api/doc/update")
|
||||
async def update_document_endpoint(request: UpdateDocumentRequest):
|
||||
"""
|
||||
Update a document and bump its version.
|
||||
If text is omitted and doc_url exists, text is re-parsed from the source document.
|
||||
"""
|
||||
try:
|
||||
result = await update_document(
|
||||
session_id=request.session_id,
|
||||
doc_id=request.doc_id,
|
||||
doc_url=request.doc_url,
|
||||
file_name=request.file_name,
|
||||
text=request.text,
|
||||
dao_id=request.dao_id,
|
||||
user_id=request.user_id,
|
||||
agent_id=request.agent_id,
|
||||
storage_ref=request.storage_ref,
|
||||
publish_artifact=request.publish_artifact,
|
||||
artifact_id=request.artifact_id,
|
||||
target_format=request.target_format,
|
||||
artifact_label=request.artifact_label,
|
||||
metadata=request.metadata,
|
||||
)
|
||||
if not result.success:
|
||||
raise HTTPException(status_code=400, detail=result.error)
|
||||
response = {
|
||||
"ok": True,
|
||||
"doc_id": result.doc_id,
|
||||
"version_no": result.version_no,
|
||||
"version_id": result.version_id,
|
||||
"updated_chunks": result.updated_chunks,
|
||||
"status": result.status,
|
||||
"publish_error": result.publish_error,
|
||||
"artifact_id": result.artifact_id,
|
||||
"artifact_version_id": result.artifact_version_id,
|
||||
"artifact_storage_key": result.artifact_storage_key,
|
||||
"artifact_mime": result.artifact_mime,
|
||||
"artifact_download_url": result.artifact_download_url,
|
||||
}
|
||||
return response
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Update document error: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/api/doc/publish")
|
||||
async def publish_document_endpoint(request: PublishDocumentRequest):
|
||||
"""
|
||||
Publish current document text as physical file artifact version.
|
||||
"""
|
||||
try:
|
||||
result = await publish_document_artifact(
|
||||
session_id=request.session_id,
|
||||
doc_id=request.doc_id,
|
||||
doc_url=request.doc_url,
|
||||
file_name=request.file_name,
|
||||
text=request.text,
|
||||
dao_id=request.dao_id,
|
||||
user_id=request.user_id,
|
||||
artifact_id=request.artifact_id,
|
||||
target_format=request.target_format,
|
||||
artifact_label=request.artifact_label,
|
||||
metadata=request.metadata,
|
||||
)
|
||||
if not result.success:
|
||||
raise HTTPException(status_code=400, detail=result.error)
|
||||
return {
|
||||
"ok": True,
|
||||
"artifact_id": result.artifact_id,
|
||||
"version_id": result.version_id,
|
||||
"storage_key": result.storage_key,
|
||||
"mime": result.mime,
|
||||
"file_name": result.file_name,
|
||||
"download_url": result.download_url,
|
||||
}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Publish document error: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/api/doc/versions/{doc_id}")
|
||||
async def list_document_versions_endpoint(doc_id: str, agent_id: str = "daarwizz", limit: int = 20):
|
||||
"""
|
||||
List document versions for agent/doc pair.
|
||||
"""
|
||||
try:
|
||||
data = await list_document_versions(agent_id=agent_id, doc_id=doc_id, limit=limit)
|
||||
if not data.get("ok"):
|
||||
raise HTTPException(status_code=400, detail=data.get("error", "Failed to load versions"))
|
||||
return data
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"List document versions error: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/api/doc/context/{session_id}")
|
||||
async def get_document_context(session_id: str):
|
||||
"""
|
||||
@@ -258,3 +410,56 @@ async def get_document_context(session_id: str):
|
||||
logger.error(f"Get document context error: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/api/doc/artifacts/{artifact_id}/versions/{version_id}/download")
|
||||
async def download_artifact_version_via_gateway(
|
||||
artifact_id: str,
|
||||
version_id: str,
|
||||
filename: Optional[str] = None,
|
||||
inline: bool = False,
|
||||
):
|
||||
"""
|
||||
Proxy download for artifact version to avoid exposing internal MinIO host to browser clients.
|
||||
"""
|
||||
aid = (artifact_id or "").strip()
|
||||
vid = (version_id or "").strip()
|
||||
if not aid or not vid:
|
||||
raise HTTPException(status_code=400, detail="artifact_id and version_id are required")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=DOC_DOWNLOAD_TIMEOUT_SECONDS) as client:
|
||||
meta_resp = await client.get(
|
||||
f"{ARTIFACT_REGISTRY_URL}/artifacts/{aid}/versions/{vid}/download"
|
||||
)
|
||||
if meta_resp.status_code >= 400:
|
||||
detail = ""
|
||||
try:
|
||||
detail = meta_resp.json().get("detail") # type: ignore[assignment]
|
||||
except Exception:
|
||||
detail = meta_resp.text[:200]
|
||||
raise HTTPException(status_code=meta_resp.status_code, detail=detail or "Version download info failed")
|
||||
meta = meta_resp.json()
|
||||
signed_url = (meta.get("url") or "").strip()
|
||||
if not signed_url:
|
||||
raise HTTPException(status_code=502, detail="artifact-registry returned empty download URL")
|
||||
|
||||
file_resp = await client.get(signed_url)
|
||||
if file_resp.status_code >= 400:
|
||||
raise HTTPException(status_code=502, detail=f"Artifact storage download failed: {file_resp.status_code}")
|
||||
|
||||
mime = (meta.get("mime") or file_resp.headers.get("content-type") or "application/octet-stream").strip()
|
||||
storage_key = str(meta.get("storage_key") or "")
|
||||
inferred_name = storage_key.rsplit("/", 1)[-1] if "/" in storage_key else storage_key
|
||||
out_name = (filename or inferred_name or f"{aid}_{vid}.bin").strip()
|
||||
out_name = re.sub(r"[^A-Za-z0-9._-]+", "_", out_name).strip("._") or f"{aid}_{vid}.bin"
|
||||
disposition = "inline" if inline else "attachment"
|
||||
headers = {
|
||||
"Content-Disposition": f'{disposition}; filename="{out_name}"',
|
||||
"Cache-Control": "private, max-age=60",
|
||||
}
|
||||
return Response(content=file_resp.content, media_type=mime, headers=headers)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Artifact version proxy download failed: aid={aid}, vid={vid}, err={e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail="Artifact proxy download failed")
|
||||
|
||||
@@ -143,6 +143,10 @@ class MemoryClient:
|
||||
"body_text": e.get("content", ""),
|
||||
"kind": e.get("kind", "message"),
|
||||
"type": "user" if e.get("role") == "user" else "agent",
|
||||
"role": e.get("role", "unknown"),
|
||||
"timestamp": e.get("timestamp"),
|
||||
"user_id": e.get("user_id"),
|
||||
"sender_name": e.get("sender_name"),
|
||||
}
|
||||
for e in events
|
||||
if e.get("content")
|
||||
@@ -445,4 +449,3 @@ class MemoryClient:
|
||||
|
||||
# Глобальний екземпляр клієнта
|
||||
memory_client = MemoryClient()
|
||||
|
||||
|
||||
@@ -11,18 +11,23 @@ This service can be used by:
|
||||
import os
|
||||
import logging
|
||||
import hashlib
|
||||
import base64
|
||||
import json
|
||||
import re
|
||||
from typing import Optional, Dict, Any, List
|
||||
from pydantic import BaseModel
|
||||
from datetime import datetime
|
||||
from io import BytesIO
|
||||
|
||||
from router_client import send_to_router
|
||||
from memory_client import memory_client
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SHARED_EXCEL_POLICY_AGENTS = {"agromatrix", "helion", "nutra", "greenfood"}
|
||||
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000")
|
||||
ARTIFACT_REGISTRY_URL = os.getenv("ARTIFACT_REGISTRY_URL", "http://artifact-registry:9220").rstrip("/")
|
||||
DOC_WRITEBACK_CREATED_BY = os.getenv("DOC_WRITEBACK_CREATED_BY", "gateway-doc-service")
|
||||
GATEWAY_PUBLIC_BASE_URL = os.getenv("GATEWAY_PUBLIC_BASE_URL", "").rstrip("/")
|
||||
|
||||
|
||||
class QAItem(BaseModel):
|
||||
@@ -51,6 +56,35 @@ class IngestResult(BaseModel):
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class UpdateResult(BaseModel):
|
||||
"""Result of document update with version bump."""
|
||||
success: bool
|
||||
doc_id: Optional[str] = None
|
||||
version_no: Optional[int] = None
|
||||
version_id: Optional[int] = None
|
||||
updated_chunks: int = 0
|
||||
status: str = "unknown"
|
||||
publish_error: Optional[str] = None
|
||||
artifact_id: Optional[str] = None
|
||||
artifact_version_id: Optional[str] = None
|
||||
artifact_storage_key: Optional[str] = None
|
||||
artifact_mime: Optional[str] = None
|
||||
artifact_download_url: Optional[str] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class PublishResult(BaseModel):
|
||||
"""Result of artifact write-back publish."""
|
||||
success: bool
|
||||
artifact_id: Optional[str] = None
|
||||
version_id: Optional[str] = None
|
||||
storage_key: Optional[str] = None
|
||||
mime: Optional[str] = None
|
||||
file_name: Optional[str] = None
|
||||
download_url: Optional[str] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class QAResult(BaseModel):
|
||||
"""Result of RAG query about a document"""
|
||||
success: bool
|
||||
@@ -84,6 +118,266 @@ class DocumentService:
|
||||
"""Initialize document service"""
|
||||
self.memory_client = memory_client
|
||||
|
||||
async def _router_post_json(
|
||||
self,
|
||||
path: str,
|
||||
payload: Dict[str, Any],
|
||||
timeout: float = 45.0,
|
||||
) -> Dict[str, Any]:
|
||||
import httpx
|
||||
|
||||
base = ROUTER_URL.rstrip("/")
|
||||
url = f"{base}{path}"
|
||||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||
resp = await client.post(url, json=payload)
|
||||
body = {}
|
||||
try:
|
||||
body = resp.json()
|
||||
except Exception:
|
||||
body = {"ok": False, "error": f"Invalid JSON from router ({resp.status_code})"}
|
||||
if resp.status_code >= 400:
|
||||
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
|
||||
raise RuntimeError(f"Router error on {path}: {err}")
|
||||
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"}
|
||||
|
||||
async def _router_get_json(
|
||||
self,
|
||||
path: str,
|
||||
timeout: float = 30.0,
|
||||
) -> Dict[str, Any]:
|
||||
import httpx
|
||||
|
||||
base = ROUTER_URL.rstrip("/")
|
||||
url = f"{base}{path}"
|
||||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||
resp = await client.get(url)
|
||||
body = {}
|
||||
try:
|
||||
body = resp.json()
|
||||
except Exception:
|
||||
body = {"ok": False, "error": f"Invalid JSON from router ({resp.status_code})"}
|
||||
if resp.status_code >= 400:
|
||||
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
|
||||
raise RuntimeError(f"Router error on {path}: {err}")
|
||||
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid router response type"}
|
||||
|
||||
async def _artifact_post_json(
|
||||
self,
|
||||
path: str,
|
||||
payload: Dict[str, Any],
|
||||
timeout: float = 45.0,
|
||||
) -> Dict[str, Any]:
|
||||
import httpx
|
||||
|
||||
base = ARTIFACT_REGISTRY_URL.rstrip("/")
|
||||
url = f"{base}{path}"
|
||||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||
resp = await client.post(url, json=payload)
|
||||
body = {}
|
||||
try:
|
||||
body = resp.json()
|
||||
except Exception:
|
||||
body = {"ok": False, "error": f"Invalid JSON from artifact-registry ({resp.status_code})"}
|
||||
if resp.status_code >= 400:
|
||||
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
|
||||
raise RuntimeError(f"Artifact registry error on {path}: {err}")
|
||||
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid artifact response type"}
|
||||
|
||||
async def _artifact_get_json(
|
||||
self,
|
||||
path: str,
|
||||
timeout: float = 30.0,
|
||||
) -> Dict[str, Any]:
|
||||
import httpx
|
||||
|
||||
base = ARTIFACT_REGISTRY_URL.rstrip("/")
|
||||
url = f"{base}{path}"
|
||||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||
resp = await client.get(url)
|
||||
body = {}
|
||||
try:
|
||||
body = resp.json()
|
||||
except Exception:
|
||||
body = {"ok": False, "error": f"Invalid JSON from artifact-registry ({resp.status_code})"}
|
||||
if resp.status_code >= 400:
|
||||
err = body.get("detail") or body.get("error") or f"HTTP {resp.status_code}"
|
||||
raise RuntimeError(f"Artifact registry error on {path}: {err}")
|
||||
return body if isinstance(body, dict) else {"ok": False, "error": "Invalid artifact response type"}
|
||||
|
||||
def _resolve_format(self, file_name: Optional[str], target_format: Optional[str]) -> str:
|
||||
fmt = (target_format or "").strip().lower().lstrip(".")
|
||||
if fmt:
|
||||
return fmt
|
||||
if file_name and "." in file_name:
|
||||
return file_name.rsplit(".", 1)[1].strip().lower()
|
||||
return "txt"
|
||||
|
||||
def _compose_output_name(self, file_name: Optional[str], doc_id: str, fmt: str) -> str:
|
||||
base = "document"
|
||||
if file_name:
|
||||
base = file_name.rsplit("/", 1)[-1].rsplit("\\", 1)[-1]
|
||||
if "." in base:
|
||||
base = base.rsplit(".", 1)[0]
|
||||
elif doc_id:
|
||||
base = doc_id
|
||||
safe_base = re.sub(r"[^A-Za-z0-9._-]+", "_", base).strip("._") or "document"
|
||||
return f"{safe_base}.{fmt}"
|
||||
|
||||
def _gateway_artifact_download_path(self, artifact_id: str, version_id: str) -> str:
|
||||
aid = (artifact_id or "").strip()
|
||||
vid = (version_id or "").strip()
|
||||
return f"/api/doc/artifacts/{aid}/versions/{vid}/download"
|
||||
|
||||
def _gateway_artifact_download_url(self, artifact_id: str, version_id: str) -> str:
|
||||
path = self._gateway_artifact_download_path(artifact_id, version_id)
|
||||
if GATEWAY_PUBLIC_BASE_URL:
|
||||
return f"{GATEWAY_PUBLIC_BASE_URL}{path}"
|
||||
return path
|
||||
|
||||
def _render_document_bytes(
|
||||
self,
|
||||
text: str,
|
||||
file_name: Optional[str],
|
||||
doc_id: str,
|
||||
target_format: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
body = (text or "").strip()
|
||||
if not body:
|
||||
raise ValueError("Cannot render empty document text")
|
||||
|
||||
fmt = self._resolve_format(file_name=file_name, target_format=target_format)
|
||||
output_name = self._compose_output_name(file_name=file_name, doc_id=doc_id, fmt=fmt)
|
||||
|
||||
if fmt in {"txt"}:
|
||||
payload = body.encode("utf-8")
|
||||
return {"bytes": payload, "mime": "text/plain; charset=utf-8", "file_name": output_name}
|
||||
if fmt in {"md", "markdown"}:
|
||||
payload = body.encode("utf-8")
|
||||
return {"bytes": payload, "mime": "text/markdown; charset=utf-8", "file_name": output_name}
|
||||
if fmt in {"json"}:
|
||||
parsed: Any
|
||||
try:
|
||||
parsed = json.loads(body)
|
||||
except Exception:
|
||||
parsed = {"text": body}
|
||||
payload = json.dumps(parsed, ensure_ascii=False, indent=2).encode("utf-8")
|
||||
return {"bytes": payload, "mime": "application/json", "file_name": output_name}
|
||||
if fmt in {"csv"}:
|
||||
payload = body.encode("utf-8")
|
||||
return {"bytes": payload, "mime": "text/csv; charset=utf-8", "file_name": output_name}
|
||||
if fmt in {"xlsx", "xlsm", "xls"}:
|
||||
try:
|
||||
from openpyxl import Workbook
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"openpyxl is required for {fmt} rendering: {e}")
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Document"
|
||||
lines = [ln for ln in body.splitlines()] or [body]
|
||||
for idx, line in enumerate(lines, start=1):
|
||||
ws.cell(row=idx, column=1, value=line)
|
||||
buf = BytesIO()
|
||||
wb.save(buf)
|
||||
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
return {"bytes": buf.getvalue(), "mime": mime, "file_name": self._compose_output_name(file_name, doc_id, "xlsx")}
|
||||
if fmt in {"docx"}:
|
||||
try:
|
||||
from docx import Document
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"python-docx is required for docx rendering: {e}")
|
||||
doc = Document()
|
||||
for line in body.splitlines():
|
||||
doc.add_paragraph(line if line else " ")
|
||||
buf = BytesIO()
|
||||
doc.save(buf)
|
||||
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
return {"bytes": buf.getvalue(), "mime": mime, "file_name": self._compose_output_name(file_name, doc_id, "docx")}
|
||||
|
||||
payload = body.encode("utf-8")
|
||||
fallback_name = self._compose_output_name(file_name=file_name, doc_id=doc_id, fmt="txt")
|
||||
return {"bytes": payload, "mime": "text/plain; charset=utf-8", "file_name": fallback_name}
|
||||
|
||||
async def _publish_text_artifact(
|
||||
self,
|
||||
text: str,
|
||||
doc_id: str,
|
||||
file_name: Optional[str] = None,
|
||||
dao_id: Optional[str] = None,
|
||||
user_id: Optional[str] = None,
|
||||
artifact_id: Optional[str] = None,
|
||||
target_format: Optional[str] = None,
|
||||
label: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> PublishResult:
|
||||
try:
|
||||
rendered = self._render_document_bytes(
|
||||
text=text,
|
||||
file_name=file_name,
|
||||
doc_id=doc_id,
|
||||
target_format=target_format,
|
||||
)
|
||||
content_bytes = rendered["bytes"]
|
||||
content_b64 = base64.b64encode(content_bytes).decode("ascii")
|
||||
|
||||
effective_artifact_id = (artifact_id or "").strip()
|
||||
if not effective_artifact_id:
|
||||
create_resp = await self._artifact_post_json(
|
||||
"/artifacts",
|
||||
{
|
||||
"type": "doc",
|
||||
"title": file_name or doc_id,
|
||||
"project_id": dao_id,
|
||||
"acl_ref": dao_id,
|
||||
"created_by": user_id or DOC_WRITEBACK_CREATED_BY,
|
||||
},
|
||||
timeout=30.0,
|
||||
)
|
||||
effective_artifact_id = str(create_resp.get("artifact_id") or "").strip()
|
||||
if not effective_artifact_id:
|
||||
return PublishResult(success=False, error="Artifact create failed: empty artifact_id")
|
||||
|
||||
meta = {"doc_id": doc_id, "source": "doc_update_publish"}
|
||||
if isinstance(metadata, dict):
|
||||
meta.update(metadata)
|
||||
|
||||
version_resp = await self._artifact_post_json(
|
||||
f"/artifacts/{effective_artifact_id}/versions/from_base64",
|
||||
{
|
||||
"content_base64": content_b64,
|
||||
"mime": rendered["mime"],
|
||||
"filename": rendered["file_name"],
|
||||
"label": label or "edited",
|
||||
"meta_json": meta,
|
||||
},
|
||||
timeout=45.0,
|
||||
)
|
||||
version_id = str(version_resp.get("version_id") or "").strip()
|
||||
storage_key = version_resp.get("storage_key")
|
||||
if not version_id:
|
||||
return PublishResult(
|
||||
success=False,
|
||||
artifact_id=effective_artifact_id,
|
||||
error="Artifact version create failed: empty version_id",
|
||||
)
|
||||
|
||||
download_url = self._gateway_artifact_download_url(
|
||||
artifact_id=effective_artifact_id,
|
||||
version_id=version_id,
|
||||
)
|
||||
|
||||
return PublishResult(
|
||||
success=True,
|
||||
artifact_id=effective_artifact_id,
|
||||
version_id=version_id,
|
||||
storage_key=storage_key,
|
||||
mime=rendered["mime"],
|
||||
file_name=rendered["file_name"],
|
||||
download_url=download_url,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"publish_text_artifact failed: {e}", exc_info=True)
|
||||
return PublishResult(success=False, error=str(e))
|
||||
|
||||
def _is_excel_filename(self, file_name: Optional[str]) -> bool:
|
||||
if not file_name:
|
||||
return False
|
||||
@@ -462,7 +756,8 @@ class DocumentService:
|
||||
doc_url: Optional[str] = None,
|
||||
file_name: Optional[str] = None,
|
||||
dao_id: str = None,
|
||||
user_id: str = None
|
||||
user_id: str = None,
|
||||
agent_id: str = "daarwizz",
|
||||
) -> IngestResult:
|
||||
"""
|
||||
Ingest document chunks into RAG/Memory.
|
||||
@@ -488,64 +783,60 @@ class DocumentService:
|
||||
file_name = file_name or doc_context.file_name
|
||||
dao_id = dao_id or doc_context.dao_id
|
||||
|
||||
if not doc_id and not doc_url:
|
||||
if not doc_url:
|
||||
return IngestResult(
|
||||
success=False,
|
||||
error="No document ID or URL provided"
|
||||
error="No document URL available for ingest"
|
||||
)
|
||||
|
||||
# Build request to Router with ingest flag
|
||||
router_request = {
|
||||
"mode": "doc_parse",
|
||||
"agent": "parser",
|
||||
|
||||
parsed = await self.parse_document(
|
||||
session_id=session_id,
|
||||
doc_url=doc_url,
|
||||
file_name=file_name or "document",
|
||||
dao_id=dao_id or "",
|
||||
user_id=user_id or "",
|
||||
output_mode="markdown",
|
||||
metadata={"source": self._extract_source(session_id), "mode": "ingest"},
|
||||
)
|
||||
if not parsed.success:
|
||||
return IngestResult(success=False, error=parsed.error or "Document parse failed")
|
||||
|
||||
effective_doc_id = doc_id or parsed.doc_id
|
||||
if not effective_doc_id:
|
||||
effective_doc_id = hashlib.md5(f"{session_id}:{file_name}:{datetime.utcnow().isoformat()}".encode()).hexdigest()[:12]
|
||||
|
||||
doc_text = (parsed.markdown or "").strip()
|
||||
if not doc_text:
|
||||
return IngestResult(success=False, error="No extractable text for ingestion")
|
||||
|
||||
payload = {
|
||||
"agent_id": (agent_id or "daarwizz").lower(),
|
||||
"doc_id": effective_doc_id,
|
||||
"file_name": file_name or "document",
|
||||
"text": doc_text,
|
||||
"dao_id": dao_id,
|
||||
"user_id": user_id,
|
||||
"metadata": {
|
||||
"source": self._extract_source(session_id),
|
||||
"dao_id": dao_id,
|
||||
"user_id": user_id,
|
||||
"session_id": session_id,
|
||||
},
|
||||
"payload": {
|
||||
"output_mode": "chunks", # Use chunks for RAG ingestion
|
||||
"dao_id": dao_id,
|
||||
"user_id": user_id,
|
||||
"ingest": True, # Flag for ingestion
|
||||
"source": self._extract_source(session_id),
|
||||
},
|
||||
}
|
||||
|
||||
if doc_url:
|
||||
router_request["payload"]["doc_url"] = doc_url
|
||||
router_request["payload"]["file_name"] = file_name or "document.pdf"
|
||||
|
||||
if doc_id:
|
||||
router_request["payload"]["doc_id"] = doc_id
|
||||
|
||||
logger.info(f"Ingesting document: session={session_id}, doc_id={doc_id}")
|
||||
|
||||
# Send to Router
|
||||
response = await send_to_router(router_request)
|
||||
|
||||
if not isinstance(response, dict):
|
||||
return IngestResult(
|
||||
success=False,
|
||||
error="Invalid response from router"
|
||||
)
|
||||
|
||||
data = response.get("data", {})
|
||||
chunks = data.get("chunks", [])
|
||||
|
||||
if chunks:
|
||||
response = await self._router_post_json("/v1/documents/ingest", payload, timeout=90.0)
|
||||
|
||||
if response.get("ok"):
|
||||
return IngestResult(
|
||||
success=True,
|
||||
doc_id=doc_id or data.get("doc_id"),
|
||||
ingested_chunks=len(chunks),
|
||||
status="ingested"
|
||||
)
|
||||
else:
|
||||
return IngestResult(
|
||||
success=False,
|
||||
status="failed",
|
||||
error="No chunks to ingest"
|
||||
doc_id=response.get("doc_id") or effective_doc_id,
|
||||
ingested_chunks=int(response.get("chunks_stored", 0) or 0),
|
||||
status="ingested",
|
||||
)
|
||||
|
||||
return IngestResult(
|
||||
success=False,
|
||||
doc_id=effective_doc_id,
|
||||
status="failed",
|
||||
error=response.get("error", "Router ingest failed"),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Document ingestion failed: {e}", exc_info=True)
|
||||
@@ -553,6 +844,245 @@ class DocumentService:
|
||||
success=False,
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
async def update_document(
|
||||
self,
|
||||
session_id: str,
|
||||
doc_id: Optional[str] = None,
|
||||
doc_url: Optional[str] = None,
|
||||
file_name: Optional[str] = None,
|
||||
text: Optional[str] = None,
|
||||
dao_id: Optional[str] = None,
|
||||
user_id: Optional[str] = None,
|
||||
agent_id: str = "daarwizz",
|
||||
storage_ref: Optional[str] = None,
|
||||
publish_artifact: bool = False,
|
||||
artifact_id: Optional[str] = None,
|
||||
target_format: Optional[str] = None,
|
||||
artifact_label: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> UpdateResult:
|
||||
"""
|
||||
Update existing document content and bump version in router memory.
|
||||
"""
|
||||
try:
|
||||
context = await self.get_doc_context(session_id)
|
||||
if context:
|
||||
if not doc_id:
|
||||
doc_id = context.doc_id
|
||||
if not doc_url:
|
||||
doc_url = context.doc_url
|
||||
if not file_name:
|
||||
file_name = context.file_name
|
||||
if not dao_id:
|
||||
dao_id = context.dao_id
|
||||
|
||||
if not doc_id:
|
||||
return UpdateResult(
|
||||
success=False,
|
||||
status="failed",
|
||||
error="No document context found. Provide doc_id or parse/ingest first.",
|
||||
)
|
||||
|
||||
effective_text = (text or "").strip()
|
||||
if not effective_text:
|
||||
if not doc_url:
|
||||
return UpdateResult(
|
||||
success=False,
|
||||
doc_id=doc_id,
|
||||
status="failed",
|
||||
error="No text or doc_url provided for update",
|
||||
)
|
||||
parsed = await self.parse_document(
|
||||
session_id=session_id,
|
||||
doc_url=doc_url,
|
||||
file_name=file_name or "document",
|
||||
dao_id=dao_id or "",
|
||||
user_id=user_id or "",
|
||||
output_mode="markdown",
|
||||
metadata={"source": self._extract_source(session_id), "mode": "update"},
|
||||
)
|
||||
if not parsed.success:
|
||||
return UpdateResult(
|
||||
success=False,
|
||||
doc_id=doc_id,
|
||||
status="failed",
|
||||
error=parsed.error or "Document parse failed",
|
||||
)
|
||||
effective_text = (parsed.markdown or "").strip()
|
||||
|
||||
if not effective_text:
|
||||
return UpdateResult(
|
||||
success=False,
|
||||
doc_id=doc_id,
|
||||
status="failed",
|
||||
error="No extractable text for update",
|
||||
)
|
||||
|
||||
meta = {
|
||||
"session_id": session_id,
|
||||
"source": self._extract_source(session_id),
|
||||
}
|
||||
if isinstance(metadata, dict):
|
||||
meta.update(metadata)
|
||||
|
||||
response = await self._router_post_json(
|
||||
"/v1/documents/update",
|
||||
{
|
||||
"agent_id": (agent_id or "daarwizz").lower(),
|
||||
"doc_id": doc_id,
|
||||
"file_name": file_name,
|
||||
"text": effective_text,
|
||||
"dao_id": dao_id,
|
||||
"user_id": user_id,
|
||||
"storage_ref": storage_ref,
|
||||
"metadata": meta,
|
||||
},
|
||||
timeout=90.0,
|
||||
)
|
||||
|
||||
if not response.get("ok"):
|
||||
return UpdateResult(
|
||||
success=False,
|
||||
doc_id=doc_id,
|
||||
status="failed",
|
||||
error=response.get("error", "Router update failed"),
|
||||
)
|
||||
|
||||
await self.save_doc_context(
|
||||
session_id=session_id,
|
||||
doc_id=doc_id,
|
||||
doc_url=doc_url,
|
||||
file_name=file_name,
|
||||
dao_id=dao_id,
|
||||
user_id=user_id,
|
||||
)
|
||||
|
||||
publish = PublishResult(success=False)
|
||||
if publish_artifact:
|
||||
publish = await self._publish_text_artifact(
|
||||
text=effective_text,
|
||||
doc_id=doc_id,
|
||||
file_name=file_name,
|
||||
dao_id=dao_id,
|
||||
user_id=user_id,
|
||||
artifact_id=artifact_id,
|
||||
target_format=target_format,
|
||||
label=artifact_label,
|
||||
metadata=meta,
|
||||
)
|
||||
|
||||
return UpdateResult(
|
||||
success=True,
|
||||
doc_id=response.get("doc_id") or doc_id,
|
||||
version_no=int(response.get("version_no", 0) or 0) or None,
|
||||
version_id=int(response.get("version_id", 0) or 0) or None,
|
||||
updated_chunks=int(response.get("chunks_stored", 0) or 0),
|
||||
status="updated_published" if publish_artifact and publish.success else ("updated_publish_failed" if publish_artifact else "updated"),
|
||||
publish_error=publish.error if publish_artifact and not publish.success else None,
|
||||
artifact_id=publish.artifact_id if publish_artifact else None,
|
||||
artifact_version_id=publish.version_id if publish_artifact else None,
|
||||
artifact_storage_key=publish.storage_key if publish_artifact else None,
|
||||
artifact_mime=publish.mime if publish_artifact else None,
|
||||
artifact_download_url=publish.download_url if publish_artifact else None,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Document update failed: {e}", exc_info=True)
|
||||
return UpdateResult(
|
||||
success=False,
|
||||
doc_id=doc_id,
|
||||
status="failed",
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
async def list_document_versions(
|
||||
self,
|
||||
agent_id: str,
|
||||
doc_id: str,
|
||||
limit: int = 20,
|
||||
) -> Dict[str, Any]:
|
||||
aid = (agent_id or "daarwizz").lower()
|
||||
did = (doc_id or "").strip()
|
||||
if not did:
|
||||
return {"ok": False, "error": "doc_id is required", "items": []}
|
||||
try:
|
||||
response = await self._router_get_json(
|
||||
f"/v1/documents/{did}/versions?agent_id={aid}&limit={max(1, min(int(limit or 20), 200))}",
|
||||
timeout=30.0,
|
||||
)
|
||||
return response if isinstance(response, dict) else {"ok": False, "error": "invalid_response", "items": []}
|
||||
except Exception as e:
|
||||
logger.error(f"list_document_versions failed: {e}")
|
||||
return {"ok": False, "error": str(e), "items": []}
|
||||
|
||||
async def publish_document_artifact(
|
||||
self,
|
||||
session_id: str,
|
||||
doc_id: Optional[str] = None,
|
||||
doc_url: Optional[str] = None,
|
||||
file_name: Optional[str] = None,
|
||||
text: Optional[str] = None,
|
||||
dao_id: Optional[str] = None,
|
||||
user_id: Optional[str] = None,
|
||||
artifact_id: Optional[str] = None,
|
||||
target_format: Optional[str] = None,
|
||||
artifact_label: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> PublishResult:
|
||||
"""
|
||||
Publish text as a physical artifact version (.docx/.xlsx/.txt/...) without changing RAG index.
|
||||
"""
|
||||
try:
|
||||
context = await self.get_doc_context(session_id)
|
||||
if context:
|
||||
if not doc_id:
|
||||
doc_id = context.doc_id
|
||||
if not doc_url:
|
||||
doc_url = context.doc_url
|
||||
if not file_name:
|
||||
file_name = context.file_name
|
||||
if not dao_id:
|
||||
dao_id = context.dao_id
|
||||
if not user_id:
|
||||
user_id = context.user_id
|
||||
|
||||
if not doc_id:
|
||||
return PublishResult(success=False, error="doc_id is required")
|
||||
|
||||
body = (text or "").strip()
|
||||
if not body:
|
||||
if not doc_url:
|
||||
return PublishResult(success=False, error="text or doc_url is required")
|
||||
parsed = await self.parse_document(
|
||||
session_id=session_id,
|
||||
doc_url=doc_url,
|
||||
file_name=file_name or "document",
|
||||
dao_id=dao_id or "",
|
||||
user_id=user_id or "",
|
||||
output_mode="markdown",
|
||||
metadata={"source": self._extract_source(session_id), "mode": "publish"},
|
||||
)
|
||||
if not parsed.success:
|
||||
return PublishResult(success=False, error=parsed.error or "Document parse failed")
|
||||
body = (parsed.markdown or "").strip()
|
||||
|
||||
if not body:
|
||||
return PublishResult(success=False, error="No text available for publish")
|
||||
|
||||
return await self._publish_text_artifact(
|
||||
text=body,
|
||||
doc_id=doc_id,
|
||||
file_name=file_name,
|
||||
dao_id=dao_id,
|
||||
user_id=user_id,
|
||||
artifact_id=artifact_id,
|
||||
target_format=target_format,
|
||||
label=artifact_label,
|
||||
metadata=metadata,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"publish_document_artifact failed: {e}", exc_info=True)
|
||||
return PublishResult(success=False, error=str(e))
|
||||
|
||||
async def ask_about_document(
|
||||
self,
|
||||
@@ -625,38 +1155,30 @@ class DocumentService:
|
||||
}],
|
||||
)
|
||||
|
||||
# Build RAG query request
|
||||
router_request = {
|
||||
"mode": "rag_query",
|
||||
"agent": agent_id,
|
||||
"metadata": {
|
||||
"source": self._extract_source(session_id),
|
||||
"dao_id": dao_id,
|
||||
"user_id": user_id,
|
||||
"session_id": session_id,
|
||||
},
|
||||
"payload": {
|
||||
"question": question,
|
||||
"dao_id": dao_id,
|
||||
"user_id": user_id,
|
||||
"doc_id": doc_id,
|
||||
},
|
||||
}
|
||||
|
||||
logger.info(
|
||||
f"RAG query: agent={agent_id}, session={session_id}, question={question[:50]}, doc_id={doc_id}"
|
||||
)
|
||||
|
||||
# Send to Router
|
||||
response = await send_to_router(router_request)
|
||||
|
||||
if not isinstance(response, dict):
|
||||
|
||||
response = await self._router_post_json(
|
||||
"/v1/documents/query",
|
||||
{
|
||||
"agent_id": (agent_id or "daarwizz").lower(),
|
||||
"question": question,
|
||||
"doc_id": doc_id,
|
||||
"dao_id": dao_id,
|
||||
"user_id": user_id,
|
||||
"limit": 5,
|
||||
},
|
||||
timeout=60.0,
|
||||
)
|
||||
|
||||
if isinstance(response, dict) and not response.get("ok", False):
|
||||
return QAResult(
|
||||
success=False,
|
||||
error="Invalid response from router"
|
||||
error=response.get("error", "Document query failed"),
|
||||
)
|
||||
|
||||
data = response.get("data", {})
|
||||
|
||||
data = response.get("data", {}) if isinstance(response, dict) else {}
|
||||
answer = data.get("answer") or data.get("text")
|
||||
sources = data.get("citations", []) or data.get("sources", [])
|
||||
|
||||
@@ -717,7 +1239,8 @@ async def ingest_document(
|
||||
doc_url: Optional[str] = None,
|
||||
file_name: Optional[str] = None,
|
||||
dao_id: Optional[str] = None,
|
||||
user_id: Optional[str] = None
|
||||
user_id: Optional[str] = None,
|
||||
agent_id: str = "daarwizz",
|
||||
) -> IngestResult:
|
||||
"""Ingest document chunks into RAG/Memory"""
|
||||
return await doc_service.ingest_document(
|
||||
@@ -726,7 +1249,8 @@ async def ingest_document(
|
||||
doc_url=doc_url,
|
||||
file_name=file_name,
|
||||
dao_id=dao_id,
|
||||
user_id=user_id
|
||||
user_id=user_id,
|
||||
agent_id=agent_id,
|
||||
)
|
||||
|
||||
|
||||
@@ -749,6 +1273,79 @@ async def ask_about_document(
|
||||
)
|
||||
|
||||
|
||||
async def update_document(
|
||||
session_id: str,
|
||||
doc_id: Optional[str] = None,
|
||||
doc_url: Optional[str] = None,
|
||||
file_name: Optional[str] = None,
|
||||
text: Optional[str] = None,
|
||||
dao_id: Optional[str] = None,
|
||||
user_id: Optional[str] = None,
|
||||
agent_id: str = "daarwizz",
|
||||
storage_ref: Optional[str] = None,
|
||||
publish_artifact: bool = False,
|
||||
artifact_id: Optional[str] = None,
|
||||
target_format: Optional[str] = None,
|
||||
artifact_label: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> UpdateResult:
|
||||
"""Update document chunks and bump version."""
|
||||
return await doc_service.update_document(
|
||||
session_id=session_id,
|
||||
doc_id=doc_id,
|
||||
doc_url=doc_url,
|
||||
file_name=file_name,
|
||||
text=text,
|
||||
dao_id=dao_id,
|
||||
user_id=user_id,
|
||||
agent_id=agent_id,
|
||||
storage_ref=storage_ref,
|
||||
publish_artifact=publish_artifact,
|
||||
artifact_id=artifact_id,
|
||||
target_format=target_format,
|
||||
artifact_label=artifact_label,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
async def list_document_versions(agent_id: str, doc_id: str, limit: int = 20) -> Dict[str, Any]:
|
||||
"""List document versions from router."""
|
||||
return await doc_service.list_document_versions(
|
||||
agent_id=agent_id,
|
||||
doc_id=doc_id,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
|
||||
async def publish_document_artifact(
|
||||
session_id: str,
|
||||
doc_id: Optional[str] = None,
|
||||
doc_url: Optional[str] = None,
|
||||
file_name: Optional[str] = None,
|
||||
text: Optional[str] = None,
|
||||
dao_id: Optional[str] = None,
|
||||
user_id: Optional[str] = None,
|
||||
artifact_id: Optional[str] = None,
|
||||
target_format: Optional[str] = None,
|
||||
artifact_label: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> PublishResult:
|
||||
"""Publish physical artifact version for document text."""
|
||||
return await doc_service.publish_document_artifact(
|
||||
session_id=session_id,
|
||||
doc_id=doc_id,
|
||||
doc_url=doc_url,
|
||||
file_name=file_name,
|
||||
text=text,
|
||||
dao_id=dao_id,
|
||||
user_id=user_id,
|
||||
artifact_id=artifact_id,
|
||||
target_format=target_format,
|
||||
artifact_label=artifact_label,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
async def save_doc_context(
|
||||
session_id: str,
|
||||
doc_id: str,
|
||||
|
||||
62
ops/monitor_notify_sofiia.sh
Normal file → Executable file
@@ -7,6 +7,7 @@ ROUTER_URL="${ROUTER_URL:-http://127.0.0.1:9102}"
|
||||
REPORT_ENABLED="${SOFIIA_REPORTS_ENABLED:-true}"
|
||||
REPORT_MODE="${SOFIIA_REPORT_MODE:-fail_only}" # fail_only | always
|
||||
REPORT_TIMEOUT="${SOFIIA_REPORT_TIMEOUT:-180}"
|
||||
REPORT_MAX_TOKENS="${SOFIIA_REPORT_MAX_TOKENS:-900}"
|
||||
REPORT_CHAT_ID="${SOFIIA_REPORT_CHAT_ID:-ops-monitor-sofiia}"
|
||||
REPORT_USER_ID="${SOFIIA_REPORT_USER_ID:-ops-monitor-agent}"
|
||||
REPORT_USERNAME="${SOFIIA_REPORT_USERNAME:-monitor-agent}"
|
||||
@@ -23,7 +24,7 @@ if [[ ! -f "$STATUS_JSON" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
python3 - "$STATUS_JSON" "$ROOT" "$ROUTER_URL" "$REPORT_MODE" "$REPORT_TIMEOUT" "$REPORT_CHAT_ID" "$REPORT_USER_ID" "$REPORT_USERNAME" "$REPORT_TELEGRAM_CHAT_ID" "$SOFIIA_BOT_TOKEN" <<'PY'
|
||||
python3 - "$STATUS_JSON" "$ROOT" "$ROUTER_URL" "$REPORT_MODE" "$REPORT_TIMEOUT" "$REPORT_MAX_TOKENS" "$REPORT_CHAT_ID" "$REPORT_USER_ID" "$REPORT_USERNAME" "$REPORT_TELEGRAM_CHAT_ID" "$SOFIIA_BOT_TOKEN" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
@@ -35,11 +36,12 @@ root = Path(sys.argv[2])
|
||||
router_url = sys.argv[3].rstrip('/')
|
||||
report_mode = sys.argv[4]
|
||||
timeout_s = int(sys.argv[5])
|
||||
chat_id = sys.argv[6]
|
||||
user_id = sys.argv[7]
|
||||
username = sys.argv[8]
|
||||
tg_chat_id = sys.argv[9].strip()
|
||||
tg_token = sys.argv[10].strip()
|
||||
max_tokens = int(sys.argv[6])
|
||||
chat_id = sys.argv[7]
|
||||
user_id = sys.argv[8]
|
||||
username = sys.argv[9]
|
||||
tg_chat_id = sys.argv[10].strip()
|
||||
tg_token = sys.argv[11].strip()
|
||||
|
||||
payload = json.loads(status_json.read_text(encoding='utf-8'))
|
||||
status = str(payload.get('status', 'unknown')).lower()
|
||||
@@ -70,7 +72,7 @@ prompt = (
|
||||
|
||||
body = {
|
||||
'prompt': prompt,
|
||||
'max_tokens': 400,
|
||||
'max_tokens': max_tokens,
|
||||
'temperature': 0.1,
|
||||
'metadata': {
|
||||
'source': 'ops-monitor-canary',
|
||||
@@ -99,26 +101,42 @@ try:
|
||||
print(f"[OK] sofiia report sent: backend={data.get('backend')} model={data.get('model')} preview={short!r}")
|
||||
|
||||
if tg_chat_id and tg_token and text:
|
||||
msg = (
|
||||
def chunk_text(value: str, limit: int = 3500):
|
||||
chunks = []
|
||||
remaining = value
|
||||
while remaining:
|
||||
if len(remaining) <= limit:
|
||||
chunks.append(remaining)
|
||||
break
|
||||
split_at = remaining.rfind('\n', 0, limit)
|
||||
if split_at < max(1, limit // 2):
|
||||
split_at = limit
|
||||
chunks.append(remaining[:split_at].rstrip())
|
||||
remaining = remaining[split_at:].lstrip()
|
||||
return chunks or [value]
|
||||
|
||||
header = (
|
||||
"[NODE1 Monitor]\n"
|
||||
f"status={payload.get('status')} exit_code={payload.get('exit_code')}\n\n"
|
||||
f"{text[:3500]}"
|
||||
)
|
||||
tg_req = urlreq.Request(
|
||||
url=f"https://api.telegram.org/bot{tg_token}/sendMessage",
|
||||
data=json.dumps({"chat_id": tg_chat_id, "text": msg}).encode('utf-8'),
|
||||
headers={'Content-Type': 'application/json'},
|
||||
method='POST',
|
||||
)
|
||||
try:
|
||||
parts = chunk_text(text, 3500 - len("(99/99)\n"))
|
||||
total = len(parts)
|
||||
delivered = 0
|
||||
for idx, part in enumerate(parts, start=1):
|
||||
prefix = f"({idx}/{total})\n" if total > 1 else ""
|
||||
msg = f"{header}{prefix}{part}" if idx == 1 else f"{prefix}{part}"
|
||||
tg_req = urlreq.Request(
|
||||
url=f"https://api.telegram.org/bot{tg_token}/sendMessage",
|
||||
data=json.dumps({"chat_id": tg_chat_id, "text": msg}).encode('utf-8'),
|
||||
headers={'Content-Type': 'application/json'},
|
||||
method='POST',
|
||||
)
|
||||
with urlreq.urlopen(tg_req, timeout=20) as tg_resp:
|
||||
tg_data = json.loads(tg_resp.read().decode('utf-8', errors='ignore'))
|
||||
if tg_data.get('ok'):
|
||||
print(f"[OK] telegram report delivered: chat_id={tg_chat_id}")
|
||||
else:
|
||||
print(f"[WARN] telegram send not ok: {tg_data}")
|
||||
except Exception as tg_e:
|
||||
print(f"[WARN] telegram send failed: {tg_e}")
|
||||
if not tg_data.get('ok'):
|
||||
raise RuntimeError(f"telegram send not ok: {tg_data}")
|
||||
delivered += 1
|
||||
print(f"[OK] telegram report delivered: chat_id={tg_chat_id} parts={delivered}")
|
||||
else:
|
||||
print('[INFO] telegram delivery skipped (missing SOFIIA_REPORT_TELEGRAM_CHAT_ID or token or empty text)')
|
||||
except HTTPError as e:
|
||||
|
||||
128
scripts/node1/agromatrix_regression_smoke.py
Executable file
@@ -0,0 +1,128 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
|
||||
TINY_PNG_DATA_URL = (
|
||||
"data:image/png;base64,"
|
||||
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8Xw8AAoMBgQhP2YkAAAAASUVORK5CYII="
|
||||
)
|
||||
|
||||
|
||||
def http_json(method: str, url: str, payload=None, headers=None):
|
||||
data = None
|
||||
req_headers = dict(headers or {})
|
||||
if payload is not None:
|
||||
data = json.dumps(payload).encode("utf-8")
|
||||
req_headers.setdefault("Content-Type", "application/json")
|
||||
req = urllib.request.Request(url, data=data, headers=req_headers, method=method)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||
body = resp.read().decode("utf-8", errors="replace")
|
||||
return resp.status, json.loads(body) if body else {}
|
||||
except urllib.error.HTTPError as e:
|
||||
body = e.read().decode("utf-8", errors="replace")
|
||||
try:
|
||||
parsed = json.loads(body) if body else {}
|
||||
except Exception:
|
||||
parsed = {"raw": body}
|
||||
return e.code, parsed
|
||||
|
||||
|
||||
def check(cond: bool, label: str, details: str = "") -> bool:
|
||||
prefix = "PASS" if cond else "FAIL"
|
||||
tail = f" :: {details}" if details else ""
|
||||
print(f"[{prefix}] {label}{tail}")
|
||||
return cond
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="AgroMatrix regression smoke checks")
|
||||
parser.add_argument("--base-url", default="http://127.0.0.1:9102")
|
||||
parser.add_argument("--agent-id", default="agromatrix")
|
||||
parser.add_argument("--chat-id", default="smoke-agromatrix")
|
||||
parser.add_argument("--user-id", default="smoke-user")
|
||||
parser.add_argument("--skip-review-404", action="store_true")
|
||||
parser.add_argument(
|
||||
"--mentor-token",
|
||||
default=(
|
||||
os.getenv("AGROMATRIX_REVIEW_BEARER_TOKEN")
|
||||
or (os.getenv("AGROMATRIX_REVIEW_BEARER_TOKENS", "").split(",")[0].strip())
|
||||
or ""
|
||||
),
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
ok_all = True
|
||||
|
||||
status, health = http_json("GET", f"{args.base_url}/health")
|
||||
ok_all &= check(status == 200 and health.get("status") == "ok", "health", str(health))
|
||||
|
||||
numeric_payload = {
|
||||
"prompt": "напиши мені яка сума була витрачена на добрива",
|
||||
"metadata": {
|
||||
"channel": "telegram",
|
||||
"chat_id": args.chat_id,
|
||||
"user_id": args.user_id,
|
||||
"user_name": "smoke",
|
||||
},
|
||||
}
|
||||
status, infer_num = http_json("POST", f"{args.base_url}/v1/agents/{args.agent_id}/infer", numeric_payload)
|
||||
resp_text = str(infer_num.get("response") or "")
|
||||
numeric_guard = (
|
||||
"Не можу підтвердити точне число" in resp_text
|
||||
or "value + unit + source" in resp_text
|
||||
or "source(sheet,row)" in resp_text
|
||||
)
|
||||
ok_all &= check(status == 200 and numeric_guard, "numeric_contract_guard", resp_text[:180])
|
||||
|
||||
plant_payload = {
|
||||
"prompt": "Що це за рослина на фото?",
|
||||
"images": [TINY_PNG_DATA_URL],
|
||||
"metadata": {
|
||||
"channel": "telegram",
|
||||
"chat_id": args.chat_id,
|
||||
"user_id": args.user_id,
|
||||
"user_name": "smoke",
|
||||
},
|
||||
}
|
||||
status, infer_plant = http_json("POST", f"{args.base_url}/v1/agents/{args.agent_id}/infer", plant_payload)
|
||||
plant_text = str(infer_plant.get("response") or "")
|
||||
plant_ok = (
|
||||
"Не впевнений" in plant_text
|
||||
or "Надішли" in plant_text
|
||||
or "канд" in plant_text.lower()
|
||||
)
|
||||
ok_all &= check(status == 200 and plant_ok, "deterministic_plant_response", plant_text[:180])
|
||||
|
||||
status, pending = http_json("GET", f"{args.base_url}/v1/agromatrix/shared-memory/pending")
|
||||
pending_shape = isinstance(pending, dict) and isinstance(pending.get("items"), list)
|
||||
ok_all &= check(status == 200 and pending_shape, "shared_pending_endpoint", f"total={pending.get('total')}")
|
||||
|
||||
if not args.skip_review_404:
|
||||
req_headers = {}
|
||||
if args.mentor_token:
|
||||
req_headers["Authorization"] = f"Bearer {args.mentor_token}"
|
||||
status, review = http_json(
|
||||
"POST",
|
||||
f"{args.base_url}/v1/agromatrix/shared-memory/review",
|
||||
{
|
||||
"point_id": "11111111-1111-1111-1111-111111111111",
|
||||
"approve": False,
|
||||
"reviewer": "smoke",
|
||||
"note": "nonexistent id check",
|
||||
},
|
||||
headers=req_headers,
|
||||
)
|
||||
expected = 404 if args.mentor_token else 401
|
||||
ok_all &= check(status == expected, "shared_review_not_found_contract", str(review))
|
||||
|
||||
return 0 if ok_all else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -16,9 +16,16 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration
|
||||
GATEWAY_URL = os.getenv("GATEWAY_URL", "http://gateway:9300")
|
||||
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000")
|
||||
PROBE_INTERVAL = int(os.getenv("PROBE_INTERVAL", "60")) # seconds
|
||||
PROBE_TIMEOUT = int(os.getenv("PROBE_TIMEOUT", "30")) # seconds
|
||||
SEMANTIC_TIMEOUT = int(os.getenv("SEMANTIC_TIMEOUT", "45")) # seconds
|
||||
METRICS_PORT = int(os.getenv("METRICS_PORT", "9108"))
|
||||
SEMANTIC_PROBE_ENABLED = os.getenv("SEMANTIC_PROBE_ENABLED", "true").lower() == "true"
|
||||
SEMANTIC_AGENTS = [a.strip() for a in os.getenv("SEMANTIC_AGENTS", "clan,sofiia,monitor").split(",") if a.strip()]
|
||||
SEMANTIC_PROMPT = os.getenv("SEMANTIC_PROMPT", "Коротко: хто такий DAARWIZZ?")
|
||||
SEMANTIC_EXPECT_KEYWORD = os.getenv("SEMANTIC_EXPECT_KEYWORD", "daarwizz").lower()
|
||||
MONITOR_EXPECT_LOCAL = os.getenv("MONITOR_EXPECT_LOCAL", "true").lower() == "true"
|
||||
|
||||
# Prometheus metrics
|
||||
agent_e2e_success = Gauge('agent_e2e_success', 'Whether last E2E probe succeeded', ['target'])
|
||||
@@ -42,7 +49,7 @@ async def probe_gateway_health() -> tuple[bool, float, str]:
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
|
||||
resp = await client.get(f"{GATEWAY_URL}/health")
|
||||
latency = time.time() - start
|
||||
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
if data.get("status") == "healthy":
|
||||
@@ -67,7 +74,7 @@ async def probe_agent_ping() -> tuple[bool, float, str]:
|
||||
json={"probe": True, "timestamp": datetime.utcnow().isoformat()}
|
||||
)
|
||||
latency = time.time() - start
|
||||
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
if data.get("success"):
|
||||
@@ -100,7 +107,7 @@ async def probe_webhook_echo() -> tuple[bool, float, str]:
|
||||
"text": "/health" # Simple health check command
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
|
||||
# Use helion webhook as it's the most tested
|
||||
resp = await client.post(
|
||||
@@ -108,7 +115,7 @@ async def probe_webhook_echo() -> tuple[bool, float, str]:
|
||||
json=test_update
|
||||
)
|
||||
latency = time.time() - start
|
||||
|
||||
|
||||
if resp.status_code == 200:
|
||||
return True, latency, ""
|
||||
else:
|
||||
@@ -119,53 +126,102 @@ async def probe_webhook_echo() -> tuple[bool, float, str]:
|
||||
return False, time.time() - start, f"error: {str(e)[:50]}"
|
||||
|
||||
|
||||
async def probe_agent_semantic(agent_id: str) -> tuple[bool, float, str]:
|
||||
"""Probe semantic response via router infer and assert DAARWIZZ awareness."""
|
||||
start = time.time()
|
||||
try:
|
||||
payload = {
|
||||
"prompt": SEMANTIC_PROMPT,
|
||||
"max_tokens": 180,
|
||||
"temperature": 0.1,
|
||||
"metadata": {
|
||||
"agent_id": agent_id,
|
||||
"user_id": "tg:0",
|
||||
"chat_id": "0",
|
||||
"username": "e2e-prober",
|
||||
"raw_user_text": SEMANTIC_PROMPT,
|
||||
},
|
||||
}
|
||||
async with httpx.AsyncClient(timeout=SEMANTIC_TIMEOUT) as client:
|
||||
resp = await client.post(f"{ROUTER_URL}/v1/agents/{agent_id}/infer", json=payload)
|
||||
latency = time.time() - start
|
||||
if resp.status_code != 200:
|
||||
return False, latency, f"http_{resp.status_code}"
|
||||
|
||||
data = resp.json()
|
||||
answer = str(data.get("response") or "")
|
||||
backend = str(data.get("backend") or "")
|
||||
model = str(data.get("model") or "")
|
||||
|
||||
answer_lc = answer.lower()
|
||||
if SEMANTIC_EXPECT_KEYWORD not in answer_lc and "даар" not in answer_lc:
|
||||
return False, latency, "no_daarwizz_in_answer"
|
||||
|
||||
if MONITOR_EXPECT_LOCAL and agent_id == "monitor":
|
||||
local_ok = ("ollama" in backend.lower()) or model.lower().startswith("qwen")
|
||||
if not local_ok:
|
||||
return False, latency, f"monitor_nonlocal_backend:{backend}:{model}"
|
||||
|
||||
return True, latency, ""
|
||||
except httpx.TimeoutException:
|
||||
return False, time.time() - start, "timeout"
|
||||
except Exception as e:
|
||||
return False, time.time() - start, f"error: {str(e)[:50]}"
|
||||
|
||||
|
||||
def record_probe(target: str, success: bool, latency: float, reason: str):
|
||||
"""Record probe metrics and log line."""
|
||||
agent_e2e_runs_total.labels(target=target).inc()
|
||||
agent_e2e_success.labels(target=target).set(1 if success else 0)
|
||||
agent_e2e_latency.labels(target=target).set(latency)
|
||||
agent_e2e_latency_histogram.labels(target=target).observe(latency)
|
||||
if not success:
|
||||
agent_e2e_failures_total.labels(target=target, reason=reason).inc()
|
||||
logger.info(f"{target}: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||
|
||||
|
||||
async def run_probes():
|
||||
"""Run all probes and update metrics"""
|
||||
# Probe 1: Gateway health
|
||||
success, latency, reason = await probe_gateway_health()
|
||||
agent_e2e_runs_total.labels(target="gateway_health").inc()
|
||||
agent_e2e_success.labels(target="gateway_health").set(1 if success else 0)
|
||||
agent_e2e_latency.labels(target="gateway_health").set(latency)
|
||||
agent_e2e_latency_histogram.labels(target="gateway_health").observe(latency)
|
||||
if not success:
|
||||
agent_e2e_failures_total.labels(target="gateway_health", reason=reason).inc()
|
||||
logger.info(f"gateway_health: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||
|
||||
record_probe("gateway_health", success, latency, reason)
|
||||
|
||||
# Probe 2: Agent ping (if endpoint exists)
|
||||
success, latency, reason = await probe_agent_ping()
|
||||
agent_e2e_runs_total.labels(target="agent_ping").inc()
|
||||
agent_e2e_success.labels(target="agent_ping").set(1 if success else 0)
|
||||
agent_e2e_latency.labels(target="agent_ping").set(latency)
|
||||
agent_e2e_latency_histogram.labels(target="agent_ping").observe(latency)
|
||||
if not success:
|
||||
agent_e2e_failures_total.labels(target="agent_ping", reason=reason).inc()
|
||||
logger.info(f"agent_ping: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||
|
||||
record_probe("agent_ping", success, latency, reason)
|
||||
|
||||
# Probe 3: Webhook E2E (full path test)
|
||||
success, latency, reason = await probe_webhook_echo()
|
||||
agent_e2e_runs_total.labels(target="webhook_e2e").inc()
|
||||
agent_e2e_success.labels(target="webhook_e2e").set(1 if success else 0)
|
||||
agent_e2e_latency.labels(target="webhook_e2e").set(latency)
|
||||
agent_e2e_latency_histogram.labels(target="webhook_e2e").observe(latency)
|
||||
if not success:
|
||||
agent_e2e_failures_total.labels(target="webhook_e2e", reason=reason).inc()
|
||||
logger.info(f"webhook_e2e: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||
record_probe("webhook_e2e", success, latency, reason)
|
||||
|
||||
# Probe 4+: semantic checks for selected agents (parallel)
|
||||
if SEMANTIC_PROBE_ENABLED and SEMANTIC_AGENTS:
|
||||
results = await asyncio.gather(*(probe_agent_semantic(agent_id) for agent_id in SEMANTIC_AGENTS))
|
||||
matrix = []
|
||||
for agent_id, (success, latency, reason) in zip(SEMANTIC_AGENTS, results):
|
||||
record_probe(f"semantic_{agent_id}", success, latency, reason)
|
||||
matrix.append(f"{agent_id}:{'PASS' if success else 'FAIL'}")
|
||||
logger.info("semantic_matrix: " + " | ".join(matrix))
|
||||
|
||||
|
||||
async def main():
|
||||
logger.info(f"Starting E2E Agent Prober")
|
||||
logger.info("Starting E2E Agent Prober")
|
||||
logger.info(f" GATEWAY_URL: {GATEWAY_URL}")
|
||||
logger.info(f" ROUTER_URL: {ROUTER_URL}")
|
||||
logger.info(f" PROBE_INTERVAL: {PROBE_INTERVAL}s")
|
||||
logger.info(f" PROBE_TIMEOUT: {PROBE_TIMEOUT}s")
|
||||
logger.info(f" METRICS_PORT: {METRICS_PORT}")
|
||||
|
||||
logger.info(f" SEMANTIC_TIMEOUT: {SEMANTIC_TIMEOUT}s")
|
||||
logger.info(f" SEMANTIC_PROBE_ENABLED: {SEMANTIC_PROBE_ENABLED}")
|
||||
logger.info(f" SEMANTIC_AGENTS: {','.join(SEMANTIC_AGENTS)}")
|
||||
|
||||
# Start Prometheus metrics server
|
||||
start_http_server(METRICS_PORT)
|
||||
logger.info(f"Prometheus metrics available at :{METRICS_PORT}/metrics")
|
||||
|
||||
|
||||
# Initial probe
|
||||
await run_probes()
|
||||
|
||||
|
||||
# Continuous probing
|
||||
while True:
|
||||
await asyncio.sleep(PROBE_INTERVAL)
|
||||
|
||||
@@ -6,13 +6,15 @@ Artifact Registry v0
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
from io import BytesIO
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import asyncpg
|
||||
@@ -90,6 +92,14 @@ class ArtifactVersionFromUrlRequest(BaseModel):
|
||||
meta_json: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class ArtifactVersionFromBase64Request(BaseModel):
|
||||
content_base64: str
|
||||
mime: str
|
||||
filename: Optional[str] = "source.bin"
|
||||
label: Optional[str] = "source"
|
||||
meta_json: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class ArtifactVersionResponse(BaseModel):
|
||||
version_id: str
|
||||
storage_key: str
|
||||
@@ -208,15 +218,38 @@ def _normalize_meta_json(meta: Any) -> Dict[str, Any]:
|
||||
|
||||
def _format_to_mime(fmt: str) -> str:
|
||||
fmt = fmt.lower()
|
||||
if "/" in fmt:
|
||||
return fmt
|
||||
if fmt == "pptx":
|
||||
return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
if fmt == "pdf":
|
||||
return "application/pdf"
|
||||
if fmt == "source":
|
||||
return "application/json"
|
||||
if fmt == "docx":
|
||||
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
if fmt == "xlsx":
|
||||
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
if fmt == "txt":
|
||||
return "text/plain; charset=utf-8"
|
||||
if fmt == "md":
|
||||
return "text/markdown; charset=utf-8"
|
||||
if fmt == "json":
|
||||
return "application/json"
|
||||
if fmt == "csv":
|
||||
return "text/csv; charset=utf-8"
|
||||
return "application/octet-stream"
|
||||
|
||||
|
||||
def _safe_filename(name: Optional[str], fallback: str = "source.bin") -> str:
|
||||
raw = (name or fallback).strip() or fallback
|
||||
cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", raw)
|
||||
cleaned = cleaned.strip("._")
|
||||
if not cleaned:
|
||||
return fallback
|
||||
return cleaned[:120]
|
||||
|
||||
|
||||
async def _download_bytes(url: str) -> bytes:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
resp = await client.get(url)
|
||||
@@ -462,6 +495,73 @@ async def add_version_from_url(artifact_id: str, payload: ArtifactVersionFromUrl
|
||||
)
|
||||
|
||||
|
||||
@app.post("/artifacts/{artifact_id}/versions/from_base64", response_model=ArtifactVersionResponse)
|
||||
async def add_version_from_base64(artifact_id: str, payload: ArtifactVersionFromBase64Request) -> ArtifactVersionResponse:
|
||||
if not minio_client:
|
||||
raise HTTPException(status_code=500, detail="MinIO not available")
|
||||
if not pool:
|
||||
raise HTTPException(status_code=500, detail="DB not available")
|
||||
|
||||
raw = (payload.content_base64 or "").strip()
|
||||
if not raw:
|
||||
raise HTTPException(status_code=400, detail="content_base64 is required")
|
||||
|
||||
if raw.startswith("data:") and "," in raw:
|
||||
raw = raw.split(",", 1)[1]
|
||||
|
||||
try:
|
||||
content = base64.b64decode(raw, validate=True)
|
||||
except Exception:
|
||||
raise HTTPException(status_code=400, detail="Invalid base64 payload")
|
||||
|
||||
if not content:
|
||||
raise HTTPException(status_code=400, detail="Decoded payload is empty")
|
||||
|
||||
version_id = f"ver_{uuid.uuid4().hex}"
|
||||
filename = _safe_filename(payload.filename, fallback="source.bin")
|
||||
sha256 = _hash_bytes(content)
|
||||
storage_key = _storage_key(artifact_id, version_id, filename)
|
||||
|
||||
try:
|
||||
minio_client.put_object(
|
||||
MINIO_BUCKET,
|
||||
storage_key,
|
||||
data=BytesIO(content),
|
||||
length=len(content),
|
||||
content_type=payload.mime,
|
||||
)
|
||||
except S3Error as e:
|
||||
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
|
||||
|
||||
meta_json = _normalize_meta_json(payload.meta_json)
|
||||
if "file_name" not in meta_json:
|
||||
meta_json["file_name"] = filename
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
insert into artifact_versions
|
||||
(id, artifact_id, label, sha256, mime, size_bytes, storage_key, meta_json)
|
||||
values ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
""",
|
||||
version_id,
|
||||
artifact_id,
|
||||
payload.label or "source",
|
||||
sha256,
|
||||
payload.mime,
|
||||
len(content),
|
||||
storage_key,
|
||||
json.dumps(meta_json),
|
||||
)
|
||||
|
||||
return ArtifactVersionResponse(
|
||||
version_id=version_id,
|
||||
storage_key=storage_key,
|
||||
sha256=sha256,
|
||||
size_bytes=len(content),
|
||||
)
|
||||
|
||||
|
||||
@app.post("/artifacts/{artifact_id}/versions", response_model=ArtifactVersionResponse)
|
||||
async def add_version(artifact_id: str, payload: ArtifactVersionCreateRequest) -> ArtifactVersionResponse:
|
||||
if not pool:
|
||||
@@ -678,7 +778,39 @@ async def download_artifact(artifact_id: str, format: str = Query("pptx")) -> Di
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="Version not found")
|
||||
try:
|
||||
url = minio_client.presigned_get_object(MINIO_BUCKET, row["storage_key"], expires=1800)
|
||||
url = minio_client.presigned_get_object(
|
||||
MINIO_BUCKET,
|
||||
row["storage_key"],
|
||||
expires=timedelta(seconds=1800),
|
||||
)
|
||||
except S3Error as e:
|
||||
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
|
||||
return {"url": url, "storage_key": row["storage_key"], "mime": row["mime"]}
|
||||
|
||||
|
||||
@app.get("/artifacts/{artifact_id}/versions/{version_id}/download")
|
||||
async def download_artifact_version(artifact_id: str, version_id: str) -> Dict[str, Any]:
|
||||
if not pool or not minio_client:
|
||||
raise HTTPException(status_code=500, detail="Service not available")
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"""
|
||||
select * from artifact_versions
|
||||
where artifact_id=$1 and id=$2
|
||||
limit 1
|
||||
""",
|
||||
artifact_id,
|
||||
version_id,
|
||||
)
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="Version not found")
|
||||
try:
|
||||
url = minio_client.presigned_get_object(
|
||||
MINIO_BUCKET,
|
||||
row["storage_key"],
|
||||
expires=timedelta(seconds=1800),
|
||||
)
|
||||
except S3Error as e:
|
||||
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
|
||||
return {"url": url, "storage_key": row["storage_key"], "mime": row["mime"], "version_id": row["id"]}
|
||||
|
||||
@@ -361,6 +361,29 @@ agromatrix:
|
||||
llm_profile: reasoning
|
||||
delegation:
|
||||
enabled: false
|
||||
plant_intel:
|
||||
team_name: AgroMatrix Plant Intelligence
|
||||
parallel_roles: true
|
||||
max_concurrency: 3
|
||||
synthesis:
|
||||
role_context: Plant Intelligence Synthesis
|
||||
system_prompt_ref: roles/agx/agx-plant-intel/orchestrator_synthesis.md
|
||||
llm_profile: reasoning
|
||||
team:
|
||||
- id: plant_identifier
|
||||
role_context: Plant Identifier
|
||||
system_prompt_ref: roles/agx/agx-plant-intel/plant_identifier.md
|
||||
llm_profile: science
|
||||
- id: taxonomy_validator
|
||||
role_context: Taxonomy Validator
|
||||
system_prompt_ref: roles/agx/agx-plant-intel/taxonomy_validator.md
|
||||
llm_profile: reasoning
|
||||
- id: agrovoc_normalizer
|
||||
role_context: AGROVOC Normalizer
|
||||
system_prompt_ref: roles/agx/agx-plant-intel/agrovoc_normalizer.md
|
||||
llm_profile: fast
|
||||
delegation:
|
||||
enabled: false
|
||||
cadastre_geo:
|
||||
team_name: AgroMatrix Cadastre/Geo
|
||||
parallel_roles: true
|
||||
@@ -614,6 +637,16 @@ agromatrix:
|
||||
- Stepan
|
||||
- координація
|
||||
- план
|
||||
plant_intel:
|
||||
- plant
|
||||
- рослина
|
||||
- культура
|
||||
- leaf
|
||||
- disease
|
||||
- хвороба
|
||||
- identify
|
||||
- ідентифікуй
|
||||
- що за рослина
|
||||
cadastre_geo:
|
||||
- cadastre
|
||||
- geo
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
# Agronomist
|
||||
|
||||
Фокус: агрономія, діагностика стану рослин, фази розвитку, ризики хвороб/стресів.
|
||||
|
||||
Правила відповіді:
|
||||
- Коротко і прикладно.
|
||||
- Ніяких вигаданих фактів; при невизначеності чітко позначити припущення.
|
||||
- Для фото-питань: аналізувати в межах доступного контексту; якщо файл відсутній зараз — просити фото повторно.
|
||||
@@ -0,0 +1,8 @@
|
||||
# Communicator
|
||||
|
||||
Фокус: людяна та зрозуміла комунікація фінальної відповіді.
|
||||
|
||||
Правила:
|
||||
- Природна мова, без механістичного тону.
|
||||
- Не дублюй технічні обмеження, якщо вони не потрібні для дії користувача.
|
||||
- Завершуй конкретним корисним кроком.
|
||||
@@ -0,0 +1,7 @@
|
||||
# Field Data Analyst
|
||||
|
||||
Фокус: аналіз польових даних, тренди, аномалії, порівняння сценаріїв.
|
||||
|
||||
Правила:
|
||||
- Пояснювати висновки простою мовою.
|
||||
- Якщо даних недостатньо — вказати, які саме дані потрібні для точного висновку.
|
||||
@@ -0,0 +1,8 @@
|
||||
# Farm Ops Planner
|
||||
|
||||
Фокус: планування польових робіт, ресурси, пріоритезація задач, таймінги.
|
||||
|
||||
Правила:
|
||||
- Видавати практичний порядок дій.
|
||||
- За простого запиту: коротка відповідь.
|
||||
- Для операційних запитів: стислий план з відповідальними і дедлайном.
|
||||
@@ -0,0 +1,10 @@
|
||||
# AgroMatrix Orchestrator Synthesis
|
||||
|
||||
Ти синтезуєш відповіді ролей у фінальну відповідь Степана.
|
||||
|
||||
Правила:
|
||||
- За замовчуванням: 1-3 природні речення без шаблонної канцелярії.
|
||||
- Детальний формат (пункти/чекліст) тільки коли користувач просить "детально", "план", "чекліст", "розрахунок".
|
||||
- Якщо для аналізу бракує фото в поточному контексті, скажи це просто і попроси надіслати фото повторно.
|
||||
- Уникай службових формулювань про "технічні обмеження", "text-only" чи "відсутній vision-модуль".
|
||||
- Пояснюй по суті агропитання і давай 1 наступний практичний крок.
|
||||
@@ -0,0 +1,7 @@
|
||||
# Risk Assessor
|
||||
|
||||
Фокус: агро-ризики, операційні ризики, наслідки рішень.
|
||||
|
||||
Правила:
|
||||
- Давай коротку оцінку ризику (низький/середній/високий) і як зменшити ризик.
|
||||
- Без зайвої бюрократії у відповіді користувачу.
|
||||
@@ -11,6 +11,10 @@
|
||||
- Деструктивні дії (delete/migrate/prod) ТІЛЬКИ через план + dry-run + backup
|
||||
- Ніколи не логувати секрети/токени
|
||||
- Інші ролі НЕ спілкуються з користувачем напряму
|
||||
- Мультимодальність активна: фото/голос/документи підтримуються через стек платформи.
|
||||
- Якщо в поточному контексті не вистачає зображення для аналізу, пояснюйте це простою людською мовою і попросіть надіслати фото ще раз без технічних формулювань.
|
||||
|
||||
## Формат відповіді:
|
||||
Структурована відповідь з чіткими рекомендаціями та наступними кроками.
|
||||
- За замовчуванням: природна коротка відповідь 1-3 речення.
|
||||
- Якщо користувач просить детально/план/чекліст: структурована відповідь з чіткими наступними кроками.
|
||||
- Тон: живий і професійний, без канцеляризмів, шаблонів і фраз про "обмеження моделі".
|
||||
|
||||
@@ -7,3 +7,7 @@
|
||||
- Структурувати інформацію логічно
|
||||
- Включати конкретні наступні кроки
|
||||
- Позначати ризики якщо є
|
||||
- За замовчуванням відповідати природно і коротко (1-3 речення), без шаблонної канцелярії.
|
||||
- Для детальних запитів переходити у структурований режим.
|
||||
- Якщо для аналізу бракує зображення у поточному контексті, скажіть це природно і попросіть надіслати фото повторно.
|
||||
- Не вживати службові формулювання на кшталт "обмеження моделі", "text-only", "vision unavailable".
|
||||
|
||||
@@ -0,0 +1,11 @@
|
||||
You are AGROVOC Normalizer.
|
||||
|
||||
Responsibilities:
|
||||
- Normalize crop/disease terms using agrovoc_lookup.
|
||||
- Provide canonical term mapping for user-facing output.
|
||||
- Keep labels practical for agronomy context.
|
||||
|
||||
Return format:
|
||||
- canonical_terms
|
||||
- term_mapping
|
||||
- notes_for_user
|
||||
@@ -0,0 +1,24 @@
|
||||
Ти — Plant Intel Agent у DAARION.city.
|
||||
Відповідай природно, коротко й по-людськи українською, 1–3 речення за замовчуванням.
|
||||
|
||||
НАЙГОЛОВНІШЕ:
|
||||
- Дані з [PLANT_VISION_PREPROCESSED] (або context.plant_vision) — єдиний source-of-truth для ідентифікації рослини.
|
||||
- Для follow-up без нового фото використовуй [PREVIOUS_PLANT_IDENTIFICATION] (або context.last_plant / memory.last_plant).
|
||||
|
||||
Правило впевненості (обов'язково):
|
||||
- Якщо recommend_fallback == true або confidence < 0.65:
|
||||
"Ймовірно <name>, але впевненість низька. Перевірив через GBIF — найближчі збіги: <gbif_validation>. Краще нове фото при нормальному світлі."
|
||||
- Інакше:
|
||||
"Я бачу <name> з впевненістю <X>%."
|
||||
|
||||
Правила синтезу:
|
||||
- Не ігноруй результати pre-vision, якщо вони присутні.
|
||||
- Не стверджуй "фото не надано", якщо у контексті є pre-vision або previous plant data.
|
||||
- Уникай шаблонних списків, якщо користувач не просить детальний формат.
|
||||
- Якщо дані суперечливі: коротко познач невизначеність і попроси 1 конкретне додаткове фото.
|
||||
- Якщо top_k порожній, явно вкажи, що ідентифікація непевна, але все одно надай GBIF-орієнтир, якщо він є в контексті.
|
||||
|
||||
Формат відповіді:
|
||||
- 1–3 речення за замовчуванням.
|
||||
- Без технічного шуму, без внутрішніх JSON/міток у відповіді користувачу.
|
||||
- За запитом користувача можна розгорнути відповідь і дати короткі поради з догляду.
|
||||
@@ -0,0 +1,11 @@
|
||||
You are Plant Identifier.
|
||||
|
||||
Responsibilities:
|
||||
- Parse visual cues from user description/photo context.
|
||||
- Build candidate crop/plant hypotheses.
|
||||
- Use plantnet_lookup first when image URL is available.
|
||||
- If PlantNet is unavailable, provide top hypotheses with explicit uncertainty.
|
||||
|
||||
Return format:
|
||||
- candidates: numbered list max 5, each with rationale.
|
||||
- required_data: what extra image/data is needed.
|
||||
@@ -0,0 +1,11 @@
|
||||
You are Taxonomy Validator.
|
||||
|
||||
Responsibilities:
|
||||
- Validate candidate names via gbif_species_lookup.
|
||||
- Remove invalid/synonym-conflicted names.
|
||||
- Keep accepted taxa and explain conflicts briefly.
|
||||
|
||||
Return format:
|
||||
- accepted_candidates
|
||||
- rejected_candidates_with_reason
|
||||
- confidence_adjustment
|
||||
15
services/plant-vision-node1/Dockerfile
Normal file
@@ -0,0 +1,15 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY main.py .
|
||||
|
||||
EXPOSE 8085
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=20s --retries=3 \
|
||||
CMD python -c "import urllib.request; urllib.request.urlopen(http://localhost:8085/health)"
|
||||
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8085"]
|
||||
238
services/plant-vision-node1/main.py
Normal file
@@ -0,0 +1,238 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shlex
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import httpx
|
||||
from fastapi import FastAPI, File, HTTPException, UploadFile
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
app = FastAPI(title="plant-vision-node1", version="0.1.1")
|
||||
|
||||
|
||||
class IdentifyRequest(BaseModel):
|
||||
image_url: Optional[str] = None
|
||||
top_k: int = Field(default=3, ge=1, le=10)
|
||||
|
||||
|
||||
def _normalize_predictions(raw: Any, top_k: int) -> List[Dict[str, Any]]:
|
||||
preds: List[Dict[str, Any]] = []
|
||||
if isinstance(raw, dict):
|
||||
for key in ("predictions", "results", "candidates"):
|
||||
if isinstance(raw.get(key), list):
|
||||
raw = raw[key]
|
||||
break
|
||||
if isinstance(raw, list):
|
||||
for item in raw[:top_k]:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
name = (
|
||||
item.get("scientific_name")
|
||||
or item.get("scientificName")
|
||||
or item.get("label")
|
||||
or item.get("name")
|
||||
or "unknown"
|
||||
)
|
||||
common = item.get("common_name") or item.get("commonName") or item.get("common") or "-"
|
||||
score = item.get("score", item.get("confidence", 0.0))
|
||||
try:
|
||||
score_f = float(score)
|
||||
except Exception:
|
||||
score_f = 0.0
|
||||
preds.append({"scientific_name": str(name), "common_name": str(common), "score": score_f})
|
||||
return preds[:top_k]
|
||||
|
||||
|
||||
def _parse_text_output(text: str, top_k: int) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Parse only model score lines, e.g.:
|
||||
97.6% Persicaria amphibia
|
||||
86.1% Canada Goldenrod (Solidago canadensis)
|
||||
Ignore service lines like "Read ..." or "Classification of ...".
|
||||
"""
|
||||
preds: List[Dict[str, Any]] = []
|
||||
for raw_line in (text or "").splitlines():
|
||||
line = raw_line.strip()
|
||||
if not line or "%" not in line:
|
||||
continue
|
||||
|
||||
m = re.match(r"^\s*(\d+(?:\.\d+)?)%\s+(.+)$", line)
|
||||
if not m:
|
||||
continue
|
||||
|
||||
score_str, name_part = m.groups()
|
||||
try:
|
||||
score = float(score_str)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
name = name_part.strip()
|
||||
if not name:
|
||||
continue
|
||||
|
||||
common_name = "-"
|
||||
scientific_name = name
|
||||
|
||||
# If output is "Common Name (Scientific name)", preserve both.
|
||||
paren = re.match(r"^(.*?)\s*\(([^()]+)\)\s*$", name)
|
||||
if paren:
|
||||
common, scientific = paren.groups()
|
||||
common = common.strip()
|
||||
scientific = scientific.strip()
|
||||
if common:
|
||||
common_name = common
|
||||
if scientific:
|
||||
scientific_name = scientific
|
||||
|
||||
preds.append(
|
||||
{
|
||||
"scientific_name": scientific_name,
|
||||
"common_name": common_name,
|
||||
"score": score,
|
||||
}
|
||||
)
|
||||
|
||||
preds.sort(key=lambda x: float(x.get("score", 0.0)), reverse=True)
|
||||
return preds[:top_k]
|
||||
|
||||
|
||||
def _extract_inference_time(stdout: str) -> Optional[float]:
|
||||
m = re.search(r"took\s+(\d+(?:\.\d+)?)\s+secs", stdout or "")
|
||||
if not m:
|
||||
return None
|
||||
try:
|
||||
return float(m.group(1))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _run_nature_id_cli(image_path: str, top_k: int) -> Dict[str, Any]:
|
||||
cmd_tmpl = (os.getenv("NATURE_ID_CMD") or "").strip()
|
||||
timeout_s = int(os.getenv("NATURE_ID_TIMEOUT", "40"))
|
||||
|
||||
if not cmd_tmpl:
|
||||
raise RuntimeError("NATURE_ID_CMD is not configured")
|
||||
|
||||
cmd = cmd_tmpl.replace("{image_path}", image_path)
|
||||
proc = subprocess.run(
|
||||
shlex.split(cmd),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout_s,
|
||||
check=False,
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError(f"nature-id cli failed rc={proc.returncode}: {proc.stderr.strip()[:240]}")
|
||||
|
||||
out = (proc.stdout or "").strip()
|
||||
inference_time_sec = _extract_inference_time(out)
|
||||
if not out:
|
||||
return {"predictions": [], "inference_time_sec": inference_time_sec}
|
||||
|
||||
try:
|
||||
parsed = json.loads(out)
|
||||
preds = _normalize_predictions(parsed, top_k)
|
||||
except Exception:
|
||||
preds = _parse_text_output(out, top_k)
|
||||
|
||||
return {"predictions": preds, "inference_time_sec": inference_time_sec}
|
||||
|
||||
|
||||
async def _download_image(image_url: str) -> str:
|
||||
timeout_s = float(os.getenv("DOWNLOAD_TIMEOUT", "20"))
|
||||
async with httpx.AsyncClient(timeout=timeout_s) as client:
|
||||
resp = await client.get(image_url)
|
||||
resp.raise_for_status()
|
||||
data = resp.content
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as f:
|
||||
f.write(data)
|
||||
return f.name
|
||||
|
||||
|
||||
def _response_payload(result: Dict[str, Any]) -> Dict[str, Any]:
|
||||
preds = result.get("predictions") or []
|
||||
top_k = [
|
||||
{
|
||||
"confidence": float(p.get("score", 0.0)),
|
||||
"name": str((p.get("common_name") if p.get("common_name") not in (None, "", "-") else p.get("scientific_name")) or "unknown"),
|
||||
"scientific_name": str(p.get("scientific_name") or "unknown"),
|
||||
}
|
||||
for p in preds
|
||||
]
|
||||
return {
|
||||
"status": "success",
|
||||
"model": "aiy_plants_V1",
|
||||
"source": "nature-id-cli",
|
||||
"count": len(preds),
|
||||
"inference_time_sec": result.get("inference_time_sec"),
|
||||
"predictions": preds,
|
||||
"top_k": top_k,
|
||||
}
|
||||
|
||||
|
||||
@app.exception_handler(RequestValidationError)
|
||||
async def validation_exception_handler(_, exc: RequestValidationError):
|
||||
# Avoid leaking raw multipart bytes in validation responses.
|
||||
errs: List[Dict[str, Any]] = []
|
||||
for e in exc.errors() or []:
|
||||
errs.append({"loc": e.get("loc"), "msg": e.get("msg"), "type": e.get("type")})
|
||||
return JSONResponse(status_code=422, content={"detail": errs})
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> Dict[str, Any]:
|
||||
cmd = (os.getenv("NATURE_ID_CMD") or "").strip()
|
||||
return {
|
||||
"status": "healthy",
|
||||
"nature_id_cmd_configured": bool(cmd),
|
||||
"nature_id_cmd": cmd,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/identify")
|
||||
async def identify(payload: IdentifyRequest) -> Dict[str, Any]:
|
||||
if not payload.image_url:
|
||||
raise HTTPException(status_code=400, detail="image_url is required")
|
||||
|
||||
tmp_path = ""
|
||||
try:
|
||||
tmp_path = await _download_image(payload.image_url)
|
||||
result = _run_nature_id_cli(tmp_path, payload.top_k)
|
||||
return _response_payload(result)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=503, detail=f"identify_failed: {e}")
|
||||
finally:
|
||||
if tmp_path:
|
||||
try:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@app.post("/identify-file")
|
||||
async def identify_file(file: UploadFile = File(...), top_k: int = 3) -> Dict[str, Any]:
|
||||
top_k = max(1, min(top_k, 10))
|
||||
tmp_path = ""
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as f:
|
||||
f.write(await file.read())
|
||||
tmp_path = f.name
|
||||
result = _run_nature_id_cli(tmp_path, top_k)
|
||||
return _response_payload(result)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=503, detail=f"identify_failed: {e}")
|
||||
finally:
|
||||
if tmp_path:
|
||||
try:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
8
services/plant-vision-node1/requirements.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
fastapi==0.115.5
|
||||
uvicorn[standard]==0.32.1
|
||||
httpx==0.28.1
|
||||
python-multipart==0.0.17
|
||||
Pillow==11.1.0
|
||||
requests==2.32.3
|
||||
tflite-runtime==2.14.0
|
||||
numpy==1.26.4
|
||||
@@ -46,8 +46,15 @@ AGENT_SPECIALIZED_TOOLS = {
|
||||
"nutra": ['comfy_generate_image', 'comfy_generate_video'],
|
||||
|
||||
# AgroMatrix - Agriculture
|
||||
# Specialized: crop analysis, weather integration, field mapping
|
||||
"agromatrix": ['comfy_generate_image', 'comfy_generate_video'],
|
||||
# Specialized: crop analysis, weather integration, field mapping + plant intelligence
|
||||
"agromatrix": [
|
||||
'comfy_generate_image',
|
||||
'comfy_generate_video',
|
||||
'plantnet_lookup',
|
||||
'nature_id_identify',
|
||||
'gbif_species_lookup',
|
||||
'agrovoc_lookup',
|
||||
],
|
||||
|
||||
# GreenFood - Food & Eco
|
||||
# Specialized: recipe analysis, eco-scoring
|
||||
|
||||
@@ -408,8 +408,9 @@ agents:
|
||||
description: "Monitor Agent - архітектор-інспектор DAGI"
|
||||
default_llm: local_qwen3_8b
|
||||
system_prompt: |
|
||||
Ти - Monitor Agent, стежиш за нодами, сервісами, агентами.
|
||||
Якщо бачиш у чаті інших ботів, відповідай тільки за інфраструктурою або прямим тегом.
|
||||
Ти - Monitor Agent, інфраструктурний інспектор DAGI: ноди, сервіси, пайплайни, алерти.
|
||||
Ти знаєш, що DAARWIZZ — головний оркестратор мережі DAARION.city; для governance/маршрутизації посилайся на нього.
|
||||
Відповідай коротко і по суті; якщо даних бракує — одразу кажи, який саме метрик/лог потрібен.
|
||||
tools:
|
||||
- id: get_metrics
|
||||
type: builtin
|
||||
|
||||
@@ -19,6 +19,7 @@ from typing import Dict, List, Any, Optional
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO, StringIO
|
||||
from pathlib import PurePath
|
||||
from urllib.parse import urlparse
|
||||
import xml.etree.ElementTree as ET
|
||||
from xml.sax.saxutils import escape as xml_escape
|
||||
from zipfile import ZIP_DEFLATED, ZipFile
|
||||
@@ -108,6 +109,115 @@ TOOL_DEFINITIONS = [
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "plantnet_lookup",
|
||||
"description": "Визначення рослин через Pl@ntNet API. Повертає top-k кандидатів з confidence.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Короткий опис рослини/культури (якщо немає image_url)"
|
||||
},
|
||||
"image_url": {
|
||||
"type": "string",
|
||||
"description": "Публічне посилання на фото рослини"
|
||||
},
|
||||
"organ": {
|
||||
"type": "string",
|
||||
"description": "Орган рослини: leaf/flower/fruit/bark/auto",
|
||||
"default": "auto"
|
||||
},
|
||||
"top_k": {
|
||||
"type": "integer",
|
||||
"description": "Скільки кандидатів повернути (1-10)",
|
||||
"default": 3
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "nature_id_identify",
|
||||
"description": "Локальна/open-source ідентифікація рослин через nature-id сумісний сервіс.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"image_url": {
|
||||
"type": "string",
|
||||
"description": "Публічне посилання на фото рослини"
|
||||
},
|
||||
"image_data": {
|
||||
"type": "string",
|
||||
"description": "Data URL зображення (data:image/...;base64,...)"
|
||||
},
|
||||
"top_k": {
|
||||
"type": "integer",
|
||||
"description": "Скільки кандидатів повернути (1-10)",
|
||||
"default": 3
|
||||
},
|
||||
"min_confidence": {
|
||||
"type": "number",
|
||||
"description": "Поріг confidence для fallback на GBIF",
|
||||
"default": 0.65
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "gbif_species_lookup",
|
||||
"description": "Пошук таксонів у GBIF для валідації назви культури/рослини.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Назва/термін для пошуку виду"
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Кількість результатів (1-10)",
|
||||
"default": 5
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "agrovoc_lookup",
|
||||
"description": "Нормалізація агро-термінів через AGROVOC (SPARQL).",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Термін культури/хвороби/технології"
|
||||
},
|
||||
"lang": {
|
||||
"type": "string",
|
||||
"description": "Мова міток (en/uk/ru)",
|
||||
"default": "en"
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Кількість результатів (1-10)",
|
||||
"default": 5
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
}
|
||||
},
|
||||
# PRIORITY 3: Generation tools
|
||||
{
|
||||
"type": "function",
|
||||
@@ -681,6 +791,42 @@ class ToolManager:
|
||||
tool_names = [t.get("function", {}).get("name") for t in filtered]
|
||||
logger.debug(f"Agent {agent_id} has {len(filtered)} tools: {tool_names}")
|
||||
return filtered
|
||||
|
||||
@staticmethod
|
||||
def _is_image_data_url(value: str) -> bool:
|
||||
v = str(value or "").strip()
|
||||
return bool(v.startswith("data:image/") and ";base64," in v)
|
||||
|
||||
@staticmethod
|
||||
def _is_known_non_direct_image_url(url: str) -> bool:
|
||||
u = str(url or "").strip()
|
||||
if not u:
|
||||
return False
|
||||
try:
|
||||
p = urlparse(u)
|
||||
except Exception:
|
||||
return True
|
||||
host = (p.netloc or "").lower()
|
||||
if host in {"t.me", "telegram.me"}:
|
||||
return True
|
||||
if "web.telegram.org" in host:
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _normalize_confidence(value: Any) -> float:
|
||||
try:
|
||||
v = float(value)
|
||||
except Exception:
|
||||
return 0.0
|
||||
if v < 0:
|
||||
return 0.0
|
||||
# Some backends return percentages (e.g. 97.6) instead of 0..1.
|
||||
if v > 1.0 and v <= 100.0:
|
||||
v = v / 100.0
|
||||
if v > 1.0:
|
||||
v = 1.0
|
||||
return v
|
||||
|
||||
async def execute_tool(
|
||||
self,
|
||||
@@ -709,6 +855,14 @@ class ToolManager:
|
||||
return await self._web_search(arguments)
|
||||
elif tool_name == "web_extract":
|
||||
return await self._web_extract(arguments)
|
||||
elif tool_name == "plantnet_lookup":
|
||||
return await self._plantnet_lookup(arguments)
|
||||
elif tool_name == "nature_id_identify":
|
||||
return await self._nature_id_identify(arguments)
|
||||
elif tool_name == "gbif_species_lookup":
|
||||
return await self._gbif_species_lookup(arguments)
|
||||
elif tool_name == "agrovoc_lookup":
|
||||
return await self._agrovoc_lookup(arguments)
|
||||
elif tool_name == "image_generate":
|
||||
return await self._image_generate(arguments)
|
||||
elif tool_name == "comfy_generate_image":
|
||||
@@ -2530,6 +2684,272 @@ class ToolManager:
|
||||
except Exception as e:
|
||||
return ToolResult(success=False, result=None, error=str(e))
|
||||
|
||||
async def _plantnet_lookup(self, args: Dict) -> ToolResult:
|
||||
"""Plant identification via Pl@ntNet API (skeleton adapter)."""
|
||||
query = str(args.get("query", "") or "").strip()
|
||||
image_url = str(args.get("image_url", "") or "").strip()
|
||||
image_data = str(args.get("image_data", "") or "").strip()
|
||||
runtime_image_data = str(args.get("_runtime_image_data", "") or "").strip()
|
||||
if not image_data and self._is_image_data_url(runtime_image_data):
|
||||
image_data = runtime_image_data
|
||||
organ = str(args.get("organ", "auto") or "auto").strip().lower()
|
||||
top_k = max(1, min(int(args.get("top_k", 3)), 5))
|
||||
|
||||
api_key = (os.getenv("PLANTNET_API_KEY") or "").strip()
|
||||
if image_url and api_key:
|
||||
try:
|
||||
params = {
|
||||
"api-key": api_key,
|
||||
"images": image_url,
|
||||
"organs": "leaf" if organ == "auto" else organ,
|
||||
"lang": "en",
|
||||
}
|
||||
resp = await self.http_client.get(
|
||||
"https://my-api.plantnet.org/v2/identify/all",
|
||||
params=params,
|
||||
timeout=25.0,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
results = (data.get("results") or [])[:top_k]
|
||||
if not results:
|
||||
return ToolResult(success=True, result="Pl@ntNet: кандидатів не знайдено.")
|
||||
lines = []
|
||||
for idx, item in enumerate(results, 1):
|
||||
species = (item.get("species") or {})
|
||||
sname = species.get("scientificNameWithoutAuthor") or species.get("scientificName") or "unknown"
|
||||
common = species.get("commonNames") or []
|
||||
cname = common[0] if common else "-"
|
||||
score = float(item.get("score") or 0.0)
|
||||
lines.append(f"{idx}. {sname} ({cname}) score={score:.3f}")
|
||||
return ToolResult(success=True, result="Pl@ntNet candidates:\n" + "\n".join(lines))
|
||||
return ToolResult(success=False, result=None, error=f"plantnet_http_{resp.status_code}")
|
||||
except Exception as e:
|
||||
return ToolResult(success=False, result=None, error=f"plantnet_error: {e}")
|
||||
|
||||
if image_url or image_data:
|
||||
ni_args: Dict[str, Any] = {"top_k": top_k}
|
||||
if image_data:
|
||||
ni_args["image_data"] = image_data
|
||||
else:
|
||||
ni_args["image_url"] = image_url
|
||||
if runtime_image_data:
|
||||
ni_args["_runtime_image_data"] = runtime_image_data
|
||||
ni = await self._nature_id_identify(ni_args)
|
||||
if ni.success:
|
||||
return ni
|
||||
|
||||
if query:
|
||||
return await self._gbif_species_lookup({"query": query, "limit": top_k})
|
||||
|
||||
return ToolResult(
|
||||
success=False,
|
||||
result=None,
|
||||
error="No available plant ID backend (set PLANTNET_API_KEY or NATURE_ID_URL, or provide text query)",
|
||||
)
|
||||
|
||||
async def _nature_id_identify(self, args: Dict) -> ToolResult:
|
||||
"""Open-source plant identification via self-hosted nature-id compatible endpoint."""
|
||||
image_url = str(args.get("image_url", "") or "").strip()
|
||||
image_data = str(args.get("image_data", "") or "").strip()
|
||||
runtime_image_data = str(args.get("_runtime_image_data", "") or "").strip()
|
||||
if not image_data and self._is_image_data_url(runtime_image_data):
|
||||
image_data = runtime_image_data
|
||||
top_k = max(1, min(int(args.get("top_k", 3)), 10))
|
||||
min_confidence = float(args.get("min_confidence", os.getenv("NATURE_ID_MIN_CONFIDENCE", "0.65")))
|
||||
|
||||
if image_url and self._is_known_non_direct_image_url(image_url):
|
||||
if image_data:
|
||||
logger.info("nature_id_identify: replacing non-direct image_url with runtime image_data")
|
||||
image_url = ""
|
||||
else:
|
||||
return ToolResult(
|
||||
success=False,
|
||||
result=None,
|
||||
error="image_url is not direct image URL; provide image_data or direct Telegram file URL",
|
||||
)
|
||||
|
||||
if not image_url and not image_data:
|
||||
return ToolResult(success=False, result=None, error="image_url or image_data is required")
|
||||
|
||||
base = (os.getenv("NATURE_ID_URL") or "").strip().rstrip("/")
|
||||
if not base:
|
||||
return ToolResult(success=False, result=None, error="NATURE_ID_URL is not configured")
|
||||
|
||||
try:
|
||||
if image_data:
|
||||
# data URL -> multipart /identify-file
|
||||
if not image_data.startswith("data:") or "," not in image_data:
|
||||
return ToolResult(success=False, result=None, error="invalid image_data format")
|
||||
header, b64 = image_data.split(",", 1)
|
||||
mime = "image/jpeg"
|
||||
if ";base64" in header:
|
||||
mime = header.split(":", 1)[1].split(";", 1)[0] or "image/jpeg"
|
||||
ext = "jpg"
|
||||
if "png" in mime:
|
||||
ext = "png"
|
||||
try:
|
||||
image_bytes = base64.b64decode(b64)
|
||||
except Exception:
|
||||
return ToolResult(success=False, result=None, error="invalid image_data base64")
|
||||
files = {"file": (f"upload.{ext}", image_bytes, mime)}
|
||||
resp = await self.http_client.post(
|
||||
f"{base}/identify-file",
|
||||
params={"top_k": top_k},
|
||||
files=files,
|
||||
timeout=45.0,
|
||||
)
|
||||
else:
|
||||
payload = {"image_url": image_url, "top_k": top_k}
|
||||
resp = await self.http_client.post(f"{base}/identify", json=payload, timeout=45.0)
|
||||
|
||||
if resp.status_code != 200:
|
||||
return ToolResult(success=False, result=None, error=f"nature_id_http_{resp.status_code}")
|
||||
|
||||
data = resp.json() or {}
|
||||
status = str(data.get("status") or "success")
|
||||
raw_top_k = data.get("top_k") or []
|
||||
raw_preds = data.get("predictions") or data.get("results") or []
|
||||
|
||||
top_k_rows = []
|
||||
if isinstance(raw_top_k, list) and raw_top_k:
|
||||
for row in raw_top_k[:top_k]:
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
conf = row.get("confidence", 0.0)
|
||||
conf_f = self._normalize_confidence(conf)
|
||||
top_k_rows.append({
|
||||
"confidence": conf_f,
|
||||
"name": str(row.get("name") or row.get("scientific_name") or "unknown"),
|
||||
"scientific_name": str(row.get("scientific_name") or row.get("name") or "unknown"),
|
||||
})
|
||||
else:
|
||||
for item in raw_preds[:top_k]:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
score = item.get("score", item.get("confidence", 0.0))
|
||||
score_f = self._normalize_confidence(score)
|
||||
sname = item.get("scientific_name") or item.get("label") or item.get("name") or "unknown"
|
||||
cname = item.get("common_name") or item.get("common") or sname
|
||||
top_k_rows.append({
|
||||
"confidence": score_f,
|
||||
"name": str(cname),
|
||||
"scientific_name": str(sname),
|
||||
})
|
||||
|
||||
if not top_k_rows:
|
||||
return ToolResult(success=True, result=json.dumps({
|
||||
"status": status,
|
||||
"model": data.get("model") or "aiy_plants_V1",
|
||||
"source": data.get("source") or "nature-id-cli",
|
||||
"top_k": [],
|
||||
"confidence": 0.0,
|
||||
"recommend_fallback": True,
|
||||
"reason": "no_predictions",
|
||||
}, ensure_ascii=False))
|
||||
|
||||
top1 = top_k_rows[0]
|
||||
top1_conf = float(top1.get("confidence", 0.0))
|
||||
recommend_fallback = top1_conf < min_confidence
|
||||
|
||||
out = {
|
||||
"status": status,
|
||||
"model": data.get("model") or "aiy_plants_V1",
|
||||
"source": data.get("source") or "nature-id-cli",
|
||||
"inference_time_sec": data.get("inference_time_sec"),
|
||||
"top_k": top_k_rows,
|
||||
"confidence": top1_conf,
|
||||
"min_confidence": min_confidence,
|
||||
"recommend_fallback": recommend_fallback,
|
||||
"fallback": "gbif_species_lookup",
|
||||
}
|
||||
|
||||
if recommend_fallback:
|
||||
fallback_query = str(top1.get("scientific_name") or top1.get("name") or "").strip()
|
||||
if fallback_query and fallback_query.lower() != "unknown":
|
||||
gbif = await self._gbif_species_lookup({"query": fallback_query, "limit": min(5, top_k)})
|
||||
if gbif.success and gbif.result:
|
||||
out["gbif_validation"] = gbif.result
|
||||
|
||||
return ToolResult(success=True, result=json.dumps(out, ensure_ascii=False))
|
||||
except Exception as e:
|
||||
return ToolResult(success=False, result=None, error=f"nature_id_error: {e}")
|
||||
|
||||
async def _gbif_species_lookup(self, args: Dict) -> ToolResult:
|
||||
"""Species lookup via GBIF public API."""
|
||||
query = str(args.get("query", "") or "").strip()
|
||||
limit = max(1, min(int(args.get("limit", 5)), 10))
|
||||
if not query:
|
||||
return ToolResult(success=False, result=None, error="query is required")
|
||||
|
||||
try:
|
||||
resp = await self.http_client.get(
|
||||
"https://api.gbif.org/v1/species/search",
|
||||
params={"q": query, "limit": limit, "status": "ACCEPTED"},
|
||||
timeout=20.0,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return ToolResult(success=False, result=None, error=f"gbif_http_{resp.status_code}")
|
||||
|
||||
data = resp.json() or {}
|
||||
results = data.get("results") or []
|
||||
if not results:
|
||||
return ToolResult(success=True, result="GBIF: результатів не знайдено.")
|
||||
|
||||
lines = []
|
||||
for idx, item in enumerate(results[:limit], 1):
|
||||
sci = item.get("scientificName") or item.get("canonicalName") or "unknown"
|
||||
rank = item.get("rank") or "-"
|
||||
status = item.get("taxonomicStatus") or "-"
|
||||
key = item.get("key")
|
||||
lines.append(f"{idx}. {sci} | rank={rank} | status={status} | key={key}")
|
||||
return ToolResult(success=True, result="GBIF matches:\n" + "\n".join(lines))
|
||||
except Exception as e:
|
||||
return ToolResult(success=False, result=None, error=f"gbif_error: {e}")
|
||||
|
||||
async def _agrovoc_lookup(self, args: Dict) -> ToolResult:
|
||||
"""AGROVOC term normalization via public SPARQL endpoint."""
|
||||
query = str(args.get("query", "") or "").strip()
|
||||
lang = str(args.get("lang", "en") or "en").strip().lower()
|
||||
limit = max(1, min(int(args.get("limit", 5)), 10))
|
||||
if not query:
|
||||
return ToolResult(success=False, result=None, error="query is required")
|
||||
if lang not in {"en", "uk", "ru"}:
|
||||
lang = "en"
|
||||
|
||||
safe_q = query.replace('\\', ' ').replace('"', ' ').strip()
|
||||
sparql = (
|
||||
"PREFIX skos: <http://www.w3.org/2004/02/skos/core#> "
|
||||
"SELECT ?concept ?label WHERE { "
|
||||
"?concept skos:prefLabel ?label . "
|
||||
f"FILTER(lang(?label) = '{lang}') "
|
||||
f"FILTER(CONTAINS(LCASE(STR(?label)), LCASE(\"{safe_q}\"))) "
|
||||
"} LIMIT " + str(limit)
|
||||
)
|
||||
|
||||
try:
|
||||
resp = await self.http_client.get(
|
||||
"https://agrovoc.fao.org/sparql",
|
||||
params={"query": sparql, "format": "json"},
|
||||
timeout=25.0,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return ToolResult(success=False, result=None, error=f"agrovoc_http_{resp.status_code}")
|
||||
|
||||
data = resp.json() or {}
|
||||
bindings = (((data.get("results") or {}).get("bindings")) or [])
|
||||
if not bindings:
|
||||
return ToolResult(success=True, result="AGROVOC: результатів не знайдено.")
|
||||
|
||||
lines = []
|
||||
for idx, b in enumerate(bindings[:limit], 1):
|
||||
label = ((b.get("label") or {}).get("value") or "").strip()
|
||||
concept = ((b.get("concept") or {}).get("value") or "").strip()
|
||||
lines.append(f"{idx}. {label} | {concept}")
|
||||
return ToolResult(success=True, result="AGROVOC matches:\n" + "\n".join(lines))
|
||||
except Exception as e:
|
||||
return ToolResult(success=False, result=None, error=f"agrovoc_error: {e}")
|
||||
|
||||
async def _unload_ollama_models(self):
|
||||
"""Unload all Ollama models to free VRAM for heavy operations like FLUX"""
|
||||
ollama_url = os.getenv("OLLAMA_BASE_URL", "http://172.18.0.1:11434")
|
||||
@@ -2942,7 +3362,11 @@ class ToolManager:
|
||||
|
||||
if results:
|
||||
result = results[0] if isinstance(results, list) else results
|
||||
markdown = result.get("markdown", "") or result.get("cleaned_html", "") or result.get("text", "")
|
||||
raw_content = result.get("markdown", "") or result.get("cleaned_html", "") or result.get("text", "")
|
||||
if isinstance(raw_content, (dict, list, tuple)):
|
||||
markdown = json.dumps(raw_content, ensure_ascii=False)
|
||||
else:
|
||||
markdown = str(raw_content or "")
|
||||
title = result.get("title", url)
|
||||
|
||||
if len(markdown) > 3000:
|
||||
@@ -2951,13 +3375,30 @@ class ToolManager:
|
||||
response_parts = [f"**{title}**", "", markdown]
|
||||
|
||||
if extract_links:
|
||||
links = result.get("links", [])
|
||||
if links:
|
||||
links_raw = result.get("links", [])
|
||||
normalized_links: List[Any] = []
|
||||
if isinstance(links_raw, dict):
|
||||
for bucket in links_raw.values():
|
||||
if isinstance(bucket, list):
|
||||
normalized_links.extend(bucket)
|
||||
elif bucket:
|
||||
normalized_links.append(bucket)
|
||||
elif isinstance(links_raw, list):
|
||||
normalized_links = links_raw
|
||||
elif links_raw:
|
||||
normalized_links = [links_raw]
|
||||
|
||||
if normalized_links:
|
||||
response_parts.append("")
|
||||
response_parts.append("**Посилання:**")
|
||||
for link in links[:10]:
|
||||
for link in normalized_links[:10]:
|
||||
if isinstance(link, dict):
|
||||
link_url = link.get("href", "")
|
||||
link_url = (
|
||||
link.get("href")
|
||||
or link.get("url")
|
||||
or link.get("link")
|
||||
or ""
|
||||
)
|
||||
else:
|
||||
link_url = str(link)
|
||||
if link_url:
|
||||
|
||||
@@ -11,10 +11,13 @@ import os
|
||||
import asyncio
|
||||
import logging
|
||||
import base64
|
||||
import json
|
||||
import re
|
||||
from typing import Optional, Dict, List, Any, Union
|
||||
from datetime import datetime, timedelta
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from fastapi import FastAPI, HTTPException, BackgroundTasks, File, UploadFile, Form
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
@@ -56,16 +59,34 @@ def _csv_to_markdown(content: bytes) -> str:
|
||||
text = _decode_text_bytes(content)
|
||||
reader = csv.reader(text.splitlines())
|
||||
rows = list(reader)
|
||||
return _rows_to_markdown(rows)
|
||||
|
||||
|
||||
def _tsv_to_markdown(content: bytes) -> str:
|
||||
text = _decode_text_bytes(content)
|
||||
reader = csv.reader(text.splitlines(), delimiter="\t")
|
||||
rows = list(reader)
|
||||
return _rows_to_markdown(rows)
|
||||
|
||||
|
||||
def _rows_to_markdown(rows: List[List[Any]]) -> str:
|
||||
if not rows:
|
||||
return ""
|
||||
header = rows[0]
|
||||
body = rows[1:]
|
||||
width = max(len(r) for r in rows)
|
||||
norm_rows = []
|
||||
for r in rows:
|
||||
rr = [str(c) if c is not None else "" for c in r]
|
||||
if len(rr) < width:
|
||||
rr.extend([""] * (width - len(rr)))
|
||||
norm_rows.append(rr)
|
||||
header = norm_rows[0]
|
||||
body = norm_rows[1:]
|
||||
lines = [
|
||||
"| " + " | ".join(header) + " |",
|
||||
"| " + " | ".join(["---"] * len(header)) + " |",
|
||||
]
|
||||
for row in body:
|
||||
lines.append("| " + " | ".join(row) + " |")
|
||||
lines.append("| " + " | ".join([str(c) if c is not None else "" for c in row]) + " |")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
@@ -91,6 +112,69 @@ def _xlsx_to_markdown(content: bytes) -> str:
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def _xls_to_markdown(content: bytes) -> str:
|
||||
try:
|
||||
import xlrd
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"xlrd not available: {e}")
|
||||
wb = xlrd.open_workbook(file_contents=content)
|
||||
parts = []
|
||||
for s in wb.sheets():
|
||||
parts.append(f"## Sheet: {s.name}")
|
||||
rows = []
|
||||
for r in range(s.nrows):
|
||||
rows.append([s.cell_value(r, c) for c in range(s.ncols)])
|
||||
if not rows:
|
||||
parts.append("_Empty sheet_")
|
||||
continue
|
||||
parts.append(_rows_to_markdown(rows))
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _ods_to_markdown(content: bytes) -> str:
|
||||
try:
|
||||
from odf.opendocument import load
|
||||
from odf.table import Table, TableRow, TableCell
|
||||
from odf.text import P
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"odfpy not available: {e}")
|
||||
|
||||
try:
|
||||
doc = load(BytesIO(content))
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid ODS file: {e}")
|
||||
|
||||
parts = []
|
||||
for table in doc.spreadsheet.getElementsByType(Table):
|
||||
table_name = str(table.getAttribute("name") or "Sheet")
|
||||
parts.append(f"## Sheet: {table_name}")
|
||||
rows: List[List[str]] = []
|
||||
for row in table.getElementsByType(TableRow):
|
||||
cells_out: List[str] = []
|
||||
for cell in row.getElementsByType(TableCell):
|
||||
txt_parts = []
|
||||
for p in cell.getElementsByType(P):
|
||||
txt_parts.extend(
|
||||
[str(getattr(node, "data", "")).strip() for node in p.childNodes if getattr(node, "data", None)]
|
||||
)
|
||||
cell_text = " ".join([t for t in txt_parts if t]).strip()
|
||||
repeat_raw = cell.getAttribute("numbercolumnsrepeated")
|
||||
try:
|
||||
repeat = int(repeat_raw) if repeat_raw else 1
|
||||
except Exception:
|
||||
repeat = 1
|
||||
repeat = max(1, min(repeat, 100))
|
||||
for _ in range(repeat):
|
||||
cells_out.append(cell_text)
|
||||
if cells_out:
|
||||
rows.append(cells_out)
|
||||
if not rows:
|
||||
parts.append("_Empty sheet_")
|
||||
continue
|
||||
parts.append(_rows_to_markdown(rows))
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _docx_to_text(content: bytes) -> str:
|
||||
try:
|
||||
from docx import Document
|
||||
@@ -115,18 +199,111 @@ def _pdf_to_text(content: bytes) -> str:
|
||||
return "\n\n".join(text_content)
|
||||
|
||||
|
||||
def _pptx_to_text(content: bytes) -> str:
|
||||
try:
|
||||
from pptx import Presentation
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"python-pptx not available: {e}")
|
||||
prs = Presentation(BytesIO(content))
|
||||
parts = []
|
||||
for idx, slide in enumerate(prs.slides, start=1):
|
||||
parts.append(f"## Slide {idx}")
|
||||
slide_lines = []
|
||||
for shape in slide.shapes:
|
||||
text = getattr(shape, "text", None)
|
||||
if text and str(text).strip():
|
||||
slide_lines.append(str(text).strip())
|
||||
parts.extend(slide_lines if slide_lines else ["_No text on this slide_"])
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _json_to_text(content: bytes) -> str:
|
||||
raw = _decode_text_bytes(content)
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
return json.dumps(parsed, ensure_ascii=False, indent=2)
|
||||
except Exception:
|
||||
return raw
|
||||
|
||||
|
||||
def _yaml_to_text(content: bytes) -> str:
|
||||
raw = _decode_text_bytes(content)
|
||||
try:
|
||||
parsed = yaml.safe_load(raw)
|
||||
return yaml.safe_dump(parsed, allow_unicode=True, sort_keys=False)
|
||||
except Exception:
|
||||
return raw
|
||||
|
||||
|
||||
def _xml_to_text(content: bytes) -> str:
|
||||
raw = _decode_text_bytes(content)
|
||||
try:
|
||||
root = ET.fromstring(raw)
|
||||
text = " ".join([t.strip() for t in root.itertext() if t and t.strip()])
|
||||
return text or raw
|
||||
except Exception:
|
||||
return raw
|
||||
|
||||
|
||||
def _html_to_text(content: bytes) -> str:
|
||||
raw = _decode_text_bytes(content)
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(raw, "html.parser")
|
||||
text = soup.get_text(separator="\n")
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text.strip() or raw
|
||||
except Exception:
|
||||
# Minimal fallback if bs4 is unavailable
|
||||
text = re.sub(r"<[^>]+>", " ", raw)
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _rtf_to_text(content: bytes) -> str:
|
||||
raw = _decode_text_bytes(content)
|
||||
try:
|
||||
from striprtf.striprtf import rtf_to_text
|
||||
return rtf_to_text(raw)
|
||||
except Exception:
|
||||
# Basic fallback: strip common RTF control tokens
|
||||
text = re.sub(r"\\'[0-9a-fA-F]{2}", " ", raw)
|
||||
text = re.sub(r"\\[a-zA-Z]+-?\d* ?", " ", text)
|
||||
text = text.replace("{", " ").replace("}", " ")
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def _extract_text_by_ext(filename: str, content: bytes) -> str:
|
||||
ext = filename.split(".")[-1].lower() if "." in filename else ""
|
||||
if ext in ["txt", "md"]:
|
||||
if ext in ["txt", "md", "markdown"]:
|
||||
return _decode_text_bytes(content)
|
||||
if ext == "csv":
|
||||
return _csv_to_markdown(content)
|
||||
if ext == "xlsx":
|
||||
if ext == "tsv":
|
||||
return _tsv_to_markdown(content)
|
||||
if ext in {"xlsx", "xlsm"}:
|
||||
return _xlsx_to_markdown(content)
|
||||
if ext == "xls":
|
||||
return _xls_to_markdown(content)
|
||||
if ext == "ods":
|
||||
return _ods_to_markdown(content)
|
||||
if ext == "docx":
|
||||
return _docx_to_text(content)
|
||||
if ext == "pdf":
|
||||
return _pdf_to_text(content)
|
||||
if ext == "pptx":
|
||||
return _pptx_to_text(content)
|
||||
if ext == "json":
|
||||
return _json_to_text(content)
|
||||
if ext in {"yaml", "yml"}:
|
||||
return _yaml_to_text(content)
|
||||
if ext == "xml":
|
||||
return _xml_to_text(content)
|
||||
if ext in {"html", "htm"}:
|
||||
return _html_to_text(content)
|
||||
if ext == "rtf":
|
||||
return _rtf_to_text(content)
|
||||
raise HTTPException(status_code=400, detail=f"Unsupported file type: .{ext}")
|
||||
|
||||
|
||||
@@ -139,7 +316,12 @@ def _zip_to_markdown(content: bytes, max_files: int = 50, max_total_mb: int = 10
|
||||
if total_size > max_total_mb * 1024 * 1024:
|
||||
raise HTTPException(status_code=400, detail=f"ZIP слишком большой: {total_size / 1024 / 1024:.1f} MB")
|
||||
parts = []
|
||||
allowed_exts = {"txt", "md", "csv", "xlsx", "docx", "pdf"}
|
||||
allowed_exts = {
|
||||
"txt", "md", "markdown", "csv", "tsv",
|
||||
"xls", "xlsx", "xlsm", "ods",
|
||||
"docx", "pdf", "pptx",
|
||||
"json", "yaml", "yml", "xml", "html", "htm", "rtf",
|
||||
}
|
||||
processed = []
|
||||
skipped = []
|
||||
for member in members:
|
||||
@@ -1655,7 +1837,8 @@ async def document_endpoint(
|
||||
- json: Structured JSON with document elements
|
||||
- text: Plain text extraction
|
||||
|
||||
Supported files: PDF, DOCX, PPTX, images (PNG, JPG)
|
||||
Supported files:
|
||||
PDF, DOCX, XLS/XLSX/XLSM/ODS, PPTX, TXT/MD/CSV/TSV, JSON/YAML/XML/HTML, RTF, ZIP, images.
|
||||
"""
|
||||
try:
|
||||
import time
|
||||
@@ -1672,15 +1855,28 @@ async def document_endpoint(
|
||||
filename = file.filename if file else "document"
|
||||
file_ext = filename.split(".")[-1].lower() if "." in filename else "pdf"
|
||||
|
||||
# Handle text-based formats without Docling
|
||||
if file_ext in ["txt", "md", "csv", "xlsx", "zip"]:
|
||||
# Handle deterministic extraction for standard office/text formats
|
||||
if file_ext in [
|
||||
"txt", "md", "markdown", "csv", "tsv",
|
||||
"xlsx", "xls", "xlsm", "ods",
|
||||
"json", "yaml", "yml", "xml", "html", "htm", "rtf",
|
||||
"pptx", "zip",
|
||||
]:
|
||||
try:
|
||||
if file_ext == "zip":
|
||||
content = _zip_to_markdown(doc_data)
|
||||
output_format = "markdown"
|
||||
else:
|
||||
content = _extract_text_by_ext(filename, doc_data)
|
||||
output_format = "markdown" if file_ext in ["md", "csv", "xlsx"] else "text"
|
||||
output_format = (
|
||||
"markdown"
|
||||
if file_ext in {
|
||||
"md", "markdown", "csv", "tsv",
|
||||
"xlsx", "xls", "xlsm", "ods",
|
||||
"json", "yaml", "yml", "xml", "html", "htm", "pptx",
|
||||
}
|
||||
else "text"
|
||||
)
|
||||
processing_time_ms = (time.time() - start_time) * 1000
|
||||
return {
|
||||
"success": True,
|
||||
@@ -1764,22 +1960,27 @@ async def document_endpoint(
|
||||
"device": swapper.device
|
||||
}
|
||||
|
||||
# For DOCX, try python-docx
|
||||
if file_ext == "docx":
|
||||
# For common office/text formats, try deterministic extractors.
|
||||
if file_ext in {
|
||||
"docx", "txt", "md", "markdown", "csv", "tsv",
|
||||
"xlsx", "xls", "xlsm", "ods",
|
||||
"pptx", "json", "yaml", "yml", "xml", "html", "htm", "rtf",
|
||||
}:
|
||||
try:
|
||||
content = _docx_to_text(doc_data)
|
||||
content = _extract_text_by_ext(filename, doc_data)
|
||||
out_fmt = "markdown" if file_ext not in {"txt", "rtf"} else "text"
|
||||
return {
|
||||
"success": True,
|
||||
"model": "python-docx (fallback)",
|
||||
"output_format": "text",
|
||||
"model": "text-extract (fallback)",
|
||||
"output_format": out_fmt,
|
||||
"result": content,
|
||||
"filename": filename,
|
||||
"processing_time_ms": (time.time() - start_time) * 1000,
|
||||
"device": swapper.device
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"DOCX fallback failed: {e}")
|
||||
raise HTTPException(status_code=500, detail="DOCX extraction failed")
|
||||
logger.error(f"Text fallback failed for .{file_ext}: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Extraction failed for .{file_ext}")
|
||||
|
||||
# For PDFs, try pdfplumber
|
||||
if file_ext == "pdf":
|
||||
@@ -1807,7 +2008,7 @@ async def document_endpoint(
|
||||
# For other documents, return error
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="Document processing not available. Supported: PDF (with pdfplumber), images (with OCR)"
|
||||
detail="Document processing unavailable for this type. Supported: office/text/image/zip standard formats."
|
||||
)
|
||||
|
||||
finally:
|
||||
@@ -2312,4 +2513,3 @@ async def get_multimodal_stack():
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8890)
|
||||
|
||||
|
||||
@@ -4,6 +4,15 @@ httpx==0.25.2
|
||||
pydantic==2.5.0
|
||||
pyyaml==6.0.1
|
||||
python-multipart==0.0.6
|
||||
chardet>=5.2.0
|
||||
openpyxl>=3.1.2
|
||||
python-docx>=1.1.2
|
||||
pdfplumber>=0.11.0
|
||||
python-pptx>=0.6.23
|
||||
xlrd>=2.0.1
|
||||
odfpy>=1.4.1
|
||||
beautifulsoup4>=4.12.0
|
||||
striprtf>=0.0.26
|
||||
|
||||
# HuggingFace dependencies for OCR models
|
||||
torch>=2.0.0
|
||||
@@ -25,4 +34,4 @@ safetensors>=0.4.0
|
||||
|
||||
# Web Scraping & Search
|
||||
trafilatura>=1.6.0
|
||||
duckduckgo-search>=4.0.0
|
||||
duckduckgo-search>=4.0.0
|
||||
|
||||
@@ -43,3 +43,8 @@ pdfplumber>=0.10.0
|
||||
python-docx>=1.1.0
|
||||
openpyxl>=3.1.2
|
||||
chardet>=5.2.0
|
||||
python-pptx>=0.6.23
|
||||
xlrd>=2.0.1
|
||||
odfpy>=1.4.1
|
||||
beautifulsoup4>=4.12.0
|
||||
striprtf>=0.0.26
|
||||
|
||||
6
third_party/nature-id/.gitignore
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.csv
|
||||
*.tflite
|
||||
*.zip
|
||||
10
third_party/nature-id/LICENSE
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2020, joergmlpts
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
372
third_party/nature-id/README.md
vendored
Normal file
@@ -0,0 +1,372 @@
|
||||
# Identify Plants, Birds, and Insects in Photos
|
||||
|
||||
This repository provides Python code that identifies plants, birds, and insects in photos.
|
||||
|
||||
This project was inspired by the amazing progress in identifying plants, animals and mushrooms in photos that has been made by [iNaturalist](https://iNaturalist.org) in recent years in identifying plants, animals, and fungi from photographs. The iNaturalist team has trained machine learning models with their large collection of photos and research-grade identifications. In 2019, iNaturalist released [Seek by iNaturalist](https://www.inaturalist.org/pages/seek_app) which identifies photos offline on the phone and identifies to a higher level than species when a species identification cannot be made.
|
||||
|
||||
Google provides three models that have been trained with iNaturalist data - classification models for plants, birds, and insects. These Google models can be downloaded and used with Google's `TensorFlow` and `TensorFlow Lite` tools.
|
||||
|
||||
This code is based on the trained models provided by Google. It was written to experiment with identifying species from photos and to try out Seek's approach to calculating scores (probabilities) across the taxonomic hierarchy.
|
||||
|
||||
This tool `nature_id.py` has been tested on Linux and Windows. It should also work on MacOS.
|
||||
|
||||
## Usage
|
||||
|
||||
This is a command-line tool. It is invoked with images or directories containing images and identifies the plants, birds, and insects in those images.
|
||||
|
||||
Here is an example. This is the command for Linux and macOS:
|
||||
|
||||
```
|
||||
./nature_id.py -m plants plant_images/Persicaria_amphibia.jpg
|
||||
```
|
||||
|
||||
On Windows the command is:
|
||||
|
||||
```
|
||||
python .\nature_id.py -m plants plant_images\Persicaria_amphibia.jpg
|
||||
```
|
||||
|
||||

|
||||
|
||||
The above image results in this identification:
|
||||
```
|
||||
Classification of 'plant_images/Persicaria_amphibia.jpg' took 0.2 secs.
|
||||
100.0% kingdom Plants (Plantae)
|
||||
100.0% phylum Tracheophytes (Tracheophyta)
|
||||
100.0% subphylum Flowering Plants (Angiospermae)
|
||||
99.6% class Dicots (Magnoliopsida)
|
||||
99.2% order Pinks, Cactuses, and Allies (Caryophyllales)
|
||||
98.8% family Knotweed Family (Polygonaceae)
|
||||
98.8% subfamily Polygonoideae
|
||||
98.8% tribe Persicarieae
|
||||
98.8% subtribe Persicariinae
|
||||
98.8% genus Smartweeds (Persicaria)
|
||||
97.6% species Water Smartweed (Persicaria amphibia)
|
||||
```
|
||||
|
||||
These scores can be used to guide identification: define a threshold and report as result the taxon with the lowest score that is larger than or equal to this threshold. In this example for a threshold of 95% an identification to species *Persicaria amphibia* has been achieved. For a threshold of 99%, this is only an identification to order *Caryophyllales*. 95% and 99% would be unusually high thresholds; Seek, I believe, uses a threshold of 70%.
|
||||
|
||||
## Command-line Options
|
||||
|
||||
This script is a command-line utility. It is called with options, filenames and directory names as arguments. These options are supported:
|
||||
|
||||
```
|
||||
usage: nature_id.py [-h] [-m MODEL] [-a] [-l] [-s] [-r RESULT_SIZE] file/directory [file/directory ...]
|
||||
|
||||
positional arguments:
|
||||
file/directory Image files or directories with images.
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
-m MODEL, --model MODEL
|
||||
Model to load to identify organisms.
|
||||
-a, --all_common_names
|
||||
Show all common names and not just one.
|
||||
-l, --label_scores_only
|
||||
Compute and display only label scores, do not propagate scores up the hierarchy.
|
||||
-s, --scientific_names_only
|
||||
Only use scientific names, do not load common names.
|
||||
-r RESULT_SIZE, --result_size RESULT_SIZE
|
||||
Number of labels and their scores to report in results.
|
||||
```
|
||||
|
||||
### Option -m MODEL, --model MODEL
|
||||
|
||||
The `-m` and `--model` options select a classification model. Possible models are `plants`, `birds`, and `insects`. These models must be installed in the `classifiers` directory. This option is required if more than one classifier is installed.
|
||||
|
||||
### Option -a, --all_common_names
|
||||
|
||||
The `-a` and `--all_common_names` options cause all common names to be displayed, not just one. Multiple common names are separated by semicolons. The output with this option looks like this:
|
||||
|
||||

|
||||
|
||||
```
|
||||
Classification of 'plant_images/Phyla_nodiflora.jpg' took 0.2 secs.
|
||||
100.0% kingdom Plants; Flora; Green Plants; Greenery; Foliage; Vegetation; Salpichlaena Papyrus; Trees; Bushes; Shrubs; Vines (Plantae)
|
||||
100.0% phylum Tracheophytes; Seed Plants; Vascular Plants (Tracheophyta)
|
||||
100.0% subphylum Flowering Plants; Angiosperms; Flowers; Basal Angiosperms; True Dicotyledons; Basal True Dicots; Rose Dicots; Daisy Dicots (Angiospermae)
|
||||
100.0% class Dicots; Dicots; Dicotyledons; Eudicots (Magnoliopsida)
|
||||
98.2% order Mints, Plantains, Olives, and Allies (Lamiales)
|
||||
97.4% family Verbena Family; Lantanas (Verbenaceae)
|
||||
97.4% tribe Lantaneae
|
||||
85.5% genus Frogfruits; Fogfruits (Phyla)
|
||||
85.5% species Turkey Tangle; Lippia; Common Lippia; Turkey Tangle Frogfruit; Sawtooth Fogfruit; Carpet Weed; Roundleaf Frogfruit; Texas Frogfruit; Cape Weed; Sawtooth Frogfruit; Lipia; Turkey Tangle Fogfruit; Daisy Lawn; Fog Grass (Phyla nodiflora)
|
||||
```
|
||||
|
||||
### Option -l, --label_scores_only
|
||||
|
||||
The `-l` and `--label_scores_only` options switch from the taxonomic hierarchy view to a flat list of labels and their scores. The output with this option looks like this:
|
||||
|
||||

|
||||
|
||||
```
|
||||
Classification of 'plant_images/Solidago_velutina_ssp_californica.jpg' took 0.2 secs.
|
||||
86.1% Canada Goldenrod (Solidago canadensis)
|
||||
9.8% Late Goldenrod (Solidago altissima)
|
||||
1.6% Flat-Topped Goldenrod (Euthamia graminifolia)
|
||||
1.2% Northern Seaside Goldenrod (Solidago sempervirens)
|
||||
0.4% Stiff-Leaved Goldenrod (Solidago rigida)
|
||||
```
|
||||
|
||||
Five labels with decreasing scores are shown by default. The `-r` and `--result_size` options can be used to request fewer or more labels.
|
||||
|
||||
### Option -s, --scientific_names_only
|
||||
|
||||
The `-s` and `--scientific_names_only` options disable common names; only the scientific names are displayed. The output with this option looks like this:
|
||||
|
||||

|
||||
|
||||
```
|
||||
Classification of 'plant_images/Trichostema_lanceolatum.jpg' took 0.2 secs.
|
||||
100.0% kingdom Plantae
|
||||
100.0% phylum Tracheophyta
|
||||
100.0% subphylum Angiospermae
|
||||
100.0% class Magnoliopsida
|
||||
99.6% order Lamiales
|
||||
99.6% family Lamiaceae
|
||||
99.2% subfamily Ajugoideae
|
||||
99.2% genus Trichostema
|
||||
99.2% species Trichostema lanceolatum
|
||||
```
|
||||
|
||||
### Option -r RESULT_SIZE, --result_size RESULT_SIZE
|
||||
|
||||
The `-r` and `--result_size` options modify the number of labels displayed when a flat list of labels is requested with the `-l` or `--label_scores_only` options. The default is 5. Options `-r` and `--result_size` allow you to choose a number between 1 and 100.
|
||||
|
||||
This is an example with 15 labels. The command-line for Linux is
|
||||
```
|
||||
./nature_id.py -m plants -l -r 15 plant_images/Primula_hendersonii.jpg
|
||||
```
|
||||
|
||||

|
||||
|
||||
```
|
||||
Classification of 'plant_images/Primula_hendersonii.jpg' took 0.2 secs.
|
||||
50.4% Henderson's Shooting Star (Primula hendersonii)
|
||||
37.2% Eastern Shooting Star (Primula meadia)
|
||||
2.5% Dark-Throated Shooting Star (Primula pauciflora)
|
||||
1.7% Red Ribbons (Clarkia concinna)
|
||||
1.2% Ruby Chalice Clarkia (Clarkia rubicunda)
|
||||
0.8% Purple Paintbrush (Castilleja purpurea)
|
||||
0.8% Fireweed (Chamaenerion angustifolium)
|
||||
0.4% Western Fairy-Slipper (Calypso bulbosa occidentalis)
|
||||
0.4% Texas Skeleton Plant (Lygodesmia texana)
|
||||
0.4% Rhodora (Rhododendron canadense)
|
||||
0.4% Ragged-Robin (Silene flos-cuculi)
|
||||
0.4% Hemp Dogbane (Apocynum cannabinum)
|
||||
0.4% Garden Cosmos (Cosmos bipinnatus)
|
||||
0.4% Farewell-To-Spring (Clarkia amoena)
|
||||
0.4% Dwarf Fireweed (Chamaenerion latifolium)
|
||||
```
|
||||
|
||||
## Dependencies
|
||||
|
||||
Several things need to be installed in order for `nature-id.py` to run. Some Python packages are required, classification models need to be downloaded and installed into the `classifiers` directory, and finally the taxonomy and common names need to be downloaded into the `inaturalist-taxonomy` directory.
|
||||
|
||||
### Python Packages
|
||||
|
||||
This code is written in Python 3. Besides Python 3, the packages `Pillow` and `requests` are used to load and process images and to access the iNaturalist API.
|
||||
|
||||
These packages as well as `TensorFlow Lite` can be installed on Ubuntu Linux and other Debian distributions with the command
|
||||
|
||||
```
|
||||
sudo apt install python3-pillow python3-requests
|
||||
pip3 install tflite-runtime
|
||||
```
|
||||
|
||||
and on other platforms with the command
|
||||
|
||||
```
|
||||
pip install Pillow requests tflite-runtime
|
||||
```
|
||||
|
||||
Where appropriate `pip3` should be called instead of `pip` to avoid accidentally installing Python 2 packages.
|
||||
|
||||
|
||||
### Classification Models
|
||||
|
||||
The classification models and their labelmap files have to be downloaded from Kaggle and they go into directory `classifiers`.
|
||||
|
||||
The classifiers can be downloaded from these links:
|
||||
|
||||
* [classifier for plants](https://www.kaggle.com/models/google/aiy/tensorFlow1/vision-classifier-plants-v1/1)
|
||||
* [classifier for birds](https://www.kaggle.com/models/google/aiy/tensorFlow1/vision-classifier-birds-v1/1)
|
||||
* [classifier for insects](https://www.kaggle.com/models/google/aiy/tensorFlow1/vision-classifier-insects-v1/1)
|
||||
|
||||
Each classifier consists of a `.tflite` model and a `.csv` labelmap file. Both are required. Click on `Model Variations` under `TensorFlow Lite` to download the TFLite model. Please also note the paragraphs at the bottom of these web pages about appropriate and inappropriate use cases and licensing.
|
||||
|
||||
These are the links to download the labelmaps: [aiy_insects_V1_labelmap.csv](https://www.gstatic.com/aihub/tfhub/labelmaps/aiy_insects_V1_labelmap.csv), [aiy_birds_V1_labelmap.csv](https://www.gstatic.com/aihub/tfhub/labelmaps/aiy_birds_V1_labelmap.csv), and [aiy_plants_V1_labelmap.csv](https://www.gstatic.com/aihub/tfhub/labelmaps/aiy_plants_V1_labelmap.csv). On Windows, the default action for a .csv file may be to open it in Excel; be sure to save the downloaded file to disk.
|
||||
|
||||
### Taxonomy and Common Names Files
|
||||
|
||||
The trained models come with scientific names as labels and many of these scientific names are already outdated. The common names and the current taxonomy are obtained from this file: [https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip](https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip) This tool expects this zip archive in the `inaturalist-taxonomy` directory.
|
||||
|
||||
## Example Images
|
||||
|
||||
Example Images pictures of plants are provided in the `plant_images` directory. The filenames indicate the species that I think is in the photo. Note that these examples only lead to successful identification to varying degrees. The *Mentzelia lindleyi* is certainly not correctly identified.
|
||||
|
||||
## Messages
|
||||
|
||||
The first call with a model transforms the labels into a taxonomic hierarchy. Each label is replaced with its representation in the current taxonomy and all its ancestors are added. This process takes some time and results in many messages. Once the hierarchy has been successfully computed, it is written to disk. Future calls to `nature_id.py` will load the taxonomic hierarchy from disk instead of reading the labels and computing the taxonomy again.
|
||||
|
||||
This is what the first calls look like. Again, we use the plant model as an example. The bird and insect models are smaller and result in fewer messages.
|
||||
|
||||
```
|
||||
PS C:\nature-id> python -m plants nature_id.py .\plant_images
|
||||
Read 2,102 labels from 'classifiers\aiy_plants_V1_labelmap.csv' in 0.0 secs.
|
||||
Loading iNaturalist taxonomy...
|
||||
Loaded iNaturalist taxonomy of 993,552 taxa in 15.2 secs.
|
||||
Info: Taxon for label 'background' not found, inserting as pseudo-kingdom.
|
||||
Info: Taxon 'Eichhornia crassipes' changed to 'Pontederia crassipes', iNat taxa id 962637.
|
||||
Info: Taxon 'Potentilla anserina' changed to 'Argentina anserina', iNat taxa id 158615.
|
||||
Info: Taxon 'Stenosiphon linifolius' changed to 'Oenothera glaucifolia', iNat taxa id 914092.
|
||||
Info: Taxon 'Sophora secundiflora' changed to 'Dermatophyllum secundiflorum', iNat taxa id 499559.
|
||||
Info: Taxon 'Mimulus bigelovii' changed to 'Diplacus bigelovii', iNat taxa id 701989.
|
||||
Info: Taxon 'Botrychium dissectum' changed to 'Sceptridium dissectum', iNat taxa id 122085.
|
||||
Info: Taxon 'Trientalis borealis' changed to 'Lysimachia borealis', iNat taxa id 204174.
|
||||
Info: Taxon 'Hyptis emoryi' changed to 'Condea emoryi', iNat taxa id 489286.
|
||||
Info: Taxon 'Opuntia engelmannii lindheimeri' changed to 'Opuntia lindheimeri', iNat taxa id 119980.
|
||||
Info: Taxon 'Aquilegia caerulea' changed to 'Aquilegia coerulea', iNat taxa id 501742.
|
||||
Info: Taxon 'Fuscospora cliffortioides' changed to 'Nothofagus cliffortioides', iNat taxa id 404204.
|
||||
Info: Taxon 'Cooperia drummondii' changed to 'Zephyranthes chlorosolen', iNat taxa id 554401.
|
||||
Info: Taxon 'Dracopis amplexicaulis' changed to 'Rudbeckia amplexicaulis', iNat taxa id 200073.
|
||||
Info: Taxon 'Dodecatheon meadia' changed to 'Primula meadia', iNat taxa id 549981.
|
||||
Info: Taxon 'Aptenia cordifolia' changed to 'Mesembryanthemum cordifolium', iNat taxa id 589815.
|
||||
Info: Taxon 'Chamerion latifolium' changed to 'Chamaenerion latifolium', iNat taxa id 564970.
|
||||
Info: Taxon 'Echinocereus mojavensis' changed to 'Echinocereus triglochidiatus mojavensis', iNat taxa id 858352.
|
||||
Warning: multiple taxa named 'Aquilegia vulgaris': species 51807, complex 1042772; choosing species.
|
||||
Info: Taxon 'Dodecatheon pulchellum' changed to 'Primula pauciflora', iNat taxa id 498086.
|
||||
Info: Taxon 'Mimulus lewisii' changed to 'Erythranthe lewisii', iNat taxa id 777190.
|
||||
Info: Taxon 'Sambucus nigra canadensis' changed to 'Sambucus canadensis', iNat taxa id 84300.
|
||||
Info: Taxon 'Asyneuma prenanthoides' changed to 'Campanula prenanthoides', iNat taxa id 851072.
|
||||
Info: Taxon 'Anemone quinquefolia' changed to 'Anemonoides quinquefolia', iNat taxa id 950598.
|
||||
Info: Taxon 'Hedypnois cretica' changed to 'Hedypnois rhagadioloides', iNat taxa id 492864.
|
||||
Warning: multiple taxa named 'Achillea millefolium': species 52821, complex 1105043; choosing species.
|
||||
Info: Taxon 'Anagallis arvensis' changed to 'Lysimachia arvensis', iNat taxa id 791928.
|
||||
Info: Taxon 'Hieracium caespitosum' changed to 'Pilosella caespitosa', iNat taxa id 711086.
|
||||
Info: Taxon 'Potentilla anserina pacifica' changed to 'Argentina pacifica', iNat taxa id 524900.
|
||||
Info: Taxon 'Sambucus nigra caerulea' changed to 'Sambucus cerulea', iNat taxa id 143799.
|
||||
Info: Taxon 'Polygala californica' changed to 'Rhinotropis californica', iNat taxa id 876453.
|
||||
Info: Taxon 'Calylophus berlandieri' changed to 'Oenothera berlandieri', iNat taxa id 359779.
|
||||
Info: Taxon 'Mimulus cardinalis' changed to 'Erythranthe cardinalis', iNat taxa id 319974.
|
||||
Info: Taxon 'Callistemon citrinus' changed to 'Melaleuca citrina', iNat taxa id 77976.
|
||||
Info: Taxon 'Liatris mucronata' changed to 'Liatris punctata mucronata', iNat taxa id 371814.
|
||||
Warning: multiple taxa named 'Stellaria media': species 53298, complex 1087592; choosing species.
|
||||
Info: Taxon 'Anemone americana' changed to 'Hepatica americana', iNat taxa id 741014.
|
||||
Info: Taxon 'Anemone occidentalis' changed to 'Pulsatilla occidentalis', iNat taxa id 60482.
|
||||
Info: Taxon 'Orobanche fasciculata' changed to 'Aphyllon fasciculatum', iNat taxa id 802543.
|
||||
Info: Taxon 'Mimulus primuloides' changed to 'Erythranthe primuloides', iNat taxa id 635401.
|
||||
Info: Taxon 'Polygala paucifolia' changed to 'Polygaloides paucifolia', iNat taxa id 497911.
|
||||
Warning: multiple taxa named 'Campanula rotundifolia': species 62312, complex 984576; choosing species.
|
||||
Info: Taxon 'Cissus incisa' changed to 'Cissus trifoliata', iNat taxa id 133333.
|
||||
Info: Taxon 'Schinus terebinthifolius' changed to 'Schinus terebinthifolia', iNat taxa id 130872.
|
||||
Info: Taxon 'Cooperia pedunculata' changed to 'Zephyranthes drummondii', iNat taxa id 120026.
|
||||
Info: Taxon 'Scabiosa atropurpurea' changed to 'Sixalix atropurpurea', iNat taxa id 372376.
|
||||
Info: Taxon 'Sphenosciadium capitellatum' changed to 'Angelica capitellata', iNat taxa id 704166.
|
||||
Info: Taxon 'Trientalis latifolia' changed to 'Lysimachia latifolia', iNat taxa id 496537.
|
||||
Warning: multiple taxa named 'Spiranthes cernua': species 773385, complex 931407; choosing species.
|
||||
Info: Taxon 'Spartina pectinata' changed to 'Sporobolus michauxianus', iNat taxa id 772984.
|
||||
Info: Taxon 'Centaurea americana' changed to 'Plectocephalus americanus', iNat taxa id 699778.
|
||||
Info: Taxon 'Fuscospora solandri' changed to 'Nothofagus solandri', iNat taxa id 70246.
|
||||
Info: Taxon 'Heliotropium tenellum' changed to 'Euploca tenella', iNat taxa id 769888.
|
||||
Info: Taxon 'Blechnum spicant' changed to 'Struthiopteris spicant', iNat taxa id 774894.
|
||||
Info: Taxon 'Fallopia japonica' changed to 'Reynoutria japonica', iNat taxa id 914922.
|
||||
Info: Taxon 'Echinocactus texensis' changed to 'Homalocephala texensis', iNat taxa id 870496.
|
||||
Info: Taxon 'Gaura parviflora' changed to 'Oenothera curtiflora', iNat taxa id 78241.
|
||||
Info: Taxon 'Parentucellia viscosa' changed to 'Bellardia viscosa', iNat taxa id 537967.
|
||||
Info: Taxon 'Anemone nemorosa' changed to 'Anemonoides nemorosa', iNat taxa id 950603.
|
||||
Info: Taxon 'Hieracium aurantiacum' changed to 'Pilosella aurantiaca', iNat taxa id 711103.
|
||||
Info: Taxon 'Anemone hepatica' changed to 'Hepatica nobilis', iNat taxa id 639660.
|
||||
Info: Taxon 'Merremia dissecta' changed to 'Distimake dissectus', iNat taxa id 907480.
|
||||
Info: Taxon 'Anemone canadensis' changed to 'Anemonastrum canadense', iNat taxa id 881527.
|
||||
Info: Taxon 'Chamerion angustifolium' changed to 'Chamaenerion angustifolium', iNat taxa id 564969.
|
||||
Info: Taxon 'Lychnis flos-cuculi' changed to 'Silene flos-cuculi', iNat taxa id 740984.
|
||||
Throttling API calls, sleeping for 44.5 seconds.
|
||||
Info: Taxon 'Ampelopsis brevipedunculata' changed to 'Ampelopsis glandulosa brevipedunculata', iNat taxa id 457553.
|
||||
Info: Taxon 'Anemone acutiloba' changed to 'Hepatica acutiloba', iNat taxa id 179786.
|
||||
Info: Taxon 'Pennisetum setaceum' changed to 'Cenchrus setaceus', iNat taxa id 430581.
|
||||
Info: Taxon 'Mimulus guttatus' changed to 'Erythranthe guttata', iNat taxa id 470643.
|
||||
Info: Taxon 'Blechnum fluviatile' changed to 'Cranfillia fluviatilis', iNat taxa id 700995.
|
||||
Info: Taxon 'Blechnum discolor' changed to 'Lomaria discolor', iNat taxa id 403546.
|
||||
Info: Taxon 'Andropogon gerardii' changed to 'Andropogon gerardi', iNat taxa id 121968.
|
||||
Info: Taxon 'Ferocactus hamatacanthus' changed to 'Hamatocactus hamatacanthus', iNat taxa id 855937.
|
||||
Info: Taxon 'Gaura lindheimeri' changed to 'Oenothera lindheimeri', iNat taxa id 590726.
|
||||
Info: Taxon 'Gaura suffulta' changed to 'Oenothera suffulta', iNat taxa id 521639.
|
||||
Info: Taxon 'Glottidium vesicarium' changed to 'Sesbania vesicaria', iNat taxa id 890511.
|
||||
Info: Taxon 'Acacia farnesiana' changed to 'Vachellia farnesiana', iNat taxa id 79472.
|
||||
Warning: multiple taxa named 'Rubus fruticosus': complex 55911, species 1090496; choosing species.
|
||||
Info: Taxon 'Othocallis siberica' changed to 'Scilla siberica', iNat taxa id 862704.
|
||||
Info: Taxon 'Mimulus aurantiacus' changed to 'Diplacus', iNat taxa id 777236.
|
||||
Info: Taxon 'Phoradendron tomentosum' changed to 'Phoradendron leucarpum', iNat taxa id 49668.
|
||||
Info: Taxon 'Orobanche uniflora' changed to 'Aphyllon uniflorum', iNat taxa id 802714.
|
||||
Info: Taxon 'Rosmarinus officinalis' changed to 'Salvia rosmarinus', iNat taxa id 636795.
|
||||
Info: Taxon 'Cynoglossum grande' changed to 'Adelinia grande', iNat taxa id 769151.
|
||||
Computed taxonomic tree from labels in 64.8 secs: 4,091 taxa including 2,102 leaf taxa.
|
||||
Taxonomy written to file 'classifiers\aiy_plants_V1_taxonomy.csv'.
|
||||
Reading common names from 'inaturalist-taxonomy\inaturalist-taxonomy.dwca.zip' member 'VernacularNames-english.csv'...
|
||||
Read 203,093 common names in 1.5 secs, loaded 3,071 in language "en_US" for 4,091 taxa.
|
||||
```
|
||||
|
||||
### Messages Explained
|
||||
|
||||
```
|
||||
Read 2,102 labels from 'classifiers\aiy_plants_V1_labelmap.csv' in 0.0 secs.
|
||||
```
|
||||
|
||||
`nature-id` reads a label file. If no errors occur, a taxonomy will be written for these labels and further runs will load `classifiers\aiy_plants_V1_taxonomy.csv` instead.
|
||||
|
||||
```
|
||||
Loading iNaturalist taxonomy...
|
||||
Loaded iNaturalist taxonomy of 993,552 taxa in 15.2 secs.
|
||||
```
|
||||
|
||||
The entire iNaturalist taxonomy of about 1 million taxa is loaded. `nature-id` will look up the labels in this taxonomy and insert them, along with all their ancestors, into a taxonomy for the labels.
|
||||
|
||||
```
|
||||
Info: Taxon for label 'background' not found, inserting as pseudo-kingdom.
|
||||
```
|
||||
|
||||
Label `background` was not found. It is not a species, but denotes something else in the Google model. It is treated as a kingdom in the taxonomy; it has no ancestors.
|
||||
|
||||
```
|
||||
Info: Taxon 'Potentilla anserina' changed to 'Argentina anserina', iNat taxa id 158615.
|
||||
```
|
||||
|
||||
In the current taxonomy, this species belongs to a different genus. The numeric ID in this message is useful for getting more information. This number can be prefixed with `https://www.inaturalist.org/taxa/` and opened in a browser: [https://www.inaturalist.org/taxa/158615](https://www.inaturalist.org/taxa/158615).
|
||||
|
||||
```
|
||||
Warning: multiple taxa named 'Achillea millefolium': species 52821, complex 1105043; choosing species.
|
||||
```
|
||||
|
||||
The label name for this common yarrow is not unique, there are several taxa for this scientific name. `nature-id` assumes that the species is the one we want.
|
||||
|
||||
```
|
||||
Throttling API calls, sleeping for 44.5 seconds.
|
||||
```
|
||||
|
||||
This message is followed by 45 seconds of silence. When a name is not found in the the current taxonomy, the one previously loaded with about 1 million taxa, then iNaturalist API calls are made to look up the inactive scientific name. The iNaturalist team would like us to throttle API calls to no more than 60 calls per minute. This delay has been implemented to accommodate their request.
|
||||
|
||||
```
|
||||
Info: Taxon 'Mimulus aurantiacus' changed to 'Diplacus', iNat taxa id 777236.
|
||||
```
|
||||
|
||||
The species *Mimulus aurantiacus* in the label file is replaced with the genus *Diplacus* and not with the current species *Diplacus aurantiacus*. This looks like a bug and hence deserves a closer look.
|
||||
|
||||
The reason for this decision of `nature_id` is that *Mimulus aurantiacus* consisted of several varieties *Mimulus aurantiacus aurantiacus*, *Mimulus aurantiacus grandiflorus*, *Mimulus aurantiacus parviflorus*, and 3 more.
|
||||
|
||||
In the current taxonomy, these varieties are species *Diplacus aurantiacus*, *Diplacus grandiflorus*, and *Diplacus parviflorus*. *Diplacus aurantiacus* does not replace *Mimulus aurantiacus*; it replaces the variety *Mimulus aurantiacus aurantiacus*.
|
||||
|
||||
Another way to understand this issue is to realize that photos of all varieties *Mimulus aurantiacus aurantiacus*, *Mimulus aurantiacus grandiflorus*, *Mimulus aurantiacus parviflorus* and the 3 others were used to train the classification model to recognize *Mimulus aurantiacus*. In the current taxonomy, this label is triggered for each of the species *Diplacus aurantiacus*, *Diplacus grandiflorus*, and *Diplacus parviflorus*. `nature_id` cannot say which of current species it sees. It can only identify images as genus *Diplacus*.
|
||||
|
||||
```
|
||||
Taxonomy written to file 'classifiers\aiy_plants_V1_taxonomy.csv'.
|
||||
```
|
||||
|
||||
A taxonomy for the scientific names in the label file has been successfully computed and this taxonomy was written to disk. Future calls will load this taxonomy instead of loading the labels and re-computing the taxonomy.
|
||||
|
||||
```
|
||||
Reading common names from 'inaturalist-taxonomy\inaturalist-taxonomy.dwca.zip' member 'VernacularNames-english.csv'...
|
||||
Read 203,093 common names in 1.5 secs, loaded 3,071 in language "en_US" for 4,091 taxa.
|
||||
```
|
||||
|
||||
Common names have been read. The common names are always selected for the local language, not necessarily for English as shown here.
|
||||
13
third_party/nature-id/classifiers/README.md
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
# Download Instructions
|
||||
|
||||
The [Tensorflow Lite](https://www.tensorflow.org/lite/guide) classifiers that go in this directory can be downloaded from these websites:
|
||||
|
||||
* [classifier for plants](https://tfhub.dev/google/aiy/vision/classifier/plants_V1/1)
|
||||
* [classifier for birds](https://tfhub.dev/google/aiy/vision/classifier/birds_V1/1)
|
||||
* [classifier for insects](https://tfhub.dev/google/aiy/vision/classifier/insects_V1/1)
|
||||
|
||||
Each classifier consists of a `.tflite` model and a `.csv` labelmap file. Both are required.
|
||||
|
||||
On each of the above websites scroll down and under `Output` click on `labelmap` to download the labels. Then scroll back up and under `Model formats` switch to `TFLite (aiyvision/classifier/...)`. There click on `Download` to get the `.tflite` file.
|
||||
|
||||
If you happen to have the classifier included in [Seek](https://www.inaturalist.org/pages/seek_app), it can go in this directory as well. It consists of two files `optimized_model_v1.tflite` and `taxonomy_v1.csv`.
|
||||
110
third_party/nature-id/inat_api.py
vendored
Normal file
@@ -0,0 +1,110 @@
|
||||
import json, os, pickle, requests, shelve, sys, time
|
||||
|
||||
#############################################################################
|
||||
# #
|
||||
# API calls to obtain taxonomic information. Used in case of name changes. #
|
||||
# #
|
||||
# See documention at https://api.inaturalist.org/v1/docs/#/Taxa #
|
||||
# #
|
||||
# We throttle the number of calls to less than 60 per minute. We also #
|
||||
# implement a cache to avoid repeated lookups of the same taxa across runs. #
|
||||
# Cache entries include time stamps and they expire after two weeks. #
|
||||
# #
|
||||
#############################################################################
|
||||
|
||||
API_HOST = "https://api.inaturalist.org/v1"
|
||||
CACHE_EXPIRATION = 14 * 24 * 3600 # cache expires after 2 weeks
|
||||
TOO_MANY_API_CALLS_DELAY = 60 # wait this long after error 429
|
||||
|
||||
# The cache stores the json responses.
|
||||
|
||||
if sys.platform == 'win32':
|
||||
DATA_DIR = os.path.join(os.path.expanduser('~'),
|
||||
'AppData', 'Local', 'inat_api')
|
||||
else:
|
||||
DATA_DIR = os.path.join(os.path.expanduser('~'), '.cache', 'inat_api')
|
||||
|
||||
if not os.path.exists(DATA_DIR):
|
||||
os.makedirs(DATA_DIR)
|
||||
|
||||
cache = shelve.open(os.path.join(DATA_DIR, 'api.cache'))
|
||||
|
||||
# API call throttling.
|
||||
|
||||
class Throttle:
|
||||
|
||||
API_MAX_CALLS = 60 # max 60 calls per minute
|
||||
API_INTERVAL = 60 # 1 minute
|
||||
|
||||
def __init__(self):
|
||||
self.callTimes = [] # times of api calls
|
||||
|
||||
# wait if necessary to avoid more than API_MAX_CALLS in API_INTERVAL
|
||||
def wait(self):
|
||||
while len(self.callTimes) >= self.API_MAX_CALLS:
|
||||
waitTime = self.callTimes[0] - (time.time() - self.API_INTERVAL)
|
||||
if waitTime > 0:
|
||||
print('Throttling API calls, '
|
||||
f'sleeping for {waitTime:.1f} seconds.')
|
||||
time.sleep(waitTime)
|
||||
continue
|
||||
self.callTimes = self.callTimes[1:]
|
||||
self.callTimes.append(time.time())
|
||||
|
||||
api_call_throttle = Throttle()
|
||||
|
||||
# argument is an id or a list of id's
|
||||
def get_taxa_by_id(id):
|
||||
if type(id) is list:
|
||||
url = API_HOST + '/taxa/' + '%2C'.join([str(i) for i in id])
|
||||
else:
|
||||
url = API_HOST + f'/taxa/{id}'
|
||||
tim = time.time()
|
||||
if not url in cache or cache[url][0] < tim - CACHE_EXPIRATION:
|
||||
delay = TOO_MANY_API_CALLS_DELAY
|
||||
headers = {'Content-type' : 'application/json' }
|
||||
while True:
|
||||
api_call_throttle.wait()
|
||||
response = requests.get(url, headers=headers)
|
||||
if response.status_code == requests.codes.too_many:
|
||||
time.sleep(delay)
|
||||
delay *= 2
|
||||
else:
|
||||
break
|
||||
if response.status_code == requests.codes.ok:
|
||||
cache[url] = (tim, response.json())
|
||||
else:
|
||||
print(response.text)
|
||||
return None
|
||||
return cache[url][1]
|
||||
|
||||
# returns taxa by name
|
||||
def get_taxa(params):
|
||||
url = API_HOST + '/taxa'
|
||||
for key, val in params.items():
|
||||
if type(val) == bool:
|
||||
params[key] = 'true' if val else 'false'
|
||||
key = pickle.dumps((url, params)).hex()
|
||||
tim = time.time()
|
||||
if not key in cache or cache[key][0] < tim - CACHE_EXPIRATION:
|
||||
delay = TOO_MANY_API_CALLS_DELAY
|
||||
headers = {'Content-type' : 'application/json' }
|
||||
while True:
|
||||
api_call_throttle.wait()
|
||||
response = requests.get(url, headers=headers, params=params)
|
||||
if response.status_code == requests.codes.too_many:
|
||||
time.sleep(delay)
|
||||
delay *= 2
|
||||
else:
|
||||
break
|
||||
if response.status_code == requests.codes.ok:
|
||||
cache[key] = (tim, response.json())
|
||||
else:
|
||||
print(response.text)
|
||||
return None
|
||||
return cache[key][1]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
assert not 'Not a top-level Python module!'
|
||||
318
third_party/nature-id/inat_taxonomy.py
vendored
Normal file
@@ -0,0 +1,318 @@
|
||||
import csv, sys, os, time, locale, zipfile, io
|
||||
import inat_api
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict
|
||||
|
||||
# The directory where this Python script is located.
|
||||
INSTALL_DIR = os.path.dirname(__file__)
|
||||
while os.path.islink(INSTALL_DIR):
|
||||
INSTALL_DIR = os.path.join(INSTALL_DIR,
|
||||
os.path.dirname(os.readlink(INSTALL_DIR)))
|
||||
|
||||
# This zip file contains the taxonomy and all common names.
|
||||
# Download https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip and
|
||||
# leave this zip file in directory 'inaturalist-taxonomy'. Do not extract the
|
||||
# files from this zip archive.
|
||||
INAT_TAXONOMY = os.path.join(INSTALL_DIR, 'inaturalist-taxonomy',
|
||||
'inaturalist-taxonomy.dwca.zip')
|
||||
|
||||
# A special node represents the root of the tree, the parent of kingdoms.
|
||||
ROOT_TAXON_ID = 48460
|
||||
ROOT_NAME = 'Life'
|
||||
ROOT_RANK_LEVEL = 100
|
||||
|
||||
# maps rank-level to its name
|
||||
gRankLevel2Name = {
|
||||
ROOT_RANK_LEVEL : 'stateofmatter', # used for the parent of kingdoms
|
||||
70 : 'kingdom',
|
||||
67 : 'subkingdom',
|
||||
60 : 'phylum',
|
||||
57 : 'subphylum',
|
||||
53 : 'superclass',
|
||||
50 : 'class',
|
||||
47 : 'subclass',
|
||||
45 : 'infraclass',
|
||||
44 : 'subterclass',
|
||||
43 : 'superorder',
|
||||
40 : 'order',
|
||||
37 : 'suborder',
|
||||
35 : 'infraorder',
|
||||
34.5: 'parvorder',
|
||||
34 : 'zoosection',
|
||||
33.5: 'zoosubsection',
|
||||
33 : 'superfamily',
|
||||
32 : 'epifamily',
|
||||
30 : 'family',
|
||||
27 : 'subfamily',
|
||||
26 : 'supertribe',
|
||||
25 : 'tribe',
|
||||
24 : 'subtribe',
|
||||
20 : 'genus',
|
||||
19 : 'genushybrid', # changed, was same as genus in iNaturalist
|
||||
15 : 'subgenus',
|
||||
13 : 'section',
|
||||
12 : 'subsection',
|
||||
11 : 'complex',
|
||||
10 : 'species',
|
||||
9 : 'hybrid', # changed, was same as species in iNaturalist
|
||||
5 : 'subspecies',
|
||||
4 : 'variety', # changed, was same as subspecies in iNaturalist
|
||||
3 : 'form', # changed, was same as subspecies in iNaturalist
|
||||
2 : 'infrahybrid' # changed, was same as subspecies in iNaturalist
|
||||
}
|
||||
|
||||
# maps rank name to numeric rank-level
|
||||
gName2RankLevel = {}
|
||||
for key, value in gRankLevel2Name.items():
|
||||
gName2RankLevel[value] = key
|
||||
|
||||
KINGDOM_RANK_LEVEL = gName2RankLevel['kingdom']
|
||||
|
||||
def get_rank_level(rank):
|
||||
assert rank in gName2RankLevel
|
||||
return gName2RankLevel[rank]
|
||||
|
||||
def get_rank_name(rank_level, default_name = 'clade'):
|
||||
return gRankLevel2Name[rank_level] if rank_level in gRankLevel2Name \
|
||||
else default_name
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Taxon:
|
||||
id : int
|
||||
parent_id : int
|
||||
name : str
|
||||
rank_level: float
|
||||
|
||||
# iNaturalist taxa, only loaded when a taxonomic tree needs
|
||||
# to be computed from a label file.
|
||||
|
||||
gName2Taxa: Dict[str,List[Taxon]] = {}
|
||||
"maps taxon name to list of taxa"
|
||||
|
||||
gId2Taxon: Dict[int,Taxon] = {}
|
||||
"maps taxon id to taxon"
|
||||
|
||||
def load_inat_taxonomy():
|
||||
"Load all iNaturalist taxa from file 'taxa.csv'."
|
||||
global gName2Taxa
|
||||
global gId2Taxon
|
||||
|
||||
if gName2Taxa and gId2Taxon:
|
||||
return True # already loaded
|
||||
|
||||
print('Loading iNaturalist taxonomy...')
|
||||
start_time = time.time()
|
||||
gName2Taxa = {}
|
||||
gId2Taxon = {}
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(INAT_TAXONOMY, 'r') as zf:
|
||||
with zf.open('taxa.csv', 'r') as zfile:
|
||||
with io.TextIOWrapper(zfile, encoding = 'latin-1') as csvfile:
|
||||
reader = csv.DictReader(csvfile)
|
||||
for row in reader:
|
||||
id = int(row['id'])
|
||||
parent_id = row['parentNameUsageID'].split('/')[-1]
|
||||
parent_id = int(parent_id) if parent_id else \
|
||||
ROOT_TAXON_ID if id != ROOT_TAXON_ID else None
|
||||
name = row['scientificName']
|
||||
rank = row['taxonRank']
|
||||
if not rank in gName2RankLevel:
|
||||
response = inat_api.get_taxa_by_id(id)
|
||||
if response and 'results' in response:
|
||||
rank_level = response['results'][0]\
|
||||
['rank_level']
|
||||
gName2RankLevel[rank] = rank_level
|
||||
if not rank_level in gRankLevel2Name:
|
||||
gRankLevel2Name[rank_level] = rank
|
||||
print(f"Please add rank '{rank}' to gName2Rank"
|
||||
f"Level, numeric value {rank_level}.")
|
||||
else:
|
||||
gName2RankLevel[rank] = -1
|
||||
rank_level = gName2RankLevel[rank]
|
||||
inat_taxon = Taxon(id, parent_id, name, rank_level)
|
||||
if name in gName2Taxa:
|
||||
gName2Taxa[name].append(inat_taxon)
|
||||
else:
|
||||
gName2Taxa[name] = [inat_taxon]
|
||||
assert not id in gId2Taxon
|
||||
gId2Taxon[id] = inat_taxon
|
||||
if len(gId2Taxon) % 10000 == 0:
|
||||
print(f' {len(gId2Taxon):,} ' if len(gId2Taxon) %
|
||||
100000 == 0 else '.', end='')
|
||||
sys.stdout.flush()
|
||||
|
||||
assert ROOT_TAXON_ID in gId2Taxon
|
||||
print(f' {len(gId2Taxon):,}.')
|
||||
print(f'Loaded iNaturalist taxonomy of {len(gId2Taxon):,} taxa '
|
||||
f'in {time.time()-start_time:.1f} secs.')
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print("Cannot load taxonomy 'taxa.csv' from archive "
|
||||
f"'{INAT_TAXONOMY}': {str(e)}.")
|
||||
gName2Taxa = {}
|
||||
gId2Taxon = {}
|
||||
return False
|
||||
|
||||
def beautify_common_name(name):
|
||||
"Capitalize (most) words in common name; helper function for common names."
|
||||
if name.endswith(' [paraphyletic]'):
|
||||
name = name[:-15] # fix dicots
|
||||
name = '-'.join(word[0].upper() + word[1:]
|
||||
for word in name.split('-'))
|
||||
return ' '.join(word if word == 'and' or word.endswith('.')
|
||||
else word[0].upper() + word[1:]
|
||||
for word in name.split())
|
||||
|
||||
def annotate_common_names(id2taxon, all_common_names = False):
|
||||
"""
|
||||
Load the common names in our language, annotate taxonomic tree with them.
|
||||
The parameter `id2taxon' includes the taxa we are interested in.
|
||||
"""
|
||||
start_time = time.time()
|
||||
language, _ = locale.getdefaultlocale()
|
||||
|
||||
if language in ['C', 'C.UTF-8', 'POSIX']:
|
||||
language = 'en'
|
||||
|
||||
if not os.path.isfile(INAT_TAXONOMY):
|
||||
print("Cannot load common names, archive "
|
||||
f"'{INAT_TAXONOMY}' does not exist.")
|
||||
return
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(INAT_TAXONOMY, 'r') as zf:
|
||||
perfect_match = []
|
||||
other_matches = []
|
||||
|
||||
# check all common names files for names in our language
|
||||
for fname in zf.namelist():
|
||||
if fname.startswith("VernacularNames-") and \
|
||||
fname.endswith(".csv"):
|
||||
with zf.open(fname, 'r') as zfile:
|
||||
with io.TextIOWrapper(zfile, encoding='utf-8') as csvf:
|
||||
reader = csv.DictReader(csvf)
|
||||
for row in reader:
|
||||
lang = row['language']
|
||||
if lang == language:
|
||||
perfect_match.append(fname) # en vs en
|
||||
elif len(lang) < len(language) and \
|
||||
lang == language[:len(lang)]:
|
||||
other_matches.append(fname) # en vs en_US
|
||||
break
|
||||
|
||||
if not perfect_match and not other_matches:
|
||||
print("Cannot find common names for language '{language}'.")
|
||||
return
|
||||
|
||||
# annotate the taxa with common names
|
||||
total_names = loaded_names = 0
|
||||
for fname in perfect_match + other_matches:
|
||||
print(f"Reading common names from '{INAT_TAXONOMY}' "
|
||||
f"member '{fname}'...")
|
||||
with zf.open(fname, 'r') as zfile:
|
||||
with io.TextIOWrapper(zfile, encoding='utf-8') as csvf:
|
||||
reader = csv.DictReader(csvf)
|
||||
for row in reader:
|
||||
total_names += 1
|
||||
id = int(row['id'])
|
||||
if id in id2taxon and (all_common_names or \
|
||||
id2taxon[id].common_name is None):
|
||||
loaded_names += 1
|
||||
cname = beautify_common_name(row['vernacular'
|
||||
'Name'])
|
||||
if id2taxon[id].common_name is None:
|
||||
id2taxon[id].common_name = cname
|
||||
else:
|
||||
id2taxon[id].common_name += '; ' + cname
|
||||
|
||||
print(f'Read {total_names:,} common names in '
|
||||
f'{time.time()-start_time:.1f} secs, loaded {loaded_names:,} '
|
||||
f'in language "{language}" for {len(id2taxon)-1:,} taxa.')
|
||||
|
||||
except Exception as e:
|
||||
print(f"Cannot load common names from archive '{INAT_TAXONOMY}':"
|
||||
f" {str(e)}.")
|
||||
|
||||
def get_ancestors(id, ancestors):
|
||||
"""
|
||||
Ancestors are a list of instances of Taxon; they are ordered from the
|
||||
kingdom down.
|
||||
"""
|
||||
taxon = gId2Taxon[id]
|
||||
if taxon.rank_level < KINGDOM_RANK_LEVEL:
|
||||
get_ancestors(taxon.parent_id, ancestors)
|
||||
ancestors.append(taxon)
|
||||
|
||||
def lookup_id(name, desired_ranks = ['species', 'subspecies']):
|
||||
"""
|
||||
Lookup by name, returns a pair, a Taxon and its ancestors, a list of
|
||||
Taxon. Desired_ranks are returned in case of ambiguities (duplicate names).
|
||||
"""
|
||||
if not gName2Taxa:
|
||||
return None # taxonomy not loaded
|
||||
if name in gName2Taxa:
|
||||
taxa = gName2Taxa[name]
|
||||
if len(taxa) > 1:
|
||||
species = None
|
||||
subspecies = None
|
||||
print(f"Warning: multiple taxa named '{name}':", end='')
|
||||
prefix = ' '
|
||||
taxon = None
|
||||
for t in taxa:
|
||||
rank = get_rank_name(t.rank_level)
|
||||
print(f"{prefix}{rank} {t.id}", end='')
|
||||
if rank in desired_ranks:
|
||||
taxon = t
|
||||
prefix = ', '
|
||||
if not taxon:
|
||||
taxon = taxa[0]
|
||||
rank = get_rank_name(taxon.rank_level)
|
||||
print(f"; choosing {rank}.")
|
||||
else:
|
||||
taxon = taxa[0]
|
||||
ancestors = []
|
||||
if taxon.rank_level < KINGDOM_RANK_LEVEL:
|
||||
get_ancestors(taxon.parent_id, ancestors)
|
||||
return (taxon, ancestors)
|
||||
else:
|
||||
# likely taxon change, query iNat API
|
||||
response = inat_api.get_taxa({ 'q' : name,
|
||||
'all_names' : 'true',
|
||||
'per_page' : 200 })
|
||||
if not response:
|
||||
print(f"API lookup for name '{name}' failed.")
|
||||
return
|
||||
taxa = response['results']
|
||||
if len(taxa) > 1:
|
||||
# more than one taxon, find the one that used to have this name
|
||||
exact_matches = [taxon for taxon in taxa for nam in taxon['names']
|
||||
if nam['locale'] == 'sci' and nam['name'] == name]
|
||||
if exact_matches:
|
||||
taxa = exact_matches
|
||||
ids = [taxon['id'] for taxon in taxa]
|
||||
taxa = set([gId2Taxon[id] for id in ids if id in gId2Taxon])
|
||||
if not taxa:
|
||||
return
|
||||
while len(taxa) > 1:
|
||||
# multiple taxa, find their common ancestor
|
||||
min_rank_level = min([taxon.rank_level for taxon in taxa])
|
||||
new_taxa = set()
|
||||
for taxon in taxa:
|
||||
new_taxon = gId2Taxon[taxon.parent_id] \
|
||||
if taxon.rank_level == min_rank_level \
|
||||
else taxon
|
||||
if not new_taxon in new_taxa:
|
||||
new_taxa.add(new_taxon)
|
||||
taxa = new_taxa
|
||||
taxon = taxa.pop()
|
||||
ancestors = []
|
||||
if taxon.rank_level < KINGDOM_RANK_LEVEL:
|
||||
get_ancestors(taxon.parent_id, ancestors)
|
||||
return (taxon, ancestors)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
assert not 'Not a top-level Python module!'
|
||||
3
third_party/nature-id/inaturalist-taxonomy/README
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
The .zip archive with the taxonomy and common names belongs in this directory.
|
||||
|
||||
Download https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip to this directory. Do not unpack this archive.
|
||||
4
third_party/nature-id/inaturalist-taxonomy/install.sh
vendored
Executable file
@@ -0,0 +1,4 @@
|
||||
#!/bin/sh
|
||||
rm -f inaturalist-taxonomy.dwca.zip
|
||||
curl https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip \
|
||||
-o inaturalist-taxonomy.dwca.zip
|
||||
537
third_party/nature-id/nature_id.py
vendored
Executable file
@@ -0,0 +1,537 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image, ImageOps
|
||||
import csv, sys, os, time
|
||||
import inat_taxonomy
|
||||
|
||||
try:
|
||||
# try importing TensorFlow Lite first
|
||||
import tflite_runtime.interpreter as tflite
|
||||
except Exception:
|
||||
try:
|
||||
# TensorFlow Lite not found, try to import full TensorFlow
|
||||
import tensorflow.lite as tflite
|
||||
except Exception:
|
||||
print('Error: TensorFlow Lite could not be loaded.', file=sys.stderr)
|
||||
print(' Follow instructions at https://www.tensorflow.org/lite/'
|
||||
'guide/python to install it.', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# The directory where this Python script is located.
|
||||
INSTALL_DIR = inat_taxonomy.INSTALL_DIR
|
||||
|
||||
# This directory contains models, label files, and taxonomy files.
|
||||
CLASSIFIER_DIRECTORY = os.path.join(INSTALL_DIR, 'classifiers')
|
||||
|
||||
# These flags can be modified with command-line options.
|
||||
scientific_names_only = False # only scientific names or also common names
|
||||
label_scores_only = False # scores for labels or hierarchical
|
||||
all_common_names = False # show only one or all common names
|
||||
result_sz = 5 # result size (for label_scores_only)
|
||||
|
||||
# This class is used by class Taxonomy.
|
||||
class Taxon:
|
||||
|
||||
def __init__(self, taxon_id):
|
||||
self.taxon_id = taxon_id # for internal lookups and iNat API calls
|
||||
self.rank_level = None # taxonomic rank, e.g. species, genus, family
|
||||
self.name = None # scientific name
|
||||
self.common_name = None # common name or None
|
||||
self.children = [] # list of child taxa
|
||||
self.leaf_class_ids = [] # list of indices into scores; there
|
||||
# can be more than one when we use old models
|
||||
# whose taxa have since been lumped together
|
||||
|
||||
def add_child(self, child_taxon):
|
||||
self.children.append(child_taxon)
|
||||
|
||||
# get taxonomic rank as a string
|
||||
def get_rank(self):
|
||||
if self.taxon_id < 0: # pseudo-kingdom?
|
||||
assert self.rank_level == inat_taxonomy.KINGDOM_RANK_LEVEL
|
||||
return ''
|
||||
return inat_taxonomy.get_rank_name(self.rank_level)
|
||||
|
||||
# get the name to display; customize here to show common names differently
|
||||
def get_name(self):
|
||||
if self.common_name:
|
||||
return f'{self.common_name} ({self.name})'
|
||||
else:
|
||||
return self.name
|
||||
|
||||
|
||||
# This taxonomy is represented in terms of instances of class Taxon.
|
||||
class Taxonomy:
|
||||
|
||||
def __init__(self):
|
||||
# The taxonomy file may contain multiple trees, one for each kingdom.
|
||||
# In order to have a single tree for prediction, we add a node for
|
||||
# Life as the parent of all kingdoms. This will be the root of our tree.
|
||||
self.root = Taxon(inat_taxonomy.ROOT_TAXON_ID)
|
||||
self.root.name = inat_taxonomy.ROOT_NAME
|
||||
self.root.rank_level = inat_taxonomy.ROOT_RANK_LEVEL
|
||||
self.id2taxon = { self.root.taxon_id : self.root }
|
||||
self.idx2label = {}
|
||||
|
||||
def reset(self):
|
||||
self.root.children = []
|
||||
self.id2taxon = { self.root.taxon_id : self.root }
|
||||
self.idx2label = {}
|
||||
|
||||
def taxonomy_available(self):
|
||||
return len(self.root.children) > 0
|
||||
|
||||
def read_taxonomy(self, filename):
|
||||
start_time = time.time()
|
||||
self.reset()
|
||||
with open(filename, newline='', encoding='latin-1') as csvfile:
|
||||
reader = csv.DictReader(csvfile)
|
||||
for row in reader:
|
||||
if 'id' in row: # this is a label file
|
||||
self.idx2label[int(row['id'])] = row['name']
|
||||
continue
|
||||
|
||||
taxon_id = int(row['taxon_id'])
|
||||
if taxon_id in self.id2taxon:
|
||||
taxon = self.id2taxon[taxon_id] # inserted earlier as parent
|
||||
else:
|
||||
self.id2taxon[taxon_id] = taxon = Taxon(taxon_id)
|
||||
|
||||
taxon.name = row['name']
|
||||
if row['rank_level'].isdigit():
|
||||
taxon.rank_level = int(row['rank_level'])
|
||||
else:
|
||||
taxon.rank_level = float(row['rank_level'])
|
||||
|
||||
if len(row['leaf_class_id']):
|
||||
for leaf_class_id in row['leaf_class_id'].split(';'):
|
||||
leaf_class_id = int(leaf_class_id)
|
||||
taxon.leaf_class_ids.append(leaf_class_id)
|
||||
self.idx2label[leaf_class_id] = taxon.name
|
||||
|
||||
if len(row['parent_taxon_id']):
|
||||
parent_taxon_id = int(row['parent_taxon_id'])
|
||||
else:
|
||||
parent_taxon_id = self.root.taxon_id
|
||||
if not parent_taxon_id in self.id2taxon:
|
||||
self.id2taxon[parent_taxon_id] = Taxon(parent_taxon_id)
|
||||
|
||||
self.id2taxon[parent_taxon_id].add_child(taxon)
|
||||
|
||||
if not self.taxonomy_available():
|
||||
# We parsed a label file; unless told otherwise, we use these
|
||||
# labels to build a taxonomic tree.
|
||||
print(f"Read {len(self.idx2label):,} labels from '{filename}' "
|
||||
f"in {time.time() - start_time:.1f} secs.")
|
||||
|
||||
if not label_scores_only:
|
||||
self.compute_taxonomic_tree()
|
||||
if self.taxonomy_available():
|
||||
self.write_taxonomic_tree(filename.replace('labelmap',
|
||||
'taxonomy'))
|
||||
else:
|
||||
print(f"Read taxonomy from '{filename}' in "
|
||||
f"{time.time() - start_time:.1f} secs: "
|
||||
f"{len(self.id2taxon) - 1:,} taxa including "
|
||||
f"{len(self.idx2label):,} leaf taxa.")
|
||||
|
||||
if not scientific_names_only and self.taxonomy_available():
|
||||
inat_taxonomy.annotate_common_names(self.id2taxon, all_common_names)
|
||||
if label_scores_only:
|
||||
self.annotate_labels_with_common_names()
|
||||
del self.id2taxon # not needed anymore
|
||||
|
||||
# augment labels with common names
|
||||
def annotate_labels_with_common_names(self):
|
||||
for taxon in self.id2taxon.values():
|
||||
for leaf_class_id in taxon.leaf_class_ids:
|
||||
self.idx2label[leaf_class_id] = taxon.get_name()
|
||||
|
||||
# write one row to taxonomy file
|
||||
def write_row(self, writer, taxon, parent_taxon_id):
|
||||
writer.writerow([parent_taxon_id, taxon.taxon_id, taxon.rank_level,
|
||||
';'.join([str(id) for id in taxon.leaf_class_ids]),
|
||||
taxon.name])
|
||||
for child in taxon.children:
|
||||
self.write_row(writer, child, taxon.taxon_id)
|
||||
|
||||
# write taxonomy file
|
||||
def write_taxonomic_tree(self, filename):
|
||||
try:
|
||||
with open(filename, 'w', newline='', encoding='latin-1') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerow(['parent_taxon_id', 'taxon_id', 'rank_level',
|
||||
'leaf_class_id', 'name'])
|
||||
for child in self.root.children:
|
||||
self.write_row(writer, child, '')
|
||||
print(f"Taxonomy written to file '{filename}'.")
|
||||
except Exception as e:
|
||||
print(f"Failure writing taxonomy to file '{filename}':", str(e))
|
||||
try:
|
||||
os.remove(filename)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Called after loading label file for Google's AIY Vision Kit.
|
||||
# Adds all the labels' direct and indirect ancestors to compute
|
||||
# the taxonomic tree.
|
||||
def compute_taxonomic_tree(self):
|
||||
global label_scores_only
|
||||
if not inat_taxonomy.load_inat_taxonomy():
|
||||
label_scores_only = True
|
||||
return
|
||||
|
||||
start_time = time.time()
|
||||
new_id = 0 # id's we add on the fly for pseudo-kingdoms
|
||||
|
||||
for idx, name in self.idx2label.items():
|
||||
inat_taxa = inat_taxonomy.lookup_id(name)
|
||||
if not inat_taxa:
|
||||
print(f"Info: Taxon for label '{name}' not found, "
|
||||
"inserting as pseudo-kingdom.")
|
||||
new_id -= 1
|
||||
taxon_id = new_id
|
||||
self.id2taxon[taxon_id] = taxon = Taxon(taxon_id)
|
||||
taxon.rank_level = inat_taxonomy.KINGDOM_RANK_LEVEL
|
||||
taxon.name = name
|
||||
taxon.leaf_class_ids = [idx]
|
||||
self.root.add_child(taxon)
|
||||
continue
|
||||
|
||||
inat_taxon, ancestors = inat_taxa
|
||||
if name != inat_taxon.name:
|
||||
print(f"Info: Taxon '{name}' changed to "
|
||||
f"'{inat_taxon.name}', iNat taxa "
|
||||
f"id {inat_taxon.id}.")
|
||||
|
||||
# ancestor taxa
|
||||
prev_ancestor = self.root
|
||||
for ancestor in ancestors:
|
||||
if ancestor.id in self.id2taxon:
|
||||
prev_ancestor = self.id2taxon[ancestor.id]
|
||||
else:
|
||||
self.id2taxon[ancestor.id] = ancestor_taxon = Taxon(ancestor.id)
|
||||
ancestor_taxon.name = ancestor.name
|
||||
ancestor_taxon.rank_level = ancestor.rank_level
|
||||
prev_ancestor.add_child(ancestor_taxon)
|
||||
prev_ancestor = ancestor_taxon
|
||||
|
||||
# this taxon
|
||||
if inat_taxon.id in self.id2taxon:
|
||||
taxon = self.id2taxon[inat_taxon.id]
|
||||
assert taxon.name == inat_taxon.name
|
||||
assert taxon.rank_level == inat_taxon.rank_level
|
||||
else:
|
||||
self.id2taxon[inat_taxon.id] = taxon = Taxon(inat_taxon.id)
|
||||
taxon.name = inat_taxon.name
|
||||
taxon.rank_level = inat_taxon.rank_level
|
||||
prev_ancestor.add_child(taxon)
|
||||
taxon.leaf_class_ids.append(idx)
|
||||
|
||||
print("Computed taxonomic tree from labels in "
|
||||
f"{time.time() - start_time:.1f} secs: {len(self.id2taxon)-1:,} "
|
||||
f"taxa including {len(self.idx2label):,} leaf taxa.")
|
||||
|
||||
# propagate scores to taxon and all below
|
||||
def assign_scores(self, taxon, scores):
|
||||
taxon.score = 0.0
|
||||
for leaf_class_id in taxon.leaf_class_ids:
|
||||
taxon.score += scores[leaf_class_id]
|
||||
for child in taxon.children:
|
||||
self.assign_scores(child, scores)
|
||||
taxon.score += child.score
|
||||
|
||||
# Returns list of 5-tuples (score, taxon_id, taxonomic rank,
|
||||
# scientific name, common name) ordered by taxonomic rank from kingdom
|
||||
# down to e.g. species.
|
||||
# Returns pairs (score, scientific name) if label_scores_only
|
||||
# is set.
|
||||
def prediction(self, scores):
|
||||
|
||||
if label_scores_only:
|
||||
# return list of pairs (score, scientific name)
|
||||
total = np.sum(scores)
|
||||
indices = np.argpartition(scores, -result_sz)[-result_sz:]
|
||||
results = [(scores[i] / total, self.idx2label[i])
|
||||
for i in indices if scores[i] != 0]
|
||||
results.sort(reverse=True)
|
||||
return results
|
||||
|
||||
# annotate all taxa across the hierarchy with scores.
|
||||
self.assign_scores(self.root, scores)
|
||||
|
||||
# return one hierarchical path guided by scores
|
||||
path = []
|
||||
taxon = self.root
|
||||
while taxon.children:
|
||||
# Find child with highest score.
|
||||
best_child = None
|
||||
for child in taxon.children:
|
||||
if not best_child or child.score > best_child.score:
|
||||
best_child = child
|
||||
|
||||
# Truncate path if all the other children combined are better
|
||||
if best_child.score < 0.5 * taxon.score:
|
||||
break
|
||||
|
||||
path.append((best_child.score / self.root.score,
|
||||
best_child.taxon_id, best_child.get_rank(),
|
||||
best_child.get_name()))
|
||||
|
||||
taxon = best_child
|
||||
|
||||
return path
|
||||
|
||||
#
|
||||
# Offline image classification.
|
||||
#
|
||||
|
||||
class OfflineClassifier:
|
||||
|
||||
def __init__(self, filenames):
|
||||
self.min_pixel_value = 0.0
|
||||
self.max_pixel_value = 255.0
|
||||
|
||||
if os.path.split(filenames[0])[1] in ['optimized_model.tflite',
|
||||
'optimized_model_v1.tflite']:
|
||||
self.min_pixel_value = -1.0
|
||||
self.max_pixel_value = 1.0
|
||||
|
||||
# Load TFLite model and allocate tensors.
|
||||
self.mInterpreter = tflite.Interpreter(model_path=filenames[0])
|
||||
self.mInterpreter.allocate_tensors()
|
||||
|
||||
# Get input and output tensors.
|
||||
self.mInput_details = self.mInterpreter.get_input_details()
|
||||
self.mOutput_details = self.mInterpreter.get_output_details()
|
||||
|
||||
# Read labels or taxonomy
|
||||
self.mTaxonomy = Taxonomy()
|
||||
self.mTaxonomy.read_taxonomy(filenames[1])
|
||||
|
||||
def classify_image(self, image_filename):
|
||||
start_time = time.time()
|
||||
try:
|
||||
img = Image.open(image_filename)
|
||||
except:
|
||||
print(f"Error: cannot load image '{image_filename}'.")
|
||||
return []
|
||||
|
||||
if img.mode != 'RGB':
|
||||
print(f"Error: image '{image_filename}' is of mode '{img.mode}',"
|
||||
" only mode RGB is supported.")
|
||||
return []
|
||||
|
||||
# rotate image if needed as it may contain EXIF orientation tag
|
||||
img = ImageOps.exif_transpose(img)
|
||||
|
||||
model_size = tuple(self.mInput_details[0]['shape'][1:3])
|
||||
|
||||
# square target shape expected by crop code below
|
||||
assert model_size[0] == model_size[1]
|
||||
|
||||
if img.size != model_size:
|
||||
# We need to scale and maybe want to crop image.
|
||||
width, height = img.size
|
||||
if width != height:
|
||||
# Before scaling, we crop image to square shape.
|
||||
left = 0
|
||||
right = width
|
||||
top = 0
|
||||
bottom = height
|
||||
if width < height:
|
||||
top = (height - width) / 2
|
||||
bottom = top + width
|
||||
else:
|
||||
left = (width - height) / 2
|
||||
right = left + height
|
||||
img = img.crop((left, top, right, bottom))
|
||||
|
||||
# scale image
|
||||
img = img.resize(model_size)
|
||||
|
||||
#img.show()
|
||||
|
||||
# pixels are in range 0 ... 255, turn into numpy array
|
||||
input_data = np.array([np.array(img, self.mInput_details[0]['dtype'])])
|
||||
|
||||
if self.mInput_details[0]['dtype'] == np.float32:
|
||||
input_data *= (self.max_pixel_value - self.min_pixel_value) / 255.0
|
||||
input_data += self.min_pixel_value
|
||||
|
||||
self.mInterpreter.set_tensor(self.mInput_details[0]['index'],
|
||||
input_data)
|
||||
self.mInterpreter.invoke()
|
||||
|
||||
output_data = self.mInterpreter.get_tensor(self.mOutput_details[0]
|
||||
['index'])
|
||||
path = self.mTaxonomy.prediction(output_data[0])
|
||||
print()
|
||||
print(f"Classification of '{image_filename}' took "
|
||||
f"{time.time() - start_time:.1f} secs.")
|
||||
return path
|
||||
|
||||
# Returns a dictionary that maps available classifiers to a pair of filenames.
|
||||
def get_installed_models():
|
||||
|
||||
if not os.path.isdir(CLASSIFIER_DIRECTORY):
|
||||
print("Cannot load classifiers, directory "
|
||||
f"'{CLASSIFIER_DIRECTORY}' does not exist.")
|
||||
sys.exit(1)
|
||||
|
||||
choices = [ 'birds', 'insects', 'plants']
|
||||
models = {}
|
||||
|
||||
for filename in os.listdir(CLASSIFIER_DIRECTORY):
|
||||
model = None
|
||||
if filename.endswith(".csv"):
|
||||
if filename == 'taxonomy_v2_13.csv':
|
||||
model = 'v2_13'
|
||||
elif filename == 'taxonomy_v1.csv':
|
||||
model = 'Seek'
|
||||
else:
|
||||
for m in choices:
|
||||
if filename.find(m) != -1:
|
||||
model = m
|
||||
break
|
||||
if model:
|
||||
filename = os.path.join(CLASSIFIER_DIRECTORY, filename)
|
||||
if model in models:
|
||||
if not models[model][1] or models[model][1].\
|
||||
endswith('labelmap.csv'):
|
||||
models[model] = (models[model][0], filename)
|
||||
else:
|
||||
models[model] = (None, filename)
|
||||
elif filename.endswith(".tflite"):
|
||||
if filename == 'optimized_model_v2_13.tflite':
|
||||
model = 'v2_13'
|
||||
elif filename == 'optimized_model_v1.tflite':
|
||||
model = 'Seek'
|
||||
else:
|
||||
for m in choices:
|
||||
if filename.find(m) != -1:
|
||||
model = m
|
||||
break
|
||||
if model:
|
||||
filename = os.path.join(CLASSIFIER_DIRECTORY, filename)
|
||||
if model in models:
|
||||
models[model] = (filename, models[model][1])
|
||||
else:
|
||||
models[model] = (filename, None)
|
||||
|
||||
delete_elements = [] # postponed deletion, cannot delete during iteration
|
||||
for name, files in models.items():
|
||||
if not files[0] or not files[1]:
|
||||
tf_missing = ".csv file but no .tflite file"
|
||||
csv_missing = ".tflite file but no .csv file"
|
||||
print("Installation issue: Excluding incomplete classifier for"
|
||||
f" '{name}': {tf_missing if files[1] else csv_missing}.")
|
||||
delete_elements.append(name)
|
||||
|
||||
for element in delete_elements:
|
||||
del models[element]
|
||||
|
||||
if not models:
|
||||
print(f"No classifiers found in directory '{CLASSIFIER_DIRECTORY}'; "
|
||||
"follow instructions in "
|
||||
f"'{os.path.join(CLASSIFIER_DIRECTORY,'README.md')}'"
|
||||
" to install them.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
return models
|
||||
|
||||
def identify_species(classifier, filename):
|
||||
result = classifier.classify_image(filename)
|
||||
if result:
|
||||
# Print list of tuples (score, taxon id, taxonomic rank, name)
|
||||
# ordered by taxonomic rank from kingdom down to species.
|
||||
for entry in result:
|
||||
if len(entry) == 2: # labels only
|
||||
print(f'{100 * entry[0]:5.1f}% {entry[1]}')
|
||||
continue
|
||||
print(f'{100 * entry[0]:5.1f}% {entry[2]:11s} {entry[3]}')
|
||||
|
||||
# command-line parsing
|
||||
|
||||
models = get_installed_models()
|
||||
|
||||
def model_parameter_check(arg):
|
||||
if not arg in models:
|
||||
msg = f"Model '{arg}' not available. Available "\
|
||||
f"model{'' if len(models)==1 else 's'}:"
|
||||
prefix = ' '
|
||||
for m in models:
|
||||
msg += f"{prefix}'{m}'"
|
||||
prefix = ', '
|
||||
msg += '.'
|
||||
raise argparse.ArgumentTypeError(msg)
|
||||
return arg
|
||||
|
||||
def result_size_check(arg):
|
||||
if arg.isdigit() and int(arg) > 0 and int(arg) <= 100:
|
||||
return int(arg)
|
||||
raise argparse.ArgumentTypeError(f"'{arg}' is not a number "
|
||||
"between 1 and 100.")
|
||||
|
||||
def file_directory_check(arg):
|
||||
if os.path.isdir(arg) or os.path.isfile(arg):
|
||||
return arg
|
||||
raise argparse.ArgumentTypeError(f"'{arg}' is not a file or directory.")
|
||||
|
||||
#
|
||||
# Identify species for picture files and directories given as command line args
|
||||
#
|
||||
|
||||
if __name__ == '__main__':
|
||||
import argparse
|
||||
|
||||
preferred1 = 'v2_13' # default if this model is available
|
||||
preferred2 = 'Seek' # second preference
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
if len(models) == 1 or preferred1 in models or preferred2 in models:
|
||||
default_model = preferred1 if preferred1 in models else \
|
||||
preferred2 if preferred2 in models else \
|
||||
next(iter(models))
|
||||
parser.add_argument("-m", "--model", type=model_parameter_check,
|
||||
default=default_model,
|
||||
help="Model to load to identify organisms.")
|
||||
else: # no default for classification model
|
||||
parser.add_argument("-m", "--model", type=model_parameter_check,
|
||||
required=True,
|
||||
help="Model to load to identify organisms.")
|
||||
parser.add_argument('-a', '--all_common_names', action="store_true",
|
||||
help='Show all common names and not just one.')
|
||||
parser.add_argument('-l', '--label_scores_only', action="store_true",
|
||||
help='Compute and display only label scores, '
|
||||
'do not propagate scores up the hierarchy.')
|
||||
parser.add_argument('-s', '--scientific_names_only', action="store_true",
|
||||
help='Only use scientific names, do not load common '
|
||||
'names.')
|
||||
parser.add_argument('-r', '--result_size', type=result_size_check,
|
||||
default=result_sz, help='Number of labels and their '
|
||||
'scores to report in results.')
|
||||
parser.add_argument('files_dirs', metavar='file/directory',
|
||||
type=file_directory_check, nargs='+',
|
||||
help='Image files or directories with images.')
|
||||
args = parser.parse_args()
|
||||
|
||||
scientific_names_only = args.scientific_names_only
|
||||
label_scores_only = args.label_scores_only
|
||||
all_common_names = args.all_common_names
|
||||
result_sz = args.result_size
|
||||
|
||||
# make classifier instance
|
||||
|
||||
classifier = OfflineClassifier(models[args.model])
|
||||
|
||||
# process photos
|
||||
|
||||
for arg in args.files_dirs:
|
||||
if os.path.isfile(arg):
|
||||
identify_species(classifier, arg)
|
||||
elif os.path.isdir(arg):
|
||||
for file in os.listdir(arg):
|
||||
ext = os.path.splitext(file)[1].lower()
|
||||
if ext in ['.jpg', '.jepg', '.png']:
|
||||
identify_species(classifier, os.path.join(arg, file))
|
||||
BIN
third_party/nature-id/plant_images/Mentzelia_lindleyi.jpg
vendored
Normal file
|
After Width: | Height: | Size: 196 KiB |
BIN
third_party/nature-id/plant_images/Persicaria_amphibia.jpg
vendored
Normal file
|
After Width: | Height: | Size: 399 KiB |
BIN
third_party/nature-id/plant_images/Phyla_nodiflora.jpg
vendored
Normal file
|
After Width: | Height: | Size: 257 KiB |
BIN
third_party/nature-id/plant_images/Primula_hendersonii.jpg
vendored
Normal file
|
After Width: | Height: | Size: 254 KiB |
BIN
third_party/nature-id/plant_images/Solidago_velutina_ssp_californica.jpg
vendored
Normal file
|
After Width: | Height: | Size: 189 KiB |
BIN
third_party/nature-id/plant_images/Tragopogon_porrifolius.jpg
vendored
Normal file
|
After Width: | Height: | Size: 168 KiB |
BIN
third_party/nature-id/plant_images/Trichostema_lanceolatum.jpg
vendored
Normal file
|
After Width: | Height: | Size: 198 KiB |
3
third_party/nature-id/requirements.txt
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
Pillow
|
||||
requests
|
||||
tflite-runtime
|
||||