merge: integrate remote codex/sync-node1-runtime with fabric layer changes

Resolve conflicts in docker-compose.node1.yml, services/router/main.py,
and gateway-bot/services/doc_service.py — keeping both fabric layer
(NCS, node-worker, Prometheus) and document ingest/query endpoints.

Made-with: Cursor
This commit is contained in:
Apple
2026-02-27 03:09:12 -08:00
76 changed files with 7495 additions and 295 deletions

View File

@@ -16,9 +16,16 @@ logger = logging.getLogger(__name__)
# Configuration
GATEWAY_URL = os.getenv("GATEWAY_URL", "http://gateway:9300")
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000")
PROBE_INTERVAL = int(os.getenv("PROBE_INTERVAL", "60")) # seconds
PROBE_TIMEOUT = int(os.getenv("PROBE_TIMEOUT", "30")) # seconds
SEMANTIC_TIMEOUT = int(os.getenv("SEMANTIC_TIMEOUT", "45")) # seconds
METRICS_PORT = int(os.getenv("METRICS_PORT", "9108"))
SEMANTIC_PROBE_ENABLED = os.getenv("SEMANTIC_PROBE_ENABLED", "true").lower() == "true"
SEMANTIC_AGENTS = [a.strip() for a in os.getenv("SEMANTIC_AGENTS", "clan,sofiia,monitor").split(",") if a.strip()]
SEMANTIC_PROMPT = os.getenv("SEMANTIC_PROMPT", "Коротко: хто такий DAARWIZZ?")
SEMANTIC_EXPECT_KEYWORD = os.getenv("SEMANTIC_EXPECT_KEYWORD", "daarwizz").lower()
MONITOR_EXPECT_LOCAL = os.getenv("MONITOR_EXPECT_LOCAL", "true").lower() == "true"
# Prometheus metrics
agent_e2e_success = Gauge('agent_e2e_success', 'Whether last E2E probe succeeded', ['target'])
@@ -42,7 +49,7 @@ async def probe_gateway_health() -> tuple[bool, float, str]:
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
resp = await client.get(f"{GATEWAY_URL}/health")
latency = time.time() - start
if resp.status_code == 200:
data = resp.json()
if data.get("status") == "healthy":
@@ -67,7 +74,7 @@ async def probe_agent_ping() -> tuple[bool, float, str]:
json={"probe": True, "timestamp": datetime.utcnow().isoformat()}
)
latency = time.time() - start
if resp.status_code == 200:
data = resp.json()
if data.get("success"):
@@ -100,7 +107,7 @@ async def probe_webhook_echo() -> tuple[bool, float, str]:
"text": "/health" # Simple health check command
}
}
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
# Use helion webhook as it's the most tested
resp = await client.post(
@@ -108,7 +115,7 @@ async def probe_webhook_echo() -> tuple[bool, float, str]:
json=test_update
)
latency = time.time() - start
if resp.status_code == 200:
return True, latency, ""
else:
@@ -119,53 +126,102 @@ async def probe_webhook_echo() -> tuple[bool, float, str]:
return False, time.time() - start, f"error: {str(e)[:50]}"
async def probe_agent_semantic(agent_id: str) -> tuple[bool, float, str]:
"""Probe semantic response via router infer and assert DAARWIZZ awareness."""
start = time.time()
try:
payload = {
"prompt": SEMANTIC_PROMPT,
"max_tokens": 180,
"temperature": 0.1,
"metadata": {
"agent_id": agent_id,
"user_id": "tg:0",
"chat_id": "0",
"username": "e2e-prober",
"raw_user_text": SEMANTIC_PROMPT,
},
}
async with httpx.AsyncClient(timeout=SEMANTIC_TIMEOUT) as client:
resp = await client.post(f"{ROUTER_URL}/v1/agents/{agent_id}/infer", json=payload)
latency = time.time() - start
if resp.status_code != 200:
return False, latency, f"http_{resp.status_code}"
data = resp.json()
answer = str(data.get("response") or "")
backend = str(data.get("backend") or "")
model = str(data.get("model") or "")
answer_lc = answer.lower()
if SEMANTIC_EXPECT_KEYWORD not in answer_lc and "даар" not in answer_lc:
return False, latency, "no_daarwizz_in_answer"
if MONITOR_EXPECT_LOCAL and agent_id == "monitor":
local_ok = ("ollama" in backend.lower()) or model.lower().startswith("qwen")
if not local_ok:
return False, latency, f"monitor_nonlocal_backend:{backend}:{model}"
return True, latency, ""
except httpx.TimeoutException:
return False, time.time() - start, "timeout"
except Exception as e:
return False, time.time() - start, f"error: {str(e)[:50]}"
def record_probe(target: str, success: bool, latency: float, reason: str):
"""Record probe metrics and log line."""
agent_e2e_runs_total.labels(target=target).inc()
agent_e2e_success.labels(target=target).set(1 if success else 0)
agent_e2e_latency.labels(target=target).set(latency)
agent_e2e_latency_histogram.labels(target=target).observe(latency)
if not success:
agent_e2e_failures_total.labels(target=target, reason=reason).inc()
logger.info(f"{target}: success={success}, latency={latency:.3f}s, reason={reason}")
async def run_probes():
"""Run all probes and update metrics"""
# Probe 1: Gateway health
success, latency, reason = await probe_gateway_health()
agent_e2e_runs_total.labels(target="gateway_health").inc()
agent_e2e_success.labels(target="gateway_health").set(1 if success else 0)
agent_e2e_latency.labels(target="gateway_health").set(latency)
agent_e2e_latency_histogram.labels(target="gateway_health").observe(latency)
if not success:
agent_e2e_failures_total.labels(target="gateway_health", reason=reason).inc()
logger.info(f"gateway_health: success={success}, latency={latency:.3f}s, reason={reason}")
record_probe("gateway_health", success, latency, reason)
# Probe 2: Agent ping (if endpoint exists)
success, latency, reason = await probe_agent_ping()
agent_e2e_runs_total.labels(target="agent_ping").inc()
agent_e2e_success.labels(target="agent_ping").set(1 if success else 0)
agent_e2e_latency.labels(target="agent_ping").set(latency)
agent_e2e_latency_histogram.labels(target="agent_ping").observe(latency)
if not success:
agent_e2e_failures_total.labels(target="agent_ping", reason=reason).inc()
logger.info(f"agent_ping: success={success}, latency={latency:.3f}s, reason={reason}")
record_probe("agent_ping", success, latency, reason)
# Probe 3: Webhook E2E (full path test)
success, latency, reason = await probe_webhook_echo()
agent_e2e_runs_total.labels(target="webhook_e2e").inc()
agent_e2e_success.labels(target="webhook_e2e").set(1 if success else 0)
agent_e2e_latency.labels(target="webhook_e2e").set(latency)
agent_e2e_latency_histogram.labels(target="webhook_e2e").observe(latency)
if not success:
agent_e2e_failures_total.labels(target="webhook_e2e", reason=reason).inc()
logger.info(f"webhook_e2e: success={success}, latency={latency:.3f}s, reason={reason}")
record_probe("webhook_e2e", success, latency, reason)
# Probe 4+: semantic checks for selected agents (parallel)
if SEMANTIC_PROBE_ENABLED and SEMANTIC_AGENTS:
results = await asyncio.gather(*(probe_agent_semantic(agent_id) for agent_id in SEMANTIC_AGENTS))
matrix = []
for agent_id, (success, latency, reason) in zip(SEMANTIC_AGENTS, results):
record_probe(f"semantic_{agent_id}", success, latency, reason)
matrix.append(f"{agent_id}:{'PASS' if success else 'FAIL'}")
logger.info("semantic_matrix: " + " | ".join(matrix))
async def main():
logger.info(f"Starting E2E Agent Prober")
logger.info("Starting E2E Agent Prober")
logger.info(f" GATEWAY_URL: {GATEWAY_URL}")
logger.info(f" ROUTER_URL: {ROUTER_URL}")
logger.info(f" PROBE_INTERVAL: {PROBE_INTERVAL}s")
logger.info(f" PROBE_TIMEOUT: {PROBE_TIMEOUT}s")
logger.info(f" METRICS_PORT: {METRICS_PORT}")
logger.info(f" SEMANTIC_TIMEOUT: {SEMANTIC_TIMEOUT}s")
logger.info(f" SEMANTIC_PROBE_ENABLED: {SEMANTIC_PROBE_ENABLED}")
logger.info(f" SEMANTIC_AGENTS: {','.join(SEMANTIC_AGENTS)}")
# Start Prometheus metrics server
start_http_server(METRICS_PORT)
logger.info(f"Prometheus metrics available at :{METRICS_PORT}/metrics")
# Initial probe
await run_probes()
# Continuous probing
while True:
await asyncio.sleep(PROBE_INTERVAL)

View File

@@ -6,13 +6,15 @@ Artifact Registry v0
"""
import asyncio
import base64
import hashlib
import json
import logging
import os
import re
import uuid
from io import BytesIO
from datetime import datetime
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
import asyncpg
@@ -90,6 +92,14 @@ class ArtifactVersionFromUrlRequest(BaseModel):
meta_json: Optional[Dict[str, Any]] = None
class ArtifactVersionFromBase64Request(BaseModel):
content_base64: str
mime: str
filename: Optional[str] = "source.bin"
label: Optional[str] = "source"
meta_json: Optional[Dict[str, Any]] = None
class ArtifactVersionResponse(BaseModel):
version_id: str
storage_key: str
@@ -208,15 +218,38 @@ def _normalize_meta_json(meta: Any) -> Dict[str, Any]:
def _format_to_mime(fmt: str) -> str:
fmt = fmt.lower()
if "/" in fmt:
return fmt
if fmt == "pptx":
return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
if fmt == "pdf":
return "application/pdf"
if fmt == "source":
return "application/json"
if fmt == "docx":
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
if fmt == "xlsx":
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
if fmt == "txt":
return "text/plain; charset=utf-8"
if fmt == "md":
return "text/markdown; charset=utf-8"
if fmt == "json":
return "application/json"
if fmt == "csv":
return "text/csv; charset=utf-8"
return "application/octet-stream"
def _safe_filename(name: Optional[str], fallback: str = "source.bin") -> str:
raw = (name or fallback).strip() or fallback
cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", raw)
cleaned = cleaned.strip("._")
if not cleaned:
return fallback
return cleaned[:120]
async def _download_bytes(url: str) -> bytes:
async with httpx.AsyncClient(timeout=60.0) as client:
resp = await client.get(url)
@@ -462,6 +495,73 @@ async def add_version_from_url(artifact_id: str, payload: ArtifactVersionFromUrl
)
@app.post("/artifacts/{artifact_id}/versions/from_base64", response_model=ArtifactVersionResponse)
async def add_version_from_base64(artifact_id: str, payload: ArtifactVersionFromBase64Request) -> ArtifactVersionResponse:
if not minio_client:
raise HTTPException(status_code=500, detail="MinIO not available")
if not pool:
raise HTTPException(status_code=500, detail="DB not available")
raw = (payload.content_base64 or "").strip()
if not raw:
raise HTTPException(status_code=400, detail="content_base64 is required")
if raw.startswith("data:") and "," in raw:
raw = raw.split(",", 1)[1]
try:
content = base64.b64decode(raw, validate=True)
except Exception:
raise HTTPException(status_code=400, detail="Invalid base64 payload")
if not content:
raise HTTPException(status_code=400, detail="Decoded payload is empty")
version_id = f"ver_{uuid.uuid4().hex}"
filename = _safe_filename(payload.filename, fallback="source.bin")
sha256 = _hash_bytes(content)
storage_key = _storage_key(artifact_id, version_id, filename)
try:
minio_client.put_object(
MINIO_BUCKET,
storage_key,
data=BytesIO(content),
length=len(content),
content_type=payload.mime,
)
except S3Error as e:
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
meta_json = _normalize_meta_json(payload.meta_json)
if "file_name" not in meta_json:
meta_json["file_name"] = filename
async with pool.acquire() as conn:
await conn.execute(
"""
insert into artifact_versions
(id, artifact_id, label, sha256, mime, size_bytes, storage_key, meta_json)
values ($1, $2, $3, $4, $5, $6, $7, $8)
""",
version_id,
artifact_id,
payload.label or "source",
sha256,
payload.mime,
len(content),
storage_key,
json.dumps(meta_json),
)
return ArtifactVersionResponse(
version_id=version_id,
storage_key=storage_key,
sha256=sha256,
size_bytes=len(content),
)
@app.post("/artifacts/{artifact_id}/versions", response_model=ArtifactVersionResponse)
async def add_version(artifact_id: str, payload: ArtifactVersionCreateRequest) -> ArtifactVersionResponse:
if not pool:
@@ -678,7 +778,39 @@ async def download_artifact(artifact_id: str, format: str = Query("pptx")) -> Di
if not row:
raise HTTPException(status_code=404, detail="Version not found")
try:
url = minio_client.presigned_get_object(MINIO_BUCKET, row["storage_key"], expires=1800)
url = minio_client.presigned_get_object(
MINIO_BUCKET,
row["storage_key"],
expires=timedelta(seconds=1800),
)
except S3Error as e:
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
return {"url": url, "storage_key": row["storage_key"], "mime": row["mime"]}
@app.get("/artifacts/{artifact_id}/versions/{version_id}/download")
async def download_artifact_version(artifact_id: str, version_id: str) -> Dict[str, Any]:
if not pool or not minio_client:
raise HTTPException(status_code=500, detail="Service not available")
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
select * from artifact_versions
where artifact_id=$1 and id=$2
limit 1
""",
artifact_id,
version_id,
)
if not row:
raise HTTPException(status_code=404, detail="Version not found")
try:
url = minio_client.presigned_get_object(
MINIO_BUCKET,
row["storage_key"],
expires=timedelta(seconds=1800),
)
except S3Error as e:
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
return {"url": url, "storage_key": row["storage_key"], "mime": row["mime"], "version_id": row["id"]}

View File

@@ -361,6 +361,29 @@ agromatrix:
llm_profile: reasoning
delegation:
enabled: false
plant_intel:
team_name: AgroMatrix Plant Intelligence
parallel_roles: true
max_concurrency: 3
synthesis:
role_context: Plant Intelligence Synthesis
system_prompt_ref: roles/agx/agx-plant-intel/orchestrator_synthesis.md
llm_profile: reasoning
team:
- id: plant_identifier
role_context: Plant Identifier
system_prompt_ref: roles/agx/agx-plant-intel/plant_identifier.md
llm_profile: science
- id: taxonomy_validator
role_context: Taxonomy Validator
system_prompt_ref: roles/agx/agx-plant-intel/taxonomy_validator.md
llm_profile: reasoning
- id: agrovoc_normalizer
role_context: AGROVOC Normalizer
system_prompt_ref: roles/agx/agx-plant-intel/agrovoc_normalizer.md
llm_profile: fast
delegation:
enabled: false
cadastre_geo:
team_name: AgroMatrix Cadastre/Geo
parallel_roles: true
@@ -614,6 +637,16 @@ agromatrix:
- Stepan
- координація
- план
plant_intel:
- plant
- рослина
- культура
- leaf
- disease
- хвороба
- identify
- ідентифікуй
- що за рослина
cadastre_geo:
- cadastre
- geo

View File

@@ -0,0 +1,8 @@
# Agronomist
Фокус: агрономія, діагностика стану рослин, фази розвитку, ризики хвороб/стресів.
Правила відповіді:
- Коротко і прикладно.
- Ніяких вигаданих фактів; при невизначеності чітко позначити припущення.
- Для фото-питань: аналізувати в межах доступного контексту; якщо файл відсутній зараз — просити фото повторно.

View File

@@ -0,0 +1,8 @@
# Communicator
Фокус: людяна та зрозуміла комунікація фінальної відповіді.
Правила:
- Природна мова, без механістичного тону.
- Не дублюй технічні обмеження, якщо вони не потрібні для дії користувача.
- Завершуй конкретним корисним кроком.

View File

@@ -0,0 +1,7 @@
# Field Data Analyst
Фокус: аналіз польових даних, тренди, аномалії, порівняння сценаріїв.
Правила:
- Пояснювати висновки простою мовою.
- Якщо даних недостатньо — вказати, які саме дані потрібні для точного висновку.

View File

@@ -0,0 +1,8 @@
# Farm Ops Planner
Фокус: планування польових робіт, ресурси, пріоритезація задач, таймінги.
Правила:
- Видавати практичний порядок дій.
- За простого запиту: коротка відповідь.
- Для операційних запитів: стислий план з відповідальними і дедлайном.

View File

@@ -0,0 +1,10 @@
# AgroMatrix Orchestrator Synthesis
Ти синтезуєш відповіді ролей у фінальну відповідь Степана.
Правила:
- За замовчуванням: 1-3 природні речення без шаблонної канцелярії.
- Детальний формат (пункти/чекліст) тільки коли користувач просить "детально", "план", "чекліст", "розрахунок".
- Якщо для аналізу бракує фото в поточному контексті, скажи це просто і попроси надіслати фото повторно.
- Уникай службових формулювань про "технічні обмеження", "text-only" чи "відсутній vision-модуль".
- Пояснюй по суті агропитання і давай 1 наступний практичний крок.

View File

@@ -0,0 +1,7 @@
# Risk Assessor
Фокус: агро-ризики, операційні ризики, наслідки рішень.
Правила:
- Давай коротку оцінку ризику (низький/середній/високий) і як зменшити ризик.
- Без зайвої бюрократії у відповіді користувачу.

View File

@@ -11,6 +11,10 @@
- Деструктивні дії (delete/migrate/prod) ТІЛЬКИ через план + dry-run + backup
- Ніколи не логувати секрети/токени
- Інші ролі НЕ спілкуються з користувачем напряму
- Мультимодальність активна: фото/голос/документи підтримуються через стек платформи.
- Якщо в поточному контексті не вистачає зображення для аналізу, пояснюйте це простою людською мовою і попросіть надіслати фото ще раз без технічних формулювань.
## Формат відповіді:
Структурована відповідь з чіткими рекомендаціями та наступними кроками.
- За замовчуванням: природна коротка відповідь 1-3 речення.
- Якщо користувач просить детально/план/чекліст: структурована відповідь з чіткими наступними кроками.
- Тон: живий і професійний, без канцеляризмів, шаблонів і фраз про "обмеження моделі".

View File

@@ -7,3 +7,7 @@
- Структурувати інформацію логічно
- Включати конкретні наступні кроки
- Позначати ризики якщо є
- За замовчуванням відповідати природно і коротко (1-3 речення), без шаблонної канцелярії.
- Для детальних запитів переходити у структурований режим.
- Якщо для аналізу бракує зображення у поточному контексті, скажіть це природно і попросіть надіслати фото повторно.
- Не вживати службові формулювання на кшталт "обмеження моделі", "text-only", "vision unavailable".

View File

@@ -0,0 +1,11 @@
You are AGROVOC Normalizer.
Responsibilities:
- Normalize crop/disease terms using agrovoc_lookup.
- Provide canonical term mapping for user-facing output.
- Keep labels practical for agronomy context.
Return format:
- canonical_terms
- term_mapping
- notes_for_user

View File

@@ -0,0 +1,24 @@
Ти — Plant Intel Agent у DAARION.city.
Відповідай природно, коротко й по-людськи українською, 13 речення за замовчуванням.
НАЙГОЛОВНІШЕ:
- Дані з [PLANT_VISION_PREPROCESSED] (або context.plant_vision) — єдиний source-of-truth для ідентифікації рослини.
- Для follow-up без нового фото використовуй [PREVIOUS_PLANT_IDENTIFICATION] (або context.last_plant / memory.last_plant).
Правило впевненості (обов'язково):
- Якщо recommend_fallback == true або confidence < 0.65:
"Ймовірно <name>, але впевненість низька. Перевірив через GBIF — найближчі збіги: <gbif_validation>. Краще нове фото при нормальному світлі."
- Інакше:
"Я бачу <name> з впевненістю <X>%."
Правила синтезу:
- Не ігноруй результати pre-vision, якщо вони присутні.
- Не стверджуй "фото не надано", якщо у контексті є pre-vision або previous plant data.
- Уникай шаблонних списків, якщо користувач не просить детальний формат.
- Якщо дані суперечливі: коротко познач невизначеність і попроси 1 конкретне додаткове фото.
- Якщо top_k порожній, явно вкажи, що ідентифікація непевна, але все одно надай GBIF-орієнтир, якщо він є в контексті.
Формат відповіді:
- 13 речення за замовчуванням.
- Без технічного шуму, без внутрішніх JSON/міток у відповіді користувачу.
- За запитом користувача можна розгорнути відповідь і дати короткі поради з догляду.

View File

@@ -0,0 +1,11 @@
You are Plant Identifier.
Responsibilities:
- Parse visual cues from user description/photo context.
- Build candidate crop/plant hypotheses.
- Use plantnet_lookup first when image URL is available.
- If PlantNet is unavailable, provide top hypotheses with explicit uncertainty.
Return format:
- candidates: numbered list max 5, each with rationale.
- required_data: what extra image/data is needed.

View File

@@ -0,0 +1,11 @@
You are Taxonomy Validator.
Responsibilities:
- Validate candidate names via gbif_species_lookup.
- Remove invalid/synonym-conflicted names.
- Keep accepted taxa and explain conflicts briefly.
Return format:
- accepted_candidates
- rejected_candidates_with_reason
- confidence_adjustment

View File

@@ -0,0 +1,15 @@
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY main.py .
EXPOSE 8085
HEALTHCHECK --interval=30s --timeout=10s --start-period=20s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen(http://localhost:8085/health)"
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8085"]

View File

@@ -0,0 +1,238 @@
import json
import os
import re
import shlex
import subprocess
import tempfile
from pathlib import Path
from typing import Any, Dict, List, Optional
import httpx
from fastapi import FastAPI, File, HTTPException, UploadFile
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field
app = FastAPI(title="plant-vision-node1", version="0.1.1")
class IdentifyRequest(BaseModel):
image_url: Optional[str] = None
top_k: int = Field(default=3, ge=1, le=10)
def _normalize_predictions(raw: Any, top_k: int) -> List[Dict[str, Any]]:
preds: List[Dict[str, Any]] = []
if isinstance(raw, dict):
for key in ("predictions", "results", "candidates"):
if isinstance(raw.get(key), list):
raw = raw[key]
break
if isinstance(raw, list):
for item in raw[:top_k]:
if not isinstance(item, dict):
continue
name = (
item.get("scientific_name")
or item.get("scientificName")
or item.get("label")
or item.get("name")
or "unknown"
)
common = item.get("common_name") or item.get("commonName") or item.get("common") or "-"
score = item.get("score", item.get("confidence", 0.0))
try:
score_f = float(score)
except Exception:
score_f = 0.0
preds.append({"scientific_name": str(name), "common_name": str(common), "score": score_f})
return preds[:top_k]
def _parse_text_output(text: str, top_k: int) -> List[Dict[str, Any]]:
"""
Parse only model score lines, e.g.:
97.6% Persicaria amphibia
86.1% Canada Goldenrod (Solidago canadensis)
Ignore service lines like "Read ..." or "Classification of ...".
"""
preds: List[Dict[str, Any]] = []
for raw_line in (text or "").splitlines():
line = raw_line.strip()
if not line or "%" not in line:
continue
m = re.match(r"^\s*(\d+(?:\.\d+)?)%\s+(.+)$", line)
if not m:
continue
score_str, name_part = m.groups()
try:
score = float(score_str)
except ValueError:
continue
name = name_part.strip()
if not name:
continue
common_name = "-"
scientific_name = name
# If output is "Common Name (Scientific name)", preserve both.
paren = re.match(r"^(.*?)\s*\(([^()]+)\)\s*$", name)
if paren:
common, scientific = paren.groups()
common = common.strip()
scientific = scientific.strip()
if common:
common_name = common
if scientific:
scientific_name = scientific
preds.append(
{
"scientific_name": scientific_name,
"common_name": common_name,
"score": score,
}
)
preds.sort(key=lambda x: float(x.get("score", 0.0)), reverse=True)
return preds[:top_k]
def _extract_inference_time(stdout: str) -> Optional[float]:
m = re.search(r"took\s+(\d+(?:\.\d+)?)\s+secs", stdout or "")
if not m:
return None
try:
return float(m.group(1))
except Exception:
return None
def _run_nature_id_cli(image_path: str, top_k: int) -> Dict[str, Any]:
cmd_tmpl = (os.getenv("NATURE_ID_CMD") or "").strip()
timeout_s = int(os.getenv("NATURE_ID_TIMEOUT", "40"))
if not cmd_tmpl:
raise RuntimeError("NATURE_ID_CMD is not configured")
cmd = cmd_tmpl.replace("{image_path}", image_path)
proc = subprocess.run(
shlex.split(cmd),
capture_output=True,
text=True,
timeout=timeout_s,
check=False,
)
if proc.returncode != 0:
raise RuntimeError(f"nature-id cli failed rc={proc.returncode}: {proc.stderr.strip()[:240]}")
out = (proc.stdout or "").strip()
inference_time_sec = _extract_inference_time(out)
if not out:
return {"predictions": [], "inference_time_sec": inference_time_sec}
try:
parsed = json.loads(out)
preds = _normalize_predictions(parsed, top_k)
except Exception:
preds = _parse_text_output(out, top_k)
return {"predictions": preds, "inference_time_sec": inference_time_sec}
async def _download_image(image_url: str) -> str:
timeout_s = float(os.getenv("DOWNLOAD_TIMEOUT", "20"))
async with httpx.AsyncClient(timeout=timeout_s) as client:
resp = await client.get(image_url)
resp.raise_for_status()
data = resp.content
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as f:
f.write(data)
return f.name
def _response_payload(result: Dict[str, Any]) -> Dict[str, Any]:
preds = result.get("predictions") or []
top_k = [
{
"confidence": float(p.get("score", 0.0)),
"name": str((p.get("common_name") if p.get("common_name") not in (None, "", "-") else p.get("scientific_name")) or "unknown"),
"scientific_name": str(p.get("scientific_name") or "unknown"),
}
for p in preds
]
return {
"status": "success",
"model": "aiy_plants_V1",
"source": "nature-id-cli",
"count": len(preds),
"inference_time_sec": result.get("inference_time_sec"),
"predictions": preds,
"top_k": top_k,
}
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(_, exc: RequestValidationError):
# Avoid leaking raw multipart bytes in validation responses.
errs: List[Dict[str, Any]] = []
for e in exc.errors() or []:
errs.append({"loc": e.get("loc"), "msg": e.get("msg"), "type": e.get("type")})
return JSONResponse(status_code=422, content={"detail": errs})
@app.get("/health")
def health() -> Dict[str, Any]:
cmd = (os.getenv("NATURE_ID_CMD") or "").strip()
return {
"status": "healthy",
"nature_id_cmd_configured": bool(cmd),
"nature_id_cmd": cmd,
}
@app.post("/identify")
async def identify(payload: IdentifyRequest) -> Dict[str, Any]:
if not payload.image_url:
raise HTTPException(status_code=400, detail="image_url is required")
tmp_path = ""
try:
tmp_path = await _download_image(payload.image_url)
result = _run_nature_id_cli(tmp_path, payload.top_k)
return _response_payload(result)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=503, detail=f"identify_failed: {e}")
finally:
if tmp_path:
try:
Path(tmp_path).unlink(missing_ok=True)
except Exception:
pass
@app.post("/identify-file")
async def identify_file(file: UploadFile = File(...), top_k: int = 3) -> Dict[str, Any]:
top_k = max(1, min(top_k, 10))
tmp_path = ""
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as f:
f.write(await file.read())
tmp_path = f.name
result = _run_nature_id_cli(tmp_path, top_k)
return _response_payload(result)
except Exception as e:
raise HTTPException(status_code=503, detail=f"identify_failed: {e}")
finally:
if tmp_path:
try:
Path(tmp_path).unlink(missing_ok=True)
except Exception:
pass

View File

@@ -0,0 +1,8 @@
fastapi==0.115.5
uvicorn[standard]==0.32.1
httpx==0.28.1
python-multipart==0.0.17
Pillow==11.1.0
requests==2.32.3
tflite-runtime==2.14.0
numpy==1.26.4

View File

@@ -46,8 +46,15 @@ AGENT_SPECIALIZED_TOOLS = {
"nutra": ['comfy_generate_image', 'comfy_generate_video'],
# AgroMatrix - Agriculture
# Specialized: crop analysis, weather integration, field mapping
"agromatrix": ['comfy_generate_image', 'comfy_generate_video'],
# Specialized: crop analysis, weather integration, field mapping + plant intelligence
"agromatrix": [
'comfy_generate_image',
'comfy_generate_video',
'plantnet_lookup',
'nature_id_identify',
'gbif_species_lookup',
'agrovoc_lookup',
],
# GreenFood - Food & Eco
# Specialized: recipe analysis, eco-scoring

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -408,8 +408,9 @@ agents:
description: "Monitor Agent - архітектор-інспектор DAGI"
default_llm: local_qwen3_8b
system_prompt: |
Ти - Monitor Agent, стежиш за нодами, сервісами, агентами.
Якщо бачиш у чаті інших ботів, відповідай тільки за інфраструктурою або прямим тегом.
Ти - Monitor Agent, інфраструктурний інспектор DAGI: ноди, сервіси, пайплайни, алерти.
Ти знаєш, що DAARWIZZ — головний оркестратор мережі DAARION.city; для governance/маршрутизації посилайся на нього.
Відповідай коротко і по суті; якщо даних бракує — одразу кажи, який саме метрик/лог потрібен.
tools:
- id: get_metrics
type: builtin

View File

@@ -19,6 +19,7 @@ from typing import Dict, List, Any, Optional
from dataclasses import dataclass
from io import BytesIO, StringIO
from pathlib import PurePath
from urllib.parse import urlparse
import xml.etree.ElementTree as ET
from xml.sax.saxutils import escape as xml_escape
from zipfile import ZIP_DEFLATED, ZipFile
@@ -108,6 +109,115 @@ TOOL_DEFINITIONS = [
}
}
},
{
"type": "function",
"function": {
"name": "plantnet_lookup",
"description": "Визначення рослин через Pl@ntNet API. Повертає top-k кандидатів з confidence.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Короткий опис рослини/культури (якщо немає image_url)"
},
"image_url": {
"type": "string",
"description": "Публічне посилання на фото рослини"
},
"organ": {
"type": "string",
"description": "Орган рослини: leaf/flower/fruit/bark/auto",
"default": "auto"
},
"top_k": {
"type": "integer",
"description": "Скільки кандидатів повернути (1-10)",
"default": 3
}
}
}
}
},
{
"type": "function",
"function": {
"name": "nature_id_identify",
"description": "Локальна/open-source ідентифікація рослин через nature-id сумісний сервіс.",
"parameters": {
"type": "object",
"properties": {
"image_url": {
"type": "string",
"description": "Публічне посилання на фото рослини"
},
"image_data": {
"type": "string",
"description": "Data URL зображення (data:image/...;base64,...)"
},
"top_k": {
"type": "integer",
"description": "Скільки кандидатів повернути (1-10)",
"default": 3
},
"min_confidence": {
"type": "number",
"description": "Поріг confidence для fallback на GBIF",
"default": 0.65
}
}
}
}
},
{
"type": "function",
"function": {
"name": "gbif_species_lookup",
"description": "Пошук таксонів у GBIF для валідації назви культури/рослини.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Назва/термін для пошуку виду"
},
"limit": {
"type": "integer",
"description": "Кількість результатів (1-10)",
"default": 5
}
},
"required": ["query"]
}
}
},
{
"type": "function",
"function": {
"name": "agrovoc_lookup",
"description": "Нормалізація агро-термінів через AGROVOC (SPARQL).",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Термін культури/хвороби/технології"
},
"lang": {
"type": "string",
"description": "Мова міток (en/uk/ru)",
"default": "en"
},
"limit": {
"type": "integer",
"description": "Кількість результатів (1-10)",
"default": 5
}
},
"required": ["query"]
}
}
},
# PRIORITY 3: Generation tools
{
"type": "function",
@@ -681,6 +791,42 @@ class ToolManager:
tool_names = [t.get("function", {}).get("name") for t in filtered]
logger.debug(f"Agent {agent_id} has {len(filtered)} tools: {tool_names}")
return filtered
@staticmethod
def _is_image_data_url(value: str) -> bool:
v = str(value or "").strip()
return bool(v.startswith("data:image/") and ";base64," in v)
@staticmethod
def _is_known_non_direct_image_url(url: str) -> bool:
u = str(url or "").strip()
if not u:
return False
try:
p = urlparse(u)
except Exception:
return True
host = (p.netloc or "").lower()
if host in {"t.me", "telegram.me"}:
return True
if "web.telegram.org" in host:
return True
return False
@staticmethod
def _normalize_confidence(value: Any) -> float:
try:
v = float(value)
except Exception:
return 0.0
if v < 0:
return 0.0
# Some backends return percentages (e.g. 97.6) instead of 0..1.
if v > 1.0 and v <= 100.0:
v = v / 100.0
if v > 1.0:
v = 1.0
return v
async def execute_tool(
self,
@@ -709,6 +855,14 @@ class ToolManager:
return await self._web_search(arguments)
elif tool_name == "web_extract":
return await self._web_extract(arguments)
elif tool_name == "plantnet_lookup":
return await self._plantnet_lookup(arguments)
elif tool_name == "nature_id_identify":
return await self._nature_id_identify(arguments)
elif tool_name == "gbif_species_lookup":
return await self._gbif_species_lookup(arguments)
elif tool_name == "agrovoc_lookup":
return await self._agrovoc_lookup(arguments)
elif tool_name == "image_generate":
return await self._image_generate(arguments)
elif tool_name == "comfy_generate_image":
@@ -2530,6 +2684,272 @@ class ToolManager:
except Exception as e:
return ToolResult(success=False, result=None, error=str(e))
async def _plantnet_lookup(self, args: Dict) -> ToolResult:
"""Plant identification via Pl@ntNet API (skeleton adapter)."""
query = str(args.get("query", "") or "").strip()
image_url = str(args.get("image_url", "") or "").strip()
image_data = str(args.get("image_data", "") or "").strip()
runtime_image_data = str(args.get("_runtime_image_data", "") or "").strip()
if not image_data and self._is_image_data_url(runtime_image_data):
image_data = runtime_image_data
organ = str(args.get("organ", "auto") or "auto").strip().lower()
top_k = max(1, min(int(args.get("top_k", 3)), 5))
api_key = (os.getenv("PLANTNET_API_KEY") or "").strip()
if image_url and api_key:
try:
params = {
"api-key": api_key,
"images": image_url,
"organs": "leaf" if organ == "auto" else organ,
"lang": "en",
}
resp = await self.http_client.get(
"https://my-api.plantnet.org/v2/identify/all",
params=params,
timeout=25.0,
)
if resp.status_code == 200:
data = resp.json()
results = (data.get("results") or [])[:top_k]
if not results:
return ToolResult(success=True, result="Pl@ntNet: кандидатів не знайдено.")
lines = []
for idx, item in enumerate(results, 1):
species = (item.get("species") or {})
sname = species.get("scientificNameWithoutAuthor") or species.get("scientificName") or "unknown"
common = species.get("commonNames") or []
cname = common[0] if common else "-"
score = float(item.get("score") or 0.0)
lines.append(f"{idx}. {sname} ({cname}) score={score:.3f}")
return ToolResult(success=True, result="Pl@ntNet candidates:\n" + "\n".join(lines))
return ToolResult(success=False, result=None, error=f"plantnet_http_{resp.status_code}")
except Exception as e:
return ToolResult(success=False, result=None, error=f"plantnet_error: {e}")
if image_url or image_data:
ni_args: Dict[str, Any] = {"top_k": top_k}
if image_data:
ni_args["image_data"] = image_data
else:
ni_args["image_url"] = image_url
if runtime_image_data:
ni_args["_runtime_image_data"] = runtime_image_data
ni = await self._nature_id_identify(ni_args)
if ni.success:
return ni
if query:
return await self._gbif_species_lookup({"query": query, "limit": top_k})
return ToolResult(
success=False,
result=None,
error="No available plant ID backend (set PLANTNET_API_KEY or NATURE_ID_URL, or provide text query)",
)
async def _nature_id_identify(self, args: Dict) -> ToolResult:
"""Open-source plant identification via self-hosted nature-id compatible endpoint."""
image_url = str(args.get("image_url", "") or "").strip()
image_data = str(args.get("image_data", "") or "").strip()
runtime_image_data = str(args.get("_runtime_image_data", "") or "").strip()
if not image_data and self._is_image_data_url(runtime_image_data):
image_data = runtime_image_data
top_k = max(1, min(int(args.get("top_k", 3)), 10))
min_confidence = float(args.get("min_confidence", os.getenv("NATURE_ID_MIN_CONFIDENCE", "0.65")))
if image_url and self._is_known_non_direct_image_url(image_url):
if image_data:
logger.info("nature_id_identify: replacing non-direct image_url with runtime image_data")
image_url = ""
else:
return ToolResult(
success=False,
result=None,
error="image_url is not direct image URL; provide image_data or direct Telegram file URL",
)
if not image_url and not image_data:
return ToolResult(success=False, result=None, error="image_url or image_data is required")
base = (os.getenv("NATURE_ID_URL") or "").strip().rstrip("/")
if not base:
return ToolResult(success=False, result=None, error="NATURE_ID_URL is not configured")
try:
if image_data:
# data URL -> multipart /identify-file
if not image_data.startswith("data:") or "," not in image_data:
return ToolResult(success=False, result=None, error="invalid image_data format")
header, b64 = image_data.split(",", 1)
mime = "image/jpeg"
if ";base64" in header:
mime = header.split(":", 1)[1].split(";", 1)[0] or "image/jpeg"
ext = "jpg"
if "png" in mime:
ext = "png"
try:
image_bytes = base64.b64decode(b64)
except Exception:
return ToolResult(success=False, result=None, error="invalid image_data base64")
files = {"file": (f"upload.{ext}", image_bytes, mime)}
resp = await self.http_client.post(
f"{base}/identify-file",
params={"top_k": top_k},
files=files,
timeout=45.0,
)
else:
payload = {"image_url": image_url, "top_k": top_k}
resp = await self.http_client.post(f"{base}/identify", json=payload, timeout=45.0)
if resp.status_code != 200:
return ToolResult(success=False, result=None, error=f"nature_id_http_{resp.status_code}")
data = resp.json() or {}
status = str(data.get("status") or "success")
raw_top_k = data.get("top_k") or []
raw_preds = data.get("predictions") or data.get("results") or []
top_k_rows = []
if isinstance(raw_top_k, list) and raw_top_k:
for row in raw_top_k[:top_k]:
if not isinstance(row, dict):
continue
conf = row.get("confidence", 0.0)
conf_f = self._normalize_confidence(conf)
top_k_rows.append({
"confidence": conf_f,
"name": str(row.get("name") or row.get("scientific_name") or "unknown"),
"scientific_name": str(row.get("scientific_name") or row.get("name") or "unknown"),
})
else:
for item in raw_preds[:top_k]:
if not isinstance(item, dict):
continue
score = item.get("score", item.get("confidence", 0.0))
score_f = self._normalize_confidence(score)
sname = item.get("scientific_name") or item.get("label") or item.get("name") or "unknown"
cname = item.get("common_name") or item.get("common") or sname
top_k_rows.append({
"confidence": score_f,
"name": str(cname),
"scientific_name": str(sname),
})
if not top_k_rows:
return ToolResult(success=True, result=json.dumps({
"status": status,
"model": data.get("model") or "aiy_plants_V1",
"source": data.get("source") or "nature-id-cli",
"top_k": [],
"confidence": 0.0,
"recommend_fallback": True,
"reason": "no_predictions",
}, ensure_ascii=False))
top1 = top_k_rows[0]
top1_conf = float(top1.get("confidence", 0.0))
recommend_fallback = top1_conf < min_confidence
out = {
"status": status,
"model": data.get("model") or "aiy_plants_V1",
"source": data.get("source") or "nature-id-cli",
"inference_time_sec": data.get("inference_time_sec"),
"top_k": top_k_rows,
"confidence": top1_conf,
"min_confidence": min_confidence,
"recommend_fallback": recommend_fallback,
"fallback": "gbif_species_lookup",
}
if recommend_fallback:
fallback_query = str(top1.get("scientific_name") or top1.get("name") or "").strip()
if fallback_query and fallback_query.lower() != "unknown":
gbif = await self._gbif_species_lookup({"query": fallback_query, "limit": min(5, top_k)})
if gbif.success and gbif.result:
out["gbif_validation"] = gbif.result
return ToolResult(success=True, result=json.dumps(out, ensure_ascii=False))
except Exception as e:
return ToolResult(success=False, result=None, error=f"nature_id_error: {e}")
async def _gbif_species_lookup(self, args: Dict) -> ToolResult:
"""Species lookup via GBIF public API."""
query = str(args.get("query", "") or "").strip()
limit = max(1, min(int(args.get("limit", 5)), 10))
if not query:
return ToolResult(success=False, result=None, error="query is required")
try:
resp = await self.http_client.get(
"https://api.gbif.org/v1/species/search",
params={"q": query, "limit": limit, "status": "ACCEPTED"},
timeout=20.0,
)
if resp.status_code != 200:
return ToolResult(success=False, result=None, error=f"gbif_http_{resp.status_code}")
data = resp.json() or {}
results = data.get("results") or []
if not results:
return ToolResult(success=True, result="GBIF: результатів не знайдено.")
lines = []
for idx, item in enumerate(results[:limit], 1):
sci = item.get("scientificName") or item.get("canonicalName") or "unknown"
rank = item.get("rank") or "-"
status = item.get("taxonomicStatus") or "-"
key = item.get("key")
lines.append(f"{idx}. {sci} | rank={rank} | status={status} | key={key}")
return ToolResult(success=True, result="GBIF matches:\n" + "\n".join(lines))
except Exception as e:
return ToolResult(success=False, result=None, error=f"gbif_error: {e}")
async def _agrovoc_lookup(self, args: Dict) -> ToolResult:
"""AGROVOC term normalization via public SPARQL endpoint."""
query = str(args.get("query", "") or "").strip()
lang = str(args.get("lang", "en") or "en").strip().lower()
limit = max(1, min(int(args.get("limit", 5)), 10))
if not query:
return ToolResult(success=False, result=None, error="query is required")
if lang not in {"en", "uk", "ru"}:
lang = "en"
safe_q = query.replace('\\', ' ').replace('"', ' ').strip()
sparql = (
"PREFIX skos: <http://www.w3.org/2004/02/skos/core#> "
"SELECT ?concept ?label WHERE { "
"?concept skos:prefLabel ?label . "
f"FILTER(lang(?label) = '{lang}') "
f"FILTER(CONTAINS(LCASE(STR(?label)), LCASE(\"{safe_q}\"))) "
"} LIMIT " + str(limit)
)
try:
resp = await self.http_client.get(
"https://agrovoc.fao.org/sparql",
params={"query": sparql, "format": "json"},
timeout=25.0,
)
if resp.status_code != 200:
return ToolResult(success=False, result=None, error=f"agrovoc_http_{resp.status_code}")
data = resp.json() or {}
bindings = (((data.get("results") or {}).get("bindings")) or [])
if not bindings:
return ToolResult(success=True, result="AGROVOC: результатів не знайдено.")
lines = []
for idx, b in enumerate(bindings[:limit], 1):
label = ((b.get("label") or {}).get("value") or "").strip()
concept = ((b.get("concept") or {}).get("value") or "").strip()
lines.append(f"{idx}. {label} | {concept}")
return ToolResult(success=True, result="AGROVOC matches:\n" + "\n".join(lines))
except Exception as e:
return ToolResult(success=False, result=None, error=f"agrovoc_error: {e}")
async def _unload_ollama_models(self):
"""Unload all Ollama models to free VRAM for heavy operations like FLUX"""
ollama_url = os.getenv("OLLAMA_BASE_URL", "http://172.18.0.1:11434")
@@ -2942,7 +3362,11 @@ class ToolManager:
if results:
result = results[0] if isinstance(results, list) else results
markdown = result.get("markdown", "") or result.get("cleaned_html", "") or result.get("text", "")
raw_content = result.get("markdown", "") or result.get("cleaned_html", "") or result.get("text", "")
if isinstance(raw_content, (dict, list, tuple)):
markdown = json.dumps(raw_content, ensure_ascii=False)
else:
markdown = str(raw_content or "")
title = result.get("title", url)
if len(markdown) > 3000:
@@ -2951,13 +3375,30 @@ class ToolManager:
response_parts = [f"**{title}**", "", markdown]
if extract_links:
links = result.get("links", [])
if links:
links_raw = result.get("links", [])
normalized_links: List[Any] = []
if isinstance(links_raw, dict):
for bucket in links_raw.values():
if isinstance(bucket, list):
normalized_links.extend(bucket)
elif bucket:
normalized_links.append(bucket)
elif isinstance(links_raw, list):
normalized_links = links_raw
elif links_raw:
normalized_links = [links_raw]
if normalized_links:
response_parts.append("")
response_parts.append("**Посилання:**")
for link in links[:10]:
for link in normalized_links[:10]:
if isinstance(link, dict):
link_url = link.get("href", "")
link_url = (
link.get("href")
or link.get("url")
or link.get("link")
or ""
)
else:
link_url = str(link)
if link_url:

View File

@@ -11,10 +11,13 @@ import os
import asyncio
import logging
import base64
import json
import re
from typing import Optional, Dict, List, Any, Union
from datetime import datetime, timedelta
from enum import Enum
from io import BytesIO
import xml.etree.ElementTree as ET
from fastapi import FastAPI, HTTPException, BackgroundTasks, File, UploadFile, Form
from fastapi.middleware.cors import CORSMiddleware
@@ -56,16 +59,34 @@ def _csv_to_markdown(content: bytes) -> str:
text = _decode_text_bytes(content)
reader = csv.reader(text.splitlines())
rows = list(reader)
return _rows_to_markdown(rows)
def _tsv_to_markdown(content: bytes) -> str:
text = _decode_text_bytes(content)
reader = csv.reader(text.splitlines(), delimiter="\t")
rows = list(reader)
return _rows_to_markdown(rows)
def _rows_to_markdown(rows: List[List[Any]]) -> str:
if not rows:
return ""
header = rows[0]
body = rows[1:]
width = max(len(r) for r in rows)
norm_rows = []
for r in rows:
rr = [str(c) if c is not None else "" for c in r]
if len(rr) < width:
rr.extend([""] * (width - len(rr)))
norm_rows.append(rr)
header = norm_rows[0]
body = norm_rows[1:]
lines = [
"| " + " | ".join(header) + " |",
"| " + " | ".join(["---"] * len(header)) + " |",
]
for row in body:
lines.append("| " + " | ".join(row) + " |")
lines.append("| " + " | ".join([str(c) if c is not None else "" for c in row]) + " |")
return "\n".join(lines)
@@ -91,6 +112,69 @@ def _xlsx_to_markdown(content: bytes) -> str:
return "\n".join(parts)
def _xls_to_markdown(content: bytes) -> str:
try:
import xlrd
except Exception as e:
raise HTTPException(status_code=500, detail=f"xlrd not available: {e}")
wb = xlrd.open_workbook(file_contents=content)
parts = []
for s in wb.sheets():
parts.append(f"## Sheet: {s.name}")
rows = []
for r in range(s.nrows):
rows.append([s.cell_value(r, c) for c in range(s.ncols)])
if not rows:
parts.append("_Empty sheet_")
continue
parts.append(_rows_to_markdown(rows))
return "\n\n".join(parts)
def _ods_to_markdown(content: bytes) -> str:
try:
from odf.opendocument import load
from odf.table import Table, TableRow, TableCell
from odf.text import P
except Exception as e:
raise HTTPException(status_code=500, detail=f"odfpy not available: {e}")
try:
doc = load(BytesIO(content))
except Exception as e:
raise HTTPException(status_code=400, detail=f"Invalid ODS file: {e}")
parts = []
for table in doc.spreadsheet.getElementsByType(Table):
table_name = str(table.getAttribute("name") or "Sheet")
parts.append(f"## Sheet: {table_name}")
rows: List[List[str]] = []
for row in table.getElementsByType(TableRow):
cells_out: List[str] = []
for cell in row.getElementsByType(TableCell):
txt_parts = []
for p in cell.getElementsByType(P):
txt_parts.extend(
[str(getattr(node, "data", "")).strip() for node in p.childNodes if getattr(node, "data", None)]
)
cell_text = " ".join([t for t in txt_parts if t]).strip()
repeat_raw = cell.getAttribute("numbercolumnsrepeated")
try:
repeat = int(repeat_raw) if repeat_raw else 1
except Exception:
repeat = 1
repeat = max(1, min(repeat, 100))
for _ in range(repeat):
cells_out.append(cell_text)
if cells_out:
rows.append(cells_out)
if not rows:
parts.append("_Empty sheet_")
continue
parts.append(_rows_to_markdown(rows))
return "\n\n".join(parts)
def _docx_to_text(content: bytes) -> str:
try:
from docx import Document
@@ -115,18 +199,111 @@ def _pdf_to_text(content: bytes) -> str:
return "\n\n".join(text_content)
def _pptx_to_text(content: bytes) -> str:
try:
from pptx import Presentation
except Exception as e:
raise HTTPException(status_code=500, detail=f"python-pptx not available: {e}")
prs = Presentation(BytesIO(content))
parts = []
for idx, slide in enumerate(prs.slides, start=1):
parts.append(f"## Slide {idx}")
slide_lines = []
for shape in slide.shapes:
text = getattr(shape, "text", None)
if text and str(text).strip():
slide_lines.append(str(text).strip())
parts.extend(slide_lines if slide_lines else ["_No text on this slide_"])
return "\n\n".join(parts)
def _json_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
parsed = json.loads(raw)
return json.dumps(parsed, ensure_ascii=False, indent=2)
except Exception:
return raw
def _yaml_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
parsed = yaml.safe_load(raw)
return yaml.safe_dump(parsed, allow_unicode=True, sort_keys=False)
except Exception:
return raw
def _xml_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
root = ET.fromstring(raw)
text = " ".join([t.strip() for t in root.itertext() if t and t.strip()])
return text or raw
except Exception:
return raw
def _html_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(raw, "html.parser")
text = soup.get_text(separator="\n")
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip() or raw
except Exception:
# Minimal fallback if bs4 is unavailable
text = re.sub(r"<[^>]+>", " ", raw)
text = re.sub(r"\s+", " ", text)
return text.strip()
def _rtf_to_text(content: bytes) -> str:
raw = _decode_text_bytes(content)
try:
from striprtf.striprtf import rtf_to_text
return rtf_to_text(raw)
except Exception:
# Basic fallback: strip common RTF control tokens
text = re.sub(r"\\'[0-9a-fA-F]{2}", " ", raw)
text = re.sub(r"\\[a-zA-Z]+-?\d* ?", " ", text)
text = text.replace("{", " ").replace("}", " ")
return re.sub(r"\s+", " ", text).strip()
def _extract_text_by_ext(filename: str, content: bytes) -> str:
ext = filename.split(".")[-1].lower() if "." in filename else ""
if ext in ["txt", "md"]:
if ext in ["txt", "md", "markdown"]:
return _decode_text_bytes(content)
if ext == "csv":
return _csv_to_markdown(content)
if ext == "xlsx":
if ext == "tsv":
return _tsv_to_markdown(content)
if ext in {"xlsx", "xlsm"}:
return _xlsx_to_markdown(content)
if ext == "xls":
return _xls_to_markdown(content)
if ext == "ods":
return _ods_to_markdown(content)
if ext == "docx":
return _docx_to_text(content)
if ext == "pdf":
return _pdf_to_text(content)
if ext == "pptx":
return _pptx_to_text(content)
if ext == "json":
return _json_to_text(content)
if ext in {"yaml", "yml"}:
return _yaml_to_text(content)
if ext == "xml":
return _xml_to_text(content)
if ext in {"html", "htm"}:
return _html_to_text(content)
if ext == "rtf":
return _rtf_to_text(content)
raise HTTPException(status_code=400, detail=f"Unsupported file type: .{ext}")
@@ -139,7 +316,12 @@ def _zip_to_markdown(content: bytes, max_files: int = 50, max_total_mb: int = 10
if total_size > max_total_mb * 1024 * 1024:
raise HTTPException(status_code=400, detail=f"ZIP слишком большой: {total_size / 1024 / 1024:.1f} MB")
parts = []
allowed_exts = {"txt", "md", "csv", "xlsx", "docx", "pdf"}
allowed_exts = {
"txt", "md", "markdown", "csv", "tsv",
"xls", "xlsx", "xlsm", "ods",
"docx", "pdf", "pptx",
"json", "yaml", "yml", "xml", "html", "htm", "rtf",
}
processed = []
skipped = []
for member in members:
@@ -1655,7 +1837,8 @@ async def document_endpoint(
- json: Structured JSON with document elements
- text: Plain text extraction
Supported files: PDF, DOCX, PPTX, images (PNG, JPG)
Supported files:
PDF, DOCX, XLS/XLSX/XLSM/ODS, PPTX, TXT/MD/CSV/TSV, JSON/YAML/XML/HTML, RTF, ZIP, images.
"""
try:
import time
@@ -1672,15 +1855,28 @@ async def document_endpoint(
filename = file.filename if file else "document"
file_ext = filename.split(".")[-1].lower() if "." in filename else "pdf"
# Handle text-based formats without Docling
if file_ext in ["txt", "md", "csv", "xlsx", "zip"]:
# Handle deterministic extraction for standard office/text formats
if file_ext in [
"txt", "md", "markdown", "csv", "tsv",
"xlsx", "xls", "xlsm", "ods",
"json", "yaml", "yml", "xml", "html", "htm", "rtf",
"pptx", "zip",
]:
try:
if file_ext == "zip":
content = _zip_to_markdown(doc_data)
output_format = "markdown"
else:
content = _extract_text_by_ext(filename, doc_data)
output_format = "markdown" if file_ext in ["md", "csv", "xlsx"] else "text"
output_format = (
"markdown"
if file_ext in {
"md", "markdown", "csv", "tsv",
"xlsx", "xls", "xlsm", "ods",
"json", "yaml", "yml", "xml", "html", "htm", "pptx",
}
else "text"
)
processing_time_ms = (time.time() - start_time) * 1000
return {
"success": True,
@@ -1764,22 +1960,27 @@ async def document_endpoint(
"device": swapper.device
}
# For DOCX, try python-docx
if file_ext == "docx":
# For common office/text formats, try deterministic extractors.
if file_ext in {
"docx", "txt", "md", "markdown", "csv", "tsv",
"xlsx", "xls", "xlsm", "ods",
"pptx", "json", "yaml", "yml", "xml", "html", "htm", "rtf",
}:
try:
content = _docx_to_text(doc_data)
content = _extract_text_by_ext(filename, doc_data)
out_fmt = "markdown" if file_ext not in {"txt", "rtf"} else "text"
return {
"success": True,
"model": "python-docx (fallback)",
"output_format": "text",
"model": "text-extract (fallback)",
"output_format": out_fmt,
"result": content,
"filename": filename,
"processing_time_ms": (time.time() - start_time) * 1000,
"device": swapper.device
}
except Exception as e:
logger.error(f"DOCX fallback failed: {e}")
raise HTTPException(status_code=500, detail="DOCX extraction failed")
logger.error(f"Text fallback failed for .{file_ext}: {e}")
raise HTTPException(status_code=500, detail=f"Extraction failed for .{file_ext}")
# For PDFs, try pdfplumber
if file_ext == "pdf":
@@ -1807,7 +2008,7 @@ async def document_endpoint(
# For other documents, return error
raise HTTPException(
status_code=503,
detail="Document processing not available. Supported: PDF (with pdfplumber), images (with OCR)"
detail="Document processing unavailable for this type. Supported: office/text/image/zip standard formats."
)
finally:
@@ -2312,4 +2513,3 @@ async def get_multimodal_stack():
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8890)

View File

@@ -4,6 +4,15 @@ httpx==0.25.2
pydantic==2.5.0
pyyaml==6.0.1
python-multipart==0.0.6
chardet>=5.2.0
openpyxl>=3.1.2
python-docx>=1.1.2
pdfplumber>=0.11.0
python-pptx>=0.6.23
xlrd>=2.0.1
odfpy>=1.4.1
beautifulsoup4>=4.12.0
striprtf>=0.0.26
# HuggingFace dependencies for OCR models
torch>=2.0.0
@@ -25,4 +34,4 @@ safetensors>=0.4.0
# Web Scraping & Search
trafilatura>=1.6.0
duckduckgo-search>=4.0.0
duckduckgo-search>=4.0.0

View File

@@ -43,3 +43,8 @@ pdfplumber>=0.10.0
python-docx>=1.1.0
openpyxl>=3.1.2
chardet>=5.2.0
python-pptx>=0.6.23
xlrd>=2.0.1
odfpy>=1.4.1
beautifulsoup4>=4.12.0
striprtf>=0.0.26