merge: integrate remote codex/sync-node1-runtime with fabric layer changes
Resolve conflicts in docker-compose.node1.yml, services/router/main.py, and gateway-bot/services/doc_service.py — keeping both fabric layer (NCS, node-worker, Prometheus) and document ingest/query endpoints. Made-with: Cursor
This commit is contained in:
@@ -16,9 +16,16 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration
|
||||
GATEWAY_URL = os.getenv("GATEWAY_URL", "http://gateway:9300")
|
||||
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000")
|
||||
PROBE_INTERVAL = int(os.getenv("PROBE_INTERVAL", "60")) # seconds
|
||||
PROBE_TIMEOUT = int(os.getenv("PROBE_TIMEOUT", "30")) # seconds
|
||||
SEMANTIC_TIMEOUT = int(os.getenv("SEMANTIC_TIMEOUT", "45")) # seconds
|
||||
METRICS_PORT = int(os.getenv("METRICS_PORT", "9108"))
|
||||
SEMANTIC_PROBE_ENABLED = os.getenv("SEMANTIC_PROBE_ENABLED", "true").lower() == "true"
|
||||
SEMANTIC_AGENTS = [a.strip() for a in os.getenv("SEMANTIC_AGENTS", "clan,sofiia,monitor").split(",") if a.strip()]
|
||||
SEMANTIC_PROMPT = os.getenv("SEMANTIC_PROMPT", "Коротко: хто такий DAARWIZZ?")
|
||||
SEMANTIC_EXPECT_KEYWORD = os.getenv("SEMANTIC_EXPECT_KEYWORD", "daarwizz").lower()
|
||||
MONITOR_EXPECT_LOCAL = os.getenv("MONITOR_EXPECT_LOCAL", "true").lower() == "true"
|
||||
|
||||
# Prometheus metrics
|
||||
agent_e2e_success = Gauge('agent_e2e_success', 'Whether last E2E probe succeeded', ['target'])
|
||||
@@ -42,7 +49,7 @@ async def probe_gateway_health() -> tuple[bool, float, str]:
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
|
||||
resp = await client.get(f"{GATEWAY_URL}/health")
|
||||
latency = time.time() - start
|
||||
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
if data.get("status") == "healthy":
|
||||
@@ -67,7 +74,7 @@ async def probe_agent_ping() -> tuple[bool, float, str]:
|
||||
json={"probe": True, "timestamp": datetime.utcnow().isoformat()}
|
||||
)
|
||||
latency = time.time() - start
|
||||
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
if data.get("success"):
|
||||
@@ -100,7 +107,7 @@ async def probe_webhook_echo() -> tuple[bool, float, str]:
|
||||
"text": "/health" # Simple health check command
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
|
||||
# Use helion webhook as it's the most tested
|
||||
resp = await client.post(
|
||||
@@ -108,7 +115,7 @@ async def probe_webhook_echo() -> tuple[bool, float, str]:
|
||||
json=test_update
|
||||
)
|
||||
latency = time.time() - start
|
||||
|
||||
|
||||
if resp.status_code == 200:
|
||||
return True, latency, ""
|
||||
else:
|
||||
@@ -119,53 +126,102 @@ async def probe_webhook_echo() -> tuple[bool, float, str]:
|
||||
return False, time.time() - start, f"error: {str(e)[:50]}"
|
||||
|
||||
|
||||
async def probe_agent_semantic(agent_id: str) -> tuple[bool, float, str]:
|
||||
"""Probe semantic response via router infer and assert DAARWIZZ awareness."""
|
||||
start = time.time()
|
||||
try:
|
||||
payload = {
|
||||
"prompt": SEMANTIC_PROMPT,
|
||||
"max_tokens": 180,
|
||||
"temperature": 0.1,
|
||||
"metadata": {
|
||||
"agent_id": agent_id,
|
||||
"user_id": "tg:0",
|
||||
"chat_id": "0",
|
||||
"username": "e2e-prober",
|
||||
"raw_user_text": SEMANTIC_PROMPT,
|
||||
},
|
||||
}
|
||||
async with httpx.AsyncClient(timeout=SEMANTIC_TIMEOUT) as client:
|
||||
resp = await client.post(f"{ROUTER_URL}/v1/agents/{agent_id}/infer", json=payload)
|
||||
latency = time.time() - start
|
||||
if resp.status_code != 200:
|
||||
return False, latency, f"http_{resp.status_code}"
|
||||
|
||||
data = resp.json()
|
||||
answer = str(data.get("response") or "")
|
||||
backend = str(data.get("backend") or "")
|
||||
model = str(data.get("model") or "")
|
||||
|
||||
answer_lc = answer.lower()
|
||||
if SEMANTIC_EXPECT_KEYWORD not in answer_lc and "даар" not in answer_lc:
|
||||
return False, latency, "no_daarwizz_in_answer"
|
||||
|
||||
if MONITOR_EXPECT_LOCAL and agent_id == "monitor":
|
||||
local_ok = ("ollama" in backend.lower()) or model.lower().startswith("qwen")
|
||||
if not local_ok:
|
||||
return False, latency, f"monitor_nonlocal_backend:{backend}:{model}"
|
||||
|
||||
return True, latency, ""
|
||||
except httpx.TimeoutException:
|
||||
return False, time.time() - start, "timeout"
|
||||
except Exception as e:
|
||||
return False, time.time() - start, f"error: {str(e)[:50]}"
|
||||
|
||||
|
||||
def record_probe(target: str, success: bool, latency: float, reason: str):
|
||||
"""Record probe metrics and log line."""
|
||||
agent_e2e_runs_total.labels(target=target).inc()
|
||||
agent_e2e_success.labels(target=target).set(1 if success else 0)
|
||||
agent_e2e_latency.labels(target=target).set(latency)
|
||||
agent_e2e_latency_histogram.labels(target=target).observe(latency)
|
||||
if not success:
|
||||
agent_e2e_failures_total.labels(target=target, reason=reason).inc()
|
||||
logger.info(f"{target}: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||
|
||||
|
||||
async def run_probes():
|
||||
"""Run all probes and update metrics"""
|
||||
# Probe 1: Gateway health
|
||||
success, latency, reason = await probe_gateway_health()
|
||||
agent_e2e_runs_total.labels(target="gateway_health").inc()
|
||||
agent_e2e_success.labels(target="gateway_health").set(1 if success else 0)
|
||||
agent_e2e_latency.labels(target="gateway_health").set(latency)
|
||||
agent_e2e_latency_histogram.labels(target="gateway_health").observe(latency)
|
||||
if not success:
|
||||
agent_e2e_failures_total.labels(target="gateway_health", reason=reason).inc()
|
||||
logger.info(f"gateway_health: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||
|
||||
record_probe("gateway_health", success, latency, reason)
|
||||
|
||||
# Probe 2: Agent ping (if endpoint exists)
|
||||
success, latency, reason = await probe_agent_ping()
|
||||
agent_e2e_runs_total.labels(target="agent_ping").inc()
|
||||
agent_e2e_success.labels(target="agent_ping").set(1 if success else 0)
|
||||
agent_e2e_latency.labels(target="agent_ping").set(latency)
|
||||
agent_e2e_latency_histogram.labels(target="agent_ping").observe(latency)
|
||||
if not success:
|
||||
agent_e2e_failures_total.labels(target="agent_ping", reason=reason).inc()
|
||||
logger.info(f"agent_ping: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||
|
||||
record_probe("agent_ping", success, latency, reason)
|
||||
|
||||
# Probe 3: Webhook E2E (full path test)
|
||||
success, latency, reason = await probe_webhook_echo()
|
||||
agent_e2e_runs_total.labels(target="webhook_e2e").inc()
|
||||
agent_e2e_success.labels(target="webhook_e2e").set(1 if success else 0)
|
||||
agent_e2e_latency.labels(target="webhook_e2e").set(latency)
|
||||
agent_e2e_latency_histogram.labels(target="webhook_e2e").observe(latency)
|
||||
if not success:
|
||||
agent_e2e_failures_total.labels(target="webhook_e2e", reason=reason).inc()
|
||||
logger.info(f"webhook_e2e: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||
record_probe("webhook_e2e", success, latency, reason)
|
||||
|
||||
# Probe 4+: semantic checks for selected agents (parallel)
|
||||
if SEMANTIC_PROBE_ENABLED and SEMANTIC_AGENTS:
|
||||
results = await asyncio.gather(*(probe_agent_semantic(agent_id) for agent_id in SEMANTIC_AGENTS))
|
||||
matrix = []
|
||||
for agent_id, (success, latency, reason) in zip(SEMANTIC_AGENTS, results):
|
||||
record_probe(f"semantic_{agent_id}", success, latency, reason)
|
||||
matrix.append(f"{agent_id}:{'PASS' if success else 'FAIL'}")
|
||||
logger.info("semantic_matrix: " + " | ".join(matrix))
|
||||
|
||||
|
||||
async def main():
|
||||
logger.info(f"Starting E2E Agent Prober")
|
||||
logger.info("Starting E2E Agent Prober")
|
||||
logger.info(f" GATEWAY_URL: {GATEWAY_URL}")
|
||||
logger.info(f" ROUTER_URL: {ROUTER_URL}")
|
||||
logger.info(f" PROBE_INTERVAL: {PROBE_INTERVAL}s")
|
||||
logger.info(f" PROBE_TIMEOUT: {PROBE_TIMEOUT}s")
|
||||
logger.info(f" METRICS_PORT: {METRICS_PORT}")
|
||||
|
||||
logger.info(f" SEMANTIC_TIMEOUT: {SEMANTIC_TIMEOUT}s")
|
||||
logger.info(f" SEMANTIC_PROBE_ENABLED: {SEMANTIC_PROBE_ENABLED}")
|
||||
logger.info(f" SEMANTIC_AGENTS: {','.join(SEMANTIC_AGENTS)}")
|
||||
|
||||
# Start Prometheus metrics server
|
||||
start_http_server(METRICS_PORT)
|
||||
logger.info(f"Prometheus metrics available at :{METRICS_PORT}/metrics")
|
||||
|
||||
|
||||
# Initial probe
|
||||
await run_probes()
|
||||
|
||||
|
||||
# Continuous probing
|
||||
while True:
|
||||
await asyncio.sleep(PROBE_INTERVAL)
|
||||
|
||||
@@ -6,13 +6,15 @@ Artifact Registry v0
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
from io import BytesIO
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import asyncpg
|
||||
@@ -90,6 +92,14 @@ class ArtifactVersionFromUrlRequest(BaseModel):
|
||||
meta_json: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class ArtifactVersionFromBase64Request(BaseModel):
|
||||
content_base64: str
|
||||
mime: str
|
||||
filename: Optional[str] = "source.bin"
|
||||
label: Optional[str] = "source"
|
||||
meta_json: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class ArtifactVersionResponse(BaseModel):
|
||||
version_id: str
|
||||
storage_key: str
|
||||
@@ -208,15 +218,38 @@ def _normalize_meta_json(meta: Any) -> Dict[str, Any]:
|
||||
|
||||
def _format_to_mime(fmt: str) -> str:
|
||||
fmt = fmt.lower()
|
||||
if "/" in fmt:
|
||||
return fmt
|
||||
if fmt == "pptx":
|
||||
return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
if fmt == "pdf":
|
||||
return "application/pdf"
|
||||
if fmt == "source":
|
||||
return "application/json"
|
||||
if fmt == "docx":
|
||||
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
if fmt == "xlsx":
|
||||
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
if fmt == "txt":
|
||||
return "text/plain; charset=utf-8"
|
||||
if fmt == "md":
|
||||
return "text/markdown; charset=utf-8"
|
||||
if fmt == "json":
|
||||
return "application/json"
|
||||
if fmt == "csv":
|
||||
return "text/csv; charset=utf-8"
|
||||
return "application/octet-stream"
|
||||
|
||||
|
||||
def _safe_filename(name: Optional[str], fallback: str = "source.bin") -> str:
|
||||
raw = (name or fallback).strip() or fallback
|
||||
cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", raw)
|
||||
cleaned = cleaned.strip("._")
|
||||
if not cleaned:
|
||||
return fallback
|
||||
return cleaned[:120]
|
||||
|
||||
|
||||
async def _download_bytes(url: str) -> bytes:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
resp = await client.get(url)
|
||||
@@ -462,6 +495,73 @@ async def add_version_from_url(artifact_id: str, payload: ArtifactVersionFromUrl
|
||||
)
|
||||
|
||||
|
||||
@app.post("/artifacts/{artifact_id}/versions/from_base64", response_model=ArtifactVersionResponse)
|
||||
async def add_version_from_base64(artifact_id: str, payload: ArtifactVersionFromBase64Request) -> ArtifactVersionResponse:
|
||||
if not minio_client:
|
||||
raise HTTPException(status_code=500, detail="MinIO not available")
|
||||
if not pool:
|
||||
raise HTTPException(status_code=500, detail="DB not available")
|
||||
|
||||
raw = (payload.content_base64 or "").strip()
|
||||
if not raw:
|
||||
raise HTTPException(status_code=400, detail="content_base64 is required")
|
||||
|
||||
if raw.startswith("data:") and "," in raw:
|
||||
raw = raw.split(",", 1)[1]
|
||||
|
||||
try:
|
||||
content = base64.b64decode(raw, validate=True)
|
||||
except Exception:
|
||||
raise HTTPException(status_code=400, detail="Invalid base64 payload")
|
||||
|
||||
if not content:
|
||||
raise HTTPException(status_code=400, detail="Decoded payload is empty")
|
||||
|
||||
version_id = f"ver_{uuid.uuid4().hex}"
|
||||
filename = _safe_filename(payload.filename, fallback="source.bin")
|
||||
sha256 = _hash_bytes(content)
|
||||
storage_key = _storage_key(artifact_id, version_id, filename)
|
||||
|
||||
try:
|
||||
minio_client.put_object(
|
||||
MINIO_BUCKET,
|
||||
storage_key,
|
||||
data=BytesIO(content),
|
||||
length=len(content),
|
||||
content_type=payload.mime,
|
||||
)
|
||||
except S3Error as e:
|
||||
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
|
||||
|
||||
meta_json = _normalize_meta_json(payload.meta_json)
|
||||
if "file_name" not in meta_json:
|
||||
meta_json["file_name"] = filename
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
await conn.execute(
|
||||
"""
|
||||
insert into artifact_versions
|
||||
(id, artifact_id, label, sha256, mime, size_bytes, storage_key, meta_json)
|
||||
values ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
""",
|
||||
version_id,
|
||||
artifact_id,
|
||||
payload.label or "source",
|
||||
sha256,
|
||||
payload.mime,
|
||||
len(content),
|
||||
storage_key,
|
||||
json.dumps(meta_json),
|
||||
)
|
||||
|
||||
return ArtifactVersionResponse(
|
||||
version_id=version_id,
|
||||
storage_key=storage_key,
|
||||
sha256=sha256,
|
||||
size_bytes=len(content),
|
||||
)
|
||||
|
||||
|
||||
@app.post("/artifacts/{artifact_id}/versions", response_model=ArtifactVersionResponse)
|
||||
async def add_version(artifact_id: str, payload: ArtifactVersionCreateRequest) -> ArtifactVersionResponse:
|
||||
if not pool:
|
||||
@@ -678,7 +778,39 @@ async def download_artifact(artifact_id: str, format: str = Query("pptx")) -> Di
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="Version not found")
|
||||
try:
|
||||
url = minio_client.presigned_get_object(MINIO_BUCKET, row["storage_key"], expires=1800)
|
||||
url = minio_client.presigned_get_object(
|
||||
MINIO_BUCKET,
|
||||
row["storage_key"],
|
||||
expires=timedelta(seconds=1800),
|
||||
)
|
||||
except S3Error as e:
|
||||
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
|
||||
return {"url": url, "storage_key": row["storage_key"], "mime": row["mime"]}
|
||||
|
||||
|
||||
@app.get("/artifacts/{artifact_id}/versions/{version_id}/download")
|
||||
async def download_artifact_version(artifact_id: str, version_id: str) -> Dict[str, Any]:
|
||||
if not pool or not minio_client:
|
||||
raise HTTPException(status_code=500, detail="Service not available")
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
"""
|
||||
select * from artifact_versions
|
||||
where artifact_id=$1 and id=$2
|
||||
limit 1
|
||||
""",
|
||||
artifact_id,
|
||||
version_id,
|
||||
)
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="Version not found")
|
||||
try:
|
||||
url = minio_client.presigned_get_object(
|
||||
MINIO_BUCKET,
|
||||
row["storage_key"],
|
||||
expires=timedelta(seconds=1800),
|
||||
)
|
||||
except S3Error as e:
|
||||
raise HTTPException(status_code=502, detail=f"MinIO error: {e}")
|
||||
return {"url": url, "storage_key": row["storage_key"], "mime": row["mime"], "version_id": row["id"]}
|
||||
|
||||
@@ -361,6 +361,29 @@ agromatrix:
|
||||
llm_profile: reasoning
|
||||
delegation:
|
||||
enabled: false
|
||||
plant_intel:
|
||||
team_name: AgroMatrix Plant Intelligence
|
||||
parallel_roles: true
|
||||
max_concurrency: 3
|
||||
synthesis:
|
||||
role_context: Plant Intelligence Synthesis
|
||||
system_prompt_ref: roles/agx/agx-plant-intel/orchestrator_synthesis.md
|
||||
llm_profile: reasoning
|
||||
team:
|
||||
- id: plant_identifier
|
||||
role_context: Plant Identifier
|
||||
system_prompt_ref: roles/agx/agx-plant-intel/plant_identifier.md
|
||||
llm_profile: science
|
||||
- id: taxonomy_validator
|
||||
role_context: Taxonomy Validator
|
||||
system_prompt_ref: roles/agx/agx-plant-intel/taxonomy_validator.md
|
||||
llm_profile: reasoning
|
||||
- id: agrovoc_normalizer
|
||||
role_context: AGROVOC Normalizer
|
||||
system_prompt_ref: roles/agx/agx-plant-intel/agrovoc_normalizer.md
|
||||
llm_profile: fast
|
||||
delegation:
|
||||
enabled: false
|
||||
cadastre_geo:
|
||||
team_name: AgroMatrix Cadastre/Geo
|
||||
parallel_roles: true
|
||||
@@ -614,6 +637,16 @@ agromatrix:
|
||||
- Stepan
|
||||
- координація
|
||||
- план
|
||||
plant_intel:
|
||||
- plant
|
||||
- рослина
|
||||
- культура
|
||||
- leaf
|
||||
- disease
|
||||
- хвороба
|
||||
- identify
|
||||
- ідентифікуй
|
||||
- що за рослина
|
||||
cadastre_geo:
|
||||
- cadastre
|
||||
- geo
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
# Agronomist
|
||||
|
||||
Фокус: агрономія, діагностика стану рослин, фази розвитку, ризики хвороб/стресів.
|
||||
|
||||
Правила відповіді:
|
||||
- Коротко і прикладно.
|
||||
- Ніяких вигаданих фактів; при невизначеності чітко позначити припущення.
|
||||
- Для фото-питань: аналізувати в межах доступного контексту; якщо файл відсутній зараз — просити фото повторно.
|
||||
@@ -0,0 +1,8 @@
|
||||
# Communicator
|
||||
|
||||
Фокус: людяна та зрозуміла комунікація фінальної відповіді.
|
||||
|
||||
Правила:
|
||||
- Природна мова, без механістичного тону.
|
||||
- Не дублюй технічні обмеження, якщо вони не потрібні для дії користувача.
|
||||
- Завершуй конкретним корисним кроком.
|
||||
@@ -0,0 +1,7 @@
|
||||
# Field Data Analyst
|
||||
|
||||
Фокус: аналіз польових даних, тренди, аномалії, порівняння сценаріїв.
|
||||
|
||||
Правила:
|
||||
- Пояснювати висновки простою мовою.
|
||||
- Якщо даних недостатньо — вказати, які саме дані потрібні для точного висновку.
|
||||
@@ -0,0 +1,8 @@
|
||||
# Farm Ops Planner
|
||||
|
||||
Фокус: планування польових робіт, ресурси, пріоритезація задач, таймінги.
|
||||
|
||||
Правила:
|
||||
- Видавати практичний порядок дій.
|
||||
- За простого запиту: коротка відповідь.
|
||||
- Для операційних запитів: стислий план з відповідальними і дедлайном.
|
||||
@@ -0,0 +1,10 @@
|
||||
# AgroMatrix Orchestrator Synthesis
|
||||
|
||||
Ти синтезуєш відповіді ролей у фінальну відповідь Степана.
|
||||
|
||||
Правила:
|
||||
- За замовчуванням: 1-3 природні речення без шаблонної канцелярії.
|
||||
- Детальний формат (пункти/чекліст) тільки коли користувач просить "детально", "план", "чекліст", "розрахунок".
|
||||
- Якщо для аналізу бракує фото в поточному контексті, скажи це просто і попроси надіслати фото повторно.
|
||||
- Уникай службових формулювань про "технічні обмеження", "text-only" чи "відсутній vision-модуль".
|
||||
- Пояснюй по суті агропитання і давай 1 наступний практичний крок.
|
||||
@@ -0,0 +1,7 @@
|
||||
# Risk Assessor
|
||||
|
||||
Фокус: агро-ризики, операційні ризики, наслідки рішень.
|
||||
|
||||
Правила:
|
||||
- Давай коротку оцінку ризику (низький/середній/високий) і як зменшити ризик.
|
||||
- Без зайвої бюрократії у відповіді користувачу.
|
||||
@@ -11,6 +11,10 @@
|
||||
- Деструктивні дії (delete/migrate/prod) ТІЛЬКИ через план + dry-run + backup
|
||||
- Ніколи не логувати секрети/токени
|
||||
- Інші ролі НЕ спілкуються з користувачем напряму
|
||||
- Мультимодальність активна: фото/голос/документи підтримуються через стек платформи.
|
||||
- Якщо в поточному контексті не вистачає зображення для аналізу, пояснюйте це простою людською мовою і попросіть надіслати фото ще раз без технічних формулювань.
|
||||
|
||||
## Формат відповіді:
|
||||
Структурована відповідь з чіткими рекомендаціями та наступними кроками.
|
||||
- За замовчуванням: природна коротка відповідь 1-3 речення.
|
||||
- Якщо користувач просить детально/план/чекліст: структурована відповідь з чіткими наступними кроками.
|
||||
- Тон: живий і професійний, без канцеляризмів, шаблонів і фраз про "обмеження моделі".
|
||||
|
||||
@@ -7,3 +7,7 @@
|
||||
- Структурувати інформацію логічно
|
||||
- Включати конкретні наступні кроки
|
||||
- Позначати ризики якщо є
|
||||
- За замовчуванням відповідати природно і коротко (1-3 речення), без шаблонної канцелярії.
|
||||
- Для детальних запитів переходити у структурований режим.
|
||||
- Якщо для аналізу бракує зображення у поточному контексті, скажіть це природно і попросіть надіслати фото повторно.
|
||||
- Не вживати службові формулювання на кшталт "обмеження моделі", "text-only", "vision unavailable".
|
||||
|
||||
@@ -0,0 +1,11 @@
|
||||
You are AGROVOC Normalizer.
|
||||
|
||||
Responsibilities:
|
||||
- Normalize crop/disease terms using agrovoc_lookup.
|
||||
- Provide canonical term mapping for user-facing output.
|
||||
- Keep labels practical for agronomy context.
|
||||
|
||||
Return format:
|
||||
- canonical_terms
|
||||
- term_mapping
|
||||
- notes_for_user
|
||||
@@ -0,0 +1,24 @@
|
||||
Ти — Plant Intel Agent у DAARION.city.
|
||||
Відповідай природно, коротко й по-людськи українською, 1–3 речення за замовчуванням.
|
||||
|
||||
НАЙГОЛОВНІШЕ:
|
||||
- Дані з [PLANT_VISION_PREPROCESSED] (або context.plant_vision) — єдиний source-of-truth для ідентифікації рослини.
|
||||
- Для follow-up без нового фото використовуй [PREVIOUS_PLANT_IDENTIFICATION] (або context.last_plant / memory.last_plant).
|
||||
|
||||
Правило впевненості (обов'язково):
|
||||
- Якщо recommend_fallback == true або confidence < 0.65:
|
||||
"Ймовірно <name>, але впевненість низька. Перевірив через GBIF — найближчі збіги: <gbif_validation>. Краще нове фото при нормальному світлі."
|
||||
- Інакше:
|
||||
"Я бачу <name> з впевненістю <X>%."
|
||||
|
||||
Правила синтезу:
|
||||
- Не ігноруй результати pre-vision, якщо вони присутні.
|
||||
- Не стверджуй "фото не надано", якщо у контексті є pre-vision або previous plant data.
|
||||
- Уникай шаблонних списків, якщо користувач не просить детальний формат.
|
||||
- Якщо дані суперечливі: коротко познач невизначеність і попроси 1 конкретне додаткове фото.
|
||||
- Якщо top_k порожній, явно вкажи, що ідентифікація непевна, але все одно надай GBIF-орієнтир, якщо він є в контексті.
|
||||
|
||||
Формат відповіді:
|
||||
- 1–3 речення за замовчуванням.
|
||||
- Без технічного шуму, без внутрішніх JSON/міток у відповіді користувачу.
|
||||
- За запитом користувача можна розгорнути відповідь і дати короткі поради з догляду.
|
||||
@@ -0,0 +1,11 @@
|
||||
You are Plant Identifier.
|
||||
|
||||
Responsibilities:
|
||||
- Parse visual cues from user description/photo context.
|
||||
- Build candidate crop/plant hypotheses.
|
||||
- Use plantnet_lookup first when image URL is available.
|
||||
- If PlantNet is unavailable, provide top hypotheses with explicit uncertainty.
|
||||
|
||||
Return format:
|
||||
- candidates: numbered list max 5, each with rationale.
|
||||
- required_data: what extra image/data is needed.
|
||||
@@ -0,0 +1,11 @@
|
||||
You are Taxonomy Validator.
|
||||
|
||||
Responsibilities:
|
||||
- Validate candidate names via gbif_species_lookup.
|
||||
- Remove invalid/synonym-conflicted names.
|
||||
- Keep accepted taxa and explain conflicts briefly.
|
||||
|
||||
Return format:
|
||||
- accepted_candidates
|
||||
- rejected_candidates_with_reason
|
||||
- confidence_adjustment
|
||||
15
services/plant-vision-node1/Dockerfile
Normal file
15
services/plant-vision-node1/Dockerfile
Normal file
@@ -0,0 +1,15 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY main.py .
|
||||
|
||||
EXPOSE 8085
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=20s --retries=3 \
|
||||
CMD python -c "import urllib.request; urllib.request.urlopen(http://localhost:8085/health)"
|
||||
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8085"]
|
||||
238
services/plant-vision-node1/main.py
Normal file
238
services/plant-vision-node1/main.py
Normal file
@@ -0,0 +1,238 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shlex
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import httpx
|
||||
from fastapi import FastAPI, File, HTTPException, UploadFile
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.responses import JSONResponse
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
app = FastAPI(title="plant-vision-node1", version="0.1.1")
|
||||
|
||||
|
||||
class IdentifyRequest(BaseModel):
|
||||
image_url: Optional[str] = None
|
||||
top_k: int = Field(default=3, ge=1, le=10)
|
||||
|
||||
|
||||
def _normalize_predictions(raw: Any, top_k: int) -> List[Dict[str, Any]]:
|
||||
preds: List[Dict[str, Any]] = []
|
||||
if isinstance(raw, dict):
|
||||
for key in ("predictions", "results", "candidates"):
|
||||
if isinstance(raw.get(key), list):
|
||||
raw = raw[key]
|
||||
break
|
||||
if isinstance(raw, list):
|
||||
for item in raw[:top_k]:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
name = (
|
||||
item.get("scientific_name")
|
||||
or item.get("scientificName")
|
||||
or item.get("label")
|
||||
or item.get("name")
|
||||
or "unknown"
|
||||
)
|
||||
common = item.get("common_name") or item.get("commonName") or item.get("common") or "-"
|
||||
score = item.get("score", item.get("confidence", 0.0))
|
||||
try:
|
||||
score_f = float(score)
|
||||
except Exception:
|
||||
score_f = 0.0
|
||||
preds.append({"scientific_name": str(name), "common_name": str(common), "score": score_f})
|
||||
return preds[:top_k]
|
||||
|
||||
|
||||
def _parse_text_output(text: str, top_k: int) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Parse only model score lines, e.g.:
|
||||
97.6% Persicaria amphibia
|
||||
86.1% Canada Goldenrod (Solidago canadensis)
|
||||
Ignore service lines like "Read ..." or "Classification of ...".
|
||||
"""
|
||||
preds: List[Dict[str, Any]] = []
|
||||
for raw_line in (text or "").splitlines():
|
||||
line = raw_line.strip()
|
||||
if not line or "%" not in line:
|
||||
continue
|
||||
|
||||
m = re.match(r"^\s*(\d+(?:\.\d+)?)%\s+(.+)$", line)
|
||||
if not m:
|
||||
continue
|
||||
|
||||
score_str, name_part = m.groups()
|
||||
try:
|
||||
score = float(score_str)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
name = name_part.strip()
|
||||
if not name:
|
||||
continue
|
||||
|
||||
common_name = "-"
|
||||
scientific_name = name
|
||||
|
||||
# If output is "Common Name (Scientific name)", preserve both.
|
||||
paren = re.match(r"^(.*?)\s*\(([^()]+)\)\s*$", name)
|
||||
if paren:
|
||||
common, scientific = paren.groups()
|
||||
common = common.strip()
|
||||
scientific = scientific.strip()
|
||||
if common:
|
||||
common_name = common
|
||||
if scientific:
|
||||
scientific_name = scientific
|
||||
|
||||
preds.append(
|
||||
{
|
||||
"scientific_name": scientific_name,
|
||||
"common_name": common_name,
|
||||
"score": score,
|
||||
}
|
||||
)
|
||||
|
||||
preds.sort(key=lambda x: float(x.get("score", 0.0)), reverse=True)
|
||||
return preds[:top_k]
|
||||
|
||||
|
||||
def _extract_inference_time(stdout: str) -> Optional[float]:
|
||||
m = re.search(r"took\s+(\d+(?:\.\d+)?)\s+secs", stdout or "")
|
||||
if not m:
|
||||
return None
|
||||
try:
|
||||
return float(m.group(1))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _run_nature_id_cli(image_path: str, top_k: int) -> Dict[str, Any]:
|
||||
cmd_tmpl = (os.getenv("NATURE_ID_CMD") or "").strip()
|
||||
timeout_s = int(os.getenv("NATURE_ID_TIMEOUT", "40"))
|
||||
|
||||
if not cmd_tmpl:
|
||||
raise RuntimeError("NATURE_ID_CMD is not configured")
|
||||
|
||||
cmd = cmd_tmpl.replace("{image_path}", image_path)
|
||||
proc = subprocess.run(
|
||||
shlex.split(cmd),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout_s,
|
||||
check=False,
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError(f"nature-id cli failed rc={proc.returncode}: {proc.stderr.strip()[:240]}")
|
||||
|
||||
out = (proc.stdout or "").strip()
|
||||
inference_time_sec = _extract_inference_time(out)
|
||||
if not out:
|
||||
return {"predictions": [], "inference_time_sec": inference_time_sec}
|
||||
|
||||
try:
|
||||
parsed = json.loads(out)
|
||||
preds = _normalize_predictions(parsed, top_k)
|
||||
except Exception:
|
||||
preds = _parse_text_output(out, top_k)
|
||||
|
||||
return {"predictions": preds, "inference_time_sec": inference_time_sec}
|
||||
|
||||
|
||||
async def _download_image(image_url: str) -> str:
|
||||
timeout_s = float(os.getenv("DOWNLOAD_TIMEOUT", "20"))
|
||||
async with httpx.AsyncClient(timeout=timeout_s) as client:
|
||||
resp = await client.get(image_url)
|
||||
resp.raise_for_status()
|
||||
data = resp.content
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as f:
|
||||
f.write(data)
|
||||
return f.name
|
||||
|
||||
|
||||
def _response_payload(result: Dict[str, Any]) -> Dict[str, Any]:
|
||||
preds = result.get("predictions") or []
|
||||
top_k = [
|
||||
{
|
||||
"confidence": float(p.get("score", 0.0)),
|
||||
"name": str((p.get("common_name") if p.get("common_name") not in (None, "", "-") else p.get("scientific_name")) or "unknown"),
|
||||
"scientific_name": str(p.get("scientific_name") or "unknown"),
|
||||
}
|
||||
for p in preds
|
||||
]
|
||||
return {
|
||||
"status": "success",
|
||||
"model": "aiy_plants_V1",
|
||||
"source": "nature-id-cli",
|
||||
"count": len(preds),
|
||||
"inference_time_sec": result.get("inference_time_sec"),
|
||||
"predictions": preds,
|
||||
"top_k": top_k,
|
||||
}
|
||||
|
||||
|
||||
@app.exception_handler(RequestValidationError)
|
||||
async def validation_exception_handler(_, exc: RequestValidationError):
|
||||
# Avoid leaking raw multipart bytes in validation responses.
|
||||
errs: List[Dict[str, Any]] = []
|
||||
for e in exc.errors() or []:
|
||||
errs.append({"loc": e.get("loc"), "msg": e.get("msg"), "type": e.get("type")})
|
||||
return JSONResponse(status_code=422, content={"detail": errs})
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> Dict[str, Any]:
|
||||
cmd = (os.getenv("NATURE_ID_CMD") or "").strip()
|
||||
return {
|
||||
"status": "healthy",
|
||||
"nature_id_cmd_configured": bool(cmd),
|
||||
"nature_id_cmd": cmd,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/identify")
|
||||
async def identify(payload: IdentifyRequest) -> Dict[str, Any]:
|
||||
if not payload.image_url:
|
||||
raise HTTPException(status_code=400, detail="image_url is required")
|
||||
|
||||
tmp_path = ""
|
||||
try:
|
||||
tmp_path = await _download_image(payload.image_url)
|
||||
result = _run_nature_id_cli(tmp_path, payload.top_k)
|
||||
return _response_payload(result)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=503, detail=f"identify_failed: {e}")
|
||||
finally:
|
||||
if tmp_path:
|
||||
try:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@app.post("/identify-file")
|
||||
async def identify_file(file: UploadFile = File(...), top_k: int = 3) -> Dict[str, Any]:
|
||||
top_k = max(1, min(top_k, 10))
|
||||
tmp_path = ""
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as f:
|
||||
f.write(await file.read())
|
||||
tmp_path = f.name
|
||||
result = _run_nature_id_cli(tmp_path, top_k)
|
||||
return _response_payload(result)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=503, detail=f"identify_failed: {e}")
|
||||
finally:
|
||||
if tmp_path:
|
||||
try:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
8
services/plant-vision-node1/requirements.txt
Normal file
8
services/plant-vision-node1/requirements.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
fastapi==0.115.5
|
||||
uvicorn[standard]==0.32.1
|
||||
httpx==0.28.1
|
||||
python-multipart==0.0.17
|
||||
Pillow==11.1.0
|
||||
requests==2.32.3
|
||||
tflite-runtime==2.14.0
|
||||
numpy==1.26.4
|
||||
@@ -46,8 +46,15 @@ AGENT_SPECIALIZED_TOOLS = {
|
||||
"nutra": ['comfy_generate_image', 'comfy_generate_video'],
|
||||
|
||||
# AgroMatrix - Agriculture
|
||||
# Specialized: crop analysis, weather integration, field mapping
|
||||
"agromatrix": ['comfy_generate_image', 'comfy_generate_video'],
|
||||
# Specialized: crop analysis, weather integration, field mapping + plant intelligence
|
||||
"agromatrix": [
|
||||
'comfy_generate_image',
|
||||
'comfy_generate_video',
|
||||
'plantnet_lookup',
|
||||
'nature_id_identify',
|
||||
'gbif_species_lookup',
|
||||
'agrovoc_lookup',
|
||||
],
|
||||
|
||||
# GreenFood - Food & Eco
|
||||
# Specialized: recipe analysis, eco-scoring
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -408,8 +408,9 @@ agents:
|
||||
description: "Monitor Agent - архітектор-інспектор DAGI"
|
||||
default_llm: local_qwen3_8b
|
||||
system_prompt: |
|
||||
Ти - Monitor Agent, стежиш за нодами, сервісами, агентами.
|
||||
Якщо бачиш у чаті інших ботів, відповідай тільки за інфраструктурою або прямим тегом.
|
||||
Ти - Monitor Agent, інфраструктурний інспектор DAGI: ноди, сервіси, пайплайни, алерти.
|
||||
Ти знаєш, що DAARWIZZ — головний оркестратор мережі DAARION.city; для governance/маршрутизації посилайся на нього.
|
||||
Відповідай коротко і по суті; якщо даних бракує — одразу кажи, який саме метрик/лог потрібен.
|
||||
tools:
|
||||
- id: get_metrics
|
||||
type: builtin
|
||||
|
||||
@@ -19,6 +19,7 @@ from typing import Dict, List, Any, Optional
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO, StringIO
|
||||
from pathlib import PurePath
|
||||
from urllib.parse import urlparse
|
||||
import xml.etree.ElementTree as ET
|
||||
from xml.sax.saxutils import escape as xml_escape
|
||||
from zipfile import ZIP_DEFLATED, ZipFile
|
||||
@@ -108,6 +109,115 @@ TOOL_DEFINITIONS = [
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "plantnet_lookup",
|
||||
"description": "Визначення рослин через Pl@ntNet API. Повертає top-k кандидатів з confidence.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Короткий опис рослини/культури (якщо немає image_url)"
|
||||
},
|
||||
"image_url": {
|
||||
"type": "string",
|
||||
"description": "Публічне посилання на фото рослини"
|
||||
},
|
||||
"organ": {
|
||||
"type": "string",
|
||||
"description": "Орган рослини: leaf/flower/fruit/bark/auto",
|
||||
"default": "auto"
|
||||
},
|
||||
"top_k": {
|
||||
"type": "integer",
|
||||
"description": "Скільки кандидатів повернути (1-10)",
|
||||
"default": 3
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "nature_id_identify",
|
||||
"description": "Локальна/open-source ідентифікація рослин через nature-id сумісний сервіс.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"image_url": {
|
||||
"type": "string",
|
||||
"description": "Публічне посилання на фото рослини"
|
||||
},
|
||||
"image_data": {
|
||||
"type": "string",
|
||||
"description": "Data URL зображення (data:image/...;base64,...)"
|
||||
},
|
||||
"top_k": {
|
||||
"type": "integer",
|
||||
"description": "Скільки кандидатів повернути (1-10)",
|
||||
"default": 3
|
||||
},
|
||||
"min_confidence": {
|
||||
"type": "number",
|
||||
"description": "Поріг confidence для fallback на GBIF",
|
||||
"default": 0.65
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "gbif_species_lookup",
|
||||
"description": "Пошук таксонів у GBIF для валідації назви культури/рослини.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Назва/термін для пошуку виду"
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Кількість результатів (1-10)",
|
||||
"default": 5
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "agrovoc_lookup",
|
||||
"description": "Нормалізація агро-термінів через AGROVOC (SPARQL).",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Термін культури/хвороби/технології"
|
||||
},
|
||||
"lang": {
|
||||
"type": "string",
|
||||
"description": "Мова міток (en/uk/ru)",
|
||||
"default": "en"
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Кількість результатів (1-10)",
|
||||
"default": 5
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
}
|
||||
},
|
||||
# PRIORITY 3: Generation tools
|
||||
{
|
||||
"type": "function",
|
||||
@@ -681,6 +791,42 @@ class ToolManager:
|
||||
tool_names = [t.get("function", {}).get("name") for t in filtered]
|
||||
logger.debug(f"Agent {agent_id} has {len(filtered)} tools: {tool_names}")
|
||||
return filtered
|
||||
|
||||
@staticmethod
|
||||
def _is_image_data_url(value: str) -> bool:
|
||||
v = str(value or "").strip()
|
||||
return bool(v.startswith("data:image/") and ";base64," in v)
|
||||
|
||||
@staticmethod
|
||||
def _is_known_non_direct_image_url(url: str) -> bool:
|
||||
u = str(url or "").strip()
|
||||
if not u:
|
||||
return False
|
||||
try:
|
||||
p = urlparse(u)
|
||||
except Exception:
|
||||
return True
|
||||
host = (p.netloc or "").lower()
|
||||
if host in {"t.me", "telegram.me"}:
|
||||
return True
|
||||
if "web.telegram.org" in host:
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _normalize_confidence(value: Any) -> float:
|
||||
try:
|
||||
v = float(value)
|
||||
except Exception:
|
||||
return 0.0
|
||||
if v < 0:
|
||||
return 0.0
|
||||
# Some backends return percentages (e.g. 97.6) instead of 0..1.
|
||||
if v > 1.0 and v <= 100.0:
|
||||
v = v / 100.0
|
||||
if v > 1.0:
|
||||
v = 1.0
|
||||
return v
|
||||
|
||||
async def execute_tool(
|
||||
self,
|
||||
@@ -709,6 +855,14 @@ class ToolManager:
|
||||
return await self._web_search(arguments)
|
||||
elif tool_name == "web_extract":
|
||||
return await self._web_extract(arguments)
|
||||
elif tool_name == "plantnet_lookup":
|
||||
return await self._plantnet_lookup(arguments)
|
||||
elif tool_name == "nature_id_identify":
|
||||
return await self._nature_id_identify(arguments)
|
||||
elif tool_name == "gbif_species_lookup":
|
||||
return await self._gbif_species_lookup(arguments)
|
||||
elif tool_name == "agrovoc_lookup":
|
||||
return await self._agrovoc_lookup(arguments)
|
||||
elif tool_name == "image_generate":
|
||||
return await self._image_generate(arguments)
|
||||
elif tool_name == "comfy_generate_image":
|
||||
@@ -2530,6 +2684,272 @@ class ToolManager:
|
||||
except Exception as e:
|
||||
return ToolResult(success=False, result=None, error=str(e))
|
||||
|
||||
async def _plantnet_lookup(self, args: Dict) -> ToolResult:
|
||||
"""Plant identification via Pl@ntNet API (skeleton adapter)."""
|
||||
query = str(args.get("query", "") or "").strip()
|
||||
image_url = str(args.get("image_url", "") or "").strip()
|
||||
image_data = str(args.get("image_data", "") or "").strip()
|
||||
runtime_image_data = str(args.get("_runtime_image_data", "") or "").strip()
|
||||
if not image_data and self._is_image_data_url(runtime_image_data):
|
||||
image_data = runtime_image_data
|
||||
organ = str(args.get("organ", "auto") or "auto").strip().lower()
|
||||
top_k = max(1, min(int(args.get("top_k", 3)), 5))
|
||||
|
||||
api_key = (os.getenv("PLANTNET_API_KEY") or "").strip()
|
||||
if image_url and api_key:
|
||||
try:
|
||||
params = {
|
||||
"api-key": api_key,
|
||||
"images": image_url,
|
||||
"organs": "leaf" if organ == "auto" else organ,
|
||||
"lang": "en",
|
||||
}
|
||||
resp = await self.http_client.get(
|
||||
"https://my-api.plantnet.org/v2/identify/all",
|
||||
params=params,
|
||||
timeout=25.0,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
results = (data.get("results") or [])[:top_k]
|
||||
if not results:
|
||||
return ToolResult(success=True, result="Pl@ntNet: кандидатів не знайдено.")
|
||||
lines = []
|
||||
for idx, item in enumerate(results, 1):
|
||||
species = (item.get("species") or {})
|
||||
sname = species.get("scientificNameWithoutAuthor") or species.get("scientificName") or "unknown"
|
||||
common = species.get("commonNames") or []
|
||||
cname = common[0] if common else "-"
|
||||
score = float(item.get("score") or 0.0)
|
||||
lines.append(f"{idx}. {sname} ({cname}) score={score:.3f}")
|
||||
return ToolResult(success=True, result="Pl@ntNet candidates:\n" + "\n".join(lines))
|
||||
return ToolResult(success=False, result=None, error=f"plantnet_http_{resp.status_code}")
|
||||
except Exception as e:
|
||||
return ToolResult(success=False, result=None, error=f"plantnet_error: {e}")
|
||||
|
||||
if image_url or image_data:
|
||||
ni_args: Dict[str, Any] = {"top_k": top_k}
|
||||
if image_data:
|
||||
ni_args["image_data"] = image_data
|
||||
else:
|
||||
ni_args["image_url"] = image_url
|
||||
if runtime_image_data:
|
||||
ni_args["_runtime_image_data"] = runtime_image_data
|
||||
ni = await self._nature_id_identify(ni_args)
|
||||
if ni.success:
|
||||
return ni
|
||||
|
||||
if query:
|
||||
return await self._gbif_species_lookup({"query": query, "limit": top_k})
|
||||
|
||||
return ToolResult(
|
||||
success=False,
|
||||
result=None,
|
||||
error="No available plant ID backend (set PLANTNET_API_KEY or NATURE_ID_URL, or provide text query)",
|
||||
)
|
||||
|
||||
async def _nature_id_identify(self, args: Dict) -> ToolResult:
|
||||
"""Open-source plant identification via self-hosted nature-id compatible endpoint."""
|
||||
image_url = str(args.get("image_url", "") or "").strip()
|
||||
image_data = str(args.get("image_data", "") or "").strip()
|
||||
runtime_image_data = str(args.get("_runtime_image_data", "") or "").strip()
|
||||
if not image_data and self._is_image_data_url(runtime_image_data):
|
||||
image_data = runtime_image_data
|
||||
top_k = max(1, min(int(args.get("top_k", 3)), 10))
|
||||
min_confidence = float(args.get("min_confidence", os.getenv("NATURE_ID_MIN_CONFIDENCE", "0.65")))
|
||||
|
||||
if image_url and self._is_known_non_direct_image_url(image_url):
|
||||
if image_data:
|
||||
logger.info("nature_id_identify: replacing non-direct image_url with runtime image_data")
|
||||
image_url = ""
|
||||
else:
|
||||
return ToolResult(
|
||||
success=False,
|
||||
result=None,
|
||||
error="image_url is not direct image URL; provide image_data or direct Telegram file URL",
|
||||
)
|
||||
|
||||
if not image_url and not image_data:
|
||||
return ToolResult(success=False, result=None, error="image_url or image_data is required")
|
||||
|
||||
base = (os.getenv("NATURE_ID_URL") or "").strip().rstrip("/")
|
||||
if not base:
|
||||
return ToolResult(success=False, result=None, error="NATURE_ID_URL is not configured")
|
||||
|
||||
try:
|
||||
if image_data:
|
||||
# data URL -> multipart /identify-file
|
||||
if not image_data.startswith("data:") or "," not in image_data:
|
||||
return ToolResult(success=False, result=None, error="invalid image_data format")
|
||||
header, b64 = image_data.split(",", 1)
|
||||
mime = "image/jpeg"
|
||||
if ";base64" in header:
|
||||
mime = header.split(":", 1)[1].split(";", 1)[0] or "image/jpeg"
|
||||
ext = "jpg"
|
||||
if "png" in mime:
|
||||
ext = "png"
|
||||
try:
|
||||
image_bytes = base64.b64decode(b64)
|
||||
except Exception:
|
||||
return ToolResult(success=False, result=None, error="invalid image_data base64")
|
||||
files = {"file": (f"upload.{ext}", image_bytes, mime)}
|
||||
resp = await self.http_client.post(
|
||||
f"{base}/identify-file",
|
||||
params={"top_k": top_k},
|
||||
files=files,
|
||||
timeout=45.0,
|
||||
)
|
||||
else:
|
||||
payload = {"image_url": image_url, "top_k": top_k}
|
||||
resp = await self.http_client.post(f"{base}/identify", json=payload, timeout=45.0)
|
||||
|
||||
if resp.status_code != 200:
|
||||
return ToolResult(success=False, result=None, error=f"nature_id_http_{resp.status_code}")
|
||||
|
||||
data = resp.json() or {}
|
||||
status = str(data.get("status") or "success")
|
||||
raw_top_k = data.get("top_k") or []
|
||||
raw_preds = data.get("predictions") or data.get("results") or []
|
||||
|
||||
top_k_rows = []
|
||||
if isinstance(raw_top_k, list) and raw_top_k:
|
||||
for row in raw_top_k[:top_k]:
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
conf = row.get("confidence", 0.0)
|
||||
conf_f = self._normalize_confidence(conf)
|
||||
top_k_rows.append({
|
||||
"confidence": conf_f,
|
||||
"name": str(row.get("name") or row.get("scientific_name") or "unknown"),
|
||||
"scientific_name": str(row.get("scientific_name") or row.get("name") or "unknown"),
|
||||
})
|
||||
else:
|
||||
for item in raw_preds[:top_k]:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
score = item.get("score", item.get("confidence", 0.0))
|
||||
score_f = self._normalize_confidence(score)
|
||||
sname = item.get("scientific_name") or item.get("label") or item.get("name") or "unknown"
|
||||
cname = item.get("common_name") or item.get("common") or sname
|
||||
top_k_rows.append({
|
||||
"confidence": score_f,
|
||||
"name": str(cname),
|
||||
"scientific_name": str(sname),
|
||||
})
|
||||
|
||||
if not top_k_rows:
|
||||
return ToolResult(success=True, result=json.dumps({
|
||||
"status": status,
|
||||
"model": data.get("model") or "aiy_plants_V1",
|
||||
"source": data.get("source") or "nature-id-cli",
|
||||
"top_k": [],
|
||||
"confidence": 0.0,
|
||||
"recommend_fallback": True,
|
||||
"reason": "no_predictions",
|
||||
}, ensure_ascii=False))
|
||||
|
||||
top1 = top_k_rows[0]
|
||||
top1_conf = float(top1.get("confidence", 0.0))
|
||||
recommend_fallback = top1_conf < min_confidence
|
||||
|
||||
out = {
|
||||
"status": status,
|
||||
"model": data.get("model") or "aiy_plants_V1",
|
||||
"source": data.get("source") or "nature-id-cli",
|
||||
"inference_time_sec": data.get("inference_time_sec"),
|
||||
"top_k": top_k_rows,
|
||||
"confidence": top1_conf,
|
||||
"min_confidence": min_confidence,
|
||||
"recommend_fallback": recommend_fallback,
|
||||
"fallback": "gbif_species_lookup",
|
||||
}
|
||||
|
||||
if recommend_fallback:
|
||||
fallback_query = str(top1.get("scientific_name") or top1.get("name") or "").strip()
|
||||
if fallback_query and fallback_query.lower() != "unknown":
|
||||
gbif = await self._gbif_species_lookup({"query": fallback_query, "limit": min(5, top_k)})
|
||||
if gbif.success and gbif.result:
|
||||
out["gbif_validation"] = gbif.result
|
||||
|
||||
return ToolResult(success=True, result=json.dumps(out, ensure_ascii=False))
|
||||
except Exception as e:
|
||||
return ToolResult(success=False, result=None, error=f"nature_id_error: {e}")
|
||||
|
||||
async def _gbif_species_lookup(self, args: Dict) -> ToolResult:
|
||||
"""Species lookup via GBIF public API."""
|
||||
query = str(args.get("query", "") or "").strip()
|
||||
limit = max(1, min(int(args.get("limit", 5)), 10))
|
||||
if not query:
|
||||
return ToolResult(success=False, result=None, error="query is required")
|
||||
|
||||
try:
|
||||
resp = await self.http_client.get(
|
||||
"https://api.gbif.org/v1/species/search",
|
||||
params={"q": query, "limit": limit, "status": "ACCEPTED"},
|
||||
timeout=20.0,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return ToolResult(success=False, result=None, error=f"gbif_http_{resp.status_code}")
|
||||
|
||||
data = resp.json() or {}
|
||||
results = data.get("results") or []
|
||||
if not results:
|
||||
return ToolResult(success=True, result="GBIF: результатів не знайдено.")
|
||||
|
||||
lines = []
|
||||
for idx, item in enumerate(results[:limit], 1):
|
||||
sci = item.get("scientificName") or item.get("canonicalName") or "unknown"
|
||||
rank = item.get("rank") or "-"
|
||||
status = item.get("taxonomicStatus") or "-"
|
||||
key = item.get("key")
|
||||
lines.append(f"{idx}. {sci} | rank={rank} | status={status} | key={key}")
|
||||
return ToolResult(success=True, result="GBIF matches:\n" + "\n".join(lines))
|
||||
except Exception as e:
|
||||
return ToolResult(success=False, result=None, error=f"gbif_error: {e}")
|
||||
|
||||
async def _agrovoc_lookup(self, args: Dict) -> ToolResult:
|
||||
"""AGROVOC term normalization via public SPARQL endpoint."""
|
||||
query = str(args.get("query", "") or "").strip()
|
||||
lang = str(args.get("lang", "en") or "en").strip().lower()
|
||||
limit = max(1, min(int(args.get("limit", 5)), 10))
|
||||
if not query:
|
||||
return ToolResult(success=False, result=None, error="query is required")
|
||||
if lang not in {"en", "uk", "ru"}:
|
||||
lang = "en"
|
||||
|
||||
safe_q = query.replace('\\', ' ').replace('"', ' ').strip()
|
||||
sparql = (
|
||||
"PREFIX skos: <http://www.w3.org/2004/02/skos/core#> "
|
||||
"SELECT ?concept ?label WHERE { "
|
||||
"?concept skos:prefLabel ?label . "
|
||||
f"FILTER(lang(?label) = '{lang}') "
|
||||
f"FILTER(CONTAINS(LCASE(STR(?label)), LCASE(\"{safe_q}\"))) "
|
||||
"} LIMIT " + str(limit)
|
||||
)
|
||||
|
||||
try:
|
||||
resp = await self.http_client.get(
|
||||
"https://agrovoc.fao.org/sparql",
|
||||
params={"query": sparql, "format": "json"},
|
||||
timeout=25.0,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return ToolResult(success=False, result=None, error=f"agrovoc_http_{resp.status_code}")
|
||||
|
||||
data = resp.json() or {}
|
||||
bindings = (((data.get("results") or {}).get("bindings")) or [])
|
||||
if not bindings:
|
||||
return ToolResult(success=True, result="AGROVOC: результатів не знайдено.")
|
||||
|
||||
lines = []
|
||||
for idx, b in enumerate(bindings[:limit], 1):
|
||||
label = ((b.get("label") or {}).get("value") or "").strip()
|
||||
concept = ((b.get("concept") or {}).get("value") or "").strip()
|
||||
lines.append(f"{idx}. {label} | {concept}")
|
||||
return ToolResult(success=True, result="AGROVOC matches:\n" + "\n".join(lines))
|
||||
except Exception as e:
|
||||
return ToolResult(success=False, result=None, error=f"agrovoc_error: {e}")
|
||||
|
||||
async def _unload_ollama_models(self):
|
||||
"""Unload all Ollama models to free VRAM for heavy operations like FLUX"""
|
||||
ollama_url = os.getenv("OLLAMA_BASE_URL", "http://172.18.0.1:11434")
|
||||
@@ -2942,7 +3362,11 @@ class ToolManager:
|
||||
|
||||
if results:
|
||||
result = results[0] if isinstance(results, list) else results
|
||||
markdown = result.get("markdown", "") or result.get("cleaned_html", "") or result.get("text", "")
|
||||
raw_content = result.get("markdown", "") or result.get("cleaned_html", "") or result.get("text", "")
|
||||
if isinstance(raw_content, (dict, list, tuple)):
|
||||
markdown = json.dumps(raw_content, ensure_ascii=False)
|
||||
else:
|
||||
markdown = str(raw_content or "")
|
||||
title = result.get("title", url)
|
||||
|
||||
if len(markdown) > 3000:
|
||||
@@ -2951,13 +3375,30 @@ class ToolManager:
|
||||
response_parts = [f"**{title}**", "", markdown]
|
||||
|
||||
if extract_links:
|
||||
links = result.get("links", [])
|
||||
if links:
|
||||
links_raw = result.get("links", [])
|
||||
normalized_links: List[Any] = []
|
||||
if isinstance(links_raw, dict):
|
||||
for bucket in links_raw.values():
|
||||
if isinstance(bucket, list):
|
||||
normalized_links.extend(bucket)
|
||||
elif bucket:
|
||||
normalized_links.append(bucket)
|
||||
elif isinstance(links_raw, list):
|
||||
normalized_links = links_raw
|
||||
elif links_raw:
|
||||
normalized_links = [links_raw]
|
||||
|
||||
if normalized_links:
|
||||
response_parts.append("")
|
||||
response_parts.append("**Посилання:**")
|
||||
for link in links[:10]:
|
||||
for link in normalized_links[:10]:
|
||||
if isinstance(link, dict):
|
||||
link_url = link.get("href", "")
|
||||
link_url = (
|
||||
link.get("href")
|
||||
or link.get("url")
|
||||
or link.get("link")
|
||||
or ""
|
||||
)
|
||||
else:
|
||||
link_url = str(link)
|
||||
if link_url:
|
||||
|
||||
@@ -11,10 +11,13 @@ import os
|
||||
import asyncio
|
||||
import logging
|
||||
import base64
|
||||
import json
|
||||
import re
|
||||
from typing import Optional, Dict, List, Any, Union
|
||||
from datetime import datetime, timedelta
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from fastapi import FastAPI, HTTPException, BackgroundTasks, File, UploadFile, Form
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
@@ -56,16 +59,34 @@ def _csv_to_markdown(content: bytes) -> str:
|
||||
text = _decode_text_bytes(content)
|
||||
reader = csv.reader(text.splitlines())
|
||||
rows = list(reader)
|
||||
return _rows_to_markdown(rows)
|
||||
|
||||
|
||||
def _tsv_to_markdown(content: bytes) -> str:
|
||||
text = _decode_text_bytes(content)
|
||||
reader = csv.reader(text.splitlines(), delimiter="\t")
|
||||
rows = list(reader)
|
||||
return _rows_to_markdown(rows)
|
||||
|
||||
|
||||
def _rows_to_markdown(rows: List[List[Any]]) -> str:
|
||||
if not rows:
|
||||
return ""
|
||||
header = rows[0]
|
||||
body = rows[1:]
|
||||
width = max(len(r) for r in rows)
|
||||
norm_rows = []
|
||||
for r in rows:
|
||||
rr = [str(c) if c is not None else "" for c in r]
|
||||
if len(rr) < width:
|
||||
rr.extend([""] * (width - len(rr)))
|
||||
norm_rows.append(rr)
|
||||
header = norm_rows[0]
|
||||
body = norm_rows[1:]
|
||||
lines = [
|
||||
"| " + " | ".join(header) + " |",
|
||||
"| " + " | ".join(["---"] * len(header)) + " |",
|
||||
]
|
||||
for row in body:
|
||||
lines.append("| " + " | ".join(row) + " |")
|
||||
lines.append("| " + " | ".join([str(c) if c is not None else "" for c in row]) + " |")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
@@ -91,6 +112,69 @@ def _xlsx_to_markdown(content: bytes) -> str:
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def _xls_to_markdown(content: bytes) -> str:
|
||||
try:
|
||||
import xlrd
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"xlrd not available: {e}")
|
||||
wb = xlrd.open_workbook(file_contents=content)
|
||||
parts = []
|
||||
for s in wb.sheets():
|
||||
parts.append(f"## Sheet: {s.name}")
|
||||
rows = []
|
||||
for r in range(s.nrows):
|
||||
rows.append([s.cell_value(r, c) for c in range(s.ncols)])
|
||||
if not rows:
|
||||
parts.append("_Empty sheet_")
|
||||
continue
|
||||
parts.append(_rows_to_markdown(rows))
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _ods_to_markdown(content: bytes) -> str:
|
||||
try:
|
||||
from odf.opendocument import load
|
||||
from odf.table import Table, TableRow, TableCell
|
||||
from odf.text import P
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"odfpy not available: {e}")
|
||||
|
||||
try:
|
||||
doc = load(BytesIO(content))
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid ODS file: {e}")
|
||||
|
||||
parts = []
|
||||
for table in doc.spreadsheet.getElementsByType(Table):
|
||||
table_name = str(table.getAttribute("name") or "Sheet")
|
||||
parts.append(f"## Sheet: {table_name}")
|
||||
rows: List[List[str]] = []
|
||||
for row in table.getElementsByType(TableRow):
|
||||
cells_out: List[str] = []
|
||||
for cell in row.getElementsByType(TableCell):
|
||||
txt_parts = []
|
||||
for p in cell.getElementsByType(P):
|
||||
txt_parts.extend(
|
||||
[str(getattr(node, "data", "")).strip() for node in p.childNodes if getattr(node, "data", None)]
|
||||
)
|
||||
cell_text = " ".join([t for t in txt_parts if t]).strip()
|
||||
repeat_raw = cell.getAttribute("numbercolumnsrepeated")
|
||||
try:
|
||||
repeat = int(repeat_raw) if repeat_raw else 1
|
||||
except Exception:
|
||||
repeat = 1
|
||||
repeat = max(1, min(repeat, 100))
|
||||
for _ in range(repeat):
|
||||
cells_out.append(cell_text)
|
||||
if cells_out:
|
||||
rows.append(cells_out)
|
||||
if not rows:
|
||||
parts.append("_Empty sheet_")
|
||||
continue
|
||||
parts.append(_rows_to_markdown(rows))
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _docx_to_text(content: bytes) -> str:
|
||||
try:
|
||||
from docx import Document
|
||||
@@ -115,18 +199,111 @@ def _pdf_to_text(content: bytes) -> str:
|
||||
return "\n\n".join(text_content)
|
||||
|
||||
|
||||
def _pptx_to_text(content: bytes) -> str:
|
||||
try:
|
||||
from pptx import Presentation
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"python-pptx not available: {e}")
|
||||
prs = Presentation(BytesIO(content))
|
||||
parts = []
|
||||
for idx, slide in enumerate(prs.slides, start=1):
|
||||
parts.append(f"## Slide {idx}")
|
||||
slide_lines = []
|
||||
for shape in slide.shapes:
|
||||
text = getattr(shape, "text", None)
|
||||
if text and str(text).strip():
|
||||
slide_lines.append(str(text).strip())
|
||||
parts.extend(slide_lines if slide_lines else ["_No text on this slide_"])
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
def _json_to_text(content: bytes) -> str:
|
||||
raw = _decode_text_bytes(content)
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
return json.dumps(parsed, ensure_ascii=False, indent=2)
|
||||
except Exception:
|
||||
return raw
|
||||
|
||||
|
||||
def _yaml_to_text(content: bytes) -> str:
|
||||
raw = _decode_text_bytes(content)
|
||||
try:
|
||||
parsed = yaml.safe_load(raw)
|
||||
return yaml.safe_dump(parsed, allow_unicode=True, sort_keys=False)
|
||||
except Exception:
|
||||
return raw
|
||||
|
||||
|
||||
def _xml_to_text(content: bytes) -> str:
|
||||
raw = _decode_text_bytes(content)
|
||||
try:
|
||||
root = ET.fromstring(raw)
|
||||
text = " ".join([t.strip() for t in root.itertext() if t and t.strip()])
|
||||
return text or raw
|
||||
except Exception:
|
||||
return raw
|
||||
|
||||
|
||||
def _html_to_text(content: bytes) -> str:
|
||||
raw = _decode_text_bytes(content)
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(raw, "html.parser")
|
||||
text = soup.get_text(separator="\n")
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text.strip() or raw
|
||||
except Exception:
|
||||
# Minimal fallback if bs4 is unavailable
|
||||
text = re.sub(r"<[^>]+>", " ", raw)
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _rtf_to_text(content: bytes) -> str:
|
||||
raw = _decode_text_bytes(content)
|
||||
try:
|
||||
from striprtf.striprtf import rtf_to_text
|
||||
return rtf_to_text(raw)
|
||||
except Exception:
|
||||
# Basic fallback: strip common RTF control tokens
|
||||
text = re.sub(r"\\'[0-9a-fA-F]{2}", " ", raw)
|
||||
text = re.sub(r"\\[a-zA-Z]+-?\d* ?", " ", text)
|
||||
text = text.replace("{", " ").replace("}", " ")
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def _extract_text_by_ext(filename: str, content: bytes) -> str:
|
||||
ext = filename.split(".")[-1].lower() if "." in filename else ""
|
||||
if ext in ["txt", "md"]:
|
||||
if ext in ["txt", "md", "markdown"]:
|
||||
return _decode_text_bytes(content)
|
||||
if ext == "csv":
|
||||
return _csv_to_markdown(content)
|
||||
if ext == "xlsx":
|
||||
if ext == "tsv":
|
||||
return _tsv_to_markdown(content)
|
||||
if ext in {"xlsx", "xlsm"}:
|
||||
return _xlsx_to_markdown(content)
|
||||
if ext == "xls":
|
||||
return _xls_to_markdown(content)
|
||||
if ext == "ods":
|
||||
return _ods_to_markdown(content)
|
||||
if ext == "docx":
|
||||
return _docx_to_text(content)
|
||||
if ext == "pdf":
|
||||
return _pdf_to_text(content)
|
||||
if ext == "pptx":
|
||||
return _pptx_to_text(content)
|
||||
if ext == "json":
|
||||
return _json_to_text(content)
|
||||
if ext in {"yaml", "yml"}:
|
||||
return _yaml_to_text(content)
|
||||
if ext == "xml":
|
||||
return _xml_to_text(content)
|
||||
if ext in {"html", "htm"}:
|
||||
return _html_to_text(content)
|
||||
if ext == "rtf":
|
||||
return _rtf_to_text(content)
|
||||
raise HTTPException(status_code=400, detail=f"Unsupported file type: .{ext}")
|
||||
|
||||
|
||||
@@ -139,7 +316,12 @@ def _zip_to_markdown(content: bytes, max_files: int = 50, max_total_mb: int = 10
|
||||
if total_size > max_total_mb * 1024 * 1024:
|
||||
raise HTTPException(status_code=400, detail=f"ZIP слишком большой: {total_size / 1024 / 1024:.1f} MB")
|
||||
parts = []
|
||||
allowed_exts = {"txt", "md", "csv", "xlsx", "docx", "pdf"}
|
||||
allowed_exts = {
|
||||
"txt", "md", "markdown", "csv", "tsv",
|
||||
"xls", "xlsx", "xlsm", "ods",
|
||||
"docx", "pdf", "pptx",
|
||||
"json", "yaml", "yml", "xml", "html", "htm", "rtf",
|
||||
}
|
||||
processed = []
|
||||
skipped = []
|
||||
for member in members:
|
||||
@@ -1655,7 +1837,8 @@ async def document_endpoint(
|
||||
- json: Structured JSON with document elements
|
||||
- text: Plain text extraction
|
||||
|
||||
Supported files: PDF, DOCX, PPTX, images (PNG, JPG)
|
||||
Supported files:
|
||||
PDF, DOCX, XLS/XLSX/XLSM/ODS, PPTX, TXT/MD/CSV/TSV, JSON/YAML/XML/HTML, RTF, ZIP, images.
|
||||
"""
|
||||
try:
|
||||
import time
|
||||
@@ -1672,15 +1855,28 @@ async def document_endpoint(
|
||||
filename = file.filename if file else "document"
|
||||
file_ext = filename.split(".")[-1].lower() if "." in filename else "pdf"
|
||||
|
||||
# Handle text-based formats without Docling
|
||||
if file_ext in ["txt", "md", "csv", "xlsx", "zip"]:
|
||||
# Handle deterministic extraction for standard office/text formats
|
||||
if file_ext in [
|
||||
"txt", "md", "markdown", "csv", "tsv",
|
||||
"xlsx", "xls", "xlsm", "ods",
|
||||
"json", "yaml", "yml", "xml", "html", "htm", "rtf",
|
||||
"pptx", "zip",
|
||||
]:
|
||||
try:
|
||||
if file_ext == "zip":
|
||||
content = _zip_to_markdown(doc_data)
|
||||
output_format = "markdown"
|
||||
else:
|
||||
content = _extract_text_by_ext(filename, doc_data)
|
||||
output_format = "markdown" if file_ext in ["md", "csv", "xlsx"] else "text"
|
||||
output_format = (
|
||||
"markdown"
|
||||
if file_ext in {
|
||||
"md", "markdown", "csv", "tsv",
|
||||
"xlsx", "xls", "xlsm", "ods",
|
||||
"json", "yaml", "yml", "xml", "html", "htm", "pptx",
|
||||
}
|
||||
else "text"
|
||||
)
|
||||
processing_time_ms = (time.time() - start_time) * 1000
|
||||
return {
|
||||
"success": True,
|
||||
@@ -1764,22 +1960,27 @@ async def document_endpoint(
|
||||
"device": swapper.device
|
||||
}
|
||||
|
||||
# For DOCX, try python-docx
|
||||
if file_ext == "docx":
|
||||
# For common office/text formats, try deterministic extractors.
|
||||
if file_ext in {
|
||||
"docx", "txt", "md", "markdown", "csv", "tsv",
|
||||
"xlsx", "xls", "xlsm", "ods",
|
||||
"pptx", "json", "yaml", "yml", "xml", "html", "htm", "rtf",
|
||||
}:
|
||||
try:
|
||||
content = _docx_to_text(doc_data)
|
||||
content = _extract_text_by_ext(filename, doc_data)
|
||||
out_fmt = "markdown" if file_ext not in {"txt", "rtf"} else "text"
|
||||
return {
|
||||
"success": True,
|
||||
"model": "python-docx (fallback)",
|
||||
"output_format": "text",
|
||||
"model": "text-extract (fallback)",
|
||||
"output_format": out_fmt,
|
||||
"result": content,
|
||||
"filename": filename,
|
||||
"processing_time_ms": (time.time() - start_time) * 1000,
|
||||
"device": swapper.device
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"DOCX fallback failed: {e}")
|
||||
raise HTTPException(status_code=500, detail="DOCX extraction failed")
|
||||
logger.error(f"Text fallback failed for .{file_ext}: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Extraction failed for .{file_ext}")
|
||||
|
||||
# For PDFs, try pdfplumber
|
||||
if file_ext == "pdf":
|
||||
@@ -1807,7 +2008,7 @@ async def document_endpoint(
|
||||
# For other documents, return error
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="Document processing not available. Supported: PDF (with pdfplumber), images (with OCR)"
|
||||
detail="Document processing unavailable for this type. Supported: office/text/image/zip standard formats."
|
||||
)
|
||||
|
||||
finally:
|
||||
@@ -2312,4 +2513,3 @@ async def get_multimodal_stack():
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8890)
|
||||
|
||||
|
||||
@@ -4,6 +4,15 @@ httpx==0.25.2
|
||||
pydantic==2.5.0
|
||||
pyyaml==6.0.1
|
||||
python-multipart==0.0.6
|
||||
chardet>=5.2.0
|
||||
openpyxl>=3.1.2
|
||||
python-docx>=1.1.2
|
||||
pdfplumber>=0.11.0
|
||||
python-pptx>=0.6.23
|
||||
xlrd>=2.0.1
|
||||
odfpy>=1.4.1
|
||||
beautifulsoup4>=4.12.0
|
||||
striprtf>=0.0.26
|
||||
|
||||
# HuggingFace dependencies for OCR models
|
||||
torch>=2.0.0
|
||||
@@ -25,4 +34,4 @@ safetensors>=0.4.0
|
||||
|
||||
# Web Scraping & Search
|
||||
trafilatura>=1.6.0
|
||||
duckduckgo-search>=4.0.0
|
||||
duckduckgo-search>=4.0.0
|
||||
|
||||
@@ -43,3 +43,8 @@ pdfplumber>=0.10.0
|
||||
python-docx>=1.1.0
|
||||
openpyxl>=3.1.2
|
||||
chardet>=5.2.0
|
||||
python-pptx>=0.6.23
|
||||
xlrd>=2.0.1
|
||||
odfpy>=1.4.1
|
||||
beautifulsoup4>=4.12.0
|
||||
striprtf>=0.0.26
|
||||
|
||||
Reference in New Issue
Block a user