ops: add plant-vision node1 service and update monitor/prober scripts

2026-02-20 17:57:40 +01:00
parent 90eff85662
commit 987ece5bac
5 changed files with 388 additions and 53 deletions
--- a/services/agent-e2e-prober/main.py
+++ b/services/agent-e2e-prober/main.py
@@ -16,9 +16,16 @@ logger = logging.getLogger(__name__)

 # Configuration
 GATEWAY_URL = os.getenv("GATEWAY_URL", "http://gateway:9300")
+ROUTER_URL = os.getenv("ROUTER_URL", "http://router:8000")
 PROBE_INTERVAL = int(os.getenv("PROBE_INTERVAL", "60"))  # seconds
 PROBE_TIMEOUT = int(os.getenv("PROBE_TIMEOUT", "30"))  # seconds
+SEMANTIC_TIMEOUT = int(os.getenv("SEMANTIC_TIMEOUT", "45"))  # seconds
 METRICS_PORT = int(os.getenv("METRICS_PORT", "9108"))
+SEMANTIC_PROBE_ENABLED = os.getenv("SEMANTIC_PROBE_ENABLED", "true").lower() == "true"
+SEMANTIC_AGENTS = [a.strip() for a in os.getenv("SEMANTIC_AGENTS", "clan,sofiia,monitor").split(",") if a.strip()]
+SEMANTIC_PROMPT = os.getenv("SEMANTIC_PROMPT", "Коротко: хто такий DAARWIZZ?")
+SEMANTIC_EXPECT_KEYWORD = os.getenv("SEMANTIC_EXPECT_KEYWORD", "daarwizz").lower()
+MONITOR_EXPECT_LOCAL = os.getenv("MONITOR_EXPECT_LOCAL", "true").lower() == "true"

 # Prometheus metrics
 agent_e2e_success = Gauge('agent_e2e_success', 'Whether last E2E probe succeeded', ['target'])
@@ -42,7 +49,7 @@ async def probe_gateway_health() -> tuple[bool, float, str]:
        async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
            resp = await client.get(f"{GATEWAY_URL}/health")
            latency = time.time() - start
-            
+
            if resp.status_code == 200:
                data = resp.json()
                if data.get("status") == "healthy":
@@ -67,7 +74,7 @@ async def probe_agent_ping() -> tuple[bool, float, str]:
                json={"probe": True, "timestamp": datetime.utcnow().isoformat()}
            )
            latency = time.time() - start
-            
+
            if resp.status_code == 200:
                data = resp.json()
                if data.get("success"):
@@ -100,7 +107,7 @@ async def probe_webhook_echo() -> tuple[bool, float, str]:
                "text": "/health"  # Simple health check command
            }
        }
-        
+
        async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
            # Use helion webhook as it's the most tested
            resp = await client.post(
@@ -108,7 +115,7 @@ async def probe_webhook_echo() -> tuple[bool, float, str]:
                json=test_update
            )
            latency = time.time() - start
-            
+
            if resp.status_code == 200:
                return True, latency, ""
            else:
@@ -119,53 +126,102 @@ async def probe_webhook_echo() -> tuple[bool, float, str]:
        return False, time.time() - start, f"error: {str(e)[:50]}"


+async def probe_agent_semantic(agent_id: str) -> tuple[bool, float, str]:
+    """Probe semantic response via router infer and assert DAARWIZZ awareness."""
+    start = time.time()
+    try:
+        payload = {
+            "prompt": SEMANTIC_PROMPT,
+            "max_tokens": 180,
+            "temperature": 0.1,
+            "metadata": {
+                "agent_id": agent_id,
+                "user_id": "tg:0",
+                "chat_id": "0",
+                "username": "e2e-prober",
+                "raw_user_text": SEMANTIC_PROMPT,
+            },
+        }
+        async with httpx.AsyncClient(timeout=SEMANTIC_TIMEOUT) as client:
+            resp = await client.post(f"{ROUTER_URL}/v1/agents/{agent_id}/infer", json=payload)
+            latency = time.time() - start
+            if resp.status_code != 200:
+                return False, latency, f"http_{resp.status_code}"
+
+            data = resp.json()
+            answer = str(data.get("response") or "")
+            backend = str(data.get("backend") or "")
+            model = str(data.get("model") or "")
+
+            answer_lc = answer.lower()
+            if SEMANTIC_EXPECT_KEYWORD not in answer_lc and "даар" not in answer_lc:
+                return False, latency, "no_daarwizz_in_answer"
+
+            if MONITOR_EXPECT_LOCAL and agent_id == "monitor":
+                local_ok = ("ollama" in backend.lower()) or model.lower().startswith("qwen")
+                if not local_ok:
+                    return False, latency, f"monitor_nonlocal_backend:{backend}:{model}"
+
+            return True, latency, ""
+    except httpx.TimeoutException:
+        return False, time.time() - start, "timeout"
+    except Exception as e:
+        return False, time.time() - start, f"error: {str(e)[:50]}"
+
+
+def record_probe(target: str, success: bool, latency: float, reason: str):
+    """Record probe metrics and log line."""
+    agent_e2e_runs_total.labels(target=target).inc()
+    agent_e2e_success.labels(target=target).set(1 if success else 0)
+    agent_e2e_latency.labels(target=target).set(latency)
+    agent_e2e_latency_histogram.labels(target=target).observe(latency)
+    if not success:
+        agent_e2e_failures_total.labels(target=target, reason=reason).inc()
+    logger.info(f"{target}: success={success}, latency={latency:.3f}s, reason={reason}")
+
+
 async def run_probes():
    """Run all probes and update metrics"""
    # Probe 1: Gateway health
    success, latency, reason = await probe_gateway_health()
-    agent_e2e_runs_total.labels(target="gateway_health").inc()
-    agent_e2e_success.labels(target="gateway_health").set(1 if success else 0)
-    agent_e2e_latency.labels(target="gateway_health").set(latency)
-    agent_e2e_latency_histogram.labels(target="gateway_health").observe(latency)
-    if not success:
-        agent_e2e_failures_total.labels(target="gateway_health", reason=reason).inc()
-    logger.info(f"gateway_health: success={success}, latency={latency:.3f}s, reason={reason}")
-    
+    record_probe("gateway_health", success, latency, reason)
+
    # Probe 2: Agent ping (if endpoint exists)
    success, latency, reason = await probe_agent_ping()
-    agent_e2e_runs_total.labels(target="agent_ping").inc()
-    agent_e2e_success.labels(target="agent_ping").set(1 if success else 0)
-    agent_e2e_latency.labels(target="agent_ping").set(latency)
-    agent_e2e_latency_histogram.labels(target="agent_ping").observe(latency)
-    if not success:
-        agent_e2e_failures_total.labels(target="agent_ping", reason=reason).inc()
-    logger.info(f"agent_ping: success={success}, latency={latency:.3f}s, reason={reason}")
-    
+    record_probe("agent_ping", success, latency, reason)
+
    # Probe 3: Webhook E2E (full path test)
    success, latency, reason = await probe_webhook_echo()
-    agent_e2e_runs_total.labels(target="webhook_e2e").inc()
-    agent_e2e_success.labels(target="webhook_e2e").set(1 if success else 0)
-    agent_e2e_latency.labels(target="webhook_e2e").set(latency)
-    agent_e2e_latency_histogram.labels(target="webhook_e2e").observe(latency)
-    if not success:
-        agent_e2e_failures_total.labels(target="webhook_e2e", reason=reason).inc()
-    logger.info(f"webhook_e2e: success={success}, latency={latency:.3f}s, reason={reason}")
+    record_probe("webhook_e2e", success, latency, reason)
+
+    # Probe 4+: semantic checks for selected agents (parallel)
+    if SEMANTIC_PROBE_ENABLED and SEMANTIC_AGENTS:
+        results = await asyncio.gather(*(probe_agent_semantic(agent_id) for agent_id in SEMANTIC_AGENTS))
+        matrix = []
+        for agent_id, (success, latency, reason) in zip(SEMANTIC_AGENTS, results):
+            record_probe(f"semantic_{agent_id}", success, latency, reason)
+            matrix.append(f"{agent_id}:{'PASS' if success else 'FAIL'}")
+        logger.info("semantic_matrix: " + " | ".join(matrix))


 async def main():
-    logger.info(f"Starting E2E Agent Prober")
+    logger.info("Starting E2E Agent Prober")
    logger.info(f"  GATEWAY_URL: {GATEWAY_URL}")
+    logger.info(f"  ROUTER_URL: {ROUTER_URL}")
    logger.info(f"  PROBE_INTERVAL: {PROBE_INTERVAL}s")
    logger.info(f"  PROBE_TIMEOUT: {PROBE_TIMEOUT}s")
    logger.info(f"  METRICS_PORT: {METRICS_PORT}")
-    
+    logger.info(f"  SEMANTIC_TIMEOUT: {SEMANTIC_TIMEOUT}s")
+    logger.info(f"  SEMANTIC_PROBE_ENABLED: {SEMANTIC_PROBE_ENABLED}")
+    logger.info(f"  SEMANTIC_AGENTS: {','.join(SEMANTIC_AGENTS)}")
+
    # Start Prometheus metrics server
    start_http_server(METRICS_PORT)
    logger.info(f"Prometheus metrics available at :{METRICS_PORT}/metrics")
-    
+
    # Initial probe
    await run_probes()
-    
+
    # Continuous probing
    while True:
        await asyncio.sleep(PROBE_INTERVAL)