ops: add plant-vision node1 service and update monitor/prober scripts
This commit is contained in:
62
ops/monitor_notify_sofiia.sh
Normal file → Executable file
62
ops/monitor_notify_sofiia.sh
Normal file → Executable file
@@ -7,6 +7,7 @@ ROUTER_URL="${ROUTER_URL:-http://127.0.0.1:9102}"
|
||||
REPORT_ENABLED="${SOFIIA_REPORTS_ENABLED:-true}"
|
||||
REPORT_MODE="${SOFIIA_REPORT_MODE:-fail_only}" # fail_only | always
|
||||
REPORT_TIMEOUT="${SOFIIA_REPORT_TIMEOUT:-180}"
|
||||
REPORT_MAX_TOKENS="${SOFIIA_REPORT_MAX_TOKENS:-900}"
|
||||
REPORT_CHAT_ID="${SOFIIA_REPORT_CHAT_ID:-ops-monitor-sofiia}"
|
||||
REPORT_USER_ID="${SOFIIA_REPORT_USER_ID:-ops-monitor-agent}"
|
||||
REPORT_USERNAME="${SOFIIA_REPORT_USERNAME:-monitor-agent}"
|
||||
@@ -23,7 +24,7 @@ if [[ ! -f "$STATUS_JSON" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
python3 - "$STATUS_JSON" "$ROOT" "$ROUTER_URL" "$REPORT_MODE" "$REPORT_TIMEOUT" "$REPORT_CHAT_ID" "$REPORT_USER_ID" "$REPORT_USERNAME" "$REPORT_TELEGRAM_CHAT_ID" "$SOFIIA_BOT_TOKEN" <<'PY'
|
||||
python3 - "$STATUS_JSON" "$ROOT" "$ROUTER_URL" "$REPORT_MODE" "$REPORT_TIMEOUT" "$REPORT_MAX_TOKENS" "$REPORT_CHAT_ID" "$REPORT_USER_ID" "$REPORT_USERNAME" "$REPORT_TELEGRAM_CHAT_ID" "$SOFIIA_BOT_TOKEN" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
@@ -35,11 +36,12 @@ root = Path(sys.argv[2])
|
||||
router_url = sys.argv[3].rstrip('/')
|
||||
report_mode = sys.argv[4]
|
||||
timeout_s = int(sys.argv[5])
|
||||
chat_id = sys.argv[6]
|
||||
user_id = sys.argv[7]
|
||||
username = sys.argv[8]
|
||||
tg_chat_id = sys.argv[9].strip()
|
||||
tg_token = sys.argv[10].strip()
|
||||
max_tokens = int(sys.argv[6])
|
||||
chat_id = sys.argv[7]
|
||||
user_id = sys.argv[8]
|
||||
username = sys.argv[9]
|
||||
tg_chat_id = sys.argv[10].strip()
|
||||
tg_token = sys.argv[11].strip()
|
||||
|
||||
payload = json.loads(status_json.read_text(encoding='utf-8'))
|
||||
status = str(payload.get('status', 'unknown')).lower()
|
||||
@@ -70,7 +72,7 @@ prompt = (
|
||||
|
||||
body = {
|
||||
'prompt': prompt,
|
||||
'max_tokens': 400,
|
||||
'max_tokens': max_tokens,
|
||||
'temperature': 0.1,
|
||||
'metadata': {
|
||||
'source': 'ops-monitor-canary',
|
||||
@@ -99,26 +101,42 @@ try:
|
||||
print(f"[OK] sofiia report sent: backend={data.get('backend')} model={data.get('model')} preview={short!r}")
|
||||
|
||||
if tg_chat_id and tg_token and text:
|
||||
msg = (
|
||||
def chunk_text(value: str, limit: int = 3500):
|
||||
chunks = []
|
||||
remaining = value
|
||||
while remaining:
|
||||
if len(remaining) <= limit:
|
||||
chunks.append(remaining)
|
||||
break
|
||||
split_at = remaining.rfind('\n', 0, limit)
|
||||
if split_at < max(1, limit // 2):
|
||||
split_at = limit
|
||||
chunks.append(remaining[:split_at].rstrip())
|
||||
remaining = remaining[split_at:].lstrip()
|
||||
return chunks or [value]
|
||||
|
||||
header = (
|
||||
"[NODE1 Monitor]\n"
|
||||
f"status={payload.get('status')} exit_code={payload.get('exit_code')}\n\n"
|
||||
f"{text[:3500]}"
|
||||
)
|
||||
tg_req = urlreq.Request(
|
||||
url=f"https://api.telegram.org/bot{tg_token}/sendMessage",
|
||||
data=json.dumps({"chat_id": tg_chat_id, "text": msg}).encode('utf-8'),
|
||||
headers={'Content-Type': 'application/json'},
|
||||
method='POST',
|
||||
)
|
||||
try:
|
||||
parts = chunk_text(text, 3500 - len("(99/99)\n"))
|
||||
total = len(parts)
|
||||
delivered = 0
|
||||
for idx, part in enumerate(parts, start=1):
|
||||
prefix = f"({idx}/{total})\n" if total > 1 else ""
|
||||
msg = f"{header}{prefix}{part}" if idx == 1 else f"{prefix}{part}"
|
||||
tg_req = urlreq.Request(
|
||||
url=f"https://api.telegram.org/bot{tg_token}/sendMessage",
|
||||
data=json.dumps({"chat_id": tg_chat_id, "text": msg}).encode('utf-8'),
|
||||
headers={'Content-Type': 'application/json'},
|
||||
method='POST',
|
||||
)
|
||||
with urlreq.urlopen(tg_req, timeout=20) as tg_resp:
|
||||
tg_data = json.loads(tg_resp.read().decode('utf-8', errors='ignore'))
|
||||
if tg_data.get('ok'):
|
||||
print(f"[OK] telegram report delivered: chat_id={tg_chat_id}")
|
||||
else:
|
||||
print(f"[WARN] telegram send not ok: {tg_data}")
|
||||
except Exception as tg_e:
|
||||
print(f"[WARN] telegram send failed: {tg_e}")
|
||||
if not tg_data.get('ok'):
|
||||
raise RuntimeError(f"telegram send not ok: {tg_data}")
|
||||
delivered += 1
|
||||
print(f"[OK] telegram report delivered: chat_id={tg_chat_id} parts={delivered}")
|
||||
else:
|
||||
print('[INFO] telegram delivery skipped (missing SOFIIA_REPORT_TELEGRAM_CHAT_ID or token or empty text)')
|
||||
except HTTPError as e:
|
||||
|
||||
Reference in New Issue
Block a user