Files
microdao-daarion/tests/test_stepan_v3_session_proactivity_stability.py
Apple 129e4ea1fc feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
2026-03-03 07:14:14 -08:00

351 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Tests для Humanized Stepan v3 — Session Context + Soft Proactivity + Intent Stability Guard.
Покриває:
1. Session TTL expiry
2. Session last_messages max 3
3. Session isolation per chat_id
4. Session update/load cycle
5. Stability guard: last_depth=light + short → light reason=stability_guard
6. Stability guard: action verb → deep (guard не спрацьовує)
7. Stability guard: urgent → deep (guard не спрацьовує)
8. Stability guard: >6 слів → guard не спрацьовує
9. Proactivity: умови виконані → added=True, фраза ≤120, без "!"
10. Proactivity: depth=light → not added
11. Proactivity: not 10th interaction → not added
12. Proactivity: confidence < 0.7 → not added
13. Proactivity: brief + "?" → not added
14. Proactivity: intent_freq < 3 → not added
15. Telemetry: AGX_STEPAN_METRIC session_updated / session_expired / stability_guard_triggered / proactivity_added
"""
import logging
import sys
import time
from copy import deepcopy
from pathlib import Path
from unittest.mock import patch
root = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(root))
sys.path.insert(0, str(root / 'packages' / 'agromatrix-tools'))
from crews.agromatrix_crew.session_context import (
load_session, update_session, clear_session, _STORE, SESSION_TTL, _default_session,
)
from crews.agromatrix_crew.proactivity import (
maybe_add_proactivity, _top_intent,
)
from crews.agromatrix_crew.depth_classifier import classify_depth
from crews.agromatrix_crew.memory_manager import _default_user_profile
from crews.agromatrix_crew.telemetry import TELEMETRY_TAG
# ─── Helper ───────────────────────────────────────────────────────────────────
class _CaptureHandler(logging.Handler):
def __init__(self):
super().__init__()
self.records: list[logging.LogRecord] = []
def emit(self, record):
self.records.append(record)
@property
def messages(self):
return [r.getMessage() for r in self.records]
def _attach(module_path: str) -> tuple[logging.Logger, _CaptureHandler]:
lg = logging.getLogger(module_path)
lg.setLevel(logging.DEBUG)
h = _CaptureHandler()
lg.addHandler(h)
return lg, h
def _profile(interaction_count=0, known_intents=None, style="conversational"):
p = _default_user_profile("test_user")
p["interaction_count"] = interaction_count
p["known_intents"] = known_intents or []
p["style"] = style
return p
# ─── 1. Session TTL expiry ────────────────────────────────────────────────────
def test_session_ttl_expiry_returns_default():
chat_id = "ttl_test_chat"
clear_session(chat_id)
update_session(chat_id, "повідомлення", "deep")
# Перемотуємо час щоб протухнути
with patch("crews.agromatrix_crew.session_context.time") as mock_time:
mock_time.time.return_value = time.time() + SESSION_TTL + 1
result = load_session(chat_id)
assert result["last_depth"] is None
assert result["last_messages"] == []
def test_session_ttl_expiry_logs_expired(caplog):
chat_id = "ttl_log_chat"
clear_session(chat_id)
update_session(chat_id, "msg", "deep")
with patch("crews.agromatrix_crew.session_context.time") as mock_time, \
caplog.at_level(logging.INFO, logger="crews.agromatrix_crew.session_context"):
mock_time.time.return_value = time.time() + SESSION_TTL + 10
load_session(chat_id)
tagged = [r.getMessage() for r in caplog.records if TELEMETRY_TAG in r.getMessage()]
assert any("session_expired" in m for m in tagged), f"No session_expired. Got: {tagged}"
# ─── 2. Session last_messages max 3 ──────────────────────────────────────────
def test_session_last_messages_max_3():
chat_id = "msg_max_chat"
clear_session(chat_id)
for i in range(4):
update_session(chat_id, f"msg_{i}", "light")
s = load_session(chat_id)
assert len(s["last_messages"]) == 3
def test_session_last_messages_keeps_newest():
chat_id = "msg_newest_chat"
clear_session(chat_id)
for i in range(4):
update_session(chat_id, f"msg_{i}", "light")
s = load_session(chat_id)
# newest 3: msg_1, msg_2, msg_3
assert "msg_0" not in s["last_messages"]
assert "msg_3" in s["last_messages"]
# ─── 3. Session isolation per chat_id ────────────────────────────────────────
def test_session_isolation():
chat_a = "iso_chat_A"
chat_b = "iso_chat_B"
clear_session(chat_a)
clear_session(chat_b)
update_session(chat_a, "msg_a", "deep")
update_session(chat_b, "msg_b", "light")
s_a = load_session(chat_a)
s_b = load_session(chat_b)
assert s_a["last_depth"] == "deep"
assert s_b["last_depth"] == "light"
assert "msg_a" not in s_b["last_messages"]
assert "msg_b" not in s_a["last_messages"]
# ─── 4. Session update/load cycle ────────────────────────────────────────────
def test_session_update_load_roundtrip():
chat_id = "roundtrip_chat"
clear_session(chat_id)
update_session(chat_id, "план на тиждень", "deep",
agents=["ops", "iot"], last_question="Уточни поле?")
s = load_session(chat_id)
assert s["last_depth"] == "deep"
assert "план на тиждень" in s["last_messages"]
assert s["last_agents"] == ["ops", "iot"]
assert s["last_question"] == "Уточни поле?"
def test_session_agents_max_5():
chat_id = "agents_max_chat"
clear_session(chat_id)
update_session(chat_id, "msg", "deep",
agents=["a", "b", "c", "d", "e", "f", "g"])
s = load_session(chat_id)
assert len(s["last_agents"]) == 5
def test_session_telemetry_updated(caplog):
chat_id = "tlog_update_chat"
clear_session(chat_id)
with caplog.at_level(logging.INFO, logger="crews.agromatrix_crew.session_context"):
update_session(chat_id, "тест", "light")
tagged = [r.getMessage() for r in caplog.records if TELEMETRY_TAG in r.getMessage()]
assert any("session_updated" in m for m in tagged), f"No session_updated. Got: {tagged}"
def test_session_no_crash_empty_chat_id():
"""load/update з порожнім chat_id не кидає виняток."""
result = load_session("")
assert result is not None
update_session("", "msg", "deep") # no crash
# ─── 58. Stability Guard ─────────────────────────────────────────────────────
def test_stability_guard_short_after_light():
"""last_depth=light + ≤6 слів + без action verbs → light reason=stability_guard."""
session = {"last_depth": "light", "last_messages": [], "last_agents": [],
"last_question": None, "updated_at": time.time()}
result = classify_depth("а на завтра?", session=session)
assert result == "light"
def test_stability_guard_action_verb_overrides():
"""Action verb перебиває guard → deep."""
session = {"last_depth": "light", "last_messages": [], "last_agents": [],
"last_question": None, "updated_at": time.time()}
result = classify_depth("зроби план на завтра", session=session)
assert result == "deep"
def test_stability_guard_urgent_overrides():
"""Urgent слово перебиває guard → deep."""
session = {"last_depth": "light", "last_messages": [], "last_agents": [],
"last_question": None, "updated_at": time.time()}
result = classify_depth("терміново на завтра?", session=session)
assert result == "deep"
def test_stability_guard_long_text_no_guard():
"""7+ слів → guard не спрацьовує (звична логіка)."""
session = {"last_depth": "light", "last_messages": [], "last_agents": [],
"last_question": None, "updated_at": time.time()}
# 7 words, no action verb — should go through normal path, likely deep
result = classify_depth("перевір статус поля один два три чотири п'ять", session=session)
# Action verb "перевір" → deep regardless
assert result == "deep"
def test_stability_guard_no_session_works_normally():
"""Без session — логіка без змін."""
result = classify_depth("а на завтра?", session=None, last_topic="plan_day")
assert result == "light" # short_followup_last_topic
def test_stability_guard_last_depth_deep_no_guard():
"""last_depth=deep → guard не спрацьовує."""
session = {"last_depth": "deep", "last_messages": [], "last_agents": [],
"last_question": None, "updated_at": time.time()}
# Short message but last was deep — normal classification
result = classify_depth("а завтра?", session=session, last_topic="plan_day")
# Normal short_followup_last_topic → light
assert result == "light"
def test_stability_guard_telemetry(caplog):
"""Stability guard → AGX_STEPAN_METRIC stability_guard_triggered."""
session = {"last_depth": "light", "last_messages": [], "last_agents": [],
"last_question": None, "updated_at": time.time()}
with caplog.at_level(logging.INFO, logger="crews.agromatrix_crew.depth_classifier"):
classify_depth("а завтра?", session=session)
tagged = [r.getMessage() for r in caplog.records if TELEMETRY_TAG in r.getMessage()]
assert any("stability_guard_triggered" in m for m in tagged), \
f"No stability_guard_triggered. Got: {tagged}"
# ─── 914. Proactivity ───────────────────────────────────────────────────────
def _profile_with_intent(intent: str, count: int, interaction_count: int = 10) -> dict:
p = _default_user_profile("u_proact")
p["interaction_count"] = interaction_count
p["known_intents"] = [intent] * count
return p
def test_proactivity_all_conditions_met():
"""Всі умови → added=True, фраза ≤120 символів, без '!'."""
p = _profile_with_intent("plan_day", count=5, interaction_count=10)
response = "Ось твій план на день."
new_resp, added = maybe_add_proactivity(response, p, depth="deep", reflection=None)
assert added is True
added_part = new_resp[len(response):].strip()
assert len(added_part) <= 120, f"Added phrase too long: {len(added_part)}"
assert "!" not in added_part, f"Exclamation found: {added_part!r}"
def test_proactivity_light_depth_not_added():
p = _profile_with_intent("plan_day", count=5, interaction_count=10)
_, added = maybe_add_proactivity("Відповідь.", p, depth="light")
assert added is False
def test_proactivity_not_tenth_interaction():
p = _profile_with_intent("plan_day", count=5, interaction_count=7)
_, added = maybe_add_proactivity("Відповідь.", p, depth="deep")
assert added is False
def test_proactivity_zero_interaction_not_added():
p = _profile_with_intent("plan_day", count=5, interaction_count=0)
_, added = maybe_add_proactivity("Відповідь.", p, depth="deep")
assert added is False
def test_proactivity_low_confidence_not_added():
p = _profile_with_intent("plan_day", count=5, interaction_count=10)
reflection = {"confidence": 0.5, "new_facts": {}, "clarifying_question": None}
_, added = maybe_add_proactivity("Відповідь.", p, depth="deep", reflection=reflection)
assert added is False
def test_proactivity_brief_with_question_not_added():
p = _profile_with_intent("plan_day", count=5, interaction_count=10)
p["style"] = "concise"
response = "Ось план. Чи уточнити?"
_, added = maybe_add_proactivity(response, p, depth="deep")
assert added is False
def test_proactivity_intent_freq_low_not_added():
p = _profile_with_intent("plan_day", count=2, interaction_count=10) # < 3
_, added = maybe_add_proactivity("Відповідь.", p, depth="deep")
assert added is False
def test_proactivity_confidence_ok_added():
"""confidence >= 0.7 → added=True."""
p = _profile_with_intent("iot_sensors", count=4, interaction_count=10)
reflection = {"confidence": 0.8, "new_facts": {}, "clarifying_question": None}
_, added = maybe_add_proactivity("Статус датчиків перевірено.", p, depth="deep",
reflection=reflection)
assert added is True
def test_proactivity_telemetry_added(caplog):
p = _profile_with_intent("plan_day", count=5, interaction_count=10)
with caplog.at_level(logging.INFO, logger="crews.agromatrix_crew.proactivity"):
maybe_add_proactivity("Відповідь.", p, depth="deep")
tagged = [r.getMessage() for r in caplog.records if TELEMETRY_TAG in r.getMessage()]
assert any("proactivity_added" in m for m in tagged), \
f"No proactivity_added telemetry. Got: {tagged}"
def test_proactivity_telemetry_skipped_light(caplog):
p = _profile_with_intent("plan_day", count=5, interaction_count=10)
with caplog.at_level(logging.DEBUG, logger="crews.agromatrix_crew.proactivity"):
maybe_add_proactivity("Відповідь.", p, depth="light")
tagged = [r.getMessage() for r in caplog.records if TELEMETRY_TAG in r.getMessage()]
assert any("proactivity_skipped" in m for m in tagged), \
f"No proactivity_skipped telemetry. Got: {tagged}"
# ─── _top_intent helper ───────────────────────────────────────────────────────
def test_top_intent_returns_most_frequent():
intents = ["plan_day"] * 5 + ["iot_sensors"] * 2
intent, count = _top_intent(intents)
assert intent == "plan_day"
assert count == 5
def test_top_intent_empty_returns_none():
intent, count = _top_intent([])
assert intent is None
assert count == 0
def test_top_intent_none_returns_none():
intent, count = _top_intent(None)
assert intent is None