microdao-daarion/tests/test_stepan_v3_session_proactivity_stability.py

"""
Tests для Humanized Stepan v3 — Session Context + Soft Proactivity + Intent Stability Guard.

Покриває:
1. Session TTL expiry
2. Session last_messages max 3
3. Session isolation per chat_id
4. Session update/load cycle
5. Stability guard: last_depth=light + short → light reason=stability_guard
6. Stability guard: action verb → deep (guard не спрацьовує)
7. Stability guard: urgent → deep (guard не спрацьовує)
8. Stability guard: >6 слів → guard не спрацьовує
9. Proactivity: умови виконані → added=True, фраза ≤120, без "!"
10. Proactivity: depth=light → not added
11. Proactivity: not 10th interaction → not added
12. Proactivity: confidence < 0.7 → not added
13. Proactivity: brief + "?" → not added
14. Proactivity: intent_freq < 3 → not added
15. Telemetry: AGX_STEPAN_METRIC session_updated / session_expired / stability_guard_triggered / proactivity_added
"""

import logging
import sys
import time
from copy import deepcopy
from pathlib import Path
from unittest.mock import patch

root = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(root))
sys.path.insert(0, str(root / 'packages' / 'agromatrix-tools'))

from crews.agromatrix_crew.session_context import (
    load_session, update_session, clear_session, _STORE, SESSION_TTL, _default_session,
)
from crews.agromatrix_crew.proactivity import (
    maybe_add_proactivity, _top_intent,
)
from crews.agromatrix_crew.depth_classifier import classify_depth
from crews.agromatrix_crew.memory_manager import _default_user_profile
from crews.agromatrix_crew.telemetry import TELEMETRY_TAG


# ─── Helper ───────────────────────────────────────────────────────────────────

class _CaptureHandler(logging.Handler):
    def __init__(self):
        super().__init__()
        self.records: list[logging.LogRecord] = []

    def emit(self, record):
        self.records.append(record)

    @property
    def messages(self):
        return [r.getMessage() for r in self.records]


def _attach(module_path: str) -> tuple[logging.Logger, _CaptureHandler]:
    lg = logging.getLogger(module_path)
    lg.setLevel(logging.DEBUG)
    h = _CaptureHandler()
    lg.addHandler(h)
    return lg, h


def _profile(interaction_count=0, known_intents=None, style="conversational"):
    p = _default_user_profile("test_user")
    p["interaction_count"] = interaction_count
    p["known_intents"] = known_intents or []
    p["style"] = style
    return p


# ─── 1. Session TTL expiry ────────────────────────────────────────────────────

def test_session_ttl_expiry_returns_default():
    chat_id = "ttl_test_chat"
    clear_session(chat_id)
    update_session(chat_id, "повідомлення", "deep")

    # Перемотуємо час щоб протухнути
    with patch("crews.agromatrix_crew.session_context.time") as mock_time:
        mock_time.time.return_value = time.time() + SESSION_TTL + 1
        result = load_session(chat_id)

    assert result["last_depth"] is None
    assert result["last_messages"] == []


def test_session_ttl_expiry_logs_expired(caplog):
    chat_id = "ttl_log_chat"
    clear_session(chat_id)
    update_session(chat_id, "msg", "deep")

    with patch("crews.agromatrix_crew.session_context.time") as mock_time, \
         caplog.at_level(logging.INFO, logger="crews.agromatrix_crew.session_context"):
        mock_time.time.return_value = time.time() + SESSION_TTL + 10
        load_session(chat_id)

    tagged = [r.getMessage() for r in caplog.records if TELEMETRY_TAG in r.getMessage()]
    assert any("session_expired" in m for m in tagged), f"No session_expired. Got: {tagged}"


# ─── 2. Session last_messages max 3 ──────────────────────────────────────────

def test_session_last_messages_max_3():
    chat_id = "msg_max_chat"
    clear_session(chat_id)
    for i in range(4):
        update_session(chat_id, f"msg_{i}", "light")
    s = load_session(chat_id)
    assert len(s["last_messages"]) == 3


def test_session_last_messages_keeps_newest():
    chat_id = "msg_newest_chat"
    clear_session(chat_id)
    for i in range(4):
        update_session(chat_id, f"msg_{i}", "light")
    s = load_session(chat_id)
    # newest 3: msg_1, msg_2, msg_3
    assert "msg_0" not in s["last_messages"]
    assert "msg_3" in s["last_messages"]


# ─── 3. Session isolation per chat_id ────────────────────────────────────────

def test_session_isolation():
    chat_a = "iso_chat_A"
    chat_b = "iso_chat_B"
    clear_session(chat_a)
    clear_session(chat_b)
    update_session(chat_a, "msg_a", "deep")
    update_session(chat_b, "msg_b", "light")

    s_a = load_session(chat_a)
    s_b = load_session(chat_b)
    assert s_a["last_depth"] == "deep"
    assert s_b["last_depth"] == "light"
    assert "msg_a" not in s_b["last_messages"]
    assert "msg_b" not in s_a["last_messages"]


# ─── 4. Session update/load cycle ────────────────────────────────────────────

def test_session_update_load_roundtrip():
    chat_id = "roundtrip_chat"
    clear_session(chat_id)
    update_session(chat_id, "план на тиждень", "deep",
                   agents=["ops", "iot"], last_question="Уточни поле?")
    s = load_session(chat_id)
    assert s["last_depth"] == "deep"
    assert "план на тиждень" in s["last_messages"]
    assert s["last_agents"] == ["ops", "iot"]
    assert s["last_question"] == "Уточни поле?"


def test_session_agents_max_5():
    chat_id = "agents_max_chat"
    clear_session(chat_id)
    update_session(chat_id, "msg", "deep",
                   agents=["a", "b", "c", "d", "e", "f", "g"])
    s = load_session(chat_id)
    assert len(s["last_agents"]) == 5


def test_session_telemetry_updated(caplog):
    chat_id = "tlog_update_chat"
    clear_session(chat_id)
    with caplog.at_level(logging.INFO, logger="crews.agromatrix_crew.session_context"):
        update_session(chat_id, "тест", "light")
    tagged = [r.getMessage() for r in caplog.records if TELEMETRY_TAG in r.getMessage()]
    assert any("session_updated" in m for m in tagged), f"No session_updated. Got: {tagged}"


def test_session_no_crash_empty_chat_id():
    """load/update з порожнім chat_id не кидає виняток."""
    result = load_session("")
    assert result is not None
    update_session("", "msg", "deep")  # no crash


# ─── 5–8. Stability Guard ─────────────────────────────────────────────────────

def test_stability_guard_short_after_light():
    """last_depth=light + ≤6 слів + без action verbs → light reason=stability_guard."""
    session = {"last_depth": "light", "last_messages": [], "last_agents": [],
               "last_question": None, "updated_at": time.time()}
    result = classify_depth("а на завтра?", session=session)
    assert result == "light"


def test_stability_guard_action_verb_overrides():
    """Action verb перебиває guard → deep."""
    session = {"last_depth": "light", "last_messages": [], "last_agents": [],
               "last_question": None, "updated_at": time.time()}
    result = classify_depth("зроби план на завтра", session=session)
    assert result == "deep"


def test_stability_guard_urgent_overrides():
    """Urgent слово перебиває guard → deep."""
    session = {"last_depth": "light", "last_messages": [], "last_agents": [],
               "last_question": None, "updated_at": time.time()}
    result = classify_depth("терміново на завтра?", session=session)
    assert result == "deep"


def test_stability_guard_long_text_no_guard():
    """7+ слів → guard не спрацьовує (звична логіка)."""
    session = {"last_depth": "light", "last_messages": [], "last_agents": [],
               "last_question": None, "updated_at": time.time()}
    # 7 words, no action verb — should go through normal path, likely deep
    result = classify_depth("перевір статус поля один два три чотири п'ять", session=session)
    # Action verb "перевір" → deep regardless
    assert result == "deep"


def test_stability_guard_no_session_works_normally():
    """Без session — логіка без змін."""
    result = classify_depth("а на завтра?", session=None, last_topic="plan_day")
    assert result == "light"  # short_followup_last_topic


def test_stability_guard_last_depth_deep_no_guard():
    """last_depth=deep → guard не спрацьовує."""
    session = {"last_depth": "deep", "last_messages": [], "last_agents": [],
               "last_question": None, "updated_at": time.time()}
    # Short message but last was deep — normal classification
    result = classify_depth("а завтра?", session=session, last_topic="plan_day")
    # Normal short_followup_last_topic → light
    assert result == "light"


def test_stability_guard_telemetry(caplog):
    """Stability guard → AGX_STEPAN_METRIC stability_guard_triggered."""
    session = {"last_depth": "light", "last_messages": [], "last_agents": [],
               "last_question": None, "updated_at": time.time()}
    with caplog.at_level(logging.INFO, logger="crews.agromatrix_crew.depth_classifier"):
        classify_depth("а завтра?", session=session)
    tagged = [r.getMessage() for r in caplog.records if TELEMETRY_TAG in r.getMessage()]
    assert any("stability_guard_triggered" in m for m in tagged), \
        f"No stability_guard_triggered. Got: {tagged}"


# ─── 9–14. Proactivity ───────────────────────────────────────────────────────

def _profile_with_intent(intent: str, count: int, interaction_count: int = 10) -> dict:
    p = _default_user_profile("u_proact")
    p["interaction_count"] = interaction_count
    p["known_intents"] = [intent] * count
    return p


def test_proactivity_all_conditions_met():
    """Всі умови → added=True, фраза ≤120 символів, без '!'."""
    p = _profile_with_intent("plan_day", count=5, interaction_count=10)
    response = "Ось твій план на день."
    new_resp, added = maybe_add_proactivity(response, p, depth="deep", reflection=None)
    assert added is True
    added_part = new_resp[len(response):].strip()
    assert len(added_part) <= 120, f"Added phrase too long: {len(added_part)}"
    assert "!" not in added_part, f"Exclamation found: {added_part!r}"


def test_proactivity_light_depth_not_added():
    p = _profile_with_intent("plan_day", count=5, interaction_count=10)
    _, added = maybe_add_proactivity("Відповідь.", p, depth="light")
    assert added is False


def test_proactivity_not_tenth_interaction():
    p = _profile_with_intent("plan_day", count=5, interaction_count=7)
    _, added = maybe_add_proactivity("Відповідь.", p, depth="deep")
    assert added is False


def test_proactivity_zero_interaction_not_added():
    p = _profile_with_intent("plan_day", count=5, interaction_count=0)
    _, added = maybe_add_proactivity("Відповідь.", p, depth="deep")
    assert added is False


def test_proactivity_low_confidence_not_added():
    p = _profile_with_intent("plan_day", count=5, interaction_count=10)
    reflection = {"confidence": 0.5, "new_facts": {}, "clarifying_question": None}
    _, added = maybe_add_proactivity("Відповідь.", p, depth="deep", reflection=reflection)
    assert added is False


def test_proactivity_brief_with_question_not_added():
    p = _profile_with_intent("plan_day", count=5, interaction_count=10)
    p["style"] = "concise"
    response = "Ось план. Чи уточнити?"
    _, added = maybe_add_proactivity(response, p, depth="deep")
    assert added is False


def test_proactivity_intent_freq_low_not_added():
    p = _profile_with_intent("plan_day", count=2, interaction_count=10)  # < 3
    _, added = maybe_add_proactivity("Відповідь.", p, depth="deep")
    assert added is False


def test_proactivity_confidence_ok_added():
    """confidence >= 0.7 → added=True."""
    p = _profile_with_intent("iot_sensors", count=4, interaction_count=10)
    reflection = {"confidence": 0.8, "new_facts": {}, "clarifying_question": None}
    _, added = maybe_add_proactivity("Статус датчиків перевірено.", p, depth="deep",
                                     reflection=reflection)
    assert added is True


def test_proactivity_telemetry_added(caplog):
    p = _profile_with_intent("plan_day", count=5, interaction_count=10)
    with caplog.at_level(logging.INFO, logger="crews.agromatrix_crew.proactivity"):
        maybe_add_proactivity("Відповідь.", p, depth="deep")
    tagged = [r.getMessage() for r in caplog.records if TELEMETRY_TAG in r.getMessage()]
    assert any("proactivity_added" in m for m in tagged), \
        f"No proactivity_added telemetry. Got: {tagged}"


def test_proactivity_telemetry_skipped_light(caplog):
    p = _profile_with_intent("plan_day", count=5, interaction_count=10)
    with caplog.at_level(logging.DEBUG, logger="crews.agromatrix_crew.proactivity"):
        maybe_add_proactivity("Відповідь.", p, depth="light")
    tagged = [r.getMessage() for r in caplog.records if TELEMETRY_TAG in r.getMessage()]
    assert any("proactivity_skipped" in m for m in tagged), \
        f"No proactivity_skipped telemetry. Got: {tagged}"


# ─── _top_intent helper ───────────────────────────────────────────────────────

def test_top_intent_returns_most_frequent():
    intents = ["plan_day"] * 5 + ["iot_sensors"] * 2
    intent, count = _top_intent(intents)
    assert intent == "plan_day"
    assert count == 5


def test_top_intent_empty_returns_none():
    intent, count = _top_intent([])
    assert intent is None
    assert count == 0


def test_top_intent_none_returns_none():
    intent, count = _top_intent(None)
    assert intent is None