Files
microdao-daarion/tests/test_data_governance.py
Apple 129e4ea1fc feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
2026-03-03 07:14:14 -08:00

554 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Tests for Data Governance & Privacy Tool
Covers:
1. test_scan_repo_detects_pii_logging — logger + email → warning
2. test_scan_repo_detects_secret — API_KEY=sk-... → error, masked
3. test_scan_repo_detects_credit_card — credit-card pattern → error
4. test_scan_repo_no_findings_clean — clean code → 0 findings
5. test_scan_audit_detects_pii_in_meta — email in audit meta → warning
6. test_scan_audit_detects_large_output — out_size anomaly → warning
7. test_retention_check_missing_cleanup — no cleanup task → warning
8. test_retention_check_with_cleanup — runbook mentions cleanup → info
9. test_scan_repo_raw_payload_audit — raw payload near logger → error
10. test_release_check_privacy_watch_integration — gate always pass
11. test_rbac_deny — wrong agent → denied
12. test_rbac_allow — sofiia → allowed
13. test_policy_action — returns policy structure
14. test_path_traversal_protection — ../../etc/passwd blocked
15. test_scan_repo_excludes_lock_files — *.lock not scanned
"""
from __future__ import annotations
import asyncio
import json
import os
import sys
import tempfile
from pathlib import Path
from typing import Any, Dict
from unittest.mock import AsyncMock, MagicMock
import pytest
# ─── Path setup ──────────────────────────────────────────────────────────────
ROUTER_DIR = Path(__file__).parent.parent / "services" / "router"
REPO_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(ROUTER_DIR))
sys.path.insert(0, str(REPO_ROOT))
os.environ.setdefault("REPO_ROOT", str(REPO_ROOT))
os.environ["AUDIT_BACKEND"] = "memory"
from data_governance import (
scan_repo,
scan_audit,
retention_check,
get_policy,
scan_data_governance_dict,
reload_policy,
_mask_evidence,
)
from audit_store import MemoryAuditStore, set_audit_store
# ─── Helpers ──────────────────────────────────────────────────────────────────
def _write(tmp: Path, name: str, content: str) -> Path:
p = tmp / name
p.write_text(content, encoding="utf-8")
return p
def _repo_scan(tmp: Path, **kwargs) -> Dict:
return scan_repo(
repo_root=str(tmp),
paths_include=[""], # scan root
paths_exclude=["**/__pycache__/**"],
**kwargs,
)
# ─── 1. PII in logging ────────────────────────────────────────────────────────
def test_scan_repo_detects_pii_logging():
"""logger call with literal email in log message → DG-PII-001 warning."""
with tempfile.TemporaryDirectory() as tmp:
_write(Path(tmp), "service.py", """\
import logging
logger = logging.getLogger(__name__)
def process_user(data):
# BAD: logging real email address
logger.info("Processing user john.doe@example.com request: %s", data)
return True
""")
result = _repo_scan(Path(tmp), focus=["pii", "logging"])
findings = result["findings"]
assert result["pass"] is True
# Should detect email PII pattern (DG-PII-001) in the source file
pii_ids = [f["id"] for f in findings]
assert any(fid.startswith("DG-PII") or fid.startswith("DG-LOG") for fid in pii_ids), \
f"Expected PII/logging finding, got: {pii_ids}"
def test_scan_repo_detects_logging_forbidden_field():
"""logger call with 'token' field → DG-LOG-001."""
with tempfile.TemporaryDirectory() as tmp:
_write(Path(tmp), "auth.py", """\
import logging
logger = logging.getLogger(__name__)
def verify(token, user_id):
logger.debug(f"Verifying token={token} for user={user_id}")
return True
""")
result = _repo_scan(Path(tmp), focus=["logging"])
log_findings = [f for f in result["findings"] if f["id"] == "DG-LOG-001"]
assert len(log_findings) >= 1
assert result["stats"]["warnings"] + result["stats"]["errors"] >= 1
# ─── 2. Secret detection ──────────────────────────────────────────────────────
def test_scan_repo_detects_secret():
"""Hardcoded API key → DG-SEC-000/DG-SEC-001, evidence masked."""
with tempfile.TemporaryDirectory() as tmp:
_write(Path(tmp), "config.py", """\
# Configuration
API_KEY = "sk-abc123xyz9012345678901234567890"
DATABASE_URL = "postgresql://user:mysecretpassword@localhost/db"
""")
result = _repo_scan(Path(tmp), focus=["secrets"])
sec_findings = [f for f in result["findings"] if f["category"] == "secrets"]
assert len(sec_findings) >= 1
# Evidence must be masked — no raw key in output
for f in sec_findings:
detail = f["evidence"].get("details", "")
# The raw key value should not be visible
assert "sk-abc123xyz9012345678901234567890" not in detail
def test_scan_repo_detects_private_key():
"""Private key block → DG-SEC-001 error."""
with tempfile.TemporaryDirectory() as tmp:
_write(Path(tmp), "keys.py", """\
PRIVATE_KEY = '''
-----BEGIN RSA PRIVATE KEY-----
MIIEowIBAAKCAQEA...base64data...
-----END RSA PRIVATE KEY-----
'''
""")
result = _repo_scan(Path(tmp), focus=["secrets"])
sec_findings = [f for f in result["findings"] if "DG-SEC" in f["id"]]
assert len(sec_findings) >= 1
# At least one error for private key
assert any(f["severity"] == "error" for f in sec_findings)
# ─── 3. Credit card pattern ───────────────────────────────────────────────────
def test_scan_repo_detects_credit_card():
"""Credit card number in code → DG-PII-003 error."""
with tempfile.TemporaryDirectory() as tmp:
_write(Path(tmp), "payment.py", """\
# Test data (NEVER use real card numbers!)
TEST_CARD = "4111111111111111" # Visa test number
""")
result = _repo_scan(Path(tmp), focus=["pii"])
pii_findings = [f for f in result["findings"] if f["id"] == "DG-PII-003"]
assert len(pii_findings) >= 1
assert pii_findings[0]["severity"] == "error"
# ─── 4. Clean code — no findings ─────────────────────────────────────────────
def test_scan_repo_no_findings_clean():
"""Clean code with proper practices → no findings (or minimal)."""
with tempfile.TemporaryDirectory() as tmp:
_write(Path(tmp), "service.py", """\
import logging
from governance import redact
logger = logging.getLogger(__name__)
def process_request(req_id: str, workspace_id: str):
# Log only safe identifiers
logger.info("Processing request=%s ws=%s", req_id[:8], workspace_id[:8])
return {"status": "ok"}
""")
result = _repo_scan(Path(tmp), focus=["pii", "logging", "secrets"])
# Should have 0 or very few findings (no credit cards, no raw emails, no raw secrets)
error_findings = [f for f in result["findings"] if f["severity"] == "error"]
assert len(error_findings) == 0
# ─── 5. scan_audit PII in meta ───────────────────────────────────────────────
def test_scan_audit_detects_pii_in_meta():
"""Email in audit event user_id field → DG-AUD-101 warning."""
store = MemoryAuditStore()
set_audit_store(store)
# Inject audit event where user_id looks like an email
store.write({
"ts": "2026-02-23T12:00:00+00:00",
"req_id": "req-001",
"workspace_id": "ws1",
"user_id": "test.user@example.com", # PII in user_id
"agent_id": "sofiia",
"tool": "observability_tool",
"action": "logs_query",
"status": "pass",
"duration_ms": 100,
"in_size": 50,
"out_size": 200,
"input_hash": "sha256:abc",
})
result = scan_audit(time_window_hours=24)
pii_audit = [f for f in result["findings"] if f["id"] == "DG-AUD-101"]
assert len(pii_audit) >= 1
assert pii_audit[0]["severity"] in ("warning", "error")
# Evidence must be masked
detail = pii_audit[0]["evidence"].get("details", "")
# Real email may be partially masked
assert len(detail) <= 250 # truncated to safe length
def test_scan_audit_detects_large_output():
"""Very large out_size → DG-AUD-102 warning."""
store = MemoryAuditStore()
set_audit_store(store)
store.write({
"ts": "2026-02-23T12:00:00+00:00",
"req_id": "req-002",
"workspace_id": "ws1",
"user_id": "user_x",
"agent_id": "sofiia",
"tool": "observability_tool",
"action": "logs_query",
"status": "pass",
"duration_ms": 500,
"in_size": 100,
"out_size": 200000, # 200KB — above 65536 threshold
"input_hash": "sha256:def",
})
result = scan_audit(time_window_hours=24)
large_findings = [f for f in result["findings"] if f["id"] == "DG-AUD-102"]
assert len(large_findings) >= 1
assert "200000" in large_findings[0]["evidence"].get("details", "")
# ─── 6. scan_audit — no store → graceful ─────────────────────────────────────
def test_scan_audit_no_findings_for_clean_events():
"""Normal audit events without PII → no findings."""
store = MemoryAuditStore()
set_audit_store(store)
for i in range(5):
store.write({
"ts": "2026-02-23T12:00:00+00:00",
"req_id": f"req-{i:03d}",
"workspace_id": "ws_opaque_hash",
"user_id": f"usr_{i:04d}",
"agent_id": "sofiia",
"tool": "cost_analyzer_tool",
"action": "top",
"status": "pass",
"duration_ms": 50,
"in_size": 40,
"out_size": 300,
"input_hash": "sha256:aaa",
})
result = scan_audit(time_window_hours=24)
# No DG-AUD-101 (no PII) and no DG-AUD-102 (small outputs)
assert not any(f["id"] == "DG-AUD-101" for f in result["findings"])
assert not any(f["id"] == "DG-AUD-102" for f in result["findings"])
# ─── 7. retention_check — missing cleanup ─────────────────────────────────────
def test_retention_check_missing_cleanup():
"""Empty repo → no cleanup mechanisms → DG-RET-201 warning."""
with tempfile.TemporaryDirectory() as tmp:
# Create empty ops/ and task_registry.yml without audit_cleanup
ops = Path(tmp) / "ops"
ops.mkdir()
(ops / "task_registry.yml").write_text("tasks: []\n")
result = retention_check(
repo_root=str(tmp),
check_audit_cleanup_task=True,
check_jsonl_rotation=True,
check_memory_retention_docs=False,
check_logs_retention_docs=False,
)
assert result["pass"] is True
warn_ids = [f["id"] for f in result["findings"]]
assert "DG-RET-201" in warn_ids
# ─── 8. retention_check — with cleanup documented ────────────────────────────
def test_retention_check_with_cleanup():
"""Runbook mentioning audit cleanup → DG-RET-202 info."""
with tempfile.TemporaryDirectory() as tmp:
ops = Path(tmp) / "ops"
ops.mkdir()
# Runbook that mentions audit cleanup and rotation
(ops / "runbook-audit.md").write_text(
"# Audit Runbook\n\nRun audit_cleanup task to rotate jsonl files older than 30 days.\n"
)
result = retention_check(
repo_root=str(tmp),
check_audit_cleanup_task=True,
check_jsonl_rotation=False,
check_memory_retention_docs=False,
check_logs_retention_docs=False,
)
assert result["pass"] is True
info_ids = [f["id"] for f in result["findings"]]
assert "DG-RET-202" in info_ids
# ─── 9. Raw payload near audit write ──────────────────────────────────────────
def test_scan_repo_raw_payload_audit_write():
"""payload field near logger.info call → DG-AUD-001 error."""
with tempfile.TemporaryDirectory() as tmp:
_write(Path(tmp), "audit_writer.py", """\
import logging
logger = logging.getLogger(__name__)
def emit_event(req_id, payload, tool):
# Storing full payload in audit log
record = {"req_id": req_id, "payload": payload, "tool": tool}
logger.info("AUDIT_EVENT %s", record)
""")
result = _repo_scan(Path(tmp), focus=["logging"])
aud_findings = [f for f in result["findings"] if f["id"] == "DG-AUD-001"]
assert len(aud_findings) >= 1
assert aud_findings[0]["severity"] == "error"
# ─── 10. Release check privacy_watch integration ─────────────────────────────
def test_release_check_privacy_watch_integration():
"""privacy_watch gate always pass=True; adds recommendations."""
async def _run():
from release_check_runner import run_release_check
class FakeResult:
def __init__(self, data, success=True, error=None):
self.success = success
self.result = data
self.error = error
async def fake_exec(tool_name, args, agent_id=None):
if tool_name == "pr_reviewer_tool":
return FakeResult({"approved": True, "verdict": "LGTM", "issues": []})
if tool_name == "config_linter_tool":
return FakeResult({"pass": True, "errors": [], "warnings": []})
if tool_name == "dependency_scanner_tool":
return FakeResult({"pass": True, "summary": "No vulns", "vulnerabilities": []})
if tool_name == "contract_tool":
return FakeResult({"pass": True, "breaking_changes": [], "warnings": []})
if tool_name == "threatmodel_tool":
return FakeResult({"risk_level": "low", "threats": []})
if tool_name == "data_governance_tool":
# Simulate findings with warning
action = args.get("action", "")
if action == "scan_repo":
return FakeResult({
"pass": True,
"summary": "2 warnings",
"stats": {"errors": 0, "warnings": 2, "infos": 0},
"findings": [
{"id": "DG-LOG-001", "severity": "warning",
"title": "Potential sensitive field logged",
"category": "logging", "evidence": {}, "recommended_fix": "Use redact()"},
],
"recommendations": ["Review logger calls for sensitive fields."],
})
return FakeResult({"pass": True, "findings": [], "recommendations": [], "stats": {}})
if tool_name == "cost_analyzer_tool":
return FakeResult({"anomalies": [], "anomaly_count": 0})
return FakeResult({})
tm = MagicMock()
tm.execute_tool = AsyncMock(side_effect=fake_exec)
inputs = {
"diff_text": "small fix",
"run_smoke": False,
"run_drift": False,
"run_deps": True,
"run_privacy_watch": True,
"run_cost_watch": True,
"fail_fast": False,
}
return await run_release_check(tm, inputs, agent_id="sofiia")
report = asyncio.run(_run())
gate_names = [g["name"] for g in report["gates"]]
assert "privacy_watch" in gate_names
pw_gate = next(g for g in report["gates"] if g["name"] == "privacy_watch")
assert pw_gate["status"] == "pass"
assert pw_gate.get("warnings", 0) >= 0 # warnings don't block release
assert report["pass"] is True
# Recommendations from privacy_watch should be in the final report
all_recs = report.get("recommendations", [])
assert any("logger" in r.lower() or "redact" in r.lower() or "sensitiv" in r.lower()
for r in all_recs), f"Expected privacy rec in {all_recs}"
# ─── 11. privacy_watch skipped on error ──────────────────────────────────────
def test_privacy_watch_skipped_on_tool_error():
"""Unhandled exception in data_governance_tool → gate still pass=True (skipped)."""
async def _run():
from release_check_runner import _run_privacy_watch
tm = MagicMock()
# Raise a real exception (not just FailResult) so outer try/except catches it
tm.execute_tool = AsyncMock(side_effect=RuntimeError("connection refused"))
return await _run_privacy_watch(tm, "sofiia")
ok, gate = asyncio.run(_run())
assert ok is True
assert gate["status"] == "pass"
# skipped=True is set when the outer except catches the error
assert gate.get("skipped") is True
# ─── 1213. RBAC ──────────────────────────────────────────────────────────────
def test_rbac_deny():
"""Agent without tools.data_gov.read → denied."""
from tool_governance import ToolGovernance
gov = ToolGovernance(enable_rbac=True, enable_limits=False, enable_allowlist=False)
result = gov.pre_call(
tool="data_governance_tool",
action="scan_repo",
agent_id="alateya", # agent_media — no data_gov entitlement
)
assert not result.allowed
assert "entitlement" in result.reason.lower() or "denied" in result.reason.lower()
def test_rbac_allow():
"""'sofiia' (agent_cto) has tools.data_gov.read → allowed."""
from tool_governance import ToolGovernance
gov = ToolGovernance(enable_rbac=True, enable_limits=False, enable_allowlist=False)
result = gov.pre_call(
tool="data_governance_tool",
action="scan_repo",
agent_id="sofiia",
)
assert result.allowed
# ─── 14. policy action ────────────────────────────────────────────────────────
def test_policy_action():
"""policy action returns structured governance policy."""
reload_policy()
result = scan_data_governance_dict("policy")
assert "retention" in result
assert "pii_patterns" in result
assert "severity_behavior" in result
assert "logging_rules" in result
ret = result["retention"]
assert "audit_jsonl_days" in ret
assert int(ret["audit_jsonl_days"]) > 0
# ─── 15. Path traversal protection ───────────────────────────────────────────
def test_path_traversal_protection():
"""Traversal outside repo_root is blocked (safe_path returns None)."""
from data_governance import _safe_path
with tempfile.TemporaryDirectory() as tmp:
result = _safe_path(tmp, "../../etc/passwd")
assert result is None
# ─── 16. Lock files excluded ─────────────────────────────────────────────────
def test_scan_repo_excludes_lock_files():
"""poetry.lock / package-lock.json not scanned (false-positive prevention)."""
with tempfile.TemporaryDirectory() as tmp:
_write(
Path(tmp), "poetry.lock",
# Lock files often have long hex strings that look like secrets
"token = \"ghp_faketoken12345678901234567890123456\"\n"
)
_write(Path(tmp), "service.py", "def hello(): return 'world'\n")
result = scan_repo(
repo_root=str(tmp),
paths_include=[""],
paths_exclude=["**/*.lock"], # lock files excluded
focus=["secrets"],
)
# poetry.lock should be excluded, so no secrets from it
lock_findings = [
f for f in result["findings"]
if "poetry.lock" in f["evidence"].get("path", "")
]
assert len(lock_findings) == 0
# ─── 17. mask_evidence ────────────────────────────────────────────────────────
def test_mask_evidence_redacts_secrets():
"""_mask_evidence masks key=value patterns."""
raw = "api_key = sk-supersecretvalue12345"
masked = _mask_evidence(raw)
assert "sk-supersecretvalue12345" not in masked
assert "***" in masked or "REDACTED" in masked
def test_mask_evidence_truncates():
"""_mask_evidence truncates long strings."""
long_str = "x" * 500
result = _mask_evidence(long_str, max_chars=100)
assert len(result) <= 120 # truncated + "…[truncated]" suffix
# ─── 18. scan_data_governance_dict unknown action ───────────────────────────
def test_unknown_action_returns_error():
"""Unknown action → error dict, not exception."""
result = scan_data_governance_dict("explode_everything")
assert "error" in result
assert "Unknown action" in result["error"]