microdao-daarion/services/router/data_governance.py

"""
Data Governance & Privacy Tool — DAARION.city

Deterministic, read-only scanner for:
  A) PII patterns in code/docs/configs (email, phone, credit card, passport)
  B) Secret exposure (inherits tool_governance._SECRET_PATTERNS + extras)
  C) Unredacted payload risk in audit/log code
  D) Storage without retention/TTL
  E) Audit stream anomalies (PII in meta, large outputs)
  F) Retention policy presence (cleanup tasks, runbooks)

Actions:
  scan_repo       — static analysis of repository files
  scan_audit      — analysis of JSONL/Postgres audit events
  retention_check — verify cleanup mechanisms exist
  policy          — return current governance policy

Security / Privacy:
  - All evidence snippets are masked/truncated before returning
  - Tool is read-only; never writes or modifies files
  - Path traversal protection: all paths confined to repo_root
"""

from __future__ import annotations

import fnmatch
import json
import logging
import os
import re
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

logger = logging.getLogger(__name__)

# ─── Config loader ────────────────────────────────────────────────────────────

_policy_cache: Optional[Dict] = None
_POLICY_PATH = os.path.join(
    os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent)),
    "config", "data_governance_policy.yml",
)


def _load_policy() -> Dict:
    global _policy_cache
    if _policy_cache is not None:
        return _policy_cache
    try:
        import yaml
        with open(_POLICY_PATH, "r") as f:
            _policy_cache = yaml.safe_load(f) or {}
    except Exception as e:
        logger.warning("data_governance_policy.yml not loaded: %s", e)
        _policy_cache = {}
    return _policy_cache


def reload_policy() -> None:
    global _policy_cache
    _policy_cache = None


# ─── Compiled patterns (lazy) ─────────────────────────────────────────────────

_compiled_pii: Optional[List[Dict]] = None
_compiled_secret: Optional[List[Dict]] = None
_compiled_log_forbidden: Optional[List[re.Pattern]] = None
_compiled_raw_payload: Optional[List[re.Pattern]] = None
_compiled_storage_write: Optional[List[re.Pattern]] = None


def _get_pii_patterns() -> List[Dict]:
    global _compiled_pii
    if _compiled_pii is not None:
        return _compiled_pii
    pol = _load_policy()
    result = []
    for name, cfg in (pol.get("pii_patterns") or {}).items():
        try:
            result.append({
                "name": name,
                "regex": re.compile(cfg["regex"], re.MULTILINE),
                "severity": cfg.get("severity", "warning"),
                "id": cfg.get("id", f"DG-PII-{name}"),
                "description": cfg.get("description", name),
            })
        except Exception as e:
            logger.warning("Bad pii_pattern '%s': %s", name, e)
    _compiled_pii = result
    return result


def _get_secret_patterns() -> List[Dict]:
    global _compiled_secret
    if _compiled_secret is not None:
        return _compiled_secret

    # Inherit from tool_governance
    inherited = []
    try:
        from tool_governance import _SECRET_PATTERNS
        for idx, pat in enumerate(_SECRET_PATTERNS):
            inherited.append({
                "name": f"inherited_{idx}",
                "regex": pat,
                "severity": "error",
                "id": "DG-SEC-000",
                "description": "Secret-like value (inherited from governance)",
            })
    except Exception:
        pass

    # Extra from policy
    pol = _load_policy()
    for extra in (pol.get("secret_patterns", {}).get("extra") or []):
        try:
            inherited.append({
                "name": extra["name"],
                "regex": re.compile(extra["regex"], re.MULTILINE),
                "severity": extra.get("severity", "error"),
                "id": extra.get("id", "DG-SEC-EXT"),
                "description": extra.get("name", "extra secret pattern"),
            })
        except Exception as e:
            logger.warning("Bad extra secret pattern '%s': %s", extra.get("name"), e)

    _compiled_secret = inherited
    return inherited


def _get_log_forbidden_pattern() -> re.Pattern:
    global _compiled_log_forbidden
    if _compiled_log_forbidden:
        return _compiled_log_forbidden[0]
    pol = _load_policy()
    fields = (pol.get("logging_rules") or {}).get("forbid_logging_fields") or []
    if not fields:
        fields = ["password", "token", "secret", "api_key"]
    pat = re.compile(
        r'(?i)(?:logger|log|logging|print|console\.log)\s*[.(]'
        r'[^)]{0,200}'
        r'(?:' + "|".join(re.escape(f) for f in fields) + r')',
        re.MULTILINE,
    )
    _compiled_log_forbidden = [pat]
    return pat


def _get_raw_payload_pattern() -> re.Pattern:
    global _compiled_raw_payload
    if _compiled_raw_payload:
        return _compiled_raw_payload[0]
    pol = _load_policy()
    indicators = (pol.get("logging_rules") or {}).get("raw_payload_indicators") or []
    if not indicators:
        indicators = ["payload", "prompt", "messages", "transcript"]
    pat = re.compile(
        r'(?i)(?:' + "|".join(re.escape(f) for f in indicators) + r')',
        re.MULTILINE,
    )
    _compiled_raw_payload = [pat]
    return pat


def _get_storage_write_pattern() -> re.Pattern:
    global _compiled_storage_write
    if _compiled_storage_write:
        return _compiled_storage_write[0]
    pol = _load_policy()
    writes = (pol.get("storage_keywords") or {}).get("write_patterns") or []
    if not writes:
        writes = ["save_message", "store_event", "insert_record", "append_event"]
    pat = re.compile(
        r'(?i)(?:' + "|".join(re.escape(w) for w in writes) + r')',
        re.MULTILINE,
    )
    _compiled_storage_write = [pat]
    return pat


# ─── Evidence masking ─────────────────────────────────────────────────────────

def _mask_evidence(text: str, max_chars: int = 200) -> str:
    """Mask secrets and truncate snippet for safe reporting."""
    try:
        from tool_governance import redact
        text = redact(text)
    except Exception:
        # Fallback: mask common patterns
        text = re.sub(
            r'(?i)(token|secret|password|key|bearer)\s*[=:]\s*\S+',
            r'\1=***',
            text,
        )
    # Truncate
    if len(text) > max_chars:
        text = text[:max_chars] + "…[truncated]"
    return text.strip()


def _line_range(lineno: int, window: int = 2) -> str:
    start = max(1, lineno - window)
    end = lineno + window
    return f"L{start}-L{end}"


# ─── Path utilities ───────────────────────────────────────────────────────────

def _is_excluded(rel_path: str, excludes: List[str]) -> bool:
    for pat in excludes:
        if fnmatch.fnmatch(rel_path, pat):
            return True
        # Also match against basename
        if fnmatch.fnmatch(Path(rel_path).name, pat):
            return True
        # Forward-slash wildcard matching
        if fnmatch.fnmatch("/" + rel_path.replace("\\", "/"), pat.replace("**", "*")):
            return True
    return False


def _is_included(rel_path: str, includes: List[str]) -> bool:
    if not includes:
        return True
    for inc in includes:
        if rel_path.startswith(inc.rstrip("/")):
            return True
    return False


def _never_scan(rel_path: str) -> bool:
    pol = _load_policy()
    never = (pol.get("paths") or {}).get("never_scan") or []
    name = Path(rel_path).name
    for pat in never:
        if fnmatch.fnmatch(name, pat.lstrip("*")):
            return True
    return False


def _safe_path(repo_root: str, rel: str) -> Optional[Path]:
    """Resolve path safely, preventing traversal outside repo_root."""
    root = Path(repo_root).resolve()
    try:
        p = (root / rel).resolve()
        if not str(p).startswith(str(root)):
            return None
        return p
    except Exception:
        return None


# ─── Finding builder ─────────────────────────────────────────────────────────

def _finding(
    fid: str,
    category: str,
    severity: str,
    title: str,
    path: str = "",
    lines: str = "",
    details: str = "",
    fix: str = "",
) -> Dict:
    return {
        "id": fid,
        "category": category,
        "severity": severity,
        "title": title,
        "evidence": {
            "path": path,
            "lines": lines,
            "details": _mask_evidence(details),
        },
        "recommended_fix": fix,
    }


# ─── A) PII scan ──────────────────────────────────────────────────────────────

def _scan_pii(content: str, rel_path: str, findings: List[Dict]) -> None:
    for pat_info in _get_pii_patterns():
        for m in pat_info["regex"].finditer(content):
            lineno = content[:m.start()].count("\n") + 1
            snippet = _mask_evidence(m.group(0))
            findings.append(_finding(
                fid=pat_info["id"],
                category="pii",
                severity=pat_info["severity"],
                title=f"{pat_info['description']} in {Path(rel_path).name}",
                path=rel_path,
                lines=_line_range(lineno),
                details=snippet,
                fix="Replace with hash, mask, or remove this value. Ensure it is not stored in plaintext.",
            ))


# ─── B) Secret scan ───────────────────────────────────────────────────────────

def _scan_secrets(content: str, rel_path: str, findings: List[Dict]) -> None:
    for pat_info in _get_secret_patterns():
        for m in pat_info["regex"].finditer(content):
            lineno = content[:m.start()].count("\n") + 1
            findings.append(_finding(
                fid=pat_info["id"],
                category="secrets",
                severity=pat_info["severity"],
                title=f"Secret-like value in {Path(rel_path).name}",
                path=rel_path,
                lines=_line_range(lineno),
                details=_mask_evidence(m.group(0), max_chars=60),
                fix="Move to environment variable or secrets manager. Never hardcode secrets.",
            ))


# ─── C) Logging risk scan ────────────────────────────────────────────────────

def _scan_logging_risk(content: str, rel_path: str, findings: List[Dict]) -> None:
    # Skip non-code files where logging patterns won't appear
    ext = Path(rel_path).suffix.lower()
    if ext not in (".py", ".ts", ".js"):
        return

    log_pat = _get_log_forbidden_pattern()
    payload_pat = _get_raw_payload_pattern()

    pol = _load_policy()
    redaction_calls = (pol.get("logging_rules") or {}).get("redaction_calls") or ["redact", "mask"]

    lines = content.splitlines()
    n = len(lines)
    context_window = 5  # lines around match to check for redaction

    for m in log_pat.finditer(content):
        lineno = content[:m.start()].count("\n") + 1
        # Check if there's a redaction call nearby
        lo = max(0, lineno - 1 - context_window)
        hi = min(n, lineno + context_window)
        context_lines = "\n".join(lines[lo:hi])
        if any(rc in context_lines for rc in redaction_calls):
            continue  # Redaction present — skip
        findings.append(_finding(
            fid="DG-LOG-001",
            category="logging",
            severity="warning",
            title=f"Potential sensitive field logged in {Path(rel_path).name}",
            path=rel_path,
            lines=_line_range(lineno),
            details=_mask_evidence(m.group(0)),
            fix="Apply redact() or mask() before logging. Log hash+last4 for identifiers.",
        ))

    # Audit/log payload risk: look for raw payload storage
    for m in payload_pat.finditer(content):
        lineno = content[:m.start()].count("\n") + 1
        # Only flag if in a logger/write context
        lo = max(0, lineno - 1 - 3)
        hi = min(n, lineno + 3)
        context = "\n".join(lines[lo:hi])
        if not re.search(r'(?i)(log|audit|event|record|store|write|insert|append|emit)', context):
            continue
        if any(rc in context for rc in redaction_calls):
            continue
        findings.append(_finding(
            fid="DG-AUD-001",
            category="logging",
            severity="error",
            title=f"Raw payload field near audit/log write in {Path(rel_path).name}",
            path=rel_path,
            lines=_line_range(lineno),
            details=_mask_evidence(m.group(0)),
            fix="Ensure payload fields are NOT stored in audit events. "
                "Log hash+size only (as in ToolGovernance post_call).",
        ))


# ─── D) Storage without retention ────────────────────────────────────────────

def _scan_retention_risk(content: str, rel_path: str, findings: List[Dict]) -> None:
    ext = Path(rel_path).suffix.lower()
    if ext not in (".py", ".ts", ".js"):
        return

    pol = _load_policy()
    storage_cfg = pol.get("storage_keywords") or {}
    retention_indicators = storage_cfg.get("retention_indicators") or ["ttl", "expire", "retention", "cleanup"]
    context_window = int(storage_cfg.get("context_window", 20))

    write_pat = _get_storage_write_pattern()
    retention_pat = re.compile(
        r'(?i)(?:' + "|".join(re.escape(r) for r in retention_indicators) + r')',
        re.MULTILINE,
    )

    lines = content.splitlines()
    n = len(lines)

    for m in write_pat.finditer(content):
        lineno = content[:m.start()].count("\n") + 1
        lo = max(0, lineno - 1 - context_window)
        hi = min(n, lineno + context_window)
        context = "\n".join(lines[lo:hi])
        if retention_pat.search(context):
            continue  # Retention indicator found — OK
        findings.append(_finding(
            fid="DG-RET-001",
            category="retention",
            severity="warning",
            title=f"Storage write without visible TTL/retention in {Path(rel_path).name}",
            path=rel_path,
            lines=_line_range(lineno),
            details=_mask_evidence(m.group(0)),
            fix="Add TTL/expiry to stored data or document retention policy in runbook. "
                "Reference ops/runbook-* for cleanup procedures.",
        ))


# ─── File collector ───────────────────────────────────────────────────────────

def _collect_files(
    repo_root: str,
    paths_include: List[str],
    paths_exclude: List[str],
    max_files: int,
    mode: str = "fast",
) -> List[Tuple[str, str]]:
    """
    Returns list of (rel_path, full_path) tuples.
    In 'fast' mode: only .py, .yml, .yaml, .json, .env.example.
    In 'full' mode: all configured extensions.
    """
    pol = _load_policy()
    if mode == "fast":
        scan_exts = {".py", ".yml", ".yaml", ".json", ".env.example", ".sh"}
    else:
        scan_exts = set((pol.get("paths") or {}).get("scan_extensions") or [
            ".py", ".ts", ".js", ".yml", ".yaml", ".json", ".md", ".txt", ".sh",
        ])

    root = Path(repo_root).resolve()
    results = []

    for start_dir in paths_include:
        start = root / start_dir.rstrip("/")
        if not start.exists():
            continue
        for fpath in start.rglob("*"):
            if not fpath.is_file():
                continue
            if fpath.suffix.lower() not in scan_exts:
                continue
            try:
                rel = str(fpath.relative_to(root))
            except ValueError:
                continue
            if _is_excluded(rel, paths_exclude):
                continue
            if _never_scan(rel):
                continue
            results.append((rel, str(fpath)))
            if len(results) >= max_files:
                return results

    return results


# ─── scan_repo ────────────────────────────────────────────────────────────────

def scan_repo(
    repo_root: str = ".",
    mode: str = "fast",
    max_files: int = 200,
    max_bytes_per_file: int = 262144,
    paths_include: Optional[List[str]] = None,
    paths_exclude: Optional[List[str]] = None,
    focus: Optional[List[str]] = None,
) -> Dict:
    """
    Static scan of repository files for privacy/security risks.

    Returns structured findings dict (pass always True in warning_only mode).
    """
    pol = _load_policy()
    paths_include = paths_include or (pol.get("paths") or {}).get("include") or ["services/", "config/", "ops/"]
    paths_exclude = paths_exclude or (pol.get("paths") or {}).get("exclude") or []
    focus = focus or ["logging", "storage", "pii", "secrets", "retention"]
    max_findings = int((pol.get("limits") or {}).get("max_findings", 200))
    gate_mode = (pol.get("severity_behavior") or {}).get("gate_mode", "warning_only")

    files = _collect_files(repo_root, paths_include, paths_exclude, max_files, mode)
    all_findings: List[Dict] = []
    files_scanned = 0
    skipped = 0

    for rel_path, full_path in files:
        try:
            size = os.path.getsize(full_path)
            if size > max_bytes_per_file:
                skipped += 1
                continue
            with open(full_path, "r", encoding="utf-8", errors="replace") as f:
                content = f.read()
        except Exception as e:
            logger.warning("Cannot read %s: %s", full_path, e)
            skipped += 1
            continue

        files_scanned += 1

        if "pii" in focus:
            _scan_pii(content, rel_path, all_findings)
        if "secrets" in focus:
            _scan_secrets(content, rel_path, all_findings)
        if "logging" in focus:
            _scan_logging_risk(content, rel_path, all_findings)
        if "retention" in focus:
            _scan_retention_risk(content, rel_path, all_findings)

        if len(all_findings) >= max_findings:
            break

    # Deduplicate: same id+path+lines
    seen = set()
    unique_findings = []
    for f in all_findings:
        key = (f["id"], f["evidence"].get("path"), f["evidence"].get("lines"))
        if key not in seen:
            unique_findings.append(f)
            seen.add(key)

    unique_findings = unique_findings[:max_findings]

    errors = sum(1 for f in unique_findings if f["severity"] == "error")
    warnings = sum(1 for f in unique_findings if f["severity"] == "warning")
    infos = sum(1 for f in unique_findings if f["severity"] == "info")

    pass_val = True  # warning_only mode
    if gate_mode == "strict" and errors > 0:
        pass_val = False

    recommendations = _build_recommendations(unique_findings)

    return {
        "pass": pass_val,
        "summary": (
            f"Scanned {files_scanned} files ({mode} mode). "
            f"Found {errors} errors, {warnings} warnings, {infos} infos."
            + (f" ({skipped} files skipped: too large)" if skipped else "")
        ),
        "stats": {
            "errors": errors,
            "warnings": warnings,
            "infos": infos,
            "files_scanned": files_scanned,
            "files_skipped": skipped,
            "events_scanned": 0,
        },
        "findings": unique_findings,
        "recommendations": recommendations,
    }


# ─── scan_audit ───────────────────────────────────────────────────────────────

def scan_audit(
    backend: str = "auto",
    time_window_hours: int = 24,
    max_events: int = 50000,
    jsonl_glob: Optional[str] = None,
    repo_root: str = ".",
) -> Dict:
    """
    Scan audit event stream for PII leaks and large-output anomalies.
    backend='auto' uses the globally configured store (Postgres or JSONL).
    """
    pol = _load_policy()
    large_threshold = int((pol.get("retention") or {}).get("large_output_bytes", 65536))

    pii_patterns = _get_pii_patterns()
    findings: List[Dict] = []
    events_scanned = 0

    try:
        store = _resolve_audit_store(backend)

        import datetime
        now = datetime.datetime.now(datetime.timezone.utc)
        from_ts = (now - datetime.timedelta(hours=time_window_hours)).isoformat()

        events = store.read(from_ts=from_ts, limit=max_events)
        events_scanned = len(events)

        for ev in events:
            # Check meta fields for PII (graph_run_id, job_id should be safe; check input_hash)
            meta_str = json.dumps({
                k: ev.get(k) for k in ("agent_id", "user_id", "workspace_id", "input_hash", "graph_run_id", "job_id")
                if ev.get(k)
            })

            for pat_info in pii_patterns:
                m = pat_info["regex"].search(meta_str)
                if m:
                    findings.append(_finding(
                        fid="DG-AUD-101",
                        category="audit",
                        severity=pat_info["severity"],
                        title=f"PII-like pattern in audit event metadata ({pat_info['description']})",
                        path=f"audit:{ev.get('tool','?')}@{ev.get('ts','')[:10]}",
                        lines="",
                        details=_mask_evidence(meta_str, max_chars=80),
                        fix="Ensure user_id/workspace_id are opaque identifiers, not real PII. "
                            "Check how identifiers are generated.",
                    ))
                    break  # One finding per event

            # Large output anomaly
            out_size = int(ev.get("out_size", 0))
            if out_size >= large_threshold:
                findings.append(_finding(
                    fid="DG-AUD-102",
                    category="audit",
                    severity="warning",
                    title=f"Unusually large tool output: {ev.get('tool','?')} ({out_size} bytes)",
                    path=f"audit:{ev.get('tool','?')}@{ev.get('ts','')[:10]}",
                    lines="",
                    details=f"out_size={out_size}, agent={ev.get('agent_id','?')}, status={ev.get('status','?')}",
                    fix="Verify output does not include raw user content. "
                        "Enforce max_bytes_out in tool_limits.yml.",
                ))

    except Exception as e:
        logger.warning("scan_audit error: %s", e)
        return {
            "pass": True,
            "summary": f"Audit scan skipped: {e}",
            "stats": {"errors": 0, "warnings": 0, "infos": 0, "events_scanned": 0, "files_scanned": 0},
            "findings": [],
            "recommendations": [],
        }

    # Deduplicate
    seen = set()
    unique = []
    for f in findings:
        key = (f["id"], f["evidence"].get("path"))
        if key not in seen:
            unique.append(f)
            seen.add(key)

    errors = sum(1 for f in unique if f["severity"] == "error")
    warnings = sum(1 for f in unique if f["severity"] == "warning")
    infos = sum(1 for f in unique if f["severity"] == "info")

    return {
        "pass": True,
        "summary": f"Scanned {events_scanned} audit events. {errors} errors, {warnings} warnings.",
        "stats": {
            "errors": errors, "warnings": warnings, "infos": infos,
            "events_scanned": events_scanned, "files_scanned": 0,
        },
        "findings": unique,
        "recommendations": _build_recommendations(unique),
    }


# ─── retention_check ─────────────────────────────────────────────────────────

def retention_check(
    repo_root: str = ".",
    check_audit_cleanup_task: bool = True,
    check_jsonl_rotation: bool = True,
    check_memory_retention_docs: bool = True,
    check_logs_retention_docs: bool = True,
) -> Dict:
    """
    Verify that cleanup/retention mechanisms exist for audit logs and memory.
    """
    findings: List[Dict] = []

    root = Path(repo_root).resolve()

    def _file_contains(path: Path, keywords: List[str]) -> bool:
        try:
            text = path.read_text(encoding="utf-8", errors="replace")
            return any(kw.lower() in text.lower() for kw in keywords)
        except Exception:
            return False

    def _find_files(pattern: str) -> List[Path]:
        return list(root.rglob(pattern))

    # ── 1. Audit cleanup task ──────────────────────────────────────────────
    if check_audit_cleanup_task:
        has_cleanup = False

        # Check task_registry.yml for audit_cleanup task
        registry_files = _find_files("task_registry.yml")
        for rf in registry_files:
            if _file_contains(rf, ["audit_cleanup", "audit_rotation"]):
                has_cleanup = True
                break

        # Check runbooks/ops docs
        if not has_cleanup:
            runbook_files = list(root.glob("ops/runbook*.md")) + list(root.rglob("*runbook*.md"))
            for rb in runbook_files:
                if _file_contains(rb, ["audit", "cleanup", "rotation", "jsonl"]):
                    has_cleanup = True
                    break

        if has_cleanup:
            findings.append(_finding(
                fid="DG-RET-202",
                category="retention",
                severity="info",
                title="Audit cleanup/rotation mechanism documented",
                path="ops/",
                fix="",
            ))
        else:
            findings.append(_finding(
                fid="DG-RET-201",
                category="retention",
                severity="warning",
                title="No audit cleanup task or runbook found",
                path="ops/task_registry.yml",
                fix="Add 'audit_cleanup' task to ops/task_registry.yml or document retention "
                    "procedure in ops/runbook-*.md. Default retention: 30 days.",
            ))

    # ── 2. JSONL rotation (audit_store.py check) ──────────────────────────
    if check_jsonl_rotation:
        store_file = root / "services" / "router" / "audit_store.py"
        if store_file.exists() and _file_contains(store_file, ["rotation", "daily", "tool_audit_"]):
            findings.append(_finding(
                fid="DG-RET-203",
                category="retention",
                severity="info",
                title="JSONL audit rotation implemented in audit_store.py",
                path="services/router/audit_store.py",
                fix="",
            ))
        else:
            findings.append(_finding(
                fid="DG-RET-204",
                category="retention",
                severity="warning",
                title="JSONL audit rotation not confirmed in audit_store.py",
                path="services/router/audit_store.py",
                fix="Ensure JsonlAuditStore uses daily rotation (tool_audit_YYYY-MM-DD.jsonl) "
                    "and implement a cleanup job for files older than 30 days.",
            ))

    # ── 3. Memory retention docs ─────────────────────────────────────────
    if check_memory_retention_docs:
        has_mem_retention = False
        doc_files = list(root.rglob("*.md")) + list(root.rglob("*.yml"))
        for df in doc_files[:200]:  # limit scan
            if _file_contains(df, ["memory_events_days", "memory retention", "memory_ttl", "memory.*expire"]):
                has_mem_retention = True
                break
        if not has_mem_retention:
            findings.append(_finding(
                fid="DG-RET-205",
                category="retention",
                severity="info",
                title="Memory event retention policy not found in docs/config",
                path="config/",
                fix="Document memory event TTL/retention in config/data_governance_policy.yml "
                    "(memory_events_days) and implement cleanup.",
            ))

    # ── 4. Logs retention docs ───────────────────────────────────────────
    if check_logs_retention_docs:
        has_log_retention = False
        for df in (list(root.glob("ops/*.md")) + list(root.rglob("*runbook*.md")))[:50]:
            if _file_contains(df, ["logs_days", "log retention", "log rotation", "loki retention"]):
                has_log_retention = True
                break
        if not has_log_retention:
            findings.append(_finding(
                fid="DG-RET-206",
                category="retention",
                severity="info",
                title="Log retention period not documented in runbooks",
                path="ops/",
                fix="Document log retention in ops/runbook-*.md or config/data_governance_policy.yml "
                    "(logs_days: 14).",
            ))

    errors = sum(1 for f in findings if f["severity"] == "error")
    warnings = sum(1 for f in findings if f["severity"] == "warning")
    infos = sum(1 for f in findings if f["severity"] == "info")

    return {
        "pass": True,
        "summary": f"Retention check: {errors} errors, {warnings} warnings, {infos} infos.",
        "stats": {"errors": errors, "warnings": warnings, "infos": infos, "files_scanned": 0, "events_scanned": 0},
        "findings": findings,
        "recommendations": _build_recommendations(findings),
    }


# ─── policy ───────────────────────────────────────────────────────────────────

def get_policy() -> Dict:
    reload_policy()
    pol = _load_policy()
    return {
        "policy_path": _POLICY_PATH,
        "retention": pol.get("retention", {}),
        "pii_patterns": {k: {"severity": v.get("severity"), "id": v.get("id")}
                         for k, v in (pol.get("pii_patterns") or {}).items()},
        "secret_patterns_count": len(_get_secret_patterns()),
        "logging_rules": pol.get("logging_rules", {}),
        "severity_behavior": pol.get("severity_behavior", {}),
        "limits": pol.get("limits", {}),
    }


# ─── Recommendations ──────────────────────────────────────────────────────────

_REC_MAP = {
    "DG-LOG-001": "Review logger calls for sensitive fields. Apply redact() before logging.",
    "DG-AUD-001": "Audit/log stores may contain raw payload. Enforce hash+size-only pattern.",
    "DG-RET-001": "Add TTL or cleanup policy for stored data. Reference data_governance_policy.yml.",
    "DG-RET-201": "Create an 'audit_cleanup' task in task_registry.yml or document retention in runbook.",
    "DG-AUD-101": "Verify audit event identifiers are opaque (not real PII).",
    "DG-AUD-102": "Large tool outputs may contain user content. Enforce max_bytes_out limits.",
    "DG-PII-001": "Mask or hash email addresses before storage/logging.",
    "DG-PII-002": "Mask phone numbers in logs and stored data.",
    "DG-PII-003": "Credit card-like patterns detected. Remove immediately and audit access.",
    "DG-SEC-000": "Rotate or remove secret-like values. Use environment variables.",
    "DG-SEC-001": "Remove private key from code. Use secrets manager.",
}


def _build_recommendations(findings: List[Dict]) -> List[str]:
    seen_ids = set()
    recs = []
    for f in findings:
        fid = f.get("id", "")
        rec = _REC_MAP.get(fid)
        if rec and fid not in seen_ids and f["severity"] in ("error", "warning"):
            recs.append(rec)
            seen_ids.add(fid)
    return recs


# ─── backend=auto resolver ───────────────────────────────────────────────────

def _resolve_audit_store(backend: str = "auto"):
    """Resolve AuditStore by backend param (auto/jsonl/memory)."""
    from audit_store import get_audit_store, JsonlAuditStore, MemoryAuditStore
    if backend in ("auto", None, ""):
        return get_audit_store()
    if backend == "jsonl":
        import os
        from pathlib import Path
        audit_dir = os.getenv(
            "AUDIT_JSONL_DIR",
            str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "audit"),
        )
        return JsonlAuditStore(audit_dir)
    if backend == "memory":
        return MemoryAuditStore()
    return get_audit_store()


# ─── digest_audit ─────────────────────────────────────────────────────────────

def digest_audit(
    backend: str = "auto",
    time_window_hours: int = 24,
    max_findings: int = 20,
    max_markdown_chars: int = 3800,
) -> Dict:
    """
    Privacy/audit digest: scans audit stream, summarises findings.

    Returns both structured JSON and a Telegram/markdown-friendly `markdown` field.
    """
    store = _resolve_audit_store(backend)

    # Run underlying scan
    raw = scan_audit(
        backend=backend,
        time_window_hours=time_window_hours,
        max_events=50_000,
    )

    findings = raw.get("findings") or []
    stats = raw.get("stats") or {}
    events_scanned = stats.get("events_scanned", 0)
    errors = stats.get("errors", 0)
    warnings = stats.get("warnings", 0)
    infos = stats.get("infos", 0)
    total = errors + warnings + infos

    # Group findings by category
    by_category: dict = {}
    for f in findings[:max_findings]:
        cat = f.get("category", "unknown")
        by_category.setdefault(cat, []).append(f)

    # Recommendations from findings
    recs = _build_recommendations(findings[:max_findings])

    # Determine source backend
    source = "unknown"
    try:
        if hasattr(store, "active_backend"):
            source = store.active_backend()
        elif type(store).__name__ == "PostgresAuditStore":
            source = "postgres"
        elif type(store).__name__ == "JsonlAuditStore":
            source = "jsonl"
        elif type(store).__name__ == "MemoryAuditStore":
            source = "memory"
    except Exception:
        pass

    # ── Markdown ─────────────────────────────────────────────────────────────
    period = f"Last {time_window_hours}h"
    status_icon = "🔴" if errors > 0 else ("🟡" if warnings > 0 else "🟢")
    lines = [
        f"{status_icon} **Privacy Audit Digest** ({period})",
        f"Events scanned: {events_scanned} | Findings: {total} ({errors}E / {warnings}W / {infos}I)",
        f"Backend: `{source}`",
        "",
    ]
    if total == 0:
        lines.append("✅ No privacy issues detected in audit stream.")
    else:
        for cat, cat_findings in by_category.items():
            lines.append(f"**[{cat.upper()}]** {len(cat_findings)} finding(s):")
            for f in cat_findings[:3]:
                sev = f.get("severity", "?")
                icon = "🔴" if sev == "error" else ("🟡" if sev == "warning" else "ℹ️")
                lines.append(f"  {icon} `{f.get('id','?')}` — {f.get('title','')[:100]}")
            lines.append("")

    if recs:
        lines.append("💡 **Recommendations:**")
        for r in recs[:5]:
            lines.append(f"  {r[:200]}")

    markdown = "\n".join(lines)
    if len(markdown) > max_markdown_chars:
        markdown = markdown[:max_markdown_chars] + "\n…[truncated]"

    return {
        "period": period,
        "window_hours": time_window_hours,
        "source_backend": source,
        "stats": {
            "events_scanned": events_scanned,
            "errors": errors,
            "warnings": warnings,
            "infos": infos,
            "total": total,
        },
        "by_category": {cat: len(fs) for cat, fs in by_category.items()},
        "top_findings": findings[:max_findings],
        "recommendations": recs,
        "markdown": markdown,
        "pass": raw.get("pass", True),
    }


# ─── Main entrypoint ─────────────────────────────────────────────────────────

def scan_data_governance_dict(action: str, params: Optional[Dict] = None, repo_root: Optional[str] = None) -> Dict:
    """
    Dispatcher called by tool_manager handler.
    Returns plain dict suitable for ToolResult.
    """
    params = params or {}
    if repo_root is None:
        repo_root = os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent))

    if action == "scan_repo":
        return scan_repo(
            repo_root=repo_root,
            mode=params.get("mode", "fast"),
            max_files=int(params.get("max_files", 200)),
            max_bytes_per_file=int(params.get("max_bytes_per_file", 262144)),
            paths_include=params.get("paths_include"),
            paths_exclude=params.get("paths_exclude"),
            focus=params.get("focus"),
        )

    if action == "digest_audit":
        return digest_audit(
            backend=params.get("backend", "auto"),
            time_window_hours=int(params.get("time_window_hours", 24)),
            max_findings=int(params.get("max_findings", 20)),
            max_markdown_chars=int(params.get("max_markdown_chars", 3800)),
        )

    if action == "scan_audit":
        return scan_audit(
            backend=params.get("backend", "auto"),
            time_window_hours=int(params.get("time_window_hours", 24)),
            max_events=int(params.get("max_events", 50000)),
            jsonl_glob=params.get("jsonl_glob"),
            repo_root=repo_root,
        )

    if action == "retention_check":
        return retention_check(
            repo_root=repo_root,
            check_audit_cleanup_task=bool(params.get("check_audit_cleanup_task", True)),
            check_jsonl_rotation=bool(params.get("check_jsonl_rotation", True)),
            check_memory_retention_docs=bool(params.get("check_memory_retention_docs", True)),
            check_logs_retention_docs=bool(params.get("check_logs_retention_docs", True)),
        )

    if action == "policy":
        return get_policy()

    return {"error": f"Unknown action '{action}'. Valid: scan_repo, digest_audit, scan_audit, retention_check, policy"}