""" Data Governance & Privacy Tool — DAARION.city Deterministic, read-only scanner for: A) PII patterns in code/docs/configs (email, phone, credit card, passport) B) Secret exposure (inherits tool_governance._SECRET_PATTERNS + extras) C) Unredacted payload risk in audit/log code D) Storage without retention/TTL E) Audit stream anomalies (PII in meta, large outputs) F) Retention policy presence (cleanup tasks, runbooks) Actions: scan_repo — static analysis of repository files scan_audit — analysis of JSONL/Postgres audit events retention_check — verify cleanup mechanisms exist policy — return current governance policy Security / Privacy: - All evidence snippets are masked/truncated before returning - Tool is read-only; never writes or modifies files - Path traversal protection: all paths confined to repo_root """ from __future__ import annotations import fnmatch import json import logging import os import re from collections import defaultdict from pathlib import Path from typing import Any, Dict, List, Optional, Tuple logger = logging.getLogger(__name__) # ─── Config loader ──────────────────────────────────────────────────────────── _policy_cache: Optional[Dict] = None _POLICY_PATH = os.path.join( os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent)), "config", "data_governance_policy.yml", ) def _load_policy() -> Dict: global _policy_cache if _policy_cache is not None: return _policy_cache try: import yaml with open(_POLICY_PATH, "r") as f: _policy_cache = yaml.safe_load(f) or {} except Exception as e: logger.warning("data_governance_policy.yml not loaded: %s", e) _policy_cache = {} return _policy_cache def reload_policy() -> None: global _policy_cache _policy_cache = None # ─── Compiled patterns (lazy) ───────────────────────────────────────────────── _compiled_pii: Optional[List[Dict]] = None _compiled_secret: Optional[List[Dict]] = None _compiled_log_forbidden: Optional[List[re.Pattern]] = None _compiled_raw_payload: Optional[List[re.Pattern]] = None _compiled_storage_write: Optional[List[re.Pattern]] = None def _get_pii_patterns() -> List[Dict]: global _compiled_pii if _compiled_pii is not None: return _compiled_pii pol = _load_policy() result = [] for name, cfg in (pol.get("pii_patterns") or {}).items(): try: result.append({ "name": name, "regex": re.compile(cfg["regex"], re.MULTILINE), "severity": cfg.get("severity", "warning"), "id": cfg.get("id", f"DG-PII-{name}"), "description": cfg.get("description", name), }) except Exception as e: logger.warning("Bad pii_pattern '%s': %s", name, e) _compiled_pii = result return result def _get_secret_patterns() -> List[Dict]: global _compiled_secret if _compiled_secret is not None: return _compiled_secret # Inherit from tool_governance inherited = [] try: from tool_governance import _SECRET_PATTERNS for idx, pat in enumerate(_SECRET_PATTERNS): inherited.append({ "name": f"inherited_{idx}", "regex": pat, "severity": "error", "id": "DG-SEC-000", "description": "Secret-like value (inherited from governance)", }) except Exception: pass # Extra from policy pol = _load_policy() for extra in (pol.get("secret_patterns", {}).get("extra") or []): try: inherited.append({ "name": extra["name"], "regex": re.compile(extra["regex"], re.MULTILINE), "severity": extra.get("severity", "error"), "id": extra.get("id", "DG-SEC-EXT"), "description": extra.get("name", "extra secret pattern"), }) except Exception as e: logger.warning("Bad extra secret pattern '%s': %s", extra.get("name"), e) _compiled_secret = inherited return inherited def _get_log_forbidden_pattern() -> re.Pattern: global _compiled_log_forbidden if _compiled_log_forbidden: return _compiled_log_forbidden[0] pol = _load_policy() fields = (pol.get("logging_rules") or {}).get("forbid_logging_fields") or [] if not fields: fields = ["password", "token", "secret", "api_key"] pat = re.compile( r'(?i)(?:logger|log|logging|print|console\.log)\s*[.(]' r'[^)]{0,200}' r'(?:' + "|".join(re.escape(f) for f in fields) + r')', re.MULTILINE, ) _compiled_log_forbidden = [pat] return pat def _get_raw_payload_pattern() -> re.Pattern: global _compiled_raw_payload if _compiled_raw_payload: return _compiled_raw_payload[0] pol = _load_policy() indicators = (pol.get("logging_rules") or {}).get("raw_payload_indicators") or [] if not indicators: indicators = ["payload", "prompt", "messages", "transcript"] pat = re.compile( r'(?i)(?:' + "|".join(re.escape(f) for f in indicators) + r')', re.MULTILINE, ) _compiled_raw_payload = [pat] return pat def _get_storage_write_pattern() -> re.Pattern: global _compiled_storage_write if _compiled_storage_write: return _compiled_storage_write[0] pol = _load_policy() writes = (pol.get("storage_keywords") or {}).get("write_patterns") or [] if not writes: writes = ["save_message", "store_event", "insert_record", "append_event"] pat = re.compile( r'(?i)(?:' + "|".join(re.escape(w) for w in writes) + r')', re.MULTILINE, ) _compiled_storage_write = [pat] return pat # ─── Evidence masking ───────────────────────────────────────────────────────── def _mask_evidence(text: str, max_chars: int = 200) -> str: """Mask secrets and truncate snippet for safe reporting.""" try: from tool_governance import redact text = redact(text) except Exception: # Fallback: mask common patterns text = re.sub( r'(?i)(token|secret|password|key|bearer)\s*[=:]\s*\S+', r'\1=***', text, ) # Truncate if len(text) > max_chars: text = text[:max_chars] + "…[truncated]" return text.strip() def _line_range(lineno: int, window: int = 2) -> str: start = max(1, lineno - window) end = lineno + window return f"L{start}-L{end}" # ─── Path utilities ─────────────────────────────────────────────────────────── def _is_excluded(rel_path: str, excludes: List[str]) -> bool: for pat in excludes: if fnmatch.fnmatch(rel_path, pat): return True # Also match against basename if fnmatch.fnmatch(Path(rel_path).name, pat): return True # Forward-slash wildcard matching if fnmatch.fnmatch("/" + rel_path.replace("\\", "/"), pat.replace("**", "*")): return True return False def _is_included(rel_path: str, includes: List[str]) -> bool: if not includes: return True for inc in includes: if rel_path.startswith(inc.rstrip("/")): return True return False def _never_scan(rel_path: str) -> bool: pol = _load_policy() never = (pol.get("paths") or {}).get("never_scan") or [] name = Path(rel_path).name for pat in never: if fnmatch.fnmatch(name, pat.lstrip("*")): return True return False def _safe_path(repo_root: str, rel: str) -> Optional[Path]: """Resolve path safely, preventing traversal outside repo_root.""" root = Path(repo_root).resolve() try: p = (root / rel).resolve() if not str(p).startswith(str(root)): return None return p except Exception: return None # ─── Finding builder ───────────────────────────────────────────────────────── def _finding( fid: str, category: str, severity: str, title: str, path: str = "", lines: str = "", details: str = "", fix: str = "", ) -> Dict: return { "id": fid, "category": category, "severity": severity, "title": title, "evidence": { "path": path, "lines": lines, "details": _mask_evidence(details), }, "recommended_fix": fix, } # ─── A) PII scan ────────────────────────────────────────────────────────────── def _scan_pii(content: str, rel_path: str, findings: List[Dict]) -> None: for pat_info in _get_pii_patterns(): for m in pat_info["regex"].finditer(content): lineno = content[:m.start()].count("\n") + 1 snippet = _mask_evidence(m.group(0)) findings.append(_finding( fid=pat_info["id"], category="pii", severity=pat_info["severity"], title=f"{pat_info['description']} in {Path(rel_path).name}", path=rel_path, lines=_line_range(lineno), details=snippet, fix="Replace with hash, mask, or remove this value. Ensure it is not stored in plaintext.", )) # ─── B) Secret scan ─────────────────────────────────────────────────────────── def _scan_secrets(content: str, rel_path: str, findings: List[Dict]) -> None: for pat_info in _get_secret_patterns(): for m in pat_info["regex"].finditer(content): lineno = content[:m.start()].count("\n") + 1 findings.append(_finding( fid=pat_info["id"], category="secrets", severity=pat_info["severity"], title=f"Secret-like value in {Path(rel_path).name}", path=rel_path, lines=_line_range(lineno), details=_mask_evidence(m.group(0), max_chars=60), fix="Move to environment variable or secrets manager. Never hardcode secrets.", )) # ─── C) Logging risk scan ──────────────────────────────────────────────────── def _scan_logging_risk(content: str, rel_path: str, findings: List[Dict]) -> None: # Skip non-code files where logging patterns won't appear ext = Path(rel_path).suffix.lower() if ext not in (".py", ".ts", ".js"): return log_pat = _get_log_forbidden_pattern() payload_pat = _get_raw_payload_pattern() pol = _load_policy() redaction_calls = (pol.get("logging_rules") or {}).get("redaction_calls") or ["redact", "mask"] lines = content.splitlines() n = len(lines) context_window = 5 # lines around match to check for redaction for m in log_pat.finditer(content): lineno = content[:m.start()].count("\n") + 1 # Check if there's a redaction call nearby lo = max(0, lineno - 1 - context_window) hi = min(n, lineno + context_window) context_lines = "\n".join(lines[lo:hi]) if any(rc in context_lines for rc in redaction_calls): continue # Redaction present — skip findings.append(_finding( fid="DG-LOG-001", category="logging", severity="warning", title=f"Potential sensitive field logged in {Path(rel_path).name}", path=rel_path, lines=_line_range(lineno), details=_mask_evidence(m.group(0)), fix="Apply redact() or mask() before logging. Log hash+last4 for identifiers.", )) # Audit/log payload risk: look for raw payload storage for m in payload_pat.finditer(content): lineno = content[:m.start()].count("\n") + 1 # Only flag if in a logger/write context lo = max(0, lineno - 1 - 3) hi = min(n, lineno + 3) context = "\n".join(lines[lo:hi]) if not re.search(r'(?i)(log|audit|event|record|store|write|insert|append|emit)', context): continue if any(rc in context for rc in redaction_calls): continue findings.append(_finding( fid="DG-AUD-001", category="logging", severity="error", title=f"Raw payload field near audit/log write in {Path(rel_path).name}", path=rel_path, lines=_line_range(lineno), details=_mask_evidence(m.group(0)), fix="Ensure payload fields are NOT stored in audit events. " "Log hash+size only (as in ToolGovernance post_call).", )) # ─── D) Storage without retention ──────────────────────────────────────────── def _scan_retention_risk(content: str, rel_path: str, findings: List[Dict]) -> None: ext = Path(rel_path).suffix.lower() if ext not in (".py", ".ts", ".js"): return pol = _load_policy() storage_cfg = pol.get("storage_keywords") or {} retention_indicators = storage_cfg.get("retention_indicators") or ["ttl", "expire", "retention", "cleanup"] context_window = int(storage_cfg.get("context_window", 20)) write_pat = _get_storage_write_pattern() retention_pat = re.compile( r'(?i)(?:' + "|".join(re.escape(r) for r in retention_indicators) + r')', re.MULTILINE, ) lines = content.splitlines() n = len(lines) for m in write_pat.finditer(content): lineno = content[:m.start()].count("\n") + 1 lo = max(0, lineno - 1 - context_window) hi = min(n, lineno + context_window) context = "\n".join(lines[lo:hi]) if retention_pat.search(context): continue # Retention indicator found — OK findings.append(_finding( fid="DG-RET-001", category="retention", severity="warning", title=f"Storage write without visible TTL/retention in {Path(rel_path).name}", path=rel_path, lines=_line_range(lineno), details=_mask_evidence(m.group(0)), fix="Add TTL/expiry to stored data or document retention policy in runbook. " "Reference ops/runbook-* for cleanup procedures.", )) # ─── File collector ─────────────────────────────────────────────────────────── def _collect_files( repo_root: str, paths_include: List[str], paths_exclude: List[str], max_files: int, mode: str = "fast", ) -> List[Tuple[str, str]]: """ Returns list of (rel_path, full_path) tuples. In 'fast' mode: only .py, .yml, .yaml, .json, .env.example. In 'full' mode: all configured extensions. """ pol = _load_policy() if mode == "fast": scan_exts = {".py", ".yml", ".yaml", ".json", ".env.example", ".sh"} else: scan_exts = set((pol.get("paths") or {}).get("scan_extensions") or [ ".py", ".ts", ".js", ".yml", ".yaml", ".json", ".md", ".txt", ".sh", ]) root = Path(repo_root).resolve() results = [] for start_dir in paths_include: start = root / start_dir.rstrip("/") if not start.exists(): continue for fpath in start.rglob("*"): if not fpath.is_file(): continue if fpath.suffix.lower() not in scan_exts: continue try: rel = str(fpath.relative_to(root)) except ValueError: continue if _is_excluded(rel, paths_exclude): continue if _never_scan(rel): continue results.append((rel, str(fpath))) if len(results) >= max_files: return results return results # ─── scan_repo ──────────────────────────────────────────────────────────────── def scan_repo( repo_root: str = ".", mode: str = "fast", max_files: int = 200, max_bytes_per_file: int = 262144, paths_include: Optional[List[str]] = None, paths_exclude: Optional[List[str]] = None, focus: Optional[List[str]] = None, ) -> Dict: """ Static scan of repository files for privacy/security risks. Returns structured findings dict (pass always True in warning_only mode). """ pol = _load_policy() paths_include = paths_include or (pol.get("paths") or {}).get("include") or ["services/", "config/", "ops/"] paths_exclude = paths_exclude or (pol.get("paths") or {}).get("exclude") or [] focus = focus or ["logging", "storage", "pii", "secrets", "retention"] max_findings = int((pol.get("limits") or {}).get("max_findings", 200)) gate_mode = (pol.get("severity_behavior") or {}).get("gate_mode", "warning_only") files = _collect_files(repo_root, paths_include, paths_exclude, max_files, mode) all_findings: List[Dict] = [] files_scanned = 0 skipped = 0 for rel_path, full_path in files: try: size = os.path.getsize(full_path) if size > max_bytes_per_file: skipped += 1 continue with open(full_path, "r", encoding="utf-8", errors="replace") as f: content = f.read() except Exception as e: logger.warning("Cannot read %s: %s", full_path, e) skipped += 1 continue files_scanned += 1 if "pii" in focus: _scan_pii(content, rel_path, all_findings) if "secrets" in focus: _scan_secrets(content, rel_path, all_findings) if "logging" in focus: _scan_logging_risk(content, rel_path, all_findings) if "retention" in focus: _scan_retention_risk(content, rel_path, all_findings) if len(all_findings) >= max_findings: break # Deduplicate: same id+path+lines seen = set() unique_findings = [] for f in all_findings: key = (f["id"], f["evidence"].get("path"), f["evidence"].get("lines")) if key not in seen: unique_findings.append(f) seen.add(key) unique_findings = unique_findings[:max_findings] errors = sum(1 for f in unique_findings if f["severity"] == "error") warnings = sum(1 for f in unique_findings if f["severity"] == "warning") infos = sum(1 for f in unique_findings if f["severity"] == "info") pass_val = True # warning_only mode if gate_mode == "strict" and errors > 0: pass_val = False recommendations = _build_recommendations(unique_findings) return { "pass": pass_val, "summary": ( f"Scanned {files_scanned} files ({mode} mode). " f"Found {errors} errors, {warnings} warnings, {infos} infos." + (f" ({skipped} files skipped: too large)" if skipped else "") ), "stats": { "errors": errors, "warnings": warnings, "infos": infos, "files_scanned": files_scanned, "files_skipped": skipped, "events_scanned": 0, }, "findings": unique_findings, "recommendations": recommendations, } # ─── scan_audit ─────────────────────────────────────────────────────────────── def scan_audit( backend: str = "auto", time_window_hours: int = 24, max_events: int = 50000, jsonl_glob: Optional[str] = None, repo_root: str = ".", ) -> Dict: """ Scan audit event stream for PII leaks and large-output anomalies. backend='auto' uses the globally configured store (Postgres or JSONL). """ pol = _load_policy() large_threshold = int((pol.get("retention") or {}).get("large_output_bytes", 65536)) pii_patterns = _get_pii_patterns() findings: List[Dict] = [] events_scanned = 0 try: store = _resolve_audit_store(backend) import datetime now = datetime.datetime.now(datetime.timezone.utc) from_ts = (now - datetime.timedelta(hours=time_window_hours)).isoformat() events = store.read(from_ts=from_ts, limit=max_events) events_scanned = len(events) for ev in events: # Check meta fields for PII (graph_run_id, job_id should be safe; check input_hash) meta_str = json.dumps({ k: ev.get(k) for k in ("agent_id", "user_id", "workspace_id", "input_hash", "graph_run_id", "job_id") if ev.get(k) }) for pat_info in pii_patterns: m = pat_info["regex"].search(meta_str) if m: findings.append(_finding( fid="DG-AUD-101", category="audit", severity=pat_info["severity"], title=f"PII-like pattern in audit event metadata ({pat_info['description']})", path=f"audit:{ev.get('tool','?')}@{ev.get('ts','')[:10]}", lines="", details=_mask_evidence(meta_str, max_chars=80), fix="Ensure user_id/workspace_id are opaque identifiers, not real PII. " "Check how identifiers are generated.", )) break # One finding per event # Large output anomaly out_size = int(ev.get("out_size", 0)) if out_size >= large_threshold: findings.append(_finding( fid="DG-AUD-102", category="audit", severity="warning", title=f"Unusually large tool output: {ev.get('tool','?')} ({out_size} bytes)", path=f"audit:{ev.get('tool','?')}@{ev.get('ts','')[:10]}", lines="", details=f"out_size={out_size}, agent={ev.get('agent_id','?')}, status={ev.get('status','?')}", fix="Verify output does not include raw user content. " "Enforce max_bytes_out in tool_limits.yml.", )) except Exception as e: logger.warning("scan_audit error: %s", e) return { "pass": True, "summary": f"Audit scan skipped: {e}", "stats": {"errors": 0, "warnings": 0, "infos": 0, "events_scanned": 0, "files_scanned": 0}, "findings": [], "recommendations": [], } # Deduplicate seen = set() unique = [] for f in findings: key = (f["id"], f["evidence"].get("path")) if key not in seen: unique.append(f) seen.add(key) errors = sum(1 for f in unique if f["severity"] == "error") warnings = sum(1 for f in unique if f["severity"] == "warning") infos = sum(1 for f in unique if f["severity"] == "info") return { "pass": True, "summary": f"Scanned {events_scanned} audit events. {errors} errors, {warnings} warnings.", "stats": { "errors": errors, "warnings": warnings, "infos": infos, "events_scanned": events_scanned, "files_scanned": 0, }, "findings": unique, "recommendations": _build_recommendations(unique), } # ─── retention_check ───────────────────────────────────────────────────────── def retention_check( repo_root: str = ".", check_audit_cleanup_task: bool = True, check_jsonl_rotation: bool = True, check_memory_retention_docs: bool = True, check_logs_retention_docs: bool = True, ) -> Dict: """ Verify that cleanup/retention mechanisms exist for audit logs and memory. """ findings: List[Dict] = [] root = Path(repo_root).resolve() def _file_contains(path: Path, keywords: List[str]) -> bool: try: text = path.read_text(encoding="utf-8", errors="replace") return any(kw.lower() in text.lower() for kw in keywords) except Exception: return False def _find_files(pattern: str) -> List[Path]: return list(root.rglob(pattern)) # ── 1. Audit cleanup task ────────────────────────────────────────────── if check_audit_cleanup_task: has_cleanup = False # Check task_registry.yml for audit_cleanup task registry_files = _find_files("task_registry.yml") for rf in registry_files: if _file_contains(rf, ["audit_cleanup", "audit_rotation"]): has_cleanup = True break # Check runbooks/ops docs if not has_cleanup: runbook_files = list(root.glob("ops/runbook*.md")) + list(root.rglob("*runbook*.md")) for rb in runbook_files: if _file_contains(rb, ["audit", "cleanup", "rotation", "jsonl"]): has_cleanup = True break if has_cleanup: findings.append(_finding( fid="DG-RET-202", category="retention", severity="info", title="Audit cleanup/rotation mechanism documented", path="ops/", fix="", )) else: findings.append(_finding( fid="DG-RET-201", category="retention", severity="warning", title="No audit cleanup task or runbook found", path="ops/task_registry.yml", fix="Add 'audit_cleanup' task to ops/task_registry.yml or document retention " "procedure in ops/runbook-*.md. Default retention: 30 days.", )) # ── 2. JSONL rotation (audit_store.py check) ────────────────────────── if check_jsonl_rotation: store_file = root / "services" / "router" / "audit_store.py" if store_file.exists() and _file_contains(store_file, ["rotation", "daily", "tool_audit_"]): findings.append(_finding( fid="DG-RET-203", category="retention", severity="info", title="JSONL audit rotation implemented in audit_store.py", path="services/router/audit_store.py", fix="", )) else: findings.append(_finding( fid="DG-RET-204", category="retention", severity="warning", title="JSONL audit rotation not confirmed in audit_store.py", path="services/router/audit_store.py", fix="Ensure JsonlAuditStore uses daily rotation (tool_audit_YYYY-MM-DD.jsonl) " "and implement a cleanup job for files older than 30 days.", )) # ── 3. Memory retention docs ───────────────────────────────────────── if check_memory_retention_docs: has_mem_retention = False doc_files = list(root.rglob("*.md")) + list(root.rglob("*.yml")) for df in doc_files[:200]: # limit scan if _file_contains(df, ["memory_events_days", "memory retention", "memory_ttl", "memory.*expire"]): has_mem_retention = True break if not has_mem_retention: findings.append(_finding( fid="DG-RET-205", category="retention", severity="info", title="Memory event retention policy not found in docs/config", path="config/", fix="Document memory event TTL/retention in config/data_governance_policy.yml " "(memory_events_days) and implement cleanup.", )) # ── 4. Logs retention docs ─────────────────────────────────────────── if check_logs_retention_docs: has_log_retention = False for df in (list(root.glob("ops/*.md")) + list(root.rglob("*runbook*.md")))[:50]: if _file_contains(df, ["logs_days", "log retention", "log rotation", "loki retention"]): has_log_retention = True break if not has_log_retention: findings.append(_finding( fid="DG-RET-206", category="retention", severity="info", title="Log retention period not documented in runbooks", path="ops/", fix="Document log retention in ops/runbook-*.md or config/data_governance_policy.yml " "(logs_days: 14).", )) errors = sum(1 for f in findings if f["severity"] == "error") warnings = sum(1 for f in findings if f["severity"] == "warning") infos = sum(1 for f in findings if f["severity"] == "info") return { "pass": True, "summary": f"Retention check: {errors} errors, {warnings} warnings, {infos} infos.", "stats": {"errors": errors, "warnings": warnings, "infos": infos, "files_scanned": 0, "events_scanned": 0}, "findings": findings, "recommendations": _build_recommendations(findings), } # ─── policy ─────────────────────────────────────────────────────────────────── def get_policy() -> Dict: reload_policy() pol = _load_policy() return { "policy_path": _POLICY_PATH, "retention": pol.get("retention", {}), "pii_patterns": {k: {"severity": v.get("severity"), "id": v.get("id")} for k, v in (pol.get("pii_patterns") or {}).items()}, "secret_patterns_count": len(_get_secret_patterns()), "logging_rules": pol.get("logging_rules", {}), "severity_behavior": pol.get("severity_behavior", {}), "limits": pol.get("limits", {}), } # ─── Recommendations ────────────────────────────────────────────────────────── _REC_MAP = { "DG-LOG-001": "Review logger calls for sensitive fields. Apply redact() before logging.", "DG-AUD-001": "Audit/log stores may contain raw payload. Enforce hash+size-only pattern.", "DG-RET-001": "Add TTL or cleanup policy for stored data. Reference data_governance_policy.yml.", "DG-RET-201": "Create an 'audit_cleanup' task in task_registry.yml or document retention in runbook.", "DG-AUD-101": "Verify audit event identifiers are opaque (not real PII).", "DG-AUD-102": "Large tool outputs may contain user content. Enforce max_bytes_out limits.", "DG-PII-001": "Mask or hash email addresses before storage/logging.", "DG-PII-002": "Mask phone numbers in logs and stored data.", "DG-PII-003": "Credit card-like patterns detected. Remove immediately and audit access.", "DG-SEC-000": "Rotate or remove secret-like values. Use environment variables.", "DG-SEC-001": "Remove private key from code. Use secrets manager.", } def _build_recommendations(findings: List[Dict]) -> List[str]: seen_ids = set() recs = [] for f in findings: fid = f.get("id", "") rec = _REC_MAP.get(fid) if rec and fid not in seen_ids and f["severity"] in ("error", "warning"): recs.append(rec) seen_ids.add(fid) return recs # ─── backend=auto resolver ─────────────────────────────────────────────────── def _resolve_audit_store(backend: str = "auto"): """Resolve AuditStore by backend param (auto/jsonl/memory).""" from audit_store import get_audit_store, JsonlAuditStore, MemoryAuditStore if backend in ("auto", None, ""): return get_audit_store() if backend == "jsonl": import os from pathlib import Path audit_dir = os.getenv( "AUDIT_JSONL_DIR", str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "audit"), ) return JsonlAuditStore(audit_dir) if backend == "memory": return MemoryAuditStore() return get_audit_store() # ─── digest_audit ───────────────────────────────────────────────────────────── def digest_audit( backend: str = "auto", time_window_hours: int = 24, max_findings: int = 20, max_markdown_chars: int = 3800, ) -> Dict: """ Privacy/audit digest: scans audit stream, summarises findings. Returns both structured JSON and a Telegram/markdown-friendly `markdown` field. """ store = _resolve_audit_store(backend) # Run underlying scan raw = scan_audit( backend=backend, time_window_hours=time_window_hours, max_events=50_000, ) findings = raw.get("findings") or [] stats = raw.get("stats") or {} events_scanned = stats.get("events_scanned", 0) errors = stats.get("errors", 0) warnings = stats.get("warnings", 0) infos = stats.get("infos", 0) total = errors + warnings + infos # Group findings by category by_category: dict = {} for f in findings[:max_findings]: cat = f.get("category", "unknown") by_category.setdefault(cat, []).append(f) # Recommendations from findings recs = _build_recommendations(findings[:max_findings]) # Determine source backend source = "unknown" try: if hasattr(store, "active_backend"): source = store.active_backend() elif type(store).__name__ == "PostgresAuditStore": source = "postgres" elif type(store).__name__ == "JsonlAuditStore": source = "jsonl" elif type(store).__name__ == "MemoryAuditStore": source = "memory" except Exception: pass # ── Markdown ───────────────────────────────────────────────────────────── period = f"Last {time_window_hours}h" status_icon = "🔴" if errors > 0 else ("🟡" if warnings > 0 else "🟢") lines = [ f"{status_icon} **Privacy Audit Digest** ({period})", f"Events scanned: {events_scanned} | Findings: {total} ({errors}E / {warnings}W / {infos}I)", f"Backend: `{source}`", "", ] if total == 0: lines.append("✅ No privacy issues detected in audit stream.") else: for cat, cat_findings in by_category.items(): lines.append(f"**[{cat.upper()}]** {len(cat_findings)} finding(s):") for f in cat_findings[:3]: sev = f.get("severity", "?") icon = "🔴" if sev == "error" else ("🟡" if sev == "warning" else "ℹ️") lines.append(f" {icon} `{f.get('id','?')}` — {f.get('title','')[:100]}") lines.append("") if recs: lines.append("💡 **Recommendations:**") for r in recs[:5]: lines.append(f" {r[:200]}") markdown = "\n".join(lines) if len(markdown) > max_markdown_chars: markdown = markdown[:max_markdown_chars] + "\n…[truncated]" return { "period": period, "window_hours": time_window_hours, "source_backend": source, "stats": { "events_scanned": events_scanned, "errors": errors, "warnings": warnings, "infos": infos, "total": total, }, "by_category": {cat: len(fs) for cat, fs in by_category.items()}, "top_findings": findings[:max_findings], "recommendations": recs, "markdown": markdown, "pass": raw.get("pass", True), } # ─── Main entrypoint ───────────────────────────────────────────────────────── def scan_data_governance_dict(action: str, params: Optional[Dict] = None, repo_root: Optional[str] = None) -> Dict: """ Dispatcher called by tool_manager handler. Returns plain dict suitable for ToolResult. """ params = params or {} if repo_root is None: repo_root = os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent)) if action == "scan_repo": return scan_repo( repo_root=repo_root, mode=params.get("mode", "fast"), max_files=int(params.get("max_files", 200)), max_bytes_per_file=int(params.get("max_bytes_per_file", 262144)), paths_include=params.get("paths_include"), paths_exclude=params.get("paths_exclude"), focus=params.get("focus"), ) if action == "digest_audit": return digest_audit( backend=params.get("backend", "auto"), time_window_hours=int(params.get("time_window_hours", 24)), max_findings=int(params.get("max_findings", 20)), max_markdown_chars=int(params.get("max_markdown_chars", 3800)), ) if action == "scan_audit": return scan_audit( backend=params.get("backend", "auto"), time_window_hours=int(params.get("time_window_hours", 24)), max_events=int(params.get("max_events", 50000)), jsonl_glob=params.get("jsonl_glob"), repo_root=repo_root, ) if action == "retention_check": return retention_check( repo_root=repo_root, check_audit_cleanup_task=bool(params.get("check_audit_cleanup_task", True)), check_jsonl_rotation=bool(params.get("check_jsonl_rotation", True)), check_memory_retention_docs=bool(params.get("check_memory_retention_docs", True)), check_logs_retention_docs=bool(params.get("check_logs_retention_docs", True)), ) if action == "policy": return get_policy() return {"error": f"Unknown action '{action}'. Valid: scan_repo, digest_audit, scan_audit, retention_check, policy"}