Files
microdao-daarion/services/router/data_governance.py
Apple 129e4ea1fc feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
2026-03-03 07:14:14 -08:00

1025 lines
39 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Data Governance & Privacy Tool — DAARION.city
Deterministic, read-only scanner for:
A) PII patterns in code/docs/configs (email, phone, credit card, passport)
B) Secret exposure (inherits tool_governance._SECRET_PATTERNS + extras)
C) Unredacted payload risk in audit/log code
D) Storage without retention/TTL
E) Audit stream anomalies (PII in meta, large outputs)
F) Retention policy presence (cleanup tasks, runbooks)
Actions:
scan_repo — static analysis of repository files
scan_audit — analysis of JSONL/Postgres audit events
retention_check — verify cleanup mechanisms exist
policy — return current governance policy
Security / Privacy:
- All evidence snippets are masked/truncated before returning
- Tool is read-only; never writes or modifies files
- Path traversal protection: all paths confined to repo_root
"""
from __future__ import annotations
import fnmatch
import json
import logging
import os
import re
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
# ─── Config loader ────────────────────────────────────────────────────────────
_policy_cache: Optional[Dict] = None
_POLICY_PATH = os.path.join(
os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent)),
"config", "data_governance_policy.yml",
)
def _load_policy() -> Dict:
global _policy_cache
if _policy_cache is not None:
return _policy_cache
try:
import yaml
with open(_POLICY_PATH, "r") as f:
_policy_cache = yaml.safe_load(f) or {}
except Exception as e:
logger.warning("data_governance_policy.yml not loaded: %s", e)
_policy_cache = {}
return _policy_cache
def reload_policy() -> None:
global _policy_cache
_policy_cache = None
# ─── Compiled patterns (lazy) ─────────────────────────────────────────────────
_compiled_pii: Optional[List[Dict]] = None
_compiled_secret: Optional[List[Dict]] = None
_compiled_log_forbidden: Optional[List[re.Pattern]] = None
_compiled_raw_payload: Optional[List[re.Pattern]] = None
_compiled_storage_write: Optional[List[re.Pattern]] = None
def _get_pii_patterns() -> List[Dict]:
global _compiled_pii
if _compiled_pii is not None:
return _compiled_pii
pol = _load_policy()
result = []
for name, cfg in (pol.get("pii_patterns") or {}).items():
try:
result.append({
"name": name,
"regex": re.compile(cfg["regex"], re.MULTILINE),
"severity": cfg.get("severity", "warning"),
"id": cfg.get("id", f"DG-PII-{name}"),
"description": cfg.get("description", name),
})
except Exception as e:
logger.warning("Bad pii_pattern '%s': %s", name, e)
_compiled_pii = result
return result
def _get_secret_patterns() -> List[Dict]:
global _compiled_secret
if _compiled_secret is not None:
return _compiled_secret
# Inherit from tool_governance
inherited = []
try:
from tool_governance import _SECRET_PATTERNS
for idx, pat in enumerate(_SECRET_PATTERNS):
inherited.append({
"name": f"inherited_{idx}",
"regex": pat,
"severity": "error",
"id": "DG-SEC-000",
"description": "Secret-like value (inherited from governance)",
})
except Exception:
pass
# Extra from policy
pol = _load_policy()
for extra in (pol.get("secret_patterns", {}).get("extra") or []):
try:
inherited.append({
"name": extra["name"],
"regex": re.compile(extra["regex"], re.MULTILINE),
"severity": extra.get("severity", "error"),
"id": extra.get("id", "DG-SEC-EXT"),
"description": extra.get("name", "extra secret pattern"),
})
except Exception as e:
logger.warning("Bad extra secret pattern '%s': %s", extra.get("name"), e)
_compiled_secret = inherited
return inherited
def _get_log_forbidden_pattern() -> re.Pattern:
global _compiled_log_forbidden
if _compiled_log_forbidden:
return _compiled_log_forbidden[0]
pol = _load_policy()
fields = (pol.get("logging_rules") or {}).get("forbid_logging_fields") or []
if not fields:
fields = ["password", "token", "secret", "api_key"]
pat = re.compile(
r'(?i)(?:logger|log|logging|print|console\.log)\s*[.(]'
r'[^)]{0,200}'
r'(?:' + "|".join(re.escape(f) for f in fields) + r')',
re.MULTILINE,
)
_compiled_log_forbidden = [pat]
return pat
def _get_raw_payload_pattern() -> re.Pattern:
global _compiled_raw_payload
if _compiled_raw_payload:
return _compiled_raw_payload[0]
pol = _load_policy()
indicators = (pol.get("logging_rules") or {}).get("raw_payload_indicators") or []
if not indicators:
indicators = ["payload", "prompt", "messages", "transcript"]
pat = re.compile(
r'(?i)(?:' + "|".join(re.escape(f) for f in indicators) + r')',
re.MULTILINE,
)
_compiled_raw_payload = [pat]
return pat
def _get_storage_write_pattern() -> re.Pattern:
global _compiled_storage_write
if _compiled_storage_write:
return _compiled_storage_write[0]
pol = _load_policy()
writes = (pol.get("storage_keywords") or {}).get("write_patterns") or []
if not writes:
writes = ["save_message", "store_event", "insert_record", "append_event"]
pat = re.compile(
r'(?i)(?:' + "|".join(re.escape(w) for w in writes) + r')',
re.MULTILINE,
)
_compiled_storage_write = [pat]
return pat
# ─── Evidence masking ─────────────────────────────────────────────────────────
def _mask_evidence(text: str, max_chars: int = 200) -> str:
"""Mask secrets and truncate snippet for safe reporting."""
try:
from tool_governance import redact
text = redact(text)
except Exception:
# Fallback: mask common patterns
text = re.sub(
r'(?i)(token|secret|password|key|bearer)\s*[=:]\s*\S+',
r'\1=***',
text,
)
# Truncate
if len(text) > max_chars:
text = text[:max_chars] + "…[truncated]"
return text.strip()
def _line_range(lineno: int, window: int = 2) -> str:
start = max(1, lineno - window)
end = lineno + window
return f"L{start}-L{end}"
# ─── Path utilities ───────────────────────────────────────────────────────────
def _is_excluded(rel_path: str, excludes: List[str]) -> bool:
for pat in excludes:
if fnmatch.fnmatch(rel_path, pat):
return True
# Also match against basename
if fnmatch.fnmatch(Path(rel_path).name, pat):
return True
# Forward-slash wildcard matching
if fnmatch.fnmatch("/" + rel_path.replace("\\", "/"), pat.replace("**", "*")):
return True
return False
def _is_included(rel_path: str, includes: List[str]) -> bool:
if not includes:
return True
for inc in includes:
if rel_path.startswith(inc.rstrip("/")):
return True
return False
def _never_scan(rel_path: str) -> bool:
pol = _load_policy()
never = (pol.get("paths") or {}).get("never_scan") or []
name = Path(rel_path).name
for pat in never:
if fnmatch.fnmatch(name, pat.lstrip("*")):
return True
return False
def _safe_path(repo_root: str, rel: str) -> Optional[Path]:
"""Resolve path safely, preventing traversal outside repo_root."""
root = Path(repo_root).resolve()
try:
p = (root / rel).resolve()
if not str(p).startswith(str(root)):
return None
return p
except Exception:
return None
# ─── Finding builder ─────────────────────────────────────────────────────────
def _finding(
fid: str,
category: str,
severity: str,
title: str,
path: str = "",
lines: str = "",
details: str = "",
fix: str = "",
) -> Dict:
return {
"id": fid,
"category": category,
"severity": severity,
"title": title,
"evidence": {
"path": path,
"lines": lines,
"details": _mask_evidence(details),
},
"recommended_fix": fix,
}
# ─── A) PII scan ──────────────────────────────────────────────────────────────
def _scan_pii(content: str, rel_path: str, findings: List[Dict]) -> None:
for pat_info in _get_pii_patterns():
for m in pat_info["regex"].finditer(content):
lineno = content[:m.start()].count("\n") + 1
snippet = _mask_evidence(m.group(0))
findings.append(_finding(
fid=pat_info["id"],
category="pii",
severity=pat_info["severity"],
title=f"{pat_info['description']} in {Path(rel_path).name}",
path=rel_path,
lines=_line_range(lineno),
details=snippet,
fix="Replace with hash, mask, or remove this value. Ensure it is not stored in plaintext.",
))
# ─── B) Secret scan ───────────────────────────────────────────────────────────
def _scan_secrets(content: str, rel_path: str, findings: List[Dict]) -> None:
for pat_info in _get_secret_patterns():
for m in pat_info["regex"].finditer(content):
lineno = content[:m.start()].count("\n") + 1
findings.append(_finding(
fid=pat_info["id"],
category="secrets",
severity=pat_info["severity"],
title=f"Secret-like value in {Path(rel_path).name}",
path=rel_path,
lines=_line_range(lineno),
details=_mask_evidence(m.group(0), max_chars=60),
fix="Move to environment variable or secrets manager. Never hardcode secrets.",
))
# ─── C) Logging risk scan ────────────────────────────────────────────────────
def _scan_logging_risk(content: str, rel_path: str, findings: List[Dict]) -> None:
# Skip non-code files where logging patterns won't appear
ext = Path(rel_path).suffix.lower()
if ext not in (".py", ".ts", ".js"):
return
log_pat = _get_log_forbidden_pattern()
payload_pat = _get_raw_payload_pattern()
pol = _load_policy()
redaction_calls = (pol.get("logging_rules") or {}).get("redaction_calls") or ["redact", "mask"]
lines = content.splitlines()
n = len(lines)
context_window = 5 # lines around match to check for redaction
for m in log_pat.finditer(content):
lineno = content[:m.start()].count("\n") + 1
# Check if there's a redaction call nearby
lo = max(0, lineno - 1 - context_window)
hi = min(n, lineno + context_window)
context_lines = "\n".join(lines[lo:hi])
if any(rc in context_lines for rc in redaction_calls):
continue # Redaction present — skip
findings.append(_finding(
fid="DG-LOG-001",
category="logging",
severity="warning",
title=f"Potential sensitive field logged in {Path(rel_path).name}",
path=rel_path,
lines=_line_range(lineno),
details=_mask_evidence(m.group(0)),
fix="Apply redact() or mask() before logging. Log hash+last4 for identifiers.",
))
# Audit/log payload risk: look for raw payload storage
for m in payload_pat.finditer(content):
lineno = content[:m.start()].count("\n") + 1
# Only flag if in a logger/write context
lo = max(0, lineno - 1 - 3)
hi = min(n, lineno + 3)
context = "\n".join(lines[lo:hi])
if not re.search(r'(?i)(log|audit|event|record|store|write|insert|append|emit)', context):
continue
if any(rc in context for rc in redaction_calls):
continue
findings.append(_finding(
fid="DG-AUD-001",
category="logging",
severity="error",
title=f"Raw payload field near audit/log write in {Path(rel_path).name}",
path=rel_path,
lines=_line_range(lineno),
details=_mask_evidence(m.group(0)),
fix="Ensure payload fields are NOT stored in audit events. "
"Log hash+size only (as in ToolGovernance post_call).",
))
# ─── D) Storage without retention ────────────────────────────────────────────
def _scan_retention_risk(content: str, rel_path: str, findings: List[Dict]) -> None:
ext = Path(rel_path).suffix.lower()
if ext not in (".py", ".ts", ".js"):
return
pol = _load_policy()
storage_cfg = pol.get("storage_keywords") or {}
retention_indicators = storage_cfg.get("retention_indicators") or ["ttl", "expire", "retention", "cleanup"]
context_window = int(storage_cfg.get("context_window", 20))
write_pat = _get_storage_write_pattern()
retention_pat = re.compile(
r'(?i)(?:' + "|".join(re.escape(r) for r in retention_indicators) + r')',
re.MULTILINE,
)
lines = content.splitlines()
n = len(lines)
for m in write_pat.finditer(content):
lineno = content[:m.start()].count("\n") + 1
lo = max(0, lineno - 1 - context_window)
hi = min(n, lineno + context_window)
context = "\n".join(lines[lo:hi])
if retention_pat.search(context):
continue # Retention indicator found — OK
findings.append(_finding(
fid="DG-RET-001",
category="retention",
severity="warning",
title=f"Storage write without visible TTL/retention in {Path(rel_path).name}",
path=rel_path,
lines=_line_range(lineno),
details=_mask_evidence(m.group(0)),
fix="Add TTL/expiry to stored data or document retention policy in runbook. "
"Reference ops/runbook-* for cleanup procedures.",
))
# ─── File collector ───────────────────────────────────────────────────────────
def _collect_files(
repo_root: str,
paths_include: List[str],
paths_exclude: List[str],
max_files: int,
mode: str = "fast",
) -> List[Tuple[str, str]]:
"""
Returns list of (rel_path, full_path) tuples.
In 'fast' mode: only .py, .yml, .yaml, .json, .env.example.
In 'full' mode: all configured extensions.
"""
pol = _load_policy()
if mode == "fast":
scan_exts = {".py", ".yml", ".yaml", ".json", ".env.example", ".sh"}
else:
scan_exts = set((pol.get("paths") or {}).get("scan_extensions") or [
".py", ".ts", ".js", ".yml", ".yaml", ".json", ".md", ".txt", ".sh",
])
root = Path(repo_root).resolve()
results = []
for start_dir in paths_include:
start = root / start_dir.rstrip("/")
if not start.exists():
continue
for fpath in start.rglob("*"):
if not fpath.is_file():
continue
if fpath.suffix.lower() not in scan_exts:
continue
try:
rel = str(fpath.relative_to(root))
except ValueError:
continue
if _is_excluded(rel, paths_exclude):
continue
if _never_scan(rel):
continue
results.append((rel, str(fpath)))
if len(results) >= max_files:
return results
return results
# ─── scan_repo ────────────────────────────────────────────────────────────────
def scan_repo(
repo_root: str = ".",
mode: str = "fast",
max_files: int = 200,
max_bytes_per_file: int = 262144,
paths_include: Optional[List[str]] = None,
paths_exclude: Optional[List[str]] = None,
focus: Optional[List[str]] = None,
) -> Dict:
"""
Static scan of repository files for privacy/security risks.
Returns structured findings dict (pass always True in warning_only mode).
"""
pol = _load_policy()
paths_include = paths_include or (pol.get("paths") or {}).get("include") or ["services/", "config/", "ops/"]
paths_exclude = paths_exclude or (pol.get("paths") or {}).get("exclude") or []
focus = focus or ["logging", "storage", "pii", "secrets", "retention"]
max_findings = int((pol.get("limits") or {}).get("max_findings", 200))
gate_mode = (pol.get("severity_behavior") or {}).get("gate_mode", "warning_only")
files = _collect_files(repo_root, paths_include, paths_exclude, max_files, mode)
all_findings: List[Dict] = []
files_scanned = 0
skipped = 0
for rel_path, full_path in files:
try:
size = os.path.getsize(full_path)
if size > max_bytes_per_file:
skipped += 1
continue
with open(full_path, "r", encoding="utf-8", errors="replace") as f:
content = f.read()
except Exception as e:
logger.warning("Cannot read %s: %s", full_path, e)
skipped += 1
continue
files_scanned += 1
if "pii" in focus:
_scan_pii(content, rel_path, all_findings)
if "secrets" in focus:
_scan_secrets(content, rel_path, all_findings)
if "logging" in focus:
_scan_logging_risk(content, rel_path, all_findings)
if "retention" in focus:
_scan_retention_risk(content, rel_path, all_findings)
if len(all_findings) >= max_findings:
break
# Deduplicate: same id+path+lines
seen = set()
unique_findings = []
for f in all_findings:
key = (f["id"], f["evidence"].get("path"), f["evidence"].get("lines"))
if key not in seen:
unique_findings.append(f)
seen.add(key)
unique_findings = unique_findings[:max_findings]
errors = sum(1 for f in unique_findings if f["severity"] == "error")
warnings = sum(1 for f in unique_findings if f["severity"] == "warning")
infos = sum(1 for f in unique_findings if f["severity"] == "info")
pass_val = True # warning_only mode
if gate_mode == "strict" and errors > 0:
pass_val = False
recommendations = _build_recommendations(unique_findings)
return {
"pass": pass_val,
"summary": (
f"Scanned {files_scanned} files ({mode} mode). "
f"Found {errors} errors, {warnings} warnings, {infos} infos."
+ (f" ({skipped} files skipped: too large)" if skipped else "")
),
"stats": {
"errors": errors,
"warnings": warnings,
"infos": infos,
"files_scanned": files_scanned,
"files_skipped": skipped,
"events_scanned": 0,
},
"findings": unique_findings,
"recommendations": recommendations,
}
# ─── scan_audit ───────────────────────────────────────────────────────────────
def scan_audit(
backend: str = "auto",
time_window_hours: int = 24,
max_events: int = 50000,
jsonl_glob: Optional[str] = None,
repo_root: str = ".",
) -> Dict:
"""
Scan audit event stream for PII leaks and large-output anomalies.
backend='auto' uses the globally configured store (Postgres or JSONL).
"""
pol = _load_policy()
large_threshold = int((pol.get("retention") or {}).get("large_output_bytes", 65536))
pii_patterns = _get_pii_patterns()
findings: List[Dict] = []
events_scanned = 0
try:
store = _resolve_audit_store(backend)
import datetime
now = datetime.datetime.now(datetime.timezone.utc)
from_ts = (now - datetime.timedelta(hours=time_window_hours)).isoformat()
events = store.read(from_ts=from_ts, limit=max_events)
events_scanned = len(events)
for ev in events:
# Check meta fields for PII (graph_run_id, job_id should be safe; check input_hash)
meta_str = json.dumps({
k: ev.get(k) for k in ("agent_id", "user_id", "workspace_id", "input_hash", "graph_run_id", "job_id")
if ev.get(k)
})
for pat_info in pii_patterns:
m = pat_info["regex"].search(meta_str)
if m:
findings.append(_finding(
fid="DG-AUD-101",
category="audit",
severity=pat_info["severity"],
title=f"PII-like pattern in audit event metadata ({pat_info['description']})",
path=f"audit:{ev.get('tool','?')}@{ev.get('ts','')[:10]}",
lines="",
details=_mask_evidence(meta_str, max_chars=80),
fix="Ensure user_id/workspace_id are opaque identifiers, not real PII. "
"Check how identifiers are generated.",
))
break # One finding per event
# Large output anomaly
out_size = int(ev.get("out_size", 0))
if out_size >= large_threshold:
findings.append(_finding(
fid="DG-AUD-102",
category="audit",
severity="warning",
title=f"Unusually large tool output: {ev.get('tool','?')} ({out_size} bytes)",
path=f"audit:{ev.get('tool','?')}@{ev.get('ts','')[:10]}",
lines="",
details=f"out_size={out_size}, agent={ev.get('agent_id','?')}, status={ev.get('status','?')}",
fix="Verify output does not include raw user content. "
"Enforce max_bytes_out in tool_limits.yml.",
))
except Exception as e:
logger.warning("scan_audit error: %s", e)
return {
"pass": True,
"summary": f"Audit scan skipped: {e}",
"stats": {"errors": 0, "warnings": 0, "infos": 0, "events_scanned": 0, "files_scanned": 0},
"findings": [],
"recommendations": [],
}
# Deduplicate
seen = set()
unique = []
for f in findings:
key = (f["id"], f["evidence"].get("path"))
if key not in seen:
unique.append(f)
seen.add(key)
errors = sum(1 for f in unique if f["severity"] == "error")
warnings = sum(1 for f in unique if f["severity"] == "warning")
infos = sum(1 for f in unique if f["severity"] == "info")
return {
"pass": True,
"summary": f"Scanned {events_scanned} audit events. {errors} errors, {warnings} warnings.",
"stats": {
"errors": errors, "warnings": warnings, "infos": infos,
"events_scanned": events_scanned, "files_scanned": 0,
},
"findings": unique,
"recommendations": _build_recommendations(unique),
}
# ─── retention_check ─────────────────────────────────────────────────────────
def retention_check(
repo_root: str = ".",
check_audit_cleanup_task: bool = True,
check_jsonl_rotation: bool = True,
check_memory_retention_docs: bool = True,
check_logs_retention_docs: bool = True,
) -> Dict:
"""
Verify that cleanup/retention mechanisms exist for audit logs and memory.
"""
findings: List[Dict] = []
root = Path(repo_root).resolve()
def _file_contains(path: Path, keywords: List[str]) -> bool:
try:
text = path.read_text(encoding="utf-8", errors="replace")
return any(kw.lower() in text.lower() for kw in keywords)
except Exception:
return False
def _find_files(pattern: str) -> List[Path]:
return list(root.rglob(pattern))
# ── 1. Audit cleanup task ──────────────────────────────────────────────
if check_audit_cleanup_task:
has_cleanup = False
# Check task_registry.yml for audit_cleanup task
registry_files = _find_files("task_registry.yml")
for rf in registry_files:
if _file_contains(rf, ["audit_cleanup", "audit_rotation"]):
has_cleanup = True
break
# Check runbooks/ops docs
if not has_cleanup:
runbook_files = list(root.glob("ops/runbook*.md")) + list(root.rglob("*runbook*.md"))
for rb in runbook_files:
if _file_contains(rb, ["audit", "cleanup", "rotation", "jsonl"]):
has_cleanup = True
break
if has_cleanup:
findings.append(_finding(
fid="DG-RET-202",
category="retention",
severity="info",
title="Audit cleanup/rotation mechanism documented",
path="ops/",
fix="",
))
else:
findings.append(_finding(
fid="DG-RET-201",
category="retention",
severity="warning",
title="No audit cleanup task or runbook found",
path="ops/task_registry.yml",
fix="Add 'audit_cleanup' task to ops/task_registry.yml or document retention "
"procedure in ops/runbook-*.md. Default retention: 30 days.",
))
# ── 2. JSONL rotation (audit_store.py check) ──────────────────────────
if check_jsonl_rotation:
store_file = root / "services" / "router" / "audit_store.py"
if store_file.exists() and _file_contains(store_file, ["rotation", "daily", "tool_audit_"]):
findings.append(_finding(
fid="DG-RET-203",
category="retention",
severity="info",
title="JSONL audit rotation implemented in audit_store.py",
path="services/router/audit_store.py",
fix="",
))
else:
findings.append(_finding(
fid="DG-RET-204",
category="retention",
severity="warning",
title="JSONL audit rotation not confirmed in audit_store.py",
path="services/router/audit_store.py",
fix="Ensure JsonlAuditStore uses daily rotation (tool_audit_YYYY-MM-DD.jsonl) "
"and implement a cleanup job for files older than 30 days.",
))
# ── 3. Memory retention docs ─────────────────────────────────────────
if check_memory_retention_docs:
has_mem_retention = False
doc_files = list(root.rglob("*.md")) + list(root.rglob("*.yml"))
for df in doc_files[:200]: # limit scan
if _file_contains(df, ["memory_events_days", "memory retention", "memory_ttl", "memory.*expire"]):
has_mem_retention = True
break
if not has_mem_retention:
findings.append(_finding(
fid="DG-RET-205",
category="retention",
severity="info",
title="Memory event retention policy not found in docs/config",
path="config/",
fix="Document memory event TTL/retention in config/data_governance_policy.yml "
"(memory_events_days) and implement cleanup.",
))
# ── 4. Logs retention docs ───────────────────────────────────────────
if check_logs_retention_docs:
has_log_retention = False
for df in (list(root.glob("ops/*.md")) + list(root.rglob("*runbook*.md")))[:50]:
if _file_contains(df, ["logs_days", "log retention", "log rotation", "loki retention"]):
has_log_retention = True
break
if not has_log_retention:
findings.append(_finding(
fid="DG-RET-206",
category="retention",
severity="info",
title="Log retention period not documented in runbooks",
path="ops/",
fix="Document log retention in ops/runbook-*.md or config/data_governance_policy.yml "
"(logs_days: 14).",
))
errors = sum(1 for f in findings if f["severity"] == "error")
warnings = sum(1 for f in findings if f["severity"] == "warning")
infos = sum(1 for f in findings if f["severity"] == "info")
return {
"pass": True,
"summary": f"Retention check: {errors} errors, {warnings} warnings, {infos} infos.",
"stats": {"errors": errors, "warnings": warnings, "infos": infos, "files_scanned": 0, "events_scanned": 0},
"findings": findings,
"recommendations": _build_recommendations(findings),
}
# ─── policy ───────────────────────────────────────────────────────────────────
def get_policy() -> Dict:
reload_policy()
pol = _load_policy()
return {
"policy_path": _POLICY_PATH,
"retention": pol.get("retention", {}),
"pii_patterns": {k: {"severity": v.get("severity"), "id": v.get("id")}
for k, v in (pol.get("pii_patterns") or {}).items()},
"secret_patterns_count": len(_get_secret_patterns()),
"logging_rules": pol.get("logging_rules", {}),
"severity_behavior": pol.get("severity_behavior", {}),
"limits": pol.get("limits", {}),
}
# ─── Recommendations ──────────────────────────────────────────────────────────
_REC_MAP = {
"DG-LOG-001": "Review logger calls for sensitive fields. Apply redact() before logging.",
"DG-AUD-001": "Audit/log stores may contain raw payload. Enforce hash+size-only pattern.",
"DG-RET-001": "Add TTL or cleanup policy for stored data. Reference data_governance_policy.yml.",
"DG-RET-201": "Create an 'audit_cleanup' task in task_registry.yml or document retention in runbook.",
"DG-AUD-101": "Verify audit event identifiers are opaque (not real PII).",
"DG-AUD-102": "Large tool outputs may contain user content. Enforce max_bytes_out limits.",
"DG-PII-001": "Mask or hash email addresses before storage/logging.",
"DG-PII-002": "Mask phone numbers in logs and stored data.",
"DG-PII-003": "Credit card-like patterns detected. Remove immediately and audit access.",
"DG-SEC-000": "Rotate or remove secret-like values. Use environment variables.",
"DG-SEC-001": "Remove private key from code. Use secrets manager.",
}
def _build_recommendations(findings: List[Dict]) -> List[str]:
seen_ids = set()
recs = []
for f in findings:
fid = f.get("id", "")
rec = _REC_MAP.get(fid)
if rec and fid not in seen_ids and f["severity"] in ("error", "warning"):
recs.append(rec)
seen_ids.add(fid)
return recs
# ─── backend=auto resolver ───────────────────────────────────────────────────
def _resolve_audit_store(backend: str = "auto"):
"""Resolve AuditStore by backend param (auto/jsonl/memory)."""
from audit_store import get_audit_store, JsonlAuditStore, MemoryAuditStore
if backend in ("auto", None, ""):
return get_audit_store()
if backend == "jsonl":
import os
from pathlib import Path
audit_dir = os.getenv(
"AUDIT_JSONL_DIR",
str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "audit"),
)
return JsonlAuditStore(audit_dir)
if backend == "memory":
return MemoryAuditStore()
return get_audit_store()
# ─── digest_audit ─────────────────────────────────────────────────────────────
def digest_audit(
backend: str = "auto",
time_window_hours: int = 24,
max_findings: int = 20,
max_markdown_chars: int = 3800,
) -> Dict:
"""
Privacy/audit digest: scans audit stream, summarises findings.
Returns both structured JSON and a Telegram/markdown-friendly `markdown` field.
"""
store = _resolve_audit_store(backend)
# Run underlying scan
raw = scan_audit(
backend=backend,
time_window_hours=time_window_hours,
max_events=50_000,
)
findings = raw.get("findings") or []
stats = raw.get("stats") or {}
events_scanned = stats.get("events_scanned", 0)
errors = stats.get("errors", 0)
warnings = stats.get("warnings", 0)
infos = stats.get("infos", 0)
total = errors + warnings + infos
# Group findings by category
by_category: dict = {}
for f in findings[:max_findings]:
cat = f.get("category", "unknown")
by_category.setdefault(cat, []).append(f)
# Recommendations from findings
recs = _build_recommendations(findings[:max_findings])
# Determine source backend
source = "unknown"
try:
if hasattr(store, "active_backend"):
source = store.active_backend()
elif type(store).__name__ == "PostgresAuditStore":
source = "postgres"
elif type(store).__name__ == "JsonlAuditStore":
source = "jsonl"
elif type(store).__name__ == "MemoryAuditStore":
source = "memory"
except Exception:
pass
# ── Markdown ─────────────────────────────────────────────────────────────
period = f"Last {time_window_hours}h"
status_icon = "🔴" if errors > 0 else ("🟡" if warnings > 0 else "🟢")
lines = [
f"{status_icon} **Privacy Audit Digest** ({period})",
f"Events scanned: {events_scanned} | Findings: {total} ({errors}E / {warnings}W / {infos}I)",
f"Backend: `{source}`",
"",
]
if total == 0:
lines.append("✅ No privacy issues detected in audit stream.")
else:
for cat, cat_findings in by_category.items():
lines.append(f"**[{cat.upper()}]** {len(cat_findings)} finding(s):")
for f in cat_findings[:3]:
sev = f.get("severity", "?")
icon = "🔴" if sev == "error" else ("🟡" if sev == "warning" else "")
lines.append(f" {icon} `{f.get('id','?')}` — {f.get('title','')[:100]}")
lines.append("")
if recs:
lines.append("💡 **Recommendations:**")
for r in recs[:5]:
lines.append(f" {r[:200]}")
markdown = "\n".join(lines)
if len(markdown) > max_markdown_chars:
markdown = markdown[:max_markdown_chars] + "\n…[truncated]"
return {
"period": period,
"window_hours": time_window_hours,
"source_backend": source,
"stats": {
"events_scanned": events_scanned,
"errors": errors,
"warnings": warnings,
"infos": infos,
"total": total,
},
"by_category": {cat: len(fs) for cat, fs in by_category.items()},
"top_findings": findings[:max_findings],
"recommendations": recs,
"markdown": markdown,
"pass": raw.get("pass", True),
}
# ─── Main entrypoint ─────────────────────────────────────────────────────────
def scan_data_governance_dict(action: str, params: Optional[Dict] = None, repo_root: Optional[str] = None) -> Dict:
"""
Dispatcher called by tool_manager handler.
Returns plain dict suitable for ToolResult.
"""
params = params or {}
if repo_root is None:
repo_root = os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent))
if action == "scan_repo":
return scan_repo(
repo_root=repo_root,
mode=params.get("mode", "fast"),
max_files=int(params.get("max_files", 200)),
max_bytes_per_file=int(params.get("max_bytes_per_file", 262144)),
paths_include=params.get("paths_include"),
paths_exclude=params.get("paths_exclude"),
focus=params.get("focus"),
)
if action == "digest_audit":
return digest_audit(
backend=params.get("backend", "auto"),
time_window_hours=int(params.get("time_window_hours", 24)),
max_findings=int(params.get("max_findings", 20)),
max_markdown_chars=int(params.get("max_markdown_chars", 3800)),
)
if action == "scan_audit":
return scan_audit(
backend=params.get("backend", "auto"),
time_window_hours=int(params.get("time_window_hours", 24)),
max_events=int(params.get("max_events", 50000)),
jsonl_glob=params.get("jsonl_glob"),
repo_root=repo_root,
)
if action == "retention_check":
return retention_check(
repo_root=repo_root,
check_audit_cleanup_task=bool(params.get("check_audit_cleanup_task", True)),
check_jsonl_rotation=bool(params.get("check_jsonl_rotation", True)),
check_memory_retention_docs=bool(params.get("check_memory_retention_docs", True)),
check_logs_retention_docs=bool(params.get("check_logs_retention_docs", True)),
)
if action == "policy":
return get_policy()
return {"error": f"Unknown action '{action}'. Valid: scan_repo, digest_audit, scan_audit, retention_check, policy"}