New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
1025 lines
39 KiB
Python
1025 lines
39 KiB
Python
"""
|
||
Data Governance & Privacy Tool — DAARION.city
|
||
|
||
Deterministic, read-only scanner for:
|
||
A) PII patterns in code/docs/configs (email, phone, credit card, passport)
|
||
B) Secret exposure (inherits tool_governance._SECRET_PATTERNS + extras)
|
||
C) Unredacted payload risk in audit/log code
|
||
D) Storage without retention/TTL
|
||
E) Audit stream anomalies (PII in meta, large outputs)
|
||
F) Retention policy presence (cleanup tasks, runbooks)
|
||
|
||
Actions:
|
||
scan_repo — static analysis of repository files
|
||
scan_audit — analysis of JSONL/Postgres audit events
|
||
retention_check — verify cleanup mechanisms exist
|
||
policy — return current governance policy
|
||
|
||
Security / Privacy:
|
||
- All evidence snippets are masked/truncated before returning
|
||
- Tool is read-only; never writes or modifies files
|
||
- Path traversal protection: all paths confined to repo_root
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import fnmatch
|
||
import json
|
||
import logging
|
||
import os
|
||
import re
|
||
from collections import defaultdict
|
||
from pathlib import Path
|
||
from typing import Any, Dict, List, Optional, Tuple
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ─── Config loader ────────────────────────────────────────────────────────────
|
||
|
||
_policy_cache: Optional[Dict] = None
|
||
_POLICY_PATH = os.path.join(
|
||
os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent)),
|
||
"config", "data_governance_policy.yml",
|
||
)
|
||
|
||
|
||
def _load_policy() -> Dict:
|
||
global _policy_cache
|
||
if _policy_cache is not None:
|
||
return _policy_cache
|
||
try:
|
||
import yaml
|
||
with open(_POLICY_PATH, "r") as f:
|
||
_policy_cache = yaml.safe_load(f) or {}
|
||
except Exception as e:
|
||
logger.warning("data_governance_policy.yml not loaded: %s", e)
|
||
_policy_cache = {}
|
||
return _policy_cache
|
||
|
||
|
||
def reload_policy() -> None:
|
||
global _policy_cache
|
||
_policy_cache = None
|
||
|
||
|
||
# ─── Compiled patterns (lazy) ─────────────────────────────────────────────────
|
||
|
||
_compiled_pii: Optional[List[Dict]] = None
|
||
_compiled_secret: Optional[List[Dict]] = None
|
||
_compiled_log_forbidden: Optional[List[re.Pattern]] = None
|
||
_compiled_raw_payload: Optional[List[re.Pattern]] = None
|
||
_compiled_storage_write: Optional[List[re.Pattern]] = None
|
||
|
||
|
||
def _get_pii_patterns() -> List[Dict]:
|
||
global _compiled_pii
|
||
if _compiled_pii is not None:
|
||
return _compiled_pii
|
||
pol = _load_policy()
|
||
result = []
|
||
for name, cfg in (pol.get("pii_patterns") or {}).items():
|
||
try:
|
||
result.append({
|
||
"name": name,
|
||
"regex": re.compile(cfg["regex"], re.MULTILINE),
|
||
"severity": cfg.get("severity", "warning"),
|
||
"id": cfg.get("id", f"DG-PII-{name}"),
|
||
"description": cfg.get("description", name),
|
||
})
|
||
except Exception as e:
|
||
logger.warning("Bad pii_pattern '%s': %s", name, e)
|
||
_compiled_pii = result
|
||
return result
|
||
|
||
|
||
def _get_secret_patterns() -> List[Dict]:
|
||
global _compiled_secret
|
||
if _compiled_secret is not None:
|
||
return _compiled_secret
|
||
|
||
# Inherit from tool_governance
|
||
inherited = []
|
||
try:
|
||
from tool_governance import _SECRET_PATTERNS
|
||
for idx, pat in enumerate(_SECRET_PATTERNS):
|
||
inherited.append({
|
||
"name": f"inherited_{idx}",
|
||
"regex": pat,
|
||
"severity": "error",
|
||
"id": "DG-SEC-000",
|
||
"description": "Secret-like value (inherited from governance)",
|
||
})
|
||
except Exception:
|
||
pass
|
||
|
||
# Extra from policy
|
||
pol = _load_policy()
|
||
for extra in (pol.get("secret_patterns", {}).get("extra") or []):
|
||
try:
|
||
inherited.append({
|
||
"name": extra["name"],
|
||
"regex": re.compile(extra["regex"], re.MULTILINE),
|
||
"severity": extra.get("severity", "error"),
|
||
"id": extra.get("id", "DG-SEC-EXT"),
|
||
"description": extra.get("name", "extra secret pattern"),
|
||
})
|
||
except Exception as e:
|
||
logger.warning("Bad extra secret pattern '%s': %s", extra.get("name"), e)
|
||
|
||
_compiled_secret = inherited
|
||
return inherited
|
||
|
||
|
||
def _get_log_forbidden_pattern() -> re.Pattern:
|
||
global _compiled_log_forbidden
|
||
if _compiled_log_forbidden:
|
||
return _compiled_log_forbidden[0]
|
||
pol = _load_policy()
|
||
fields = (pol.get("logging_rules") or {}).get("forbid_logging_fields") or []
|
||
if not fields:
|
||
fields = ["password", "token", "secret", "api_key"]
|
||
pat = re.compile(
|
||
r'(?i)(?:logger|log|logging|print|console\.log)\s*[.(]'
|
||
r'[^)]{0,200}'
|
||
r'(?:' + "|".join(re.escape(f) for f in fields) + r')',
|
||
re.MULTILINE,
|
||
)
|
||
_compiled_log_forbidden = [pat]
|
||
return pat
|
||
|
||
|
||
def _get_raw_payload_pattern() -> re.Pattern:
|
||
global _compiled_raw_payload
|
||
if _compiled_raw_payload:
|
||
return _compiled_raw_payload[0]
|
||
pol = _load_policy()
|
||
indicators = (pol.get("logging_rules") or {}).get("raw_payload_indicators") or []
|
||
if not indicators:
|
||
indicators = ["payload", "prompt", "messages", "transcript"]
|
||
pat = re.compile(
|
||
r'(?i)(?:' + "|".join(re.escape(f) for f in indicators) + r')',
|
||
re.MULTILINE,
|
||
)
|
||
_compiled_raw_payload = [pat]
|
||
return pat
|
||
|
||
|
||
def _get_storage_write_pattern() -> re.Pattern:
|
||
global _compiled_storage_write
|
||
if _compiled_storage_write:
|
||
return _compiled_storage_write[0]
|
||
pol = _load_policy()
|
||
writes = (pol.get("storage_keywords") or {}).get("write_patterns") or []
|
||
if not writes:
|
||
writes = ["save_message", "store_event", "insert_record", "append_event"]
|
||
pat = re.compile(
|
||
r'(?i)(?:' + "|".join(re.escape(w) for w in writes) + r')',
|
||
re.MULTILINE,
|
||
)
|
||
_compiled_storage_write = [pat]
|
||
return pat
|
||
|
||
|
||
# ─── Evidence masking ─────────────────────────────────────────────────────────
|
||
|
||
def _mask_evidence(text: str, max_chars: int = 200) -> str:
|
||
"""Mask secrets and truncate snippet for safe reporting."""
|
||
try:
|
||
from tool_governance import redact
|
||
text = redact(text)
|
||
except Exception:
|
||
# Fallback: mask common patterns
|
||
text = re.sub(
|
||
r'(?i)(token|secret|password|key|bearer)\s*[=:]\s*\S+',
|
||
r'\1=***',
|
||
text,
|
||
)
|
||
# Truncate
|
||
if len(text) > max_chars:
|
||
text = text[:max_chars] + "…[truncated]"
|
||
return text.strip()
|
||
|
||
|
||
def _line_range(lineno: int, window: int = 2) -> str:
|
||
start = max(1, lineno - window)
|
||
end = lineno + window
|
||
return f"L{start}-L{end}"
|
||
|
||
|
||
# ─── Path utilities ───────────────────────────────────────────────────────────
|
||
|
||
def _is_excluded(rel_path: str, excludes: List[str]) -> bool:
|
||
for pat in excludes:
|
||
if fnmatch.fnmatch(rel_path, pat):
|
||
return True
|
||
# Also match against basename
|
||
if fnmatch.fnmatch(Path(rel_path).name, pat):
|
||
return True
|
||
# Forward-slash wildcard matching
|
||
if fnmatch.fnmatch("/" + rel_path.replace("\\", "/"), pat.replace("**", "*")):
|
||
return True
|
||
return False
|
||
|
||
|
||
def _is_included(rel_path: str, includes: List[str]) -> bool:
|
||
if not includes:
|
||
return True
|
||
for inc in includes:
|
||
if rel_path.startswith(inc.rstrip("/")):
|
||
return True
|
||
return False
|
||
|
||
|
||
def _never_scan(rel_path: str) -> bool:
|
||
pol = _load_policy()
|
||
never = (pol.get("paths") or {}).get("never_scan") or []
|
||
name = Path(rel_path).name
|
||
for pat in never:
|
||
if fnmatch.fnmatch(name, pat.lstrip("*")):
|
||
return True
|
||
return False
|
||
|
||
|
||
def _safe_path(repo_root: str, rel: str) -> Optional[Path]:
|
||
"""Resolve path safely, preventing traversal outside repo_root."""
|
||
root = Path(repo_root).resolve()
|
||
try:
|
||
p = (root / rel).resolve()
|
||
if not str(p).startswith(str(root)):
|
||
return None
|
||
return p
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
# ─── Finding builder ─────────────────────────────────────────────────────────
|
||
|
||
def _finding(
|
||
fid: str,
|
||
category: str,
|
||
severity: str,
|
||
title: str,
|
||
path: str = "",
|
||
lines: str = "",
|
||
details: str = "",
|
||
fix: str = "",
|
||
) -> Dict:
|
||
return {
|
||
"id": fid,
|
||
"category": category,
|
||
"severity": severity,
|
||
"title": title,
|
||
"evidence": {
|
||
"path": path,
|
||
"lines": lines,
|
||
"details": _mask_evidence(details),
|
||
},
|
||
"recommended_fix": fix,
|
||
}
|
||
|
||
|
||
# ─── A) PII scan ──────────────────────────────────────────────────────────────
|
||
|
||
def _scan_pii(content: str, rel_path: str, findings: List[Dict]) -> None:
|
||
for pat_info in _get_pii_patterns():
|
||
for m in pat_info["regex"].finditer(content):
|
||
lineno = content[:m.start()].count("\n") + 1
|
||
snippet = _mask_evidence(m.group(0))
|
||
findings.append(_finding(
|
||
fid=pat_info["id"],
|
||
category="pii",
|
||
severity=pat_info["severity"],
|
||
title=f"{pat_info['description']} in {Path(rel_path).name}",
|
||
path=rel_path,
|
||
lines=_line_range(lineno),
|
||
details=snippet,
|
||
fix="Replace with hash, mask, or remove this value. Ensure it is not stored in plaintext.",
|
||
))
|
||
|
||
|
||
# ─── B) Secret scan ───────────────────────────────────────────────────────────
|
||
|
||
def _scan_secrets(content: str, rel_path: str, findings: List[Dict]) -> None:
|
||
for pat_info in _get_secret_patterns():
|
||
for m in pat_info["regex"].finditer(content):
|
||
lineno = content[:m.start()].count("\n") + 1
|
||
findings.append(_finding(
|
||
fid=pat_info["id"],
|
||
category="secrets",
|
||
severity=pat_info["severity"],
|
||
title=f"Secret-like value in {Path(rel_path).name}",
|
||
path=rel_path,
|
||
lines=_line_range(lineno),
|
||
details=_mask_evidence(m.group(0), max_chars=60),
|
||
fix="Move to environment variable or secrets manager. Never hardcode secrets.",
|
||
))
|
||
|
||
|
||
# ─── C) Logging risk scan ────────────────────────────────────────────────────
|
||
|
||
def _scan_logging_risk(content: str, rel_path: str, findings: List[Dict]) -> None:
|
||
# Skip non-code files where logging patterns won't appear
|
||
ext = Path(rel_path).suffix.lower()
|
||
if ext not in (".py", ".ts", ".js"):
|
||
return
|
||
|
||
log_pat = _get_log_forbidden_pattern()
|
||
payload_pat = _get_raw_payload_pattern()
|
||
|
||
pol = _load_policy()
|
||
redaction_calls = (pol.get("logging_rules") or {}).get("redaction_calls") or ["redact", "mask"]
|
||
|
||
lines = content.splitlines()
|
||
n = len(lines)
|
||
context_window = 5 # lines around match to check for redaction
|
||
|
||
for m in log_pat.finditer(content):
|
||
lineno = content[:m.start()].count("\n") + 1
|
||
# Check if there's a redaction call nearby
|
||
lo = max(0, lineno - 1 - context_window)
|
||
hi = min(n, lineno + context_window)
|
||
context_lines = "\n".join(lines[lo:hi])
|
||
if any(rc in context_lines for rc in redaction_calls):
|
||
continue # Redaction present — skip
|
||
findings.append(_finding(
|
||
fid="DG-LOG-001",
|
||
category="logging",
|
||
severity="warning",
|
||
title=f"Potential sensitive field logged in {Path(rel_path).name}",
|
||
path=rel_path,
|
||
lines=_line_range(lineno),
|
||
details=_mask_evidence(m.group(0)),
|
||
fix="Apply redact() or mask() before logging. Log hash+last4 for identifiers.",
|
||
))
|
||
|
||
# Audit/log payload risk: look for raw payload storage
|
||
for m in payload_pat.finditer(content):
|
||
lineno = content[:m.start()].count("\n") + 1
|
||
# Only flag if in a logger/write context
|
||
lo = max(0, lineno - 1 - 3)
|
||
hi = min(n, lineno + 3)
|
||
context = "\n".join(lines[lo:hi])
|
||
if not re.search(r'(?i)(log|audit|event|record|store|write|insert|append|emit)', context):
|
||
continue
|
||
if any(rc in context for rc in redaction_calls):
|
||
continue
|
||
findings.append(_finding(
|
||
fid="DG-AUD-001",
|
||
category="logging",
|
||
severity="error",
|
||
title=f"Raw payload field near audit/log write in {Path(rel_path).name}",
|
||
path=rel_path,
|
||
lines=_line_range(lineno),
|
||
details=_mask_evidence(m.group(0)),
|
||
fix="Ensure payload fields are NOT stored in audit events. "
|
||
"Log hash+size only (as in ToolGovernance post_call).",
|
||
))
|
||
|
||
|
||
# ─── D) Storage without retention ────────────────────────────────────────────
|
||
|
||
def _scan_retention_risk(content: str, rel_path: str, findings: List[Dict]) -> None:
|
||
ext = Path(rel_path).suffix.lower()
|
||
if ext not in (".py", ".ts", ".js"):
|
||
return
|
||
|
||
pol = _load_policy()
|
||
storage_cfg = pol.get("storage_keywords") or {}
|
||
retention_indicators = storage_cfg.get("retention_indicators") or ["ttl", "expire", "retention", "cleanup"]
|
||
context_window = int(storage_cfg.get("context_window", 20))
|
||
|
||
write_pat = _get_storage_write_pattern()
|
||
retention_pat = re.compile(
|
||
r'(?i)(?:' + "|".join(re.escape(r) for r in retention_indicators) + r')',
|
||
re.MULTILINE,
|
||
)
|
||
|
||
lines = content.splitlines()
|
||
n = len(lines)
|
||
|
||
for m in write_pat.finditer(content):
|
||
lineno = content[:m.start()].count("\n") + 1
|
||
lo = max(0, lineno - 1 - context_window)
|
||
hi = min(n, lineno + context_window)
|
||
context = "\n".join(lines[lo:hi])
|
||
if retention_pat.search(context):
|
||
continue # Retention indicator found — OK
|
||
findings.append(_finding(
|
||
fid="DG-RET-001",
|
||
category="retention",
|
||
severity="warning",
|
||
title=f"Storage write without visible TTL/retention in {Path(rel_path).name}",
|
||
path=rel_path,
|
||
lines=_line_range(lineno),
|
||
details=_mask_evidence(m.group(0)),
|
||
fix="Add TTL/expiry to stored data or document retention policy in runbook. "
|
||
"Reference ops/runbook-* for cleanup procedures.",
|
||
))
|
||
|
||
|
||
# ─── File collector ───────────────────────────────────────────────────────────
|
||
|
||
def _collect_files(
|
||
repo_root: str,
|
||
paths_include: List[str],
|
||
paths_exclude: List[str],
|
||
max_files: int,
|
||
mode: str = "fast",
|
||
) -> List[Tuple[str, str]]:
|
||
"""
|
||
Returns list of (rel_path, full_path) tuples.
|
||
In 'fast' mode: only .py, .yml, .yaml, .json, .env.example.
|
||
In 'full' mode: all configured extensions.
|
||
"""
|
||
pol = _load_policy()
|
||
if mode == "fast":
|
||
scan_exts = {".py", ".yml", ".yaml", ".json", ".env.example", ".sh"}
|
||
else:
|
||
scan_exts = set((pol.get("paths") or {}).get("scan_extensions") or [
|
||
".py", ".ts", ".js", ".yml", ".yaml", ".json", ".md", ".txt", ".sh",
|
||
])
|
||
|
||
root = Path(repo_root).resolve()
|
||
results = []
|
||
|
||
for start_dir in paths_include:
|
||
start = root / start_dir.rstrip("/")
|
||
if not start.exists():
|
||
continue
|
||
for fpath in start.rglob("*"):
|
||
if not fpath.is_file():
|
||
continue
|
||
if fpath.suffix.lower() not in scan_exts:
|
||
continue
|
||
try:
|
||
rel = str(fpath.relative_to(root))
|
||
except ValueError:
|
||
continue
|
||
if _is_excluded(rel, paths_exclude):
|
||
continue
|
||
if _never_scan(rel):
|
||
continue
|
||
results.append((rel, str(fpath)))
|
||
if len(results) >= max_files:
|
||
return results
|
||
|
||
return results
|
||
|
||
|
||
# ─── scan_repo ────────────────────────────────────────────────────────────────
|
||
|
||
def scan_repo(
|
||
repo_root: str = ".",
|
||
mode: str = "fast",
|
||
max_files: int = 200,
|
||
max_bytes_per_file: int = 262144,
|
||
paths_include: Optional[List[str]] = None,
|
||
paths_exclude: Optional[List[str]] = None,
|
||
focus: Optional[List[str]] = None,
|
||
) -> Dict:
|
||
"""
|
||
Static scan of repository files for privacy/security risks.
|
||
|
||
Returns structured findings dict (pass always True in warning_only mode).
|
||
"""
|
||
pol = _load_policy()
|
||
paths_include = paths_include or (pol.get("paths") or {}).get("include") or ["services/", "config/", "ops/"]
|
||
paths_exclude = paths_exclude or (pol.get("paths") or {}).get("exclude") or []
|
||
focus = focus or ["logging", "storage", "pii", "secrets", "retention"]
|
||
max_findings = int((pol.get("limits") or {}).get("max_findings", 200))
|
||
gate_mode = (pol.get("severity_behavior") or {}).get("gate_mode", "warning_only")
|
||
|
||
files = _collect_files(repo_root, paths_include, paths_exclude, max_files, mode)
|
||
all_findings: List[Dict] = []
|
||
files_scanned = 0
|
||
skipped = 0
|
||
|
||
for rel_path, full_path in files:
|
||
try:
|
||
size = os.path.getsize(full_path)
|
||
if size > max_bytes_per_file:
|
||
skipped += 1
|
||
continue
|
||
with open(full_path, "r", encoding="utf-8", errors="replace") as f:
|
||
content = f.read()
|
||
except Exception as e:
|
||
logger.warning("Cannot read %s: %s", full_path, e)
|
||
skipped += 1
|
||
continue
|
||
|
||
files_scanned += 1
|
||
|
||
if "pii" in focus:
|
||
_scan_pii(content, rel_path, all_findings)
|
||
if "secrets" in focus:
|
||
_scan_secrets(content, rel_path, all_findings)
|
||
if "logging" in focus:
|
||
_scan_logging_risk(content, rel_path, all_findings)
|
||
if "retention" in focus:
|
||
_scan_retention_risk(content, rel_path, all_findings)
|
||
|
||
if len(all_findings) >= max_findings:
|
||
break
|
||
|
||
# Deduplicate: same id+path+lines
|
||
seen = set()
|
||
unique_findings = []
|
||
for f in all_findings:
|
||
key = (f["id"], f["evidence"].get("path"), f["evidence"].get("lines"))
|
||
if key not in seen:
|
||
unique_findings.append(f)
|
||
seen.add(key)
|
||
|
||
unique_findings = unique_findings[:max_findings]
|
||
|
||
errors = sum(1 for f in unique_findings if f["severity"] == "error")
|
||
warnings = sum(1 for f in unique_findings if f["severity"] == "warning")
|
||
infos = sum(1 for f in unique_findings if f["severity"] == "info")
|
||
|
||
pass_val = True # warning_only mode
|
||
if gate_mode == "strict" and errors > 0:
|
||
pass_val = False
|
||
|
||
recommendations = _build_recommendations(unique_findings)
|
||
|
||
return {
|
||
"pass": pass_val,
|
||
"summary": (
|
||
f"Scanned {files_scanned} files ({mode} mode). "
|
||
f"Found {errors} errors, {warnings} warnings, {infos} infos."
|
||
+ (f" ({skipped} files skipped: too large)" if skipped else "")
|
||
),
|
||
"stats": {
|
||
"errors": errors,
|
||
"warnings": warnings,
|
||
"infos": infos,
|
||
"files_scanned": files_scanned,
|
||
"files_skipped": skipped,
|
||
"events_scanned": 0,
|
||
},
|
||
"findings": unique_findings,
|
||
"recommendations": recommendations,
|
||
}
|
||
|
||
|
||
# ─── scan_audit ───────────────────────────────────────────────────────────────
|
||
|
||
def scan_audit(
|
||
backend: str = "auto",
|
||
time_window_hours: int = 24,
|
||
max_events: int = 50000,
|
||
jsonl_glob: Optional[str] = None,
|
||
repo_root: str = ".",
|
||
) -> Dict:
|
||
"""
|
||
Scan audit event stream for PII leaks and large-output anomalies.
|
||
backend='auto' uses the globally configured store (Postgres or JSONL).
|
||
"""
|
||
pol = _load_policy()
|
||
large_threshold = int((pol.get("retention") or {}).get("large_output_bytes", 65536))
|
||
|
||
pii_patterns = _get_pii_patterns()
|
||
findings: List[Dict] = []
|
||
events_scanned = 0
|
||
|
||
try:
|
||
store = _resolve_audit_store(backend)
|
||
|
||
import datetime
|
||
now = datetime.datetime.now(datetime.timezone.utc)
|
||
from_ts = (now - datetime.timedelta(hours=time_window_hours)).isoformat()
|
||
|
||
events = store.read(from_ts=from_ts, limit=max_events)
|
||
events_scanned = len(events)
|
||
|
||
for ev in events:
|
||
# Check meta fields for PII (graph_run_id, job_id should be safe; check input_hash)
|
||
meta_str = json.dumps({
|
||
k: ev.get(k) for k in ("agent_id", "user_id", "workspace_id", "input_hash", "graph_run_id", "job_id")
|
||
if ev.get(k)
|
||
})
|
||
|
||
for pat_info in pii_patterns:
|
||
m = pat_info["regex"].search(meta_str)
|
||
if m:
|
||
findings.append(_finding(
|
||
fid="DG-AUD-101",
|
||
category="audit",
|
||
severity=pat_info["severity"],
|
||
title=f"PII-like pattern in audit event metadata ({pat_info['description']})",
|
||
path=f"audit:{ev.get('tool','?')}@{ev.get('ts','')[:10]}",
|
||
lines="",
|
||
details=_mask_evidence(meta_str, max_chars=80),
|
||
fix="Ensure user_id/workspace_id are opaque identifiers, not real PII. "
|
||
"Check how identifiers are generated.",
|
||
))
|
||
break # One finding per event
|
||
|
||
# Large output anomaly
|
||
out_size = int(ev.get("out_size", 0))
|
||
if out_size >= large_threshold:
|
||
findings.append(_finding(
|
||
fid="DG-AUD-102",
|
||
category="audit",
|
||
severity="warning",
|
||
title=f"Unusually large tool output: {ev.get('tool','?')} ({out_size} bytes)",
|
||
path=f"audit:{ev.get('tool','?')}@{ev.get('ts','')[:10]}",
|
||
lines="",
|
||
details=f"out_size={out_size}, agent={ev.get('agent_id','?')}, status={ev.get('status','?')}",
|
||
fix="Verify output does not include raw user content. "
|
||
"Enforce max_bytes_out in tool_limits.yml.",
|
||
))
|
||
|
||
except Exception as e:
|
||
logger.warning("scan_audit error: %s", e)
|
||
return {
|
||
"pass": True,
|
||
"summary": f"Audit scan skipped: {e}",
|
||
"stats": {"errors": 0, "warnings": 0, "infos": 0, "events_scanned": 0, "files_scanned": 0},
|
||
"findings": [],
|
||
"recommendations": [],
|
||
}
|
||
|
||
# Deduplicate
|
||
seen = set()
|
||
unique = []
|
||
for f in findings:
|
||
key = (f["id"], f["evidence"].get("path"))
|
||
if key not in seen:
|
||
unique.append(f)
|
||
seen.add(key)
|
||
|
||
errors = sum(1 for f in unique if f["severity"] == "error")
|
||
warnings = sum(1 for f in unique if f["severity"] == "warning")
|
||
infos = sum(1 for f in unique if f["severity"] == "info")
|
||
|
||
return {
|
||
"pass": True,
|
||
"summary": f"Scanned {events_scanned} audit events. {errors} errors, {warnings} warnings.",
|
||
"stats": {
|
||
"errors": errors, "warnings": warnings, "infos": infos,
|
||
"events_scanned": events_scanned, "files_scanned": 0,
|
||
},
|
||
"findings": unique,
|
||
"recommendations": _build_recommendations(unique),
|
||
}
|
||
|
||
|
||
# ─── retention_check ─────────────────────────────────────────────────────────
|
||
|
||
def retention_check(
|
||
repo_root: str = ".",
|
||
check_audit_cleanup_task: bool = True,
|
||
check_jsonl_rotation: bool = True,
|
||
check_memory_retention_docs: bool = True,
|
||
check_logs_retention_docs: bool = True,
|
||
) -> Dict:
|
||
"""
|
||
Verify that cleanup/retention mechanisms exist for audit logs and memory.
|
||
"""
|
||
findings: List[Dict] = []
|
||
|
||
root = Path(repo_root).resolve()
|
||
|
||
def _file_contains(path: Path, keywords: List[str]) -> bool:
|
||
try:
|
||
text = path.read_text(encoding="utf-8", errors="replace")
|
||
return any(kw.lower() in text.lower() for kw in keywords)
|
||
except Exception:
|
||
return False
|
||
|
||
def _find_files(pattern: str) -> List[Path]:
|
||
return list(root.rglob(pattern))
|
||
|
||
# ── 1. Audit cleanup task ──────────────────────────────────────────────
|
||
if check_audit_cleanup_task:
|
||
has_cleanup = False
|
||
|
||
# Check task_registry.yml for audit_cleanup task
|
||
registry_files = _find_files("task_registry.yml")
|
||
for rf in registry_files:
|
||
if _file_contains(rf, ["audit_cleanup", "audit_rotation"]):
|
||
has_cleanup = True
|
||
break
|
||
|
||
# Check runbooks/ops docs
|
||
if not has_cleanup:
|
||
runbook_files = list(root.glob("ops/runbook*.md")) + list(root.rglob("*runbook*.md"))
|
||
for rb in runbook_files:
|
||
if _file_contains(rb, ["audit", "cleanup", "rotation", "jsonl"]):
|
||
has_cleanup = True
|
||
break
|
||
|
||
if has_cleanup:
|
||
findings.append(_finding(
|
||
fid="DG-RET-202",
|
||
category="retention",
|
||
severity="info",
|
||
title="Audit cleanup/rotation mechanism documented",
|
||
path="ops/",
|
||
fix="",
|
||
))
|
||
else:
|
||
findings.append(_finding(
|
||
fid="DG-RET-201",
|
||
category="retention",
|
||
severity="warning",
|
||
title="No audit cleanup task or runbook found",
|
||
path="ops/task_registry.yml",
|
||
fix="Add 'audit_cleanup' task to ops/task_registry.yml or document retention "
|
||
"procedure in ops/runbook-*.md. Default retention: 30 days.",
|
||
))
|
||
|
||
# ── 2. JSONL rotation (audit_store.py check) ──────────────────────────
|
||
if check_jsonl_rotation:
|
||
store_file = root / "services" / "router" / "audit_store.py"
|
||
if store_file.exists() and _file_contains(store_file, ["rotation", "daily", "tool_audit_"]):
|
||
findings.append(_finding(
|
||
fid="DG-RET-203",
|
||
category="retention",
|
||
severity="info",
|
||
title="JSONL audit rotation implemented in audit_store.py",
|
||
path="services/router/audit_store.py",
|
||
fix="",
|
||
))
|
||
else:
|
||
findings.append(_finding(
|
||
fid="DG-RET-204",
|
||
category="retention",
|
||
severity="warning",
|
||
title="JSONL audit rotation not confirmed in audit_store.py",
|
||
path="services/router/audit_store.py",
|
||
fix="Ensure JsonlAuditStore uses daily rotation (tool_audit_YYYY-MM-DD.jsonl) "
|
||
"and implement a cleanup job for files older than 30 days.",
|
||
))
|
||
|
||
# ── 3. Memory retention docs ─────────────────────────────────────────
|
||
if check_memory_retention_docs:
|
||
has_mem_retention = False
|
||
doc_files = list(root.rglob("*.md")) + list(root.rglob("*.yml"))
|
||
for df in doc_files[:200]: # limit scan
|
||
if _file_contains(df, ["memory_events_days", "memory retention", "memory_ttl", "memory.*expire"]):
|
||
has_mem_retention = True
|
||
break
|
||
if not has_mem_retention:
|
||
findings.append(_finding(
|
||
fid="DG-RET-205",
|
||
category="retention",
|
||
severity="info",
|
||
title="Memory event retention policy not found in docs/config",
|
||
path="config/",
|
||
fix="Document memory event TTL/retention in config/data_governance_policy.yml "
|
||
"(memory_events_days) and implement cleanup.",
|
||
))
|
||
|
||
# ── 4. Logs retention docs ───────────────────────────────────────────
|
||
if check_logs_retention_docs:
|
||
has_log_retention = False
|
||
for df in (list(root.glob("ops/*.md")) + list(root.rglob("*runbook*.md")))[:50]:
|
||
if _file_contains(df, ["logs_days", "log retention", "log rotation", "loki retention"]):
|
||
has_log_retention = True
|
||
break
|
||
if not has_log_retention:
|
||
findings.append(_finding(
|
||
fid="DG-RET-206",
|
||
category="retention",
|
||
severity="info",
|
||
title="Log retention period not documented in runbooks",
|
||
path="ops/",
|
||
fix="Document log retention in ops/runbook-*.md or config/data_governance_policy.yml "
|
||
"(logs_days: 14).",
|
||
))
|
||
|
||
errors = sum(1 for f in findings if f["severity"] == "error")
|
||
warnings = sum(1 for f in findings if f["severity"] == "warning")
|
||
infos = sum(1 for f in findings if f["severity"] == "info")
|
||
|
||
return {
|
||
"pass": True,
|
||
"summary": f"Retention check: {errors} errors, {warnings} warnings, {infos} infos.",
|
||
"stats": {"errors": errors, "warnings": warnings, "infos": infos, "files_scanned": 0, "events_scanned": 0},
|
||
"findings": findings,
|
||
"recommendations": _build_recommendations(findings),
|
||
}
|
||
|
||
|
||
# ─── policy ───────────────────────────────────────────────────────────────────
|
||
|
||
def get_policy() -> Dict:
|
||
reload_policy()
|
||
pol = _load_policy()
|
||
return {
|
||
"policy_path": _POLICY_PATH,
|
||
"retention": pol.get("retention", {}),
|
||
"pii_patterns": {k: {"severity": v.get("severity"), "id": v.get("id")}
|
||
for k, v in (pol.get("pii_patterns") or {}).items()},
|
||
"secret_patterns_count": len(_get_secret_patterns()),
|
||
"logging_rules": pol.get("logging_rules", {}),
|
||
"severity_behavior": pol.get("severity_behavior", {}),
|
||
"limits": pol.get("limits", {}),
|
||
}
|
||
|
||
|
||
# ─── Recommendations ──────────────────────────────────────────────────────────
|
||
|
||
_REC_MAP = {
|
||
"DG-LOG-001": "Review logger calls for sensitive fields. Apply redact() before logging.",
|
||
"DG-AUD-001": "Audit/log stores may contain raw payload. Enforce hash+size-only pattern.",
|
||
"DG-RET-001": "Add TTL or cleanup policy for stored data. Reference data_governance_policy.yml.",
|
||
"DG-RET-201": "Create an 'audit_cleanup' task in task_registry.yml or document retention in runbook.",
|
||
"DG-AUD-101": "Verify audit event identifiers are opaque (not real PII).",
|
||
"DG-AUD-102": "Large tool outputs may contain user content. Enforce max_bytes_out limits.",
|
||
"DG-PII-001": "Mask or hash email addresses before storage/logging.",
|
||
"DG-PII-002": "Mask phone numbers in logs and stored data.",
|
||
"DG-PII-003": "Credit card-like patterns detected. Remove immediately and audit access.",
|
||
"DG-SEC-000": "Rotate or remove secret-like values. Use environment variables.",
|
||
"DG-SEC-001": "Remove private key from code. Use secrets manager.",
|
||
}
|
||
|
||
|
||
def _build_recommendations(findings: List[Dict]) -> List[str]:
|
||
seen_ids = set()
|
||
recs = []
|
||
for f in findings:
|
||
fid = f.get("id", "")
|
||
rec = _REC_MAP.get(fid)
|
||
if rec and fid not in seen_ids and f["severity"] in ("error", "warning"):
|
||
recs.append(rec)
|
||
seen_ids.add(fid)
|
||
return recs
|
||
|
||
|
||
# ─── backend=auto resolver ───────────────────────────────────────────────────
|
||
|
||
def _resolve_audit_store(backend: str = "auto"):
|
||
"""Resolve AuditStore by backend param (auto/jsonl/memory)."""
|
||
from audit_store import get_audit_store, JsonlAuditStore, MemoryAuditStore
|
||
if backend in ("auto", None, ""):
|
||
return get_audit_store()
|
||
if backend == "jsonl":
|
||
import os
|
||
from pathlib import Path
|
||
audit_dir = os.getenv(
|
||
"AUDIT_JSONL_DIR",
|
||
str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "audit"),
|
||
)
|
||
return JsonlAuditStore(audit_dir)
|
||
if backend == "memory":
|
||
return MemoryAuditStore()
|
||
return get_audit_store()
|
||
|
||
|
||
# ─── digest_audit ─────────────────────────────────────────────────────────────
|
||
|
||
def digest_audit(
|
||
backend: str = "auto",
|
||
time_window_hours: int = 24,
|
||
max_findings: int = 20,
|
||
max_markdown_chars: int = 3800,
|
||
) -> Dict:
|
||
"""
|
||
Privacy/audit digest: scans audit stream, summarises findings.
|
||
|
||
Returns both structured JSON and a Telegram/markdown-friendly `markdown` field.
|
||
"""
|
||
store = _resolve_audit_store(backend)
|
||
|
||
# Run underlying scan
|
||
raw = scan_audit(
|
||
backend=backend,
|
||
time_window_hours=time_window_hours,
|
||
max_events=50_000,
|
||
)
|
||
|
||
findings = raw.get("findings") or []
|
||
stats = raw.get("stats") or {}
|
||
events_scanned = stats.get("events_scanned", 0)
|
||
errors = stats.get("errors", 0)
|
||
warnings = stats.get("warnings", 0)
|
||
infos = stats.get("infos", 0)
|
||
total = errors + warnings + infos
|
||
|
||
# Group findings by category
|
||
by_category: dict = {}
|
||
for f in findings[:max_findings]:
|
||
cat = f.get("category", "unknown")
|
||
by_category.setdefault(cat, []).append(f)
|
||
|
||
# Recommendations from findings
|
||
recs = _build_recommendations(findings[:max_findings])
|
||
|
||
# Determine source backend
|
||
source = "unknown"
|
||
try:
|
||
if hasattr(store, "active_backend"):
|
||
source = store.active_backend()
|
||
elif type(store).__name__ == "PostgresAuditStore":
|
||
source = "postgres"
|
||
elif type(store).__name__ == "JsonlAuditStore":
|
||
source = "jsonl"
|
||
elif type(store).__name__ == "MemoryAuditStore":
|
||
source = "memory"
|
||
except Exception:
|
||
pass
|
||
|
||
# ── Markdown ─────────────────────────────────────────────────────────────
|
||
period = f"Last {time_window_hours}h"
|
||
status_icon = "🔴" if errors > 0 else ("🟡" if warnings > 0 else "🟢")
|
||
lines = [
|
||
f"{status_icon} **Privacy Audit Digest** ({period})",
|
||
f"Events scanned: {events_scanned} | Findings: {total} ({errors}E / {warnings}W / {infos}I)",
|
||
f"Backend: `{source}`",
|
||
"",
|
||
]
|
||
if total == 0:
|
||
lines.append("✅ No privacy issues detected in audit stream.")
|
||
else:
|
||
for cat, cat_findings in by_category.items():
|
||
lines.append(f"**[{cat.upper()}]** {len(cat_findings)} finding(s):")
|
||
for f in cat_findings[:3]:
|
||
sev = f.get("severity", "?")
|
||
icon = "🔴" if sev == "error" else ("🟡" if sev == "warning" else "ℹ️")
|
||
lines.append(f" {icon} `{f.get('id','?')}` — {f.get('title','')[:100]}")
|
||
lines.append("")
|
||
|
||
if recs:
|
||
lines.append("💡 **Recommendations:**")
|
||
for r in recs[:5]:
|
||
lines.append(f" {r[:200]}")
|
||
|
||
markdown = "\n".join(lines)
|
||
if len(markdown) > max_markdown_chars:
|
||
markdown = markdown[:max_markdown_chars] + "\n…[truncated]"
|
||
|
||
return {
|
||
"period": period,
|
||
"window_hours": time_window_hours,
|
||
"source_backend": source,
|
||
"stats": {
|
||
"events_scanned": events_scanned,
|
||
"errors": errors,
|
||
"warnings": warnings,
|
||
"infos": infos,
|
||
"total": total,
|
||
},
|
||
"by_category": {cat: len(fs) for cat, fs in by_category.items()},
|
||
"top_findings": findings[:max_findings],
|
||
"recommendations": recs,
|
||
"markdown": markdown,
|
||
"pass": raw.get("pass", True),
|
||
}
|
||
|
||
|
||
# ─── Main entrypoint ─────────────────────────────────────────────────────────
|
||
|
||
def scan_data_governance_dict(action: str, params: Optional[Dict] = None, repo_root: Optional[str] = None) -> Dict:
|
||
"""
|
||
Dispatcher called by tool_manager handler.
|
||
Returns plain dict suitable for ToolResult.
|
||
"""
|
||
params = params or {}
|
||
if repo_root is None:
|
||
repo_root = os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent))
|
||
|
||
if action == "scan_repo":
|
||
return scan_repo(
|
||
repo_root=repo_root,
|
||
mode=params.get("mode", "fast"),
|
||
max_files=int(params.get("max_files", 200)),
|
||
max_bytes_per_file=int(params.get("max_bytes_per_file", 262144)),
|
||
paths_include=params.get("paths_include"),
|
||
paths_exclude=params.get("paths_exclude"),
|
||
focus=params.get("focus"),
|
||
)
|
||
|
||
if action == "digest_audit":
|
||
return digest_audit(
|
||
backend=params.get("backend", "auto"),
|
||
time_window_hours=int(params.get("time_window_hours", 24)),
|
||
max_findings=int(params.get("max_findings", 20)),
|
||
max_markdown_chars=int(params.get("max_markdown_chars", 3800)),
|
||
)
|
||
|
||
if action == "scan_audit":
|
||
return scan_audit(
|
||
backend=params.get("backend", "auto"),
|
||
time_window_hours=int(params.get("time_window_hours", 24)),
|
||
max_events=int(params.get("max_events", 50000)),
|
||
jsonl_glob=params.get("jsonl_glob"),
|
||
repo_root=repo_root,
|
||
)
|
||
|
||
if action == "retention_check":
|
||
return retention_check(
|
||
repo_root=repo_root,
|
||
check_audit_cleanup_task=bool(params.get("check_audit_cleanup_task", True)),
|
||
check_jsonl_rotation=bool(params.get("check_jsonl_rotation", True)),
|
||
check_memory_retention_docs=bool(params.get("check_memory_retention_docs", True)),
|
||
check_logs_retention_docs=bool(params.get("check_logs_retention_docs", True)),
|
||
)
|
||
|
||
if action == "policy":
|
||
return get_policy()
|
||
|
||
return {"error": f"Unknown action '{action}'. Valid: scan_repo, digest_audit, scan_audit, retention_check, policy"}
|