#!/usr/bin/env python3 """ audit_compact.py — Audit JSONL Compaction Merges individual daily JSONL files from the last `window_days` into a single compressed artifact: ops/audit/compact/tool_audit_last_{window_days}d.jsonl.gz Useful for: - Faster forensic analysis (single file to read) - Archival before cleanup - Offline cost_analyzer runs Usage: python3 ops/scripts/audit_compact.py \ --window-days 7 \ [--output-path ops/audit/compact] \ [--dry-run] [--verbose] Callable programmatically via run_compact(). """ from __future__ import annotations import argparse import datetime import gzip import json import logging import os import re import sys from pathlib import Path from typing import Dict, List, Optional logger = logging.getLogger(__name__) _DATE_PAT = re.compile(r"tool_audit_(\d{4}-\d{2}-\d{2})\.jsonl$") def run_compact( window_days: int = 7, audit_dir: str = "ops/audit", output_path: Optional[str] = None, dry_run: bool = True, repo_root: Optional[str] = None, verbose: bool = False, ) -> Dict: """ Compact last `window_days` JSONL audit files into one .jsonl.gz. Returns: {source_files, lines_written, output_file, bytes_written, dry_run, errors} """ if window_days < 1 or window_days > 30: raise ValueError(f"window_days must be 1–30, got {window_days}") root = Path(repo_root or os.getenv("REPO_ROOT", ".")).resolve() dir_path = (root / audit_dir).resolve() if not str(dir_path).startswith(str(root)): raise ValueError("audit_dir resolves outside repo root") today = datetime.date.today() cutoff = today - datetime.timedelta(days=window_days) # Find files within window source_files: List[Path] = [] for fpath in sorted(dir_path.glob("tool_audit_*.jsonl")): m = _DATE_PAT.search(fpath.name) if not m: continue try: file_date = datetime.date.fromisoformat(m.group(1)) except ValueError: continue if file_date >= cutoff: source_files.append(fpath) out_dir = (root / (output_path or f"{audit_dir}/compact")).resolve() if not str(out_dir).startswith(str(root)): raise ValueError("output_path resolves outside repo root") out_name = f"tool_audit_last_{window_days}d.jsonl.gz" out_file = out_dir / out_name lines_written = 0 bytes_written = 0 errors: List[str] = [] if dry_run: # Count lines without writing for fpath in source_files: try: with open(fpath, "r", encoding="utf-8", errors="replace") as f: lines_written += sum(1 for line in f if line.strip()) except Exception as e: errors.append(f"{fpath.name}: {e}") if verbose: logger.info( "[dry_run] Would compact %d files → %s (%d lines)", len(source_files), out_file, lines_written, ) else: out_dir.mkdir(parents=True, exist_ok=True) try: with gzip.open(out_file, "wt", encoding="utf-8") as gz: for fpath in source_files: try: with open(fpath, "r", encoding="utf-8", errors="replace") as f: for line in f: line = line.strip() if line: gz.write(line + "\n") lines_written += 1 except Exception as e: msg = f"Error reading {fpath.name}: {e}" logger.warning(msg) errors.append(msg) bytes_written = out_file.stat().st_size if verbose: logger.info( "Compacted %d files → %s (%d lines, %d bytes compressed)", len(source_files), out_file.name, lines_written, bytes_written, ) except Exception as e: errors.append(f"Write error: {e}") logger.error("audit_compact failed: %s", e) return { "source_files": len(source_files), "window_days": window_days, "lines_written": lines_written, "output_file": str(out_file) if not dry_run else str(out_file) + " [not created]", "bytes_written": bytes_written, "dry_run": dry_run, "errors": errors, } def _parse_args(argv=None) -> argparse.Namespace: p = argparse.ArgumentParser( description="Compact audit JSONL files into a single .gz archive", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) p.add_argument("--window-days", type=int, default=7, help="Compact files from last N days") p.add_argument("--audit-dir", default="ops/audit", help="Relative path to audit directory") p.add_argument("--output-path", default=None, help="Output directory (default: ops/audit/compact)") p.add_argument("--repo-root", default=None) p.add_argument("--dry-run", action="store_true") p.add_argument("--verbose", action="store_true") p.add_argument("--output-json", action="store_true") return p.parse_args(argv) def main(argv=None): logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s audit_compact %(message)s", stream=sys.stderr, ) args = _parse_args(argv) result = run_compact( window_days=args.window_days, audit_dir=args.audit_dir, output_path=args.output_path, dry_run=args.dry_run, repo_root=args.repo_root, verbose=args.verbose, ) if args.output_json: print(json.dumps(result, indent=2)) else: status = "DRY RUN" if result["dry_run"] else "DONE" print( f"[{status}] sources={result['source_files']} " f"lines={result['lines_written']} bytes={result['bytes_written']} " f"→ {result['output_file']}" ) if result["errors"]: sys.exit(1) if __name__ == "__main__": main()