microdao-daarion/services/router/audit_store.py

"""
Audit Store — persistence layer for ToolGovernance audit events.

Backends:
  memory   — in-process list (testing; not persistent)
  jsonl    — append-only JSONL file with daily rotation (default, zero-config)
  postgres — asyncpg INSERT into tool_audit_events table

Selection: env var AUDIT_BACKEND=jsonl|postgres|memory (default: jsonl)

Security / Privacy:
  - Payload is NEVER written (only hash + sizes)
  - Each write is fire-and-forget: errors → log warning, do NOT raise
  - Postgres writes are non-blocking (asyncio task)

JSONL schema per line (matches AuditEvent fields):
  {ts, req_id, workspace_id, user_id, agent_id, tool, action,
   status, duration_ms, in_size, out_size, input_hash,
   graph_run_id?, graph_node?, job_id?}

Postgres DDL (run once — or apply via migration):
  See _POSTGRES_DDL constant below.
"""

from __future__ import annotations

import asyncio
import datetime
import json
import logging
import os
import threading
import time
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Dict, List, Optional

logger = logging.getLogger(__name__)

# ─── DDL ──────────────────────────────────────────────────────────────────────

_POSTGRES_DDL = """
CREATE TABLE IF NOT EXISTS tool_audit_events (
  id            BIGSERIAL PRIMARY KEY,
  ts            TIMESTAMPTZ NOT NULL,
  req_id        TEXT        NOT NULL,
  workspace_id  TEXT        NOT NULL,
  user_id       TEXT        NOT NULL,
  agent_id      TEXT        NOT NULL,
  tool          TEXT        NOT NULL,
  action        TEXT        NOT NULL,
  status        TEXT        NOT NULL,
  duration_ms   INT         NOT NULL,
  in_size       INT         NOT NULL,
  out_size      INT         NOT NULL,
  input_hash    TEXT        NOT NULL,
  graph_run_id  TEXT,
  graph_node    TEXT,
  job_id        TEXT
);
CREATE INDEX IF NOT EXISTS idx_tool_audit_ts       ON tool_audit_events(ts);
CREATE INDEX IF NOT EXISTS idx_tool_audit_tool_ts  ON tool_audit_events(tool, ts);
CREATE INDEX IF NOT EXISTS idx_tool_audit_agent_ts ON tool_audit_events(agent_id, ts);
CREATE INDEX IF NOT EXISTS idx_tool_audit_ws_ts    ON tool_audit_events(workspace_id, ts);
"""


# ─── Canonical event dict ─────────────────────────────────────────────────────

def _event_to_dict(event: "AuditEventLike") -> Dict[str, Any]:
    """Convert an AuditEvent (dataclass) or dict to canonical storage dict."""
    if isinstance(event, dict):
        return event
    return {
        "ts": getattr(event, "ts", ""),
        "req_id": getattr(event, "req_id", ""),
        "workspace_id": getattr(event, "workspace_id", ""),
        "user_id": getattr(event, "user_id", ""),
        "agent_id": getattr(event, "agent_id", ""),
        "tool": getattr(event, "tool", ""),
        "action": getattr(event, "action", ""),
        "status": getattr(event, "status", ""),
        "duration_ms": round(float(getattr(event, "duration_ms", 0))),
        "in_size": int(getattr(event, "input_chars", 0)),
        "out_size": int(getattr(event, "output_size_bytes", 0)),
        "input_hash": getattr(event, "input_hash", ""),
        "graph_run_id": getattr(event, "graph_run_id", None),
        "graph_node": getattr(event, "graph_node", None),
        "job_id": getattr(event, "job_id", None),
    }


# Type alias (avoid circular imports)
AuditEventLike = Any


# ─── Interface ────────────────────────────────────────────────────────────────

class AuditStore(ABC):
    @abstractmethod
    def write(self, event: AuditEventLike) -> None:
        """Non-blocking write. MUST NOT raise on error."""
        ...

    @abstractmethod
    def read(
        self,
        from_ts: Optional[str] = None,
        to_ts: Optional[str] = None,
        tool: Optional[str] = None,
        agent_id: Optional[str] = None,
        workspace_id: Optional[str] = None,
        limit: int = 50000,
    ) -> List[Dict[str, Any]]:
        """Read events matching filters. Returns list of dicts."""
        ...

    def close(self) -> None:
        pass


# ─── Memory store ─────────────────────────────────────────────────────────────

class MemoryAuditStore(AuditStore):
    """In-process store for testing. Thread-safe."""

    def __init__(self, max_events: int = 100_000):
        self._events: List[Dict] = []
        self._lock = threading.Lock()
        self._max = max_events

    def write(self, event: AuditEventLike) -> None:
        try:
            d = _event_to_dict(event)
            with self._lock:
                self._events.append(d)
                if len(self._events) > self._max:
                    self._events = self._events[-self._max:]
        except Exception as e:
            logger.warning("MemoryAuditStore.write error: %s", e)

    def read(
        self,
        from_ts: Optional[str] = None,
        to_ts: Optional[str] = None,
        tool: Optional[str] = None,
        agent_id: Optional[str] = None,
        workspace_id: Optional[str] = None,
        limit: int = 50000,
    ) -> List[Dict]:
        with self._lock:
            rows = list(self._events)

        # Filter
        if from_ts:
            rows = [r for r in rows if r.get("ts", "") >= from_ts]
        if to_ts:
            rows = [r for r in rows if r.get("ts", "") <= to_ts]
        if tool:
            rows = [r for r in rows if r.get("tool") == tool]
        if agent_id:
            rows = [r for r in rows if r.get("agent_id") == agent_id]
        if workspace_id:
            rows = [r for r in rows if r.get("workspace_id") == workspace_id]

        return rows[-limit:]

    def clear(self) -> None:
        with self._lock:
            self._events.clear()


# ─── JSONL store ──────────────────────────────────────────────────────────────

class JsonlAuditStore(AuditStore):
    """
    Append-only JSONL file with daily rotation.

    File pattern: ops/audit/tool_audit_YYYY-MM-DD.jsonl
    Writes are serialised through a threading.Lock (safe for multi-thread, not multi-process).
    """

    def __init__(self, directory: str = "ops/audit"):
        self._dir = Path(directory)
        self._dir.mkdir(parents=True, exist_ok=True)
        self._lock = threading.Lock()
        self._current_file: Optional[Path] = None
        self._current_date: Optional[str] = None
        self._fh = None

    def _get_fh(self, date_str: str):
        if date_str != self._current_date:
            if self._fh:
                try:
                    self._fh.close()
                except Exception:
                    pass
            path = self._dir / f"tool_audit_{date_str}.jsonl"
            self._fh = open(path, "a", encoding="utf-8", buffering=1)  # line-buffered
            self._current_date = date_str
            self._current_file = path
        return self._fh

    def write(self, event: AuditEventLike) -> None:
        try:
            d = _event_to_dict(event)
            date_str = (d.get("ts") or "")[:10] or datetime.date.today().isoformat()
            line = json.dumps(d, ensure_ascii=False)
            with self._lock:
                fh = self._get_fh(date_str)
                fh.write(line + "\n")
        except Exception as e:
            logger.warning("JsonlAuditStore.write error: %s", e)

    def read(
        self,
        from_ts: Optional[str] = None,
        to_ts: Optional[str] = None,
        tool: Optional[str] = None,
        agent_id: Optional[str] = None,
        workspace_id: Optional[str] = None,
        limit: int = 50000,
    ) -> List[Dict]:
        """Stream-read JSONL files in date range."""
        # Determine which files to read
        files = sorted(self._dir.glob("tool_audit_*.jsonl"))
        if from_ts:
            from_date = from_ts[:10]
            files = [f for f in files if f.stem[-10:] >= from_date]
        if to_ts:
            to_date = to_ts[:10]
            files = [f for f in files if f.stem[-10:] <= to_date]

        rows = []
        for fpath in files:
            try:
                with open(fpath, "r", encoding="utf-8") as f:
                    for line in f:
                        line = line.strip()
                        if not line:
                            continue
                        try:
                            d = json.loads(line)
                        except Exception:
                            continue
                        ts = d.get("ts", "")
                        if from_ts and ts < from_ts:
                            continue
                        if to_ts and ts > to_ts:
                            continue
                        if tool and d.get("tool") != tool:
                            continue
                        if agent_id and d.get("agent_id") != agent_id:
                            continue
                        if workspace_id and d.get("workspace_id") != workspace_id:
                            continue
                        rows.append(d)
                        if len(rows) >= limit:
                            break
            except Exception as e:
                logger.warning("JsonlAuditStore.read error %s: %s", fpath, e)
            if len(rows) >= limit:
                break

        return rows

    def close(self) -> None:
        with self._lock:
            if self._fh:
                try:
                    self._fh.close()
                except Exception:
                    pass
                self._fh = None


# ─── Postgres store ───────────────────────────────────────────────────────────

class PostgresAuditStore(AuditStore):
    """
    Async Postgres store using asyncpg.
    Writes are enqueued to an asyncio queue and flushed in background.
    Falls back gracefully if Postgres is unavailable.
    """

    def __init__(self, dsn: str):
        self._dsn = dsn
        self._pool = None
        self._queue: asyncio.Queue = asyncio.Queue(maxsize=10_000)
        self._task: Optional[asyncio.Task] = None
        self._started = False

    def _ensure_started(self):
        if self._started:
            return
        try:
            loop = asyncio.get_event_loop()
            if loop.is_running():
                self._task = loop.create_task(self._flush_loop())
                self._started = True
        except RuntimeError:
            pass

    async def _get_pool(self):
        if self._pool is None:
            import asyncpg
            self._pool = await asyncpg.create_pool(self._dsn, min_size=1, max_size=3)
            async with self._pool.acquire() as conn:
                await conn.execute(_POSTGRES_DDL)
        return self._pool

    async def _flush_loop(self):
        while True:
            events = []
            try:
                # Collect up to 50 events or wait 2s
                evt = await asyncio.wait_for(self._queue.get(), timeout=2.0)
                events.append(evt)
                while not self._queue.empty() and len(events) < 50:
                    events.append(self._queue.get_nowait())
            except asyncio.TimeoutError:
                pass
            except Exception:
                pass

            if not events:
                continue

            try:
                pool = await self._get_pool()
                async with pool.acquire() as conn:
                    await conn.executemany(
                        """
                        INSERT INTO tool_audit_events
                          (ts, req_id, workspace_id, user_id, agent_id, tool, action,
                           status, duration_ms, in_size, out_size, input_hash,
                           graph_run_id, graph_node, job_id)
                        VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15)
                        """,
                        [
                            (
                                e["ts"], e["req_id"], e["workspace_id"], e["user_id"],
                                e["agent_id"], e["tool"], e["action"], e["status"],
                                e["duration_ms"], e["in_size"], e["out_size"],
                                e["input_hash"], e.get("graph_run_id"),
                                e.get("graph_node"), e.get("job_id"),
                            )
                            for e in events
                        ],
                    )
            except Exception as ex:
                logger.warning("PostgresAuditStore flush error: %s", ex)

    def write(self, event: AuditEventLike) -> None:
        try:
            d = _event_to_dict(event)
            self._ensure_started()
            if self._started and not self._queue.full():
                self._queue.put_nowait(d)
        except Exception as e:
            logger.warning("PostgresAuditStore.write error: %s", e)

    def read(
        self,
        from_ts: Optional[str] = None,
        to_ts: Optional[str] = None,
        tool: Optional[str] = None,
        agent_id: Optional[str] = None,
        workspace_id: Optional[str] = None,
        limit: int = 50000,
    ) -> List[Dict]:
        """Synchronous read via asyncio.run() — for analyzer queries."""
        try:
            return asyncio.run(self._async_read(from_ts, to_ts, tool, agent_id, workspace_id, limit))
        except Exception as e:
            logger.warning("PostgresAuditStore.read error: %s", e)
            return []

    async def _async_read(self, from_ts, to_ts, tool, agent_id, workspace_id, limit):
        pool = await self._get_pool()
        conditions = ["TRUE"]
        params = []
        p = 1
        if from_ts:
            conditions.append(f"ts >= ${p}"); params.append(from_ts); p += 1
        if to_ts:
            conditions.append(f"ts <= ${p}"); params.append(to_ts); p += 1
        if tool:
            conditions.append(f"tool = ${p}"); params.append(tool); p += 1
        if agent_id:
            conditions.append(f"agent_id = ${p}"); params.append(agent_id); p += 1
        if workspace_id:
            conditions.append(f"workspace_id = ${p}"); params.append(workspace_id); p += 1

        sql = f"SELECT * FROM tool_audit_events WHERE {' AND '.join(conditions)} ORDER BY ts LIMIT {limit}"
        async with pool.acquire() as conn:
            rows = await conn.fetch(sql, *params)
        return [dict(r) for r in rows]


# ─── Null store ───────────────────────────────────────────────────────────────

class NullAuditStore(AuditStore):
    """No-op store (audit disabled)."""
    def write(self, event: AuditEventLike) -> None:
        pass
    def read(self, **kwargs) -> List[Dict]:
        return []


# ─── Global singleton ─────────────────────────────────────────────────────────

_store: Optional[AuditStore] = None
_store_lock = threading.Lock()


def get_audit_store() -> AuditStore:
    """Lazily initialise and return the global audit store."""
    global _store
    if _store is None:
        with _store_lock:
            if _store is None:
                _store = _create_store()
    return _store


def set_audit_store(store: AuditStore) -> None:
    """Override the global store (used in tests)."""
    global _store
    with _store_lock:
        _store = store


class AutoAuditStore(AuditStore):
    """
    Smart backend: tries Postgres first, falls back to JSONL on failure.

    Used when AUDIT_BACKEND=auto (or unset with DATABASE_URL present).
    - Writes go to whichever backend is currently healthy.
    - On Postgres failure, transparently falls back to JsonlAuditStore.
    - Recovers to Postgres on next health check (every ~5 min).

    Non-fatal: write errors are logged as warnings.
    """

    _RECOVERY_INTERVAL_S = 300  # retry Postgres after 5 minutes

    def __init__(self, pg_dsn: str, jsonl_dir: str):
        self._pg_dsn = pg_dsn
        self._jsonl_dir = jsonl_dir
        self._primary: Optional[PostgresAuditStore] = None
        self._fallback: Optional[JsonlAuditStore] = None
        self._using_fallback = False
        self._fallback_since: float = 0.0
        self._init_lock = threading.Lock()

    def _get_primary(self) -> Optional[PostgresAuditStore]:
        if self._primary is None:
            with self._init_lock:
                if self._primary is None:
                    self._primary = PostgresAuditStore(self._pg_dsn)
        return self._primary

    def _get_fallback(self) -> JsonlAuditStore:
        if self._fallback is None:
            with self._init_lock:
                if self._fallback is None:
                    self._fallback = JsonlAuditStore(self._jsonl_dir)
        return self._fallback

    def _maybe_recover(self) -> None:
        """Try to switch back to Postgres if enough time has passed since fallback."""
        if self._using_fallback and self._fallback_since > 0:
            if time.monotonic() - self._fallback_since >= self._RECOVERY_INTERVAL_S:
                logger.info("AutoAuditStore: attempting Postgres recovery")
                self._using_fallback = False
                self._fallback_since = 0.0

    def write(self, event: AuditEventLike) -> None:
        self._maybe_recover()
        if not self._using_fallback:
            try:
                primary = self._get_primary()
                if primary:
                    primary.write(event)
                    return
            except Exception as pg_err:
                logger.warning(
                    "AutoAuditStore: Postgres write failed (%s), switching to JSONL fallback", pg_err
                )
                self._using_fallback = True
                self._fallback_since = time.monotonic()
        # Write to JSONL fallback
        try:
            self._get_fallback().write(event)
        except Exception as jl_err:
            logger.warning("AutoAuditStore: JSONL fallback write failed: %s", jl_err)

    def read(
        self,
        from_ts: Optional[str] = None,
        to_ts: Optional[str] = None,
        tool: Optional[str] = None,
        agent_id: Optional[str] = None,
        workspace_id: Optional[str] = None,
        limit: int = 50000,
    ) -> List[Dict]:
        """Read from Postgres if available, else JSONL."""
        self._maybe_recover()
        if not self._using_fallback:
            try:
                primary = self._get_primary()
                if primary:
                    return primary.read(from_ts=from_ts, to_ts=to_ts, tool=tool,
                                        agent_id=agent_id, workspace_id=workspace_id, limit=limit)
            except Exception as pg_err:
                logger.warning("AutoAuditStore: Postgres read failed (%s), using JSONL", pg_err)
                self._using_fallback = True
                self._fallback_since = time.monotonic()
        return self._get_fallback().read(
            from_ts=from_ts, to_ts=to_ts, tool=tool,
            agent_id=agent_id, workspace_id=workspace_id, limit=limit,
        )

    def active_backend(self) -> str:
        """Return the name of the currently active backend."""
        return "jsonl_fallback" if self._using_fallback else "postgres"

    def close(self) -> None:
        if self._primary:
            try:
                self._primary.close()
            except Exception:
                pass
        if self._fallback:
            try:
                self._fallback.close()
            except Exception:
                pass


def _create_store() -> AuditStore:
    backend = os.getenv("AUDIT_BACKEND", "jsonl").lower()
    dsn = os.getenv("DATABASE_URL") or os.getenv("POSTGRES_DSN", "")
    audit_dir = os.getenv(
        "AUDIT_JSONL_DIR",
        str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "audit"),
    )

    if backend == "memory":
        logger.info("AuditStore: in-memory (testing only)")
        return MemoryAuditStore()

    if backend == "postgres":
        if not dsn:
            logger.warning("AUDIT_BACKEND=postgres but DATABASE_URL not set; falling back to jsonl")
        else:
            logger.info("AuditStore: postgres dsn=%s…", dsn[:30])
            return PostgresAuditStore(dsn)

    if backend == "auto":
        if dsn:
            logger.info("AuditStore: auto (postgres→jsonl fallback) dsn=%s…", dsn[:30])
            return AutoAuditStore(pg_dsn=dsn, jsonl_dir=audit_dir)
        else:
            logger.info("AuditStore: auto — no DATABASE_URL, using jsonl")

    if backend == "null":
        return NullAuditStore()

    # Default / jsonl
    logger.info("AuditStore: jsonl dir=%s", audit_dir)
    return JsonlAuditStore(audit_dir)