microdao-daarion/config/data_governance_policy.yml

# Data Governance & Privacy Policy — DAARION.city
#
# Used by data_governance_tool to scan for PII/secrets/logging/retention risks.
# Severity: "error" = high risk (still warning-only in gate_mode=warning_only).
#           "warning" = medium risk.
#           "info" = low risk / informational.

# ─── Retention policies ───────────────────────────────────────────────────────
retention:
  audit_jsonl_days: 30
  audit_postgres_days: 90
  memory_events_days: 90
  logs_days: 14
  # Large output threshold: if audit out_size >= this, flag as anomaly
  large_output_bytes: 65536      # 64KB

# ─── PII patterns ─────────────────────────────────────────────────────────────
pii_patterns:
  email:
    regex: "(?i)\\b[A-Z0-9._%+\\-]+@[A-Z0-9.\\-]+\\.[A-Z]{2,}\\b"
    severity: "warning"
    id: "DG-PII-001"
    description: "Email address detected"

  phone_ua_intl:
    regex: "\\b\\+?[0-9][0-9\\-\\s()]{7,}[0-9]\\b"
    severity: "warning"
    id: "DG-PII-002"
    description: "Phone-like number detected"

  credit_card:
    regex: "\\b(?:\\d[ \\-]*?){13,19}\\b"
    severity: "error"
    id: "DG-PII-003"
    description: "Credit card-like number detected"

  passport_like:
    regex: "\\b[A-Z]{2}\\d{6,7}\\b"
    severity: "warning"
    id: "DG-PII-004"
    description: "Passport-like identifier detected"

  tax_id_ua:
    regex: "\\b\\d{10}\\b"
    severity: "info"
    id: "DG-PII-005"
    description: "Possible Ukrainian tax ID (10 digits)"

# ─── Extra secret patterns (supplement tool_governance._SECRET_PATTERNS) ──────
secret_patterns:
  inherit_from_tool_governance: true
  extra:
    - name: "private_key_block"
      regex: "-----BEGIN [A-Z ]*PRIVATE KEY-----"
      severity: "error"
      id: "DG-SEC-001"
    - name: "aws_mfa_token"
      regex: "(?i)mfa[_\\-]?token[\\s=:]+['\"`]?[\\dA-Z]{6,8}['\"`]?"
      severity: "warning"
      id: "DG-SEC-002"
    - name: "pem_certificate"
      regex: "-----BEGIN CERTIFICATE-----"
      severity: "info"
      id: "DG-SEC-003"

# ─── Logging safety rules ─────────────────────────────────────────────────────
logging_rules:
  # Field names that must NOT appear unmasked in logger calls
  forbid_logging_fields:
    - password
    - passwd
    - token
    - secret
    - private_key
    - api_key
    - access_key
    - credential
    - auth_header
    - bearer

  # Fields that should appear as hash-only (warn if logged raw)
  sensitive_fields_warn:
    - user_id
    - chat_id
    - telegram_id
    - session_id
    - workspace_id

  # Calls that indicate redaction is applied (good)
  redaction_calls:
    - redact
    - mask
    - sanitize
    - anonymize
    - _hash
    - sha256

  # Payload field names that indicate raw content is being logged/stored
  raw_payload_indicators:
    - payload
    - diff_text
    - openapi_text
    - request_body
    - response_body
    - prompt
    - messages
    - content
    - transcript
    - conversation
    - full_text

# ─── Storage / retention keywords ─────────────────────────────────────────────
storage_keywords:
  write_patterns:
    - save_message
    - store_event
    - insert_record
    - append_event
    - write_event
    - write_record
    - persist
    - bulk_insert
    - executemany
  retention_indicators:
    - ttl
    - expire
    - retention
    - cleanup
    - delete_old
    - purge
    - rotate
    - max_age
    - expiry
  context_window: 20    # lines before/after to search for retention indicator

# ─── Scan paths ───────────────────────────────────────────────────────────────
paths:
  include:
    - "services/"
    - "docs/"
    - "ops/"
    - "config/"
  exclude:
    - "**/node_modules/**"
    - "**/.git/**"
    - "**/dist/**"
    - "**/build/**"
    - "**/.venv/**"
    - "**/__pycache__/**"
    - "**/*.pyc"
    - "**/*.lock"        # dependency lock files (high false-positive risk)
    - "**/*.min.js"

  # File extensions to scan
  scan_extensions:
    - ".py"
    - ".ts"
    - ".js"
    - ".yml"
    - ".yaml"
    - ".json"
    - ".env.example"
    - ".md"
    - ".txt"
    - ".sh"

  # Never scan these (sensitive or binary)
  never_scan:
    - "*.env"
    - ".env.*"
    - "*.pem"
    - "*.key"
    - "*.pfx"
    - "*.p12"
    - "*.crt"

# ─── Gate behaviour ───────────────────────────────────────────────────────────
severity_behavior:
  # warning_only: gate always pass=True (adds recommendations only)
  # strict: gate pass=False on any error finding
  gate_mode: "warning_only"
  recommend_on:
    - "warning"
    - "error"

# ─── Limits ───────────────────────────────────────────────────────────────────
limits:
  max_files_fast: 200
  max_files_full: 500
  max_bytes_per_file: 262144   # 256KB
  max_findings: 200            # cap before truncating
  max_evidence_chars: 200      # mask and truncate evidence snippets