feat(sofiia-console): add docs index and runbook search API (FTS5)

adds SQLite docs index (files/chunks + FTS5) and CLI rebuild

exposes authenticated runbook search/preview/raw endpoints

Made-with: Cursor
This commit is contained in:
Apple
2026-03-03 04:26:34 -08:00
parent bddb6cd75a
commit ef3ff80645
6 changed files with 484 additions and 0 deletions

View File

@@ -350,6 +350,31 @@ CREATE INDEX IF NOT EXISTS idx_audit_operator_ts ON audit_events(operator_id, ts
CREATE INDEX IF NOT EXISTS idx_audit_chat_ts ON audit_events(chat_id, ts DESC);
CREATE INDEX IF NOT EXISTS idx_audit_event_ts ON audit_events(event, ts DESC);
-- ── Docs index (runbooks/release FTS, PR1.1) ─────────────────────────────────
CREATE TABLE IF NOT EXISTS docs_files (
path TEXT PRIMARY KEY,
mtime REAL NOT NULL,
sha TEXT NOT NULL,
title TEXT DEFAULT '',
doc_type TEXT NOT NULL DEFAULT 'misc'
);
CREATE TABLE IF NOT EXISTS docs_chunks (
id TEXT PRIMARY KEY,
path TEXT NOT NULL,
heading TEXT NOT NULL DEFAULT '',
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
FOREIGN KEY (path) REFERENCES docs_files(path) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_docs_chunks_path ON docs_chunks(path);
CREATE VIRTUAL TABLE IF NOT EXISTS docs_chunks_fts USING fts5(
chunk_id UNINDEXED,
path,
heading,
content,
content=''
);
-- ── Graph Intelligence (Hygiene + Reflection) ──────────────────────────────
-- These ADD COLUMN statements are idempotent (IF NOT EXISTS requires SQLite 3.37+).
-- On older SQLite they fail silently — init_db() wraps them in a separate try block.

View File

@@ -0,0 +1,76 @@
"""
Docs index builder — scan docs/**/*.md, chunk, and populate docs_files / docs_chunks / docs_chunks_fts.
Run manually: python -m app.docs_index --rebuild
Or from repo root: SOFIIA_DOCS_ROOT=./docs python -m app.docs_index --rebuild
"""
from __future__ import annotations
import asyncio
import logging
import os
import sys
from pathlib import Path
from typing import Optional
from . import db as _db
from .docs_store import clear_docs_index, insert_docs_file
logger = logging.getLogger(__name__)
def get_docs_root() -> Path:
"""SOFIIA_DOCS_ROOT or repo/docs (sofiia-console lives in repo/services/sofiia-console)."""
env = os.getenv("SOFIIA_DOCS_ROOT", "").strip()
if env:
return Path(env).resolve()
# app/docs_index.py -> app -> sofiia-console -> services -> repo
repo = Path(__file__).resolve().parent.parent.parent.parent
return (repo / "docs").resolve()
async def rebuild_index(docs_root: Optional[Path] = None) -> int:
"""Scan docs_root for **/*.md, clear index, insert all. Returns count of files indexed."""
root = docs_root or get_docs_root()
if not root.is_dir():
logger.warning("Docs root not found: %s", root)
return 0
await _db.init_db()
await clear_docs_index()
count = 0
for path in sorted(root.rglob("*.md")):
try:
content = path.read_text(encoding="utf-8", errors="replace")
except Exception as e:
logger.warning("Skip %s: %s", path, e)
continue
# Store path relative to repo (docs/runbook/...) for stable IDs
try:
rel = path.relative_to(root)
except ValueError:
rel = path.name
path_key = str(rel).replace("\\", "/")
mtime = path.stat().st_mtime
await insert_docs_file(path_key, mtime, content)
count += 1
logger.info("Docs index rebuilt: %s files from %s", count, root)
return count
def main() -> int:
import argparse
p = argparse.ArgumentParser(description="Rebuild docs FTS index")
p.add_argument("--rebuild", action="store_true", help="Clear and rebuild index")
p.add_argument("--docs-root", type=str, default=None, help="Override docs directory")
args = p.parse_args()
if not args.rebuild:
print("Use --rebuild to rebuild index.", file=sys.stderr)
return 1
logging.basicConfig(level=logging.INFO)
root = Path(args.docs_root).resolve() if args.docs_root else None
n = asyncio.run(rebuild_index(root))
print(f"Indexed {n} files.")
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,221 @@
"""
Docs index store — SQLite tables docs_files, docs_chunks, docs_chunks_fts (FTS5).
Read-only API: search, preview, raw. Index build in docs_index.py.
"""
from __future__ import annotations
import hashlib
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional
from . import db as _db
logger = logging.getLogger(__name__)
# Max snippet length per fragment
_SNIPPET_LEN = 120
_CHUNK_SIZE = 3500 # ~24KB target
def _doc_type_from_path(path: str) -> str:
p = path.replace("\\", "/")
if "/runbook/" in p or p.startswith("runbook/"):
return "runbook"
if "/release/" in p or p.startswith("release/"):
return "release"
return "spec" if "/docs/" in p or p.startswith("docs/") else "misc"
def _extract_title(content: str, path: str) -> str:
"""First # heading or filename."""
for line in content.splitlines():
line = line.strip()
if line.startswith("# "):
return line[2:].strip()[:200]
return Path(path).stem.replace("-", " ").replace("_", " ").title()[:200]
async def clear_docs_index() -> None:
"""Remove all docs_files, docs_chunks, and FTS rows."""
conn = await _db.get_db()
await conn.execute("DELETE FROM docs_chunks_fts")
await conn.execute("DELETE FROM docs_chunks")
await conn.execute("DELETE FROM docs_files")
await conn.commit()
logger.info("Docs index cleared.")
async def insert_docs_file(path: str, mtime: float, content: str) -> None:
"""Register one file and its chunks. Caller ensures path is normalized."""
conn = await _db.get_db()
sha = hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
title = _extract_title(content, path)
doc_type = _doc_type_from_path(path)
await conn.execute(
"INSERT OR REPLACE INTO docs_files(path, mtime, sha, title, doc_type) VALUES (?,?,?,?,?)",
(path, mtime, sha, title, doc_type),
)
await conn.execute("DELETE FROM docs_chunks WHERE path = ?", (path,))
await conn.execute("DELETE FROM docs_chunks_fts WHERE path = ?", (path,))
await conn.commit()
chunks = _chunk_content(content, path)
for i, (heading, text) in enumerate(chunks):
chunk_id = f"{path}:{i}"
await conn.execute(
"INSERT INTO docs_chunks(id, path, heading, chunk_index, content) VALUES (?,?,?,?,?)",
(chunk_id, path, heading, i, text),
)
await conn.execute(
"INSERT INTO docs_chunks_fts(chunk_id, path, heading, content) VALUES (?,?,?,?)",
(chunk_id, path, heading, text),
)
await conn.commit()
def _chunk_content(content: str, path: str) -> List[tuple]:
"""Split by headers, then by size. Returns [(heading, text), ...]."""
sections: List[tuple] = []
current_heading = ""
current_lines: List[str] = []
current_size = 0
for line in content.splitlines():
stripped = line.strip()
if stripped.startswith("# "):
if current_lines:
sections.append((current_heading, "\n".join(current_lines)))
current_heading = stripped[2:].strip()[:300]
current_lines = [line]
current_size = len(line) + 1
elif stripped.startswith("## "):
if current_lines:
sections.append((current_heading, "\n".join(current_lines)))
current_heading = stripped[3:].strip()[:300]
current_lines = [line]
current_size = len(line) + 1
elif stripped.startswith("### "):
if current_lines:
sections.append((current_heading, "\n".join(current_lines)))
current_heading = stripped[4:].strip()[:300]
current_lines = [line]
current_size = len(line) + 1
else:
current_lines.append(line)
current_size += len(line) + 1
if current_size >= _CHUNK_SIZE:
sections.append((current_heading, "\n".join(current_lines)))
current_lines = []
current_size = 0
if current_lines:
sections.append((current_heading, "\n".join(current_lines)))
return sections
async def search_docs(
q: str,
doc_type: Optional[str] = None,
limit: int = 10,
) -> List[Dict[str, Any]]:
"""FTS search. Returns [{path, title, snippet, score}]. Uses FTS5 if available."""
conn = await _db.get_db()
# Sanitize FTS5 query: wrap in quotes for phrase or use simple terms
q_clean = q.strip().replace("\"", " ")[:200]
if not q_clean:
return []
# Filter by doc_type via join with docs_files
type_filter = "AND f.doc_type = ?" if doc_type else ""
params: List[Any] = [q_clean]
if doc_type:
params.append(doc_type)
params.append(limit)
try:
# FTS5: snippet(build, col_idx, left, right, ellipsis, max_tokens)
# columns: 0=chunk_id, 1=path, 2=heading, 3=content → snippet column 3
async with conn.execute(
f"""
SELECT f.path, f.title, snippet(docs_chunks_fts, 3, '**', '**', '...', {_SNIPPET_LEN//5}) AS snippet
FROM docs_chunks_fts AS fts
JOIN docs_files f ON f.path = fts.path
WHERE docs_chunks_fts MATCH ? {type_filter}
LIMIT ?
""",
params,
) as cur:
rows = await cur.fetchall()
except Exception as e:
logger.warning("FTS search failed, fallback to LIKE: %s", e)
return await _search_docs_like(q_clean, doc_type, limit)
result = [
{"path": r[0], "title": r[1] or "", "snippet": (r[2] or "").strip(), "score": 1.0}
for r in (rows or [])
]
if not result:
return await _search_docs_like(q_clean, doc_type, limit)
return result
async def _search_docs_like(
q: str,
doc_type: Optional[str],
limit: int,
) -> List[Dict[str, Any]]:
"""Fallback when FTS5 unavailable: LIKE on content."""
conn = await _db.get_db()
like = f"%{q}%"
params: List[Any] = [like]
if doc_type:
params.append(doc_type)
params.append(limit)
type_sql = "AND f.doc_type = ?" if doc_type else ""
async with conn.execute(
f"""
SELECT DISTINCT f.path, f.title, substr(c.content, 1, {_SNIPPET_LEN}) AS snippet
FROM docs_chunks c
JOIN docs_files f ON f.path = c.path
WHERE c.content LIKE ? {type_sql}
LIMIT ?
""",
params,
) as cur:
rows = await cur.fetchall()
return [
{"path": r[0], "title": r[1] or "", "snippet": (r[2] or "").strip(), "score": 0.5}
for r in (rows or [])
]
async def get_preview(path: str) -> Optional[Dict[str, Any]]:
"""Return path, title, sections (heading + short excerpt)."""
conn = await _db.get_db()
async with conn.execute("SELECT path, title FROM docs_files WHERE path = ?", (path,)) as cur:
row = await cur.fetchone()
if not row:
return None
async with conn.execute(
"SELECT heading, content FROM docs_chunks WHERE path = ? ORDER BY chunk_index",
(path,),
) as cur:
rows = await cur.fetchall()
sections = [
{"heading": r[0] or "", "excerpt": (r[1] or "")[:400].strip()}
for r in (rows or [])
]
return {"path": row[0], "title": row[1] or "", "sections": sections}
async def get_raw(path: str) -> Optional[str]:
"""Return full content of first chunk or concatenated chunks (best-effort)."""
conn = await _db.get_db()
async with conn.execute(
"SELECT content FROM docs_chunks WHERE path = ? ORDER BY chunk_index",
(path,),
) as cur:
rows = await cur.fetchall()
if not rows:
return None
return "\n\n".join(r[0] or "" for r in rows)

View File

@@ -56,6 +56,7 @@ from .nodes import get_nodes_dashboard
from .monitor import collect_all_nodes
from .ops import run_ops_action, OPS_ACTIONS
from .docs_router import docs_router
from .runbooks_router import runbooks_router
from . import db as _app_db
from .metrics import (
SOFIIA_SEND_REQUESTS_TOTAL,
@@ -462,6 +463,8 @@ app.add_middleware(
# Projects + Documents + Sessions + Dialog Map API
app.include_router(docs_router)
# Runbooks / docs index (read-only search & preview, PR1.1)
app.include_router(runbooks_router)
# ── WebSocket event bus ───────────────────────────────────────────────────────
_ws_clients: Set[WebSocket] = set()

View File

@@ -0,0 +1,64 @@
"""
Runbooks / docs index API — read-only search and preview (PR1.1).
GET /api/runbooks/search, /api/runbooks/preview, /api/runbooks/raw.
"""
from __future__ import annotations
import re
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from .auth import require_auth
from . import docs_store as store
runbooks_router = APIRouter(prefix="/api/runbooks", tags=["runbooks-docs"])
def _safe_path(path: str) -> bool:
"""Reject path traversal and non-docs paths."""
if not path or ".." in path or path.startswith("/"):
return False
norm = path.replace("\\", "/").strip()
return bool(re.match(r"^(docs/|runbook/|release/)?[\w/\-.]+\.md$", norm, re.I))
@runbooks_router.get("/search")
async def runbooks_search(
q: str = Query(..., min_length=1, max_length=200),
doc_type: Optional[str] = Query(None, description="runbook | release | spec"),
limit: int = Query(10, ge=1, le=50),
_auth: str = Depends(require_auth),
):
"""Search runbooks/docs by full text. Returns path, title, snippet."""
items = await store.search_docs(q=q, doc_type=doc_type, limit=limit)
return {"items": items}
@runbooks_router.get("/preview")
async def runbooks_preview(
path: str = Query(..., description="Relative path, e.g. runbook/rehearsal-v1-30min-checklist.md"),
_auth: str = Depends(require_auth),
):
"""Get path, title, and sections (heading + excerpt) for a doc."""
if not _safe_path(path):
raise HTTPException(status_code=400, detail="Invalid path")
out = await store.get_preview(path)
if not out:
raise HTTPException(status_code=404, detail="Not found")
return out
@runbooks_router.get("/raw")
async def runbooks_raw(
path: str = Query(..., description="Relative path to markdown file"),
_auth: str = Depends(require_auth),
):
"""Get raw markdown content (read-only)."""
if not _safe_path(path):
raise HTTPException(status_code=400, detail="Invalid path")
content = await store.get_raw(path)
if content is None:
raise HTTPException(status_code=404, detail="Not found")
from fastapi.responses import PlainTextResponse
return PlainTextResponse(content, media_type="text/markdown; charset=utf-8")

View File

@@ -0,0 +1,95 @@
"""
Tests for runbooks/docs search API (PR1.1): search and preview.
Uses tmp docs dir and rebuild_index; no network.
"""
from __future__ import annotations
import asyncio
from pathlib import Path
import httpx
import pytest
from httpx import ASGITransport
@pytest.fixture
def tmp_docs_with_rehearsal(tmp_path):
"""Create tmp_path/docs/runbook with a rehearsal checklist file."""
docs_root = tmp_path / "docs"
runbook_dir = docs_root / "runbook"
runbook_dir.mkdir(parents=True)
(runbook_dir / "rehearsal-v1-30min-checklist.md").write_text(
"# Rehearsal v1 — 30-minute execution plan\n\n"
"## Preflight\n\n"
"Run STRICT=1 bash ops/preflight_sofiia_console.sh\n\n"
"## Smoke\n\n"
"Idempotency and audit auth checks.",
encoding="utf-8",
)
return docs_root
def test_runbooks_search_finds_rehearsal(sofiia_module, tmp_path, tmp_docs_with_rehearsal, monkeypatch):
"""Search for 'rehearsal' returns the checklist path and snippet."""
import app.docs_index as docs_index_mod
import app.docs_store as docs_store_mod
monkeypatch.setenv("SOFIIA_DATA_DIR", str(tmp_path / "sofiia-data"))
loop = asyncio.get_event_loop()
async def run():
await docs_index_mod.rebuild_index(tmp_docs_with_rehearsal)
# Direct store call (same loop/conn) to verify index
items = await docs_store_mod.search_docs("rehearsal", limit=5)
return items
items = loop.run_until_complete(run())
assert len(items) >= 1, "search_docs should return at least one hit for 'rehearsal'"
paths = [x["path"] for x in items]
assert any("rehearsal" in p for p in paths), f"Expected path containing 'rehearsal', got {paths}"
first = items[0]
assert "path" in first and "title" in first and "snippet" in first
def test_runbooks_preview_returns_headings(sofiia_module, sofiia_client, tmp_path, tmp_docs_with_rehearsal, monkeypatch):
"""Preview returns path, title, sections with heading and excerpt."""
import app.docs_index as docs_index_mod
monkeypatch.setenv("SOFIIA_DATA_DIR", str(tmp_path / "sofiia-data"))
loop = asyncio.get_event_loop()
loop.run_until_complete(docs_index_mod.rebuild_index(tmp_docs_with_rehearsal))
r = sofiia_client.get("/api/runbooks/preview?path=runbook/rehearsal-v1-30min-checklist.md")
assert r.status_code == 200, r.text
data = r.json()
assert data["path"] == "runbook/rehearsal-v1-30min-checklist.md"
assert "Rehearsal" in (data.get("title") or "")
assert "sections" in data
assert len(data["sections"]) >= 1
assert any("Preflight" in (s.get("heading") or "") for s in data["sections"])
def test_runbooks_search_filter_doc_type(sofiia_module, sofiia_client, tmp_path, tmp_docs_with_rehearsal, monkeypatch):
"""Search with doc_type=runbook returns only runbook paths."""
import app.docs_index as docs_index_mod
monkeypatch.setenv("SOFIIA_DATA_DIR", str(tmp_path / "sofiia-data"))
loop = asyncio.get_event_loop()
loop.run_until_complete(docs_index_mod.rebuild_index(tmp_docs_with_rehearsal))
r = sofiia_client.get("/api/runbooks/search?q=rehearsal&doc_type=runbook&limit=5")
assert r.status_code == 200, r.text
for item in r.json().get("items", []):
assert "runbook" in item["path"] or item["path"].startswith("runbook/")
def test_runbooks_preview_404_for_unknown_path(sofiia_client):
"""Preview returns 404 for path not in index."""
r = sofiia_client.get("/api/runbooks/preview?path=runbook/nonexistent-file.md")
assert r.status_code == 404
def test_runbooks_raw_400_for_invalid_path(sofiia_client):
"""Raw returns 400 for path traversal attempt."""
r = sofiia_client.get("/api/runbooks/raw?path=../../../etc/passwd")
assert r.status_code == 400