feat(sofiia-console): add docs index and runbook search API (FTS5)

adds SQLite docs index (files/chunks + FTS5) and CLI rebuild exposes authenticated runbook search/preview/raw endpoints Made-with: Cursor
2026-03-03 04:26:34 -08:00
parent bddb6cd75a
commit ef3ff80645
6 changed files with 484 additions and 0 deletions
--- a/services/sofiia-console/app/db.py
+++ b/services/sofiia-console/app/db.py
@@ -350,6 +350,31 @@ CREATE INDEX IF NOT EXISTS idx_audit_operator_ts ON audit_events(operator_id, ts
 CREATE INDEX IF NOT EXISTS idx_audit_chat_ts ON audit_events(chat_id, ts DESC);
 CREATE INDEX IF NOT EXISTS idx_audit_event_ts ON audit_events(event, ts DESC);

+-- ── Docs index (runbooks/release FTS, PR1.1) ─────────────────────────────────
+CREATE TABLE IF NOT EXISTS docs_files (
+    path      TEXT PRIMARY KEY,
+    mtime     REAL NOT NULL,
+    sha       TEXT NOT NULL,
+    title     TEXT DEFAULT '',
+    doc_type  TEXT NOT NULL DEFAULT 'misc'
+);
+CREATE TABLE IF NOT EXISTS docs_chunks (
+    id          TEXT PRIMARY KEY,
+    path        TEXT NOT NULL,
+    heading     TEXT NOT NULL DEFAULT '',
+    chunk_index INTEGER NOT NULL,
+    content     TEXT NOT NULL,
+    FOREIGN KEY (path) REFERENCES docs_files(path) ON DELETE CASCADE
+);
+CREATE INDEX IF NOT EXISTS idx_docs_chunks_path ON docs_chunks(path);
+CREATE VIRTUAL TABLE IF NOT EXISTS docs_chunks_fts USING fts5(
+    chunk_id UNINDEXED,
+    path,
+    heading,
+    content,
+    content=''
+);
+
 -- ── Graph Intelligence (Hygiene + Reflection) ──────────────────────────────
 -- These ADD COLUMN statements are idempotent (IF NOT EXISTS requires SQLite 3.37+).
 -- On older SQLite they fail silently — init_db() wraps them in a separate try block.
--- a/services/sofiia-console/app/docs_index.py
+++ b/services/sofiia-console/app/docs_index.py
@@ -0,0 +1,76 @@
+"""
+Docs index builder — scan docs/**/*.md, chunk, and populate docs_files / docs_chunks / docs_chunks_fts.
+Run manually: python -m app.docs_index --rebuild
+Or from repo root: SOFIIA_DOCS_ROOT=./docs python -m app.docs_index --rebuild
+"""
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Optional
+
+from . import db as _db
+from .docs_store import clear_docs_index, insert_docs_file
+
+logger = logging.getLogger(__name__)
+
+
+def get_docs_root() -> Path:
+    """SOFIIA_DOCS_ROOT or repo/docs (sofiia-console lives in repo/services/sofiia-console)."""
+    env = os.getenv("SOFIIA_DOCS_ROOT", "").strip()
+    if env:
+        return Path(env).resolve()
+    # app/docs_index.py -> app -> sofiia-console -> services -> repo
+    repo = Path(__file__).resolve().parent.parent.parent.parent
+    return (repo / "docs").resolve()
+
+
+async def rebuild_index(docs_root: Optional[Path] = None) -> int:
+    """Scan docs_root for **/*.md, clear index, insert all. Returns count of files indexed."""
+    root = docs_root or get_docs_root()
+    if not root.is_dir():
+        logger.warning("Docs root not found: %s", root)
+        return 0
+    await _db.init_db()
+    await clear_docs_index()
+    count = 0
+    for path in sorted(root.rglob("*.md")):
+        try:
+            content = path.read_text(encoding="utf-8", errors="replace")
+        except Exception as e:
+            logger.warning("Skip %s: %s", path, e)
+            continue
+        # Store path relative to repo (docs/runbook/...) for stable IDs
+        try:
+            rel = path.relative_to(root)
+        except ValueError:
+            rel = path.name
+        path_key = str(rel).replace("\\", "/")
+        mtime = path.stat().st_mtime
+        await insert_docs_file(path_key, mtime, content)
+        count += 1
+    logger.info("Docs index rebuilt: %s files from %s", count, root)
+    return count
+
+
+def main() -> int:
+    import argparse
+    p = argparse.ArgumentParser(description="Rebuild docs FTS index")
+    p.add_argument("--rebuild", action="store_true", help="Clear and rebuild index")
+    p.add_argument("--docs-root", type=str, default=None, help="Override docs directory")
+    args = p.parse_args()
+    if not args.rebuild:
+        print("Use --rebuild to rebuild index.", file=sys.stderr)
+        return 1
+    logging.basicConfig(level=logging.INFO)
+    root = Path(args.docs_root).resolve() if args.docs_root else None
+    n = asyncio.run(rebuild_index(root))
+    print(f"Indexed {n} files.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/services/sofiia-console/app/docs_store.py
+++ b/services/sofiia-console/app/docs_store.py
@@ -0,0 +1,221 @@
+"""
+Docs index store — SQLite tables docs_files, docs_chunks, docs_chunks_fts (FTS5).
+Read-only API: search, preview, raw. Index build in docs_index.py.
+"""
+from __future__ import annotations
+
+import hashlib
+import logging
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from . import db as _db
+
+logger = logging.getLogger(__name__)
+
+# Max snippet length per fragment
+_SNIPPET_LEN = 120
+_CHUNK_SIZE = 3500  # ~2–4KB target
+
+
+def _doc_type_from_path(path: str) -> str:
+    p = path.replace("\\", "/")
+    if "/runbook/" in p or p.startswith("runbook/"):
+        return "runbook"
+    if "/release/" in p or p.startswith("release/"):
+        return "release"
+    return "spec" if "/docs/" in p or p.startswith("docs/") else "misc"
+
+
+def _extract_title(content: str, path: str) -> str:
+    """First # heading or filename."""
+    for line in content.splitlines():
+        line = line.strip()
+        if line.startswith("# "):
+            return line[2:].strip()[:200]
+    return Path(path).stem.replace("-", " ").replace("_", " ").title()[:200]
+
+
+async def clear_docs_index() -> None:
+    """Remove all docs_files, docs_chunks, and FTS rows."""
+    conn = await _db.get_db()
+    await conn.execute("DELETE FROM docs_chunks_fts")
+    await conn.execute("DELETE FROM docs_chunks")
+    await conn.execute("DELETE FROM docs_files")
+    await conn.commit()
+    logger.info("Docs index cleared.")
+
+
+async def insert_docs_file(path: str, mtime: float, content: str) -> None:
+    """Register one file and its chunks. Caller ensures path is normalized."""
+    conn = await _db.get_db()
+    sha = hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
+    title = _extract_title(content, path)
+    doc_type = _doc_type_from_path(path)
+    await conn.execute(
+        "INSERT OR REPLACE INTO docs_files(path, mtime, sha, title, doc_type) VALUES (?,?,?,?,?)",
+        (path, mtime, sha, title, doc_type),
+    )
+    await conn.execute("DELETE FROM docs_chunks WHERE path = ?", (path,))
+    await conn.execute("DELETE FROM docs_chunks_fts WHERE path = ?", (path,))
+    await conn.commit()
+
+    chunks = _chunk_content(content, path)
+    for i, (heading, text) in enumerate(chunks):
+        chunk_id = f"{path}:{i}"
+        await conn.execute(
+            "INSERT INTO docs_chunks(id, path, heading, chunk_index, content) VALUES (?,?,?,?,?)",
+            (chunk_id, path, heading, i, text),
+        )
+        await conn.execute(
+            "INSERT INTO docs_chunks_fts(chunk_id, path, heading, content) VALUES (?,?,?,?)",
+            (chunk_id, path, heading, text),
+        )
+    await conn.commit()
+
+
+def _chunk_content(content: str, path: str) -> List[tuple]:
+    """Split by headers, then by size. Returns [(heading, text), ...]."""
+    sections: List[tuple] = []
+    current_heading = ""
+    current_lines: List[str] = []
+    current_size = 0
+
+    for line in content.splitlines():
+        stripped = line.strip()
+        if stripped.startswith("# "):
+            if current_lines:
+                sections.append((current_heading, "\n".join(current_lines)))
+            current_heading = stripped[2:].strip()[:300]
+            current_lines = [line]
+            current_size = len(line) + 1
+        elif stripped.startswith("## "):
+            if current_lines:
+                sections.append((current_heading, "\n".join(current_lines)))
+            current_heading = stripped[3:].strip()[:300]
+            current_lines = [line]
+            current_size = len(line) + 1
+        elif stripped.startswith("### "):
+            if current_lines:
+                sections.append((current_heading, "\n".join(current_lines)))
+            current_heading = stripped[4:].strip()[:300]
+            current_lines = [line]
+            current_size = len(line) + 1
+        else:
+            current_lines.append(line)
+            current_size += len(line) + 1
+            if current_size >= _CHUNK_SIZE:
+                sections.append((current_heading, "\n".join(current_lines)))
+                current_lines = []
+                current_size = 0
+    if current_lines:
+        sections.append((current_heading, "\n".join(current_lines)))
+    return sections
+
+
+async def search_docs(
+    q: str,
+    doc_type: Optional[str] = None,
+    limit: int = 10,
+) -> List[Dict[str, Any]]:
+    """FTS search. Returns [{path, title, snippet, score}]. Uses FTS5 if available."""
+    conn = await _db.get_db()
+    # Sanitize FTS5 query: wrap in quotes for phrase or use simple terms
+    q_clean = q.strip().replace("\"", " ")[:200]
+    if not q_clean:
+        return []
+
+    # Filter by doc_type via join with docs_files
+    type_filter = "AND f.doc_type = ?" if doc_type else ""
+    params: List[Any] = [q_clean]
+    if doc_type:
+        params.append(doc_type)
+    params.append(limit)
+
+    try:
+        # FTS5: snippet(build, col_idx, left, right, ellipsis, max_tokens)
+        # columns: 0=chunk_id, 1=path, 2=heading, 3=content → snippet column 3
+        async with conn.execute(
+            f"""
+            SELECT f.path, f.title, snippet(docs_chunks_fts, 3, '**', '**', '...', {_SNIPPET_LEN//5}) AS snippet
+            FROM docs_chunks_fts AS fts
+            JOIN docs_files f ON f.path = fts.path
+            WHERE docs_chunks_fts MATCH ? {type_filter}
+            LIMIT ?
+            """,
+            params,
+        ) as cur:
+            rows = await cur.fetchall()
+    except Exception as e:
+        logger.warning("FTS search failed, fallback to LIKE: %s", e)
+        return await _search_docs_like(q_clean, doc_type, limit)
+
+    result = [
+        {"path": r[0], "title": r[1] or "", "snippet": (r[2] or "").strip(), "score": 1.0}
+        for r in (rows or [])
+    ]
+    if not result:
+        return await _search_docs_like(q_clean, doc_type, limit)
+    return result
+
+
+async def _search_docs_like(
+    q: str,
+    doc_type: Optional[str],
+    limit: int,
+) -> List[Dict[str, Any]]:
+    """Fallback when FTS5 unavailable: LIKE on content."""
+    conn = await _db.get_db()
+    like = f"%{q}%"
+    params: List[Any] = [like]
+    if doc_type:
+        params.append(doc_type)
+    params.append(limit)
+    type_sql = "AND f.doc_type = ?" if doc_type else ""
+    async with conn.execute(
+        f"""
+        SELECT DISTINCT f.path, f.title, substr(c.content, 1, {_SNIPPET_LEN}) AS snippet
+        FROM docs_chunks c
+        JOIN docs_files f ON f.path = c.path
+        WHERE c.content LIKE ? {type_sql}
+        LIMIT ?
+        """,
+        params,
+    ) as cur:
+        rows = await cur.fetchall()
+    return [
+        {"path": r[0], "title": r[1] or "", "snippet": (r[2] or "").strip(), "score": 0.5}
+        for r in (rows or [])
+    ]
+
+
+async def get_preview(path: str) -> Optional[Dict[str, Any]]:
+    """Return path, title, sections (heading + short excerpt)."""
+    conn = await _db.get_db()
+    async with conn.execute("SELECT path, title FROM docs_files WHERE path = ?", (path,)) as cur:
+        row = await cur.fetchone()
+    if not row:
+        return None
+    async with conn.execute(
+        "SELECT heading, content FROM docs_chunks WHERE path = ? ORDER BY chunk_index",
+        (path,),
+    ) as cur:
+        rows = await cur.fetchall()
+    sections = [
+        {"heading": r[0] or "", "excerpt": (r[1] or "")[:400].strip()}
+        for r in (rows or [])
+    ]
+    return {"path": row[0], "title": row[1] or "", "sections": sections}
+
+
+async def get_raw(path: str) -> Optional[str]:
+    """Return full content of first chunk or concatenated chunks (best-effort)."""
+    conn = await _db.get_db()
+    async with conn.execute(
+        "SELECT content FROM docs_chunks WHERE path = ? ORDER BY chunk_index",
+        (path,),
+    ) as cur:
+        rows = await cur.fetchall()
+    if not rows:
+        return None
+    return "\n\n".join(r[0] or "" for r in rows)
--- a/services/sofiia-console/app/main.py
+++ b/services/sofiia-console/app/main.py
@@ -56,6 +56,7 @@ from .nodes import get_nodes_dashboard
 from .monitor import collect_all_nodes
 from .ops import run_ops_action, OPS_ACTIONS
 from .docs_router import docs_router
+from .runbooks_router import runbooks_router
 from . import db as _app_db
 from .metrics import (
    SOFIIA_SEND_REQUESTS_TOTAL,
@@ -462,6 +463,8 @@ app.add_middleware(

 # Projects + Documents + Sessions + Dialog Map API
 app.include_router(docs_router)
+# Runbooks / docs index (read-only search & preview, PR1.1)
+app.include_router(runbooks_router)

 # ── WebSocket event bus ───────────────────────────────────────────────────────
 _ws_clients: Set[WebSocket] = set()
--- a/services/sofiia-console/app/runbooks_router.py
+++ b/services/sofiia-console/app/runbooks_router.py
@@ -0,0 +1,64 @@
+"""
+Runbooks / docs index API — read-only search and preview (PR1.1).
+GET /api/runbooks/search, /api/runbooks/preview, /api/runbooks/raw.
+"""
+from __future__ import annotations
+
+import re
+from typing import Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+
+from .auth import require_auth
+from . import docs_store as store
+
+runbooks_router = APIRouter(prefix="/api/runbooks", tags=["runbooks-docs"])
+
+
+def _safe_path(path: str) -> bool:
+    """Reject path traversal and non-docs paths."""
+    if not path or ".." in path or path.startswith("/"):
+        return False
+    norm = path.replace("\\", "/").strip()
+    return bool(re.match(r"^(docs/|runbook/|release/)?[\w/\-.]+\.md$", norm, re.I))
+
+
+@runbooks_router.get("/search")
+async def runbooks_search(
+    q: str = Query(..., min_length=1, max_length=200),
+    doc_type: Optional[str] = Query(None, description="runbook | release | spec"),
+    limit: int = Query(10, ge=1, le=50),
+    _auth: str = Depends(require_auth),
+):
+    """Search runbooks/docs by full text. Returns path, title, snippet."""
+    items = await store.search_docs(q=q, doc_type=doc_type, limit=limit)
+    return {"items": items}
+
+
+@runbooks_router.get("/preview")
+async def runbooks_preview(
+    path: str = Query(..., description="Relative path, e.g. runbook/rehearsal-v1-30min-checklist.md"),
+    _auth: str = Depends(require_auth),
+):
+    """Get path, title, and sections (heading + excerpt) for a doc."""
+    if not _safe_path(path):
+        raise HTTPException(status_code=400, detail="Invalid path")
+    out = await store.get_preview(path)
+    if not out:
+        raise HTTPException(status_code=404, detail="Not found")
+    return out
+
+
+@runbooks_router.get("/raw")
+async def runbooks_raw(
+    path: str = Query(..., description="Relative path to markdown file"),
+    _auth: str = Depends(require_auth),
+):
+    """Get raw markdown content (read-only)."""
+    if not _safe_path(path):
+        raise HTTPException(status_code=400, detail="Invalid path")
+    content = await store.get_raw(path)
+    if content is None:
+        raise HTTPException(status_code=404, detail="Not found")
+    from fastapi.responses import PlainTextResponse
+    return PlainTextResponse(content, media_type="text/markdown; charset=utf-8")
--- a/tests/test_sofiia_docs_search.py
+++ b/tests/test_sofiia_docs_search.py
@@ -0,0 +1,95 @@
+"""
+Tests for runbooks/docs search API (PR1.1): search and preview.
+Uses tmp docs dir and rebuild_index; no network.
+"""
+from __future__ import annotations
+
+import asyncio
+from pathlib import Path
+
+import httpx
+import pytest
+from httpx import ASGITransport
+
+
+@pytest.fixture
+def tmp_docs_with_rehearsal(tmp_path):
+    """Create tmp_path/docs/runbook with a rehearsal checklist file."""
+    docs_root = tmp_path / "docs"
+    runbook_dir = docs_root / "runbook"
+    runbook_dir.mkdir(parents=True)
+    (runbook_dir / "rehearsal-v1-30min-checklist.md").write_text(
+        "# Rehearsal v1 — 30-minute execution plan\n\n"
+        "## Preflight\n\n"
+        "Run STRICT=1 bash ops/preflight_sofiia_console.sh\n\n"
+        "## Smoke\n\n"
+        "Idempotency and audit auth checks.",
+        encoding="utf-8",
+    )
+    return docs_root
+
+
+def test_runbooks_search_finds_rehearsal(sofiia_module, tmp_path, tmp_docs_with_rehearsal, monkeypatch):
+    """Search for 'rehearsal' returns the checklist path and snippet."""
+    import app.docs_index as docs_index_mod
+    import app.docs_store as docs_store_mod
+
+    monkeypatch.setenv("SOFIIA_DATA_DIR", str(tmp_path / "sofiia-data"))
+    loop = asyncio.get_event_loop()
+
+    async def run():
+        await docs_index_mod.rebuild_index(tmp_docs_with_rehearsal)
+        # Direct store call (same loop/conn) to verify index
+        items = await docs_store_mod.search_docs("rehearsal", limit=5)
+        return items
+
+    items = loop.run_until_complete(run())
+    assert len(items) >= 1, "search_docs should return at least one hit for 'rehearsal'"
+    paths = [x["path"] for x in items]
+    assert any("rehearsal" in p for p in paths), f"Expected path containing 'rehearsal', got {paths}"
+    first = items[0]
+    assert "path" in first and "title" in first and "snippet" in first
+
+
+def test_runbooks_preview_returns_headings(sofiia_module, sofiia_client, tmp_path, tmp_docs_with_rehearsal, monkeypatch):
+    """Preview returns path, title, sections with heading and excerpt."""
+    import app.docs_index as docs_index_mod
+
+    monkeypatch.setenv("SOFIIA_DATA_DIR", str(tmp_path / "sofiia-data"))
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(docs_index_mod.rebuild_index(tmp_docs_with_rehearsal))
+
+    r = sofiia_client.get("/api/runbooks/preview?path=runbook/rehearsal-v1-30min-checklist.md")
+    assert r.status_code == 200, r.text
+    data = r.json()
+    assert data["path"] == "runbook/rehearsal-v1-30min-checklist.md"
+    assert "Rehearsal" in (data.get("title") or "")
+    assert "sections" in data
+    assert len(data["sections"]) >= 1
+    assert any("Preflight" in (s.get("heading") or "") for s in data["sections"])
+
+
+def test_runbooks_search_filter_doc_type(sofiia_module, sofiia_client, tmp_path, tmp_docs_with_rehearsal, monkeypatch):
+    """Search with doc_type=runbook returns only runbook paths."""
+    import app.docs_index as docs_index_mod
+
+    monkeypatch.setenv("SOFIIA_DATA_DIR", str(tmp_path / "sofiia-data"))
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(docs_index_mod.rebuild_index(tmp_docs_with_rehearsal))
+
+    r = sofiia_client.get("/api/runbooks/search?q=rehearsal&doc_type=runbook&limit=5")
+    assert r.status_code == 200, r.text
+    for item in r.json().get("items", []):
+        assert "runbook" in item["path"] or item["path"].startswith("runbook/")
+
+
+def test_runbooks_preview_404_for_unknown_path(sofiia_client):
+    """Preview returns 404 for path not in index."""
+    r = sofiia_client.get("/api/runbooks/preview?path=runbook/nonexistent-file.md")
+    assert r.status_code == 404
+
+
+def test_runbooks_raw_400_for_invalid_path(sofiia_client):
+    """Raw returns 400 for path traversal attempt."""
+    r = sofiia_client.get("/api/runbooks/raw?path=../../../etc/passwd")
+    assert r.status_code == 400