From ef3ff806450584c2946e78d94a6b694c1d77051d Mon Sep 17 00:00:00 2001 From: Apple Date: Tue, 3 Mar 2026 04:26:34 -0800 Subject: [PATCH] feat(sofiia-console): add docs index and runbook search API (FTS5) adds SQLite docs index (files/chunks + FTS5) and CLI rebuild exposes authenticated runbook search/preview/raw endpoints Made-with: Cursor --- services/sofiia-console/app/db.py | 25 ++ services/sofiia-console/app/docs_index.py | 76 ++++++ services/sofiia-console/app/docs_store.py | 221 ++++++++++++++++++ services/sofiia-console/app/main.py | 3 + .../sofiia-console/app/runbooks_router.py | 64 +++++ tests/test_sofiia_docs_search.py | 95 ++++++++ 6 files changed, 484 insertions(+) create mode 100644 services/sofiia-console/app/docs_index.py create mode 100644 services/sofiia-console/app/docs_store.py create mode 100644 services/sofiia-console/app/runbooks_router.py create mode 100644 tests/test_sofiia_docs_search.py diff --git a/services/sofiia-console/app/db.py b/services/sofiia-console/app/db.py index f16130aa..b2f4a864 100644 --- a/services/sofiia-console/app/db.py +++ b/services/sofiia-console/app/db.py @@ -350,6 +350,31 @@ CREATE INDEX IF NOT EXISTS idx_audit_operator_ts ON audit_events(operator_id, ts CREATE INDEX IF NOT EXISTS idx_audit_chat_ts ON audit_events(chat_id, ts DESC); CREATE INDEX IF NOT EXISTS idx_audit_event_ts ON audit_events(event, ts DESC); +-- ── Docs index (runbooks/release FTS, PR1.1) ───────────────────────────────── +CREATE TABLE IF NOT EXISTS docs_files ( + path TEXT PRIMARY KEY, + mtime REAL NOT NULL, + sha TEXT NOT NULL, + title TEXT DEFAULT '', + doc_type TEXT NOT NULL DEFAULT 'misc' +); +CREATE TABLE IF NOT EXISTS docs_chunks ( + id TEXT PRIMARY KEY, + path TEXT NOT NULL, + heading TEXT NOT NULL DEFAULT '', + chunk_index INTEGER NOT NULL, + content TEXT NOT NULL, + FOREIGN KEY (path) REFERENCES docs_files(path) ON DELETE CASCADE +); +CREATE INDEX IF NOT EXISTS idx_docs_chunks_path ON docs_chunks(path); +CREATE VIRTUAL TABLE IF NOT EXISTS docs_chunks_fts USING fts5( + chunk_id UNINDEXED, + path, + heading, + content, + content='' +); + -- ── Graph Intelligence (Hygiene + Reflection) ────────────────────────────── -- These ADD COLUMN statements are idempotent (IF NOT EXISTS requires SQLite 3.37+). -- On older SQLite they fail silently — init_db() wraps them in a separate try block. diff --git a/services/sofiia-console/app/docs_index.py b/services/sofiia-console/app/docs_index.py new file mode 100644 index 00000000..81b5af3d --- /dev/null +++ b/services/sofiia-console/app/docs_index.py @@ -0,0 +1,76 @@ +""" +Docs index builder — scan docs/**/*.md, chunk, and populate docs_files / docs_chunks / docs_chunks_fts. +Run manually: python -m app.docs_index --rebuild +Or from repo root: SOFIIA_DOCS_ROOT=./docs python -m app.docs_index --rebuild +""" +from __future__ import annotations + +import asyncio +import logging +import os +import sys +from pathlib import Path +from typing import Optional + +from . import db as _db +from .docs_store import clear_docs_index, insert_docs_file + +logger = logging.getLogger(__name__) + + +def get_docs_root() -> Path: + """SOFIIA_DOCS_ROOT or repo/docs (sofiia-console lives in repo/services/sofiia-console).""" + env = os.getenv("SOFIIA_DOCS_ROOT", "").strip() + if env: + return Path(env).resolve() + # app/docs_index.py -> app -> sofiia-console -> services -> repo + repo = Path(__file__).resolve().parent.parent.parent.parent + return (repo / "docs").resolve() + + +async def rebuild_index(docs_root: Optional[Path] = None) -> int: + """Scan docs_root for **/*.md, clear index, insert all. Returns count of files indexed.""" + root = docs_root or get_docs_root() + if not root.is_dir(): + logger.warning("Docs root not found: %s", root) + return 0 + await _db.init_db() + await clear_docs_index() + count = 0 + for path in sorted(root.rglob("*.md")): + try: + content = path.read_text(encoding="utf-8", errors="replace") + except Exception as e: + logger.warning("Skip %s: %s", path, e) + continue + # Store path relative to repo (docs/runbook/...) for stable IDs + try: + rel = path.relative_to(root) + except ValueError: + rel = path.name + path_key = str(rel).replace("\\", "/") + mtime = path.stat().st_mtime + await insert_docs_file(path_key, mtime, content) + count += 1 + logger.info("Docs index rebuilt: %s files from %s", count, root) + return count + + +def main() -> int: + import argparse + p = argparse.ArgumentParser(description="Rebuild docs FTS index") + p.add_argument("--rebuild", action="store_true", help="Clear and rebuild index") + p.add_argument("--docs-root", type=str, default=None, help="Override docs directory") + args = p.parse_args() + if not args.rebuild: + print("Use --rebuild to rebuild index.", file=sys.stderr) + return 1 + logging.basicConfig(level=logging.INFO) + root = Path(args.docs_root).resolve() if args.docs_root else None + n = asyncio.run(rebuild_index(root)) + print(f"Indexed {n} files.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/services/sofiia-console/app/docs_store.py b/services/sofiia-console/app/docs_store.py new file mode 100644 index 00000000..6078f871 --- /dev/null +++ b/services/sofiia-console/app/docs_store.py @@ -0,0 +1,221 @@ +""" +Docs index store — SQLite tables docs_files, docs_chunks, docs_chunks_fts (FTS5). +Read-only API: search, preview, raw. Index build in docs_index.py. +""" +from __future__ import annotations + +import hashlib +import logging +from pathlib import Path +from typing import Any, Dict, List, Optional + +from . import db as _db + +logger = logging.getLogger(__name__) + +# Max snippet length per fragment +_SNIPPET_LEN = 120 +_CHUNK_SIZE = 3500 # ~2–4KB target + + +def _doc_type_from_path(path: str) -> str: + p = path.replace("\\", "/") + if "/runbook/" in p or p.startswith("runbook/"): + return "runbook" + if "/release/" in p or p.startswith("release/"): + return "release" + return "spec" if "/docs/" in p or p.startswith("docs/") else "misc" + + +def _extract_title(content: str, path: str) -> str: + """First # heading or filename.""" + for line in content.splitlines(): + line = line.strip() + if line.startswith("# "): + return line[2:].strip()[:200] + return Path(path).stem.replace("-", " ").replace("_", " ").title()[:200] + + +async def clear_docs_index() -> None: + """Remove all docs_files, docs_chunks, and FTS rows.""" + conn = await _db.get_db() + await conn.execute("DELETE FROM docs_chunks_fts") + await conn.execute("DELETE FROM docs_chunks") + await conn.execute("DELETE FROM docs_files") + await conn.commit() + logger.info("Docs index cleared.") + + +async def insert_docs_file(path: str, mtime: float, content: str) -> None: + """Register one file and its chunks. Caller ensures path is normalized.""" + conn = await _db.get_db() + sha = hashlib.sha256(content.encode("utf-8")).hexdigest()[:16] + title = _extract_title(content, path) + doc_type = _doc_type_from_path(path) + await conn.execute( + "INSERT OR REPLACE INTO docs_files(path, mtime, sha, title, doc_type) VALUES (?,?,?,?,?)", + (path, mtime, sha, title, doc_type), + ) + await conn.execute("DELETE FROM docs_chunks WHERE path = ?", (path,)) + await conn.execute("DELETE FROM docs_chunks_fts WHERE path = ?", (path,)) + await conn.commit() + + chunks = _chunk_content(content, path) + for i, (heading, text) in enumerate(chunks): + chunk_id = f"{path}:{i}" + await conn.execute( + "INSERT INTO docs_chunks(id, path, heading, chunk_index, content) VALUES (?,?,?,?,?)", + (chunk_id, path, heading, i, text), + ) + await conn.execute( + "INSERT INTO docs_chunks_fts(chunk_id, path, heading, content) VALUES (?,?,?,?)", + (chunk_id, path, heading, text), + ) + await conn.commit() + + +def _chunk_content(content: str, path: str) -> List[tuple]: + """Split by headers, then by size. Returns [(heading, text), ...].""" + sections: List[tuple] = [] + current_heading = "" + current_lines: List[str] = [] + current_size = 0 + + for line in content.splitlines(): + stripped = line.strip() + if stripped.startswith("# "): + if current_lines: + sections.append((current_heading, "\n".join(current_lines))) + current_heading = stripped[2:].strip()[:300] + current_lines = [line] + current_size = len(line) + 1 + elif stripped.startswith("## "): + if current_lines: + sections.append((current_heading, "\n".join(current_lines))) + current_heading = stripped[3:].strip()[:300] + current_lines = [line] + current_size = len(line) + 1 + elif stripped.startswith("### "): + if current_lines: + sections.append((current_heading, "\n".join(current_lines))) + current_heading = stripped[4:].strip()[:300] + current_lines = [line] + current_size = len(line) + 1 + else: + current_lines.append(line) + current_size += len(line) + 1 + if current_size >= _CHUNK_SIZE: + sections.append((current_heading, "\n".join(current_lines))) + current_lines = [] + current_size = 0 + if current_lines: + sections.append((current_heading, "\n".join(current_lines))) + return sections + + +async def search_docs( + q: str, + doc_type: Optional[str] = None, + limit: int = 10, +) -> List[Dict[str, Any]]: + """FTS search. Returns [{path, title, snippet, score}]. Uses FTS5 if available.""" + conn = await _db.get_db() + # Sanitize FTS5 query: wrap in quotes for phrase or use simple terms + q_clean = q.strip().replace("\"", " ")[:200] + if not q_clean: + return [] + + # Filter by doc_type via join with docs_files + type_filter = "AND f.doc_type = ?" if doc_type else "" + params: List[Any] = [q_clean] + if doc_type: + params.append(doc_type) + params.append(limit) + + try: + # FTS5: snippet(build, col_idx, left, right, ellipsis, max_tokens) + # columns: 0=chunk_id, 1=path, 2=heading, 3=content → snippet column 3 + async with conn.execute( + f""" + SELECT f.path, f.title, snippet(docs_chunks_fts, 3, '**', '**', '...', {_SNIPPET_LEN//5}) AS snippet + FROM docs_chunks_fts AS fts + JOIN docs_files f ON f.path = fts.path + WHERE docs_chunks_fts MATCH ? {type_filter} + LIMIT ? + """, + params, + ) as cur: + rows = await cur.fetchall() + except Exception as e: + logger.warning("FTS search failed, fallback to LIKE: %s", e) + return await _search_docs_like(q_clean, doc_type, limit) + + result = [ + {"path": r[0], "title": r[1] or "", "snippet": (r[2] or "").strip(), "score": 1.0} + for r in (rows or []) + ] + if not result: + return await _search_docs_like(q_clean, doc_type, limit) + return result + + +async def _search_docs_like( + q: str, + doc_type: Optional[str], + limit: int, +) -> List[Dict[str, Any]]: + """Fallback when FTS5 unavailable: LIKE on content.""" + conn = await _db.get_db() + like = f"%{q}%" + params: List[Any] = [like] + if doc_type: + params.append(doc_type) + params.append(limit) + type_sql = "AND f.doc_type = ?" if doc_type else "" + async with conn.execute( + f""" + SELECT DISTINCT f.path, f.title, substr(c.content, 1, {_SNIPPET_LEN}) AS snippet + FROM docs_chunks c + JOIN docs_files f ON f.path = c.path + WHERE c.content LIKE ? {type_sql} + LIMIT ? + """, + params, + ) as cur: + rows = await cur.fetchall() + return [ + {"path": r[0], "title": r[1] or "", "snippet": (r[2] or "").strip(), "score": 0.5} + for r in (rows or []) + ] + + +async def get_preview(path: str) -> Optional[Dict[str, Any]]: + """Return path, title, sections (heading + short excerpt).""" + conn = await _db.get_db() + async with conn.execute("SELECT path, title FROM docs_files WHERE path = ?", (path,)) as cur: + row = await cur.fetchone() + if not row: + return None + async with conn.execute( + "SELECT heading, content FROM docs_chunks WHERE path = ? ORDER BY chunk_index", + (path,), + ) as cur: + rows = await cur.fetchall() + sections = [ + {"heading": r[0] or "", "excerpt": (r[1] or "")[:400].strip()} + for r in (rows or []) + ] + return {"path": row[0], "title": row[1] or "", "sections": sections} + + +async def get_raw(path: str) -> Optional[str]: + """Return full content of first chunk or concatenated chunks (best-effort).""" + conn = await _db.get_db() + async with conn.execute( + "SELECT content FROM docs_chunks WHERE path = ? ORDER BY chunk_index", + (path,), + ) as cur: + rows = await cur.fetchall() + if not rows: + return None + return "\n\n".join(r[0] or "" for r in rows) diff --git a/services/sofiia-console/app/main.py b/services/sofiia-console/app/main.py index b9507d8d..82b05539 100644 --- a/services/sofiia-console/app/main.py +++ b/services/sofiia-console/app/main.py @@ -56,6 +56,7 @@ from .nodes import get_nodes_dashboard from .monitor import collect_all_nodes from .ops import run_ops_action, OPS_ACTIONS from .docs_router import docs_router +from .runbooks_router import runbooks_router from . import db as _app_db from .metrics import ( SOFIIA_SEND_REQUESTS_TOTAL, @@ -462,6 +463,8 @@ app.add_middleware( # Projects + Documents + Sessions + Dialog Map API app.include_router(docs_router) +# Runbooks / docs index (read-only search & preview, PR1.1) +app.include_router(runbooks_router) # ── WebSocket event bus ─────────────────────────────────────────────────────── _ws_clients: Set[WebSocket] = set() diff --git a/services/sofiia-console/app/runbooks_router.py b/services/sofiia-console/app/runbooks_router.py new file mode 100644 index 00000000..cb80f93d --- /dev/null +++ b/services/sofiia-console/app/runbooks_router.py @@ -0,0 +1,64 @@ +""" +Runbooks / docs index API — read-only search and preview (PR1.1). +GET /api/runbooks/search, /api/runbooks/preview, /api/runbooks/raw. +""" +from __future__ import annotations + +import re +from typing import Optional + +from fastapi import APIRouter, Depends, HTTPException, Query + +from .auth import require_auth +from . import docs_store as store + +runbooks_router = APIRouter(prefix="/api/runbooks", tags=["runbooks-docs"]) + + +def _safe_path(path: str) -> bool: + """Reject path traversal and non-docs paths.""" + if not path or ".." in path or path.startswith("/"): + return False + norm = path.replace("\\", "/").strip() + return bool(re.match(r"^(docs/|runbook/|release/)?[\w/\-.]+\.md$", norm, re.I)) + + +@runbooks_router.get("/search") +async def runbooks_search( + q: str = Query(..., min_length=1, max_length=200), + doc_type: Optional[str] = Query(None, description="runbook | release | spec"), + limit: int = Query(10, ge=1, le=50), + _auth: str = Depends(require_auth), +): + """Search runbooks/docs by full text. Returns path, title, snippet.""" + items = await store.search_docs(q=q, doc_type=doc_type, limit=limit) + return {"items": items} + + +@runbooks_router.get("/preview") +async def runbooks_preview( + path: str = Query(..., description="Relative path, e.g. runbook/rehearsal-v1-30min-checklist.md"), + _auth: str = Depends(require_auth), +): + """Get path, title, and sections (heading + excerpt) for a doc.""" + if not _safe_path(path): + raise HTTPException(status_code=400, detail="Invalid path") + out = await store.get_preview(path) + if not out: + raise HTTPException(status_code=404, detail="Not found") + return out + + +@runbooks_router.get("/raw") +async def runbooks_raw( + path: str = Query(..., description="Relative path to markdown file"), + _auth: str = Depends(require_auth), +): + """Get raw markdown content (read-only).""" + if not _safe_path(path): + raise HTTPException(status_code=400, detail="Invalid path") + content = await store.get_raw(path) + if content is None: + raise HTTPException(status_code=404, detail="Not found") + from fastapi.responses import PlainTextResponse + return PlainTextResponse(content, media_type="text/markdown; charset=utf-8") diff --git a/tests/test_sofiia_docs_search.py b/tests/test_sofiia_docs_search.py new file mode 100644 index 00000000..3d1797fb --- /dev/null +++ b/tests/test_sofiia_docs_search.py @@ -0,0 +1,95 @@ +""" +Tests for runbooks/docs search API (PR1.1): search and preview. +Uses tmp docs dir and rebuild_index; no network. +""" +from __future__ import annotations + +import asyncio +from pathlib import Path + +import httpx +import pytest +from httpx import ASGITransport + + +@pytest.fixture +def tmp_docs_with_rehearsal(tmp_path): + """Create tmp_path/docs/runbook with a rehearsal checklist file.""" + docs_root = tmp_path / "docs" + runbook_dir = docs_root / "runbook" + runbook_dir.mkdir(parents=True) + (runbook_dir / "rehearsal-v1-30min-checklist.md").write_text( + "# Rehearsal v1 — 30-minute execution plan\n\n" + "## Preflight\n\n" + "Run STRICT=1 bash ops/preflight_sofiia_console.sh\n\n" + "## Smoke\n\n" + "Idempotency and audit auth checks.", + encoding="utf-8", + ) + return docs_root + + +def test_runbooks_search_finds_rehearsal(sofiia_module, tmp_path, tmp_docs_with_rehearsal, monkeypatch): + """Search for 'rehearsal' returns the checklist path and snippet.""" + import app.docs_index as docs_index_mod + import app.docs_store as docs_store_mod + + monkeypatch.setenv("SOFIIA_DATA_DIR", str(tmp_path / "sofiia-data")) + loop = asyncio.get_event_loop() + + async def run(): + await docs_index_mod.rebuild_index(tmp_docs_with_rehearsal) + # Direct store call (same loop/conn) to verify index + items = await docs_store_mod.search_docs("rehearsal", limit=5) + return items + + items = loop.run_until_complete(run()) + assert len(items) >= 1, "search_docs should return at least one hit for 'rehearsal'" + paths = [x["path"] for x in items] + assert any("rehearsal" in p for p in paths), f"Expected path containing 'rehearsal', got {paths}" + first = items[0] + assert "path" in first and "title" in first and "snippet" in first + + +def test_runbooks_preview_returns_headings(sofiia_module, sofiia_client, tmp_path, tmp_docs_with_rehearsal, monkeypatch): + """Preview returns path, title, sections with heading and excerpt.""" + import app.docs_index as docs_index_mod + + monkeypatch.setenv("SOFIIA_DATA_DIR", str(tmp_path / "sofiia-data")) + loop = asyncio.get_event_loop() + loop.run_until_complete(docs_index_mod.rebuild_index(tmp_docs_with_rehearsal)) + + r = sofiia_client.get("/api/runbooks/preview?path=runbook/rehearsal-v1-30min-checklist.md") + assert r.status_code == 200, r.text + data = r.json() + assert data["path"] == "runbook/rehearsal-v1-30min-checklist.md" + assert "Rehearsal" in (data.get("title") or "") + assert "sections" in data + assert len(data["sections"]) >= 1 + assert any("Preflight" in (s.get("heading") or "") for s in data["sections"]) + + +def test_runbooks_search_filter_doc_type(sofiia_module, sofiia_client, tmp_path, tmp_docs_with_rehearsal, monkeypatch): + """Search with doc_type=runbook returns only runbook paths.""" + import app.docs_index as docs_index_mod + + monkeypatch.setenv("SOFIIA_DATA_DIR", str(tmp_path / "sofiia-data")) + loop = asyncio.get_event_loop() + loop.run_until_complete(docs_index_mod.rebuild_index(tmp_docs_with_rehearsal)) + + r = sofiia_client.get("/api/runbooks/search?q=rehearsal&doc_type=runbook&limit=5") + assert r.status_code == 200, r.text + for item in r.json().get("items", []): + assert "runbook" in item["path"] or item["path"].startswith("runbook/") + + +def test_runbooks_preview_404_for_unknown_path(sofiia_client): + """Preview returns 404 for path not in index.""" + r = sofiia_client.get("/api/runbooks/preview?path=runbook/nonexistent-file.md") + assert r.status_code == 404 + + +def test_runbooks_raw_400_for_invalid_path(sofiia_client): + """Raw returns 400 for path traversal attempt.""" + r = sofiia_client.get("/api/runbooks/raw?path=../../../etc/passwd") + assert r.status_code == 400