feat(sofiia-console): add docs index and runbook search API (FTS5)

adds SQLite docs index (files/chunks + FTS5) and CLI rebuild exposes authenticated runbook search/preview/raw endpoints Made-with: Cursor
2026-03-03 04:26:34 -08:00
parent bddb6cd75a
commit ef3ff80645
6 changed files with 484 additions and 0 deletions
--- a/services/sofiia-console/app/docs_store.py
+++ b/services/sofiia-console/app/docs_store.py
@@ -0,0 +1,221 @@
+"""
+Docs index store — SQLite tables docs_files, docs_chunks, docs_chunks_fts (FTS5).
+Read-only API: search, preview, raw. Index build in docs_index.py.
+"""
+from __future__ import annotations
+
+import hashlib
+import logging
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from . import db as _db
+
+logger = logging.getLogger(__name__)
+
+# Max snippet length per fragment
+_SNIPPET_LEN = 120
+_CHUNK_SIZE = 3500  # ~2–4KB target
+
+
+def _doc_type_from_path(path: str) -> str:
+    p = path.replace("\\", "/")
+    if "/runbook/" in p or p.startswith("runbook/"):
+        return "runbook"
+    if "/release/" in p or p.startswith("release/"):
+        return "release"
+    return "spec" if "/docs/" in p or p.startswith("docs/") else "misc"
+
+
+def _extract_title(content: str, path: str) -> str:
+    """First # heading or filename."""
+    for line in content.splitlines():
+        line = line.strip()
+        if line.startswith("# "):
+            return line[2:].strip()[:200]
+    return Path(path).stem.replace("-", " ").replace("_", " ").title()[:200]
+
+
+async def clear_docs_index() -> None:
+    """Remove all docs_files, docs_chunks, and FTS rows."""
+    conn = await _db.get_db()
+    await conn.execute("DELETE FROM docs_chunks_fts")
+    await conn.execute("DELETE FROM docs_chunks")
+    await conn.execute("DELETE FROM docs_files")
+    await conn.commit()
+    logger.info("Docs index cleared.")
+
+
+async def insert_docs_file(path: str, mtime: float, content: str) -> None:
+    """Register one file and its chunks. Caller ensures path is normalized."""
+    conn = await _db.get_db()
+    sha = hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
+    title = _extract_title(content, path)
+    doc_type = _doc_type_from_path(path)
+    await conn.execute(
+        "INSERT OR REPLACE INTO docs_files(path, mtime, sha, title, doc_type) VALUES (?,?,?,?,?)",
+        (path, mtime, sha, title, doc_type),
+    )
+    await conn.execute("DELETE FROM docs_chunks WHERE path = ?", (path,))
+    await conn.execute("DELETE FROM docs_chunks_fts WHERE path = ?", (path,))
+    await conn.commit()
+
+    chunks = _chunk_content(content, path)
+    for i, (heading, text) in enumerate(chunks):
+        chunk_id = f"{path}:{i}"
+        await conn.execute(
+            "INSERT INTO docs_chunks(id, path, heading, chunk_index, content) VALUES (?,?,?,?,?)",
+            (chunk_id, path, heading, i, text),
+        )
+        await conn.execute(
+            "INSERT INTO docs_chunks_fts(chunk_id, path, heading, content) VALUES (?,?,?,?)",
+            (chunk_id, path, heading, text),
+        )
+    await conn.commit()
+
+
+def _chunk_content(content: str, path: str) -> List[tuple]:
+    """Split by headers, then by size. Returns [(heading, text), ...]."""
+    sections: List[tuple] = []
+    current_heading = ""
+    current_lines: List[str] = []
+    current_size = 0
+
+    for line in content.splitlines():
+        stripped = line.strip()
+        if stripped.startswith("# "):
+            if current_lines:
+                sections.append((current_heading, "\n".join(current_lines)))
+            current_heading = stripped[2:].strip()[:300]
+            current_lines = [line]
+            current_size = len(line) + 1
+        elif stripped.startswith("## "):
+            if current_lines:
+                sections.append((current_heading, "\n".join(current_lines)))
+            current_heading = stripped[3:].strip()[:300]
+            current_lines = [line]
+            current_size = len(line) + 1
+        elif stripped.startswith("### "):
+            if current_lines:
+                sections.append((current_heading, "\n".join(current_lines)))
+            current_heading = stripped[4:].strip()[:300]
+            current_lines = [line]
+            current_size = len(line) + 1
+        else:
+            current_lines.append(line)
+            current_size += len(line) + 1
+            if current_size >= _CHUNK_SIZE:
+                sections.append((current_heading, "\n".join(current_lines)))
+                current_lines = []
+                current_size = 0
+    if current_lines:
+        sections.append((current_heading, "\n".join(current_lines)))
+    return sections
+
+
+async def search_docs(
+    q: str,
+    doc_type: Optional[str] = None,
+    limit: int = 10,
+) -> List[Dict[str, Any]]:
+    """FTS search. Returns [{path, title, snippet, score}]. Uses FTS5 if available."""
+    conn = await _db.get_db()
+    # Sanitize FTS5 query: wrap in quotes for phrase or use simple terms
+    q_clean = q.strip().replace("\"", " ")[:200]
+    if not q_clean:
+        return []
+
+    # Filter by doc_type via join with docs_files
+    type_filter = "AND f.doc_type = ?" if doc_type else ""
+    params: List[Any] = [q_clean]
+    if doc_type:
+        params.append(doc_type)
+    params.append(limit)
+
+    try:
+        # FTS5: snippet(build, col_idx, left, right, ellipsis, max_tokens)
+        # columns: 0=chunk_id, 1=path, 2=heading, 3=content → snippet column 3
+        async with conn.execute(
+            f"""
+            SELECT f.path, f.title, snippet(docs_chunks_fts, 3, '**', '**', '...', {_SNIPPET_LEN//5}) AS snippet
+            FROM docs_chunks_fts AS fts
+            JOIN docs_files f ON f.path = fts.path
+            WHERE docs_chunks_fts MATCH ? {type_filter}
+            LIMIT ?
+            """,
+            params,
+        ) as cur:
+            rows = await cur.fetchall()
+    except Exception as e:
+        logger.warning("FTS search failed, fallback to LIKE: %s", e)
+        return await _search_docs_like(q_clean, doc_type, limit)
+
+    result = [
+        {"path": r[0], "title": r[1] or "", "snippet": (r[2] or "").strip(), "score": 1.0}
+        for r in (rows or [])
+    ]
+    if not result:
+        return await _search_docs_like(q_clean, doc_type, limit)
+    return result
+
+
+async def _search_docs_like(
+    q: str,
+    doc_type: Optional[str],
+    limit: int,
+) -> List[Dict[str, Any]]:
+    """Fallback when FTS5 unavailable: LIKE on content."""
+    conn = await _db.get_db()
+    like = f"%{q}%"
+    params: List[Any] = [like]
+    if doc_type:
+        params.append(doc_type)
+    params.append(limit)
+    type_sql = "AND f.doc_type = ?" if doc_type else ""
+    async with conn.execute(
+        f"""
+        SELECT DISTINCT f.path, f.title, substr(c.content, 1, {_SNIPPET_LEN}) AS snippet
+        FROM docs_chunks c
+        JOIN docs_files f ON f.path = c.path
+        WHERE c.content LIKE ? {type_sql}
+        LIMIT ?
+        """,
+        params,
+    ) as cur:
+        rows = await cur.fetchall()
+    return [
+        {"path": r[0], "title": r[1] or "", "snippet": (r[2] or "").strip(), "score": 0.5}
+        for r in (rows or [])
+    ]
+
+
+async def get_preview(path: str) -> Optional[Dict[str, Any]]:
+    """Return path, title, sections (heading + short excerpt)."""
+    conn = await _db.get_db()
+    async with conn.execute("SELECT path, title FROM docs_files WHERE path = ?", (path,)) as cur:
+        row = await cur.fetchone()
+    if not row:
+        return None
+    async with conn.execute(
+        "SELECT heading, content FROM docs_chunks WHERE path = ? ORDER BY chunk_index",
+        (path,),
+    ) as cur:
+        rows = await cur.fetchall()
+    sections = [
+        {"heading": r[0] or "", "excerpt": (r[1] or "")[:400].strip()}
+        for r in (rows or [])
+    ]
+    return {"path": row[0], "title": row[1] or "", "sections": sections}
+
+
+async def get_raw(path: str) -> Optional[str]:
+    """Return full content of first chunk or concatenated chunks (best-effort)."""
+    conn = await _db.get_db()
+    async with conn.execute(
+        "SELECT content FROM docs_chunks WHERE path = ? ORDER BY chunk_index",
+        (path,),
+    ) as cur:
+        rows = await cur.fetchall()
+    if not rows:
+        return None
+    return "\n\n".join(r[0] or "" for r in rows)