Files
microdao-daarion/services/sofiia-console/app/docs_store.py
Apple ef3ff80645 feat(sofiia-console): add docs index and runbook search API (FTS5)
adds SQLite docs index (files/chunks + FTS5) and CLI rebuild

exposes authenticated runbook search/preview/raw endpoints

Made-with: Cursor
2026-03-03 04:26:34 -08:00

222 lines
7.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Docs index store — SQLite tables docs_files, docs_chunks, docs_chunks_fts (FTS5).
Read-only API: search, preview, raw. Index build in docs_index.py.
"""
from __future__ import annotations
import hashlib
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional
from . import db as _db
logger = logging.getLogger(__name__)
# Max snippet length per fragment
_SNIPPET_LEN = 120
_CHUNK_SIZE = 3500 # ~24KB target
def _doc_type_from_path(path: str) -> str:
p = path.replace("\\", "/")
if "/runbook/" in p or p.startswith("runbook/"):
return "runbook"
if "/release/" in p or p.startswith("release/"):
return "release"
return "spec" if "/docs/" in p or p.startswith("docs/") else "misc"
def _extract_title(content: str, path: str) -> str:
"""First # heading or filename."""
for line in content.splitlines():
line = line.strip()
if line.startswith("# "):
return line[2:].strip()[:200]
return Path(path).stem.replace("-", " ").replace("_", " ").title()[:200]
async def clear_docs_index() -> None:
"""Remove all docs_files, docs_chunks, and FTS rows."""
conn = await _db.get_db()
await conn.execute("DELETE FROM docs_chunks_fts")
await conn.execute("DELETE FROM docs_chunks")
await conn.execute("DELETE FROM docs_files")
await conn.commit()
logger.info("Docs index cleared.")
async def insert_docs_file(path: str, mtime: float, content: str) -> None:
"""Register one file and its chunks. Caller ensures path is normalized."""
conn = await _db.get_db()
sha = hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
title = _extract_title(content, path)
doc_type = _doc_type_from_path(path)
await conn.execute(
"INSERT OR REPLACE INTO docs_files(path, mtime, sha, title, doc_type) VALUES (?,?,?,?,?)",
(path, mtime, sha, title, doc_type),
)
await conn.execute("DELETE FROM docs_chunks WHERE path = ?", (path,))
await conn.execute("DELETE FROM docs_chunks_fts WHERE path = ?", (path,))
await conn.commit()
chunks = _chunk_content(content, path)
for i, (heading, text) in enumerate(chunks):
chunk_id = f"{path}:{i}"
await conn.execute(
"INSERT INTO docs_chunks(id, path, heading, chunk_index, content) VALUES (?,?,?,?,?)",
(chunk_id, path, heading, i, text),
)
await conn.execute(
"INSERT INTO docs_chunks_fts(chunk_id, path, heading, content) VALUES (?,?,?,?)",
(chunk_id, path, heading, text),
)
await conn.commit()
def _chunk_content(content: str, path: str) -> List[tuple]:
"""Split by headers, then by size. Returns [(heading, text), ...]."""
sections: List[tuple] = []
current_heading = ""
current_lines: List[str] = []
current_size = 0
for line in content.splitlines():
stripped = line.strip()
if stripped.startswith("# "):
if current_lines:
sections.append((current_heading, "\n".join(current_lines)))
current_heading = stripped[2:].strip()[:300]
current_lines = [line]
current_size = len(line) + 1
elif stripped.startswith("## "):
if current_lines:
sections.append((current_heading, "\n".join(current_lines)))
current_heading = stripped[3:].strip()[:300]
current_lines = [line]
current_size = len(line) + 1
elif stripped.startswith("### "):
if current_lines:
sections.append((current_heading, "\n".join(current_lines)))
current_heading = stripped[4:].strip()[:300]
current_lines = [line]
current_size = len(line) + 1
else:
current_lines.append(line)
current_size += len(line) + 1
if current_size >= _CHUNK_SIZE:
sections.append((current_heading, "\n".join(current_lines)))
current_lines = []
current_size = 0
if current_lines:
sections.append((current_heading, "\n".join(current_lines)))
return sections
async def search_docs(
q: str,
doc_type: Optional[str] = None,
limit: int = 10,
) -> List[Dict[str, Any]]:
"""FTS search. Returns [{path, title, snippet, score}]. Uses FTS5 if available."""
conn = await _db.get_db()
# Sanitize FTS5 query: wrap in quotes for phrase or use simple terms
q_clean = q.strip().replace("\"", " ")[:200]
if not q_clean:
return []
# Filter by doc_type via join with docs_files
type_filter = "AND f.doc_type = ?" if doc_type else ""
params: List[Any] = [q_clean]
if doc_type:
params.append(doc_type)
params.append(limit)
try:
# FTS5: snippet(build, col_idx, left, right, ellipsis, max_tokens)
# columns: 0=chunk_id, 1=path, 2=heading, 3=content → snippet column 3
async with conn.execute(
f"""
SELECT f.path, f.title, snippet(docs_chunks_fts, 3, '**', '**', '...', {_SNIPPET_LEN//5}) AS snippet
FROM docs_chunks_fts AS fts
JOIN docs_files f ON f.path = fts.path
WHERE docs_chunks_fts MATCH ? {type_filter}
LIMIT ?
""",
params,
) as cur:
rows = await cur.fetchall()
except Exception as e:
logger.warning("FTS search failed, fallback to LIKE: %s", e)
return await _search_docs_like(q_clean, doc_type, limit)
result = [
{"path": r[0], "title": r[1] or "", "snippet": (r[2] or "").strip(), "score": 1.0}
for r in (rows or [])
]
if not result:
return await _search_docs_like(q_clean, doc_type, limit)
return result
async def _search_docs_like(
q: str,
doc_type: Optional[str],
limit: int,
) -> List[Dict[str, Any]]:
"""Fallback when FTS5 unavailable: LIKE on content."""
conn = await _db.get_db()
like = f"%{q}%"
params: List[Any] = [like]
if doc_type:
params.append(doc_type)
params.append(limit)
type_sql = "AND f.doc_type = ?" if doc_type else ""
async with conn.execute(
f"""
SELECT DISTINCT f.path, f.title, substr(c.content, 1, {_SNIPPET_LEN}) AS snippet
FROM docs_chunks c
JOIN docs_files f ON f.path = c.path
WHERE c.content LIKE ? {type_sql}
LIMIT ?
""",
params,
) as cur:
rows = await cur.fetchall()
return [
{"path": r[0], "title": r[1] or "", "snippet": (r[2] or "").strip(), "score": 0.5}
for r in (rows or [])
]
async def get_preview(path: str) -> Optional[Dict[str, Any]]:
"""Return path, title, sections (heading + short excerpt)."""
conn = await _db.get_db()
async with conn.execute("SELECT path, title FROM docs_files WHERE path = ?", (path,)) as cur:
row = await cur.fetchone()
if not row:
return None
async with conn.execute(
"SELECT heading, content FROM docs_chunks WHERE path = ? ORDER BY chunk_index",
(path,),
) as cur:
rows = await cur.fetchall()
sections = [
{"heading": r[0] or "", "excerpt": (r[1] or "")[:400].strip()}
for r in (rows or [])
]
return {"path": row[0], "title": row[1] or "", "sections": sections}
async def get_raw(path: str) -> Optional[str]:
"""Return full content of first chunk or concatenated chunks (best-effort)."""
conn = await _db.get_db()
async with conn.execute(
"SELECT content FROM docs_chunks WHERE path = ? ORDER BY chunk_index",
(path,),
) as cur:
rows = await cur.fetchall()
if not rows:
return None
return "\n\n".join(r[0] or "" for r in rows)