microdao-daarion/services/sofiia-console/app/voice_utils.py

"""
voice_utils.py — Voice pipeline utilities (importable without FastAPI).

Extracted from main.py to enable unit testing without full app startup.
"""
import re

_SENTENCE_SPLIT_RE = re.compile(
    r'(?<=[.!?…])\s+'           # standard sentence end
    r'|(?<=[,;:])\s{2,}'        # long pause after punctuation
    r'|(?<=\n)\s*(?=\S)'        # new paragraph
)

MIN_CHUNK_CHARS = 30    # avoid splitting "OK." into tiny TTS calls
MAX_CHUNK_CHARS = 250   # align with max_tts_chars in voice policy
MAX_TTS_SAFE_CHARS = 700  # hard server-side limit (memory-service accepts ≤700)

# Markdown/code patterns to strip before TTS
_MD_BOLD_RE = re.compile(r'\*\*(.+?)\*\*', re.DOTALL)
_MD_ITALIC_RE = re.compile(r'\*(.+?)\*', re.DOTALL)
_MD_HEADER_RE = re.compile(r'^#{1,6}\s+', re.MULTILINE)
_MD_LIST_RE = re.compile(r'^[\-\*]\s+', re.MULTILINE)
_MD_ORDERED_RE = re.compile(r'^\d+\.\s+', re.MULTILINE)
_MD_CODE_BLOCK_RE = re.compile(r'```.*?```', re.DOTALL)
_MD_INLINE_CODE_RE = re.compile(r'`[^`]+`')
_MD_LINK_RE = re.compile(r'\[([^\]]+)\]\([^)]+\)')
_MD_URL_RE = re.compile(r'https?://\S+')
_MULTI_SPACE_RE = re.compile(r'[ \t]{2,}')
_MULTI_NEWLINE_RE = re.compile(r'\n{3,}')


def split_into_voice_chunks(text: str, max_chars: int = MAX_CHUNK_CHARS) -> list[str]:
    """Split text into TTS-friendly chunks (sentences / clauses).

    Rules:
    - Try sentence boundaries first.
    - Merge short fragments (< MIN_CHUNK_CHARS) with the next chunk.
    - Hard-split anything > max_chars on a word boundary.

    Returns a list of non-empty strings. Never loses content.
    """
    raw = [s.strip() for s in _SENTENCE_SPLIT_RE.split(text) if s.strip()]
    if not raw:
        return [text.strip()] if text.strip() else []

    chunks: list[str] = []
    buf = ""
    for part in raw:
        candidate = (buf + " " + part).strip() if buf else part
        if len(candidate) > max_chars:
            if buf:
                chunks.append(buf)
            # hard-split part at word boundary
            while len(part) > max_chars:
                cut = part[:max_chars].rsplit(" ", 1)
                chunks.append(cut[0].strip())
                part = part[len(cut[0]):].strip()
            buf = part
        else:
            buf = candidate
    if buf:
        chunks.append(buf)

    # Merge tiny trailing fragments into the previous chunk
    merged: list[str] = []
    for chunk in chunks:
        if merged and len(chunk) < MIN_CHUNK_CHARS:
            merged[-1] = merged[-1] + " " + chunk
        else:
            merged.append(chunk)
    return merged


def clean_think_blocks(text: str) -> str:
    """Remove <think>...</think> reasoning blocks from LLM output.

    1. Strip complete blocks (DOTALL for multiline).
    2. Fallback: if an unclosed <think> remains, drop everything after it.
    """
    cleaned = re.sub(r"<think>.*?</think>", "", text,
                     flags=re.DOTALL | re.IGNORECASE)
    if "<think>" in cleaned.lower():
        cleaned = re.split(r"(?i)<think>", cleaned)[0]
    return cleaned.strip()


def sanitize_for_voice(text: str, max_chars: int = MAX_TTS_SAFE_CHARS) -> str:
    """Server-side final barrier before TTS synthesis.

    Pipeline (order matters):
      1. Strip <think> blocks
      2. Strip markdown (code blocks first → inline → bold → italic → headers → lists → links → URLs)
      3. Collapse whitespace
      4. Hard-truncate to max_chars on sentence boundary when possible

    Returns clean, TTS-ready plain text. Never raises.
    """
    if not text:
        return ""

    # 1. <think> blocks
    out = clean_think_blocks(text)

    # 2. Markdown stripping (order: fenced code before inline to avoid partial matches)
    out = _MD_CODE_BLOCK_RE.sub('', out)
    out = _MD_INLINE_CODE_RE.sub('', out)
    out = _MD_BOLD_RE.sub(r'\1', out)
    out = _MD_ITALIC_RE.sub(r'\1', out)
    out = _MD_HEADER_RE.sub('', out)
    out = _MD_LIST_RE.sub('', out)
    out = _MD_ORDERED_RE.sub('', out)
    out = _MD_LINK_RE.sub(r'\1', out)   # keep link text, drop URL
    out = _MD_URL_RE.sub('', out)        # remove bare URLs

    # 3. Whitespace normalisation
    out = _MULTI_SPACE_RE.sub(' ', out)
    out = _MULTI_NEWLINE_RE.sub('\n\n', out)
    out = out.strip()

    # 4. Hard-truncate preserving sentence boundary
    if len(out) > max_chars:
        # Try to cut at last sentence-ending punctuation before the limit
        cut = out[:max_chars]
        boundary = max(cut.rfind('.'), cut.rfind('!'), cut.rfind('?'), cut.rfind('…'))
        if boundary > max_chars // 2:
            out = out[:boundary + 1].strip()
        else:
            out = cut.rstrip() + '…'

    return out