""" voice_utils.py — Voice pipeline utilities (importable without FastAPI). Extracted from main.py to enable unit testing without full app startup. """ import re _SENTENCE_SPLIT_RE = re.compile( r'(?<=[.!?…])\s+' # standard sentence end r'|(?<=[,;:])\s{2,}' # long pause after punctuation r'|(?<=\n)\s*(?=\S)' # new paragraph ) MIN_CHUNK_CHARS = 30 # avoid splitting "OK." into tiny TTS calls MAX_CHUNK_CHARS = 250 # align with max_tts_chars in voice policy MAX_TTS_SAFE_CHARS = 700 # hard server-side limit (memory-service accepts ≤700) # Markdown/code patterns to strip before TTS _MD_BOLD_RE = re.compile(r'\*\*(.+?)\*\*', re.DOTALL) _MD_ITALIC_RE = re.compile(r'\*(.+?)\*', re.DOTALL) _MD_HEADER_RE = re.compile(r'^#{1,6}\s+', re.MULTILINE) _MD_LIST_RE = re.compile(r'^[\-\*]\s+', re.MULTILINE) _MD_ORDERED_RE = re.compile(r'^\d+\.\s+', re.MULTILINE) _MD_CODE_BLOCK_RE = re.compile(r'```.*?```', re.DOTALL) _MD_INLINE_CODE_RE = re.compile(r'`[^`]+`') _MD_LINK_RE = re.compile(r'\[([^\]]+)\]\([^)]+\)') _MD_URL_RE = re.compile(r'https?://\S+') _MULTI_SPACE_RE = re.compile(r'[ \t]{2,}') _MULTI_NEWLINE_RE = re.compile(r'\n{3,}') def split_into_voice_chunks(text: str, max_chars: int = MAX_CHUNK_CHARS) -> list[str]: """Split text into TTS-friendly chunks (sentences / clauses). Rules: - Try sentence boundaries first. - Merge short fragments (< MIN_CHUNK_CHARS) with the next chunk. - Hard-split anything > max_chars on a word boundary. Returns a list of non-empty strings. Never loses content. """ raw = [s.strip() for s in _SENTENCE_SPLIT_RE.split(text) if s.strip()] if not raw: return [text.strip()] if text.strip() else [] chunks: list[str] = [] buf = "" for part in raw: candidate = (buf + " " + part).strip() if buf else part if len(candidate) > max_chars: if buf: chunks.append(buf) # hard-split part at word boundary while len(part) > max_chars: cut = part[:max_chars].rsplit(" ", 1) chunks.append(cut[0].strip()) part = part[len(cut[0]):].strip() buf = part else: buf = candidate if buf: chunks.append(buf) # Merge tiny trailing fragments into the previous chunk merged: list[str] = [] for chunk in chunks: if merged and len(chunk) < MIN_CHUNK_CHARS: merged[-1] = merged[-1] + " " + chunk else: merged.append(chunk) return merged def clean_think_blocks(text: str) -> str: """Remove ... reasoning blocks from LLM output. 1. Strip complete blocks (DOTALL for multiline). 2. Fallback: if an unclosed remains, drop everything after it. """ cleaned = re.sub(r".*?", "", text, flags=re.DOTALL | re.IGNORECASE) if "" in cleaned.lower(): cleaned = re.split(r"(?i)", cleaned)[0] return cleaned.strip() def sanitize_for_voice(text: str, max_chars: int = MAX_TTS_SAFE_CHARS) -> str: """Server-side final barrier before TTS synthesis. Pipeline (order matters): 1. Strip blocks 2. Strip markdown (code blocks first → inline → bold → italic → headers → lists → links → URLs) 3. Collapse whitespace 4. Hard-truncate to max_chars on sentence boundary when possible Returns clean, TTS-ready plain text. Never raises. """ if not text: return "" # 1. blocks out = clean_think_blocks(text) # 2. Markdown stripping (order: fenced code before inline to avoid partial matches) out = _MD_CODE_BLOCK_RE.sub('', out) out = _MD_INLINE_CODE_RE.sub('', out) out = _MD_BOLD_RE.sub(r'\1', out) out = _MD_ITALIC_RE.sub(r'\1', out) out = _MD_HEADER_RE.sub('', out) out = _MD_LIST_RE.sub('', out) out = _MD_ORDERED_RE.sub('', out) out = _MD_LINK_RE.sub(r'\1', out) # keep link text, drop URL out = _MD_URL_RE.sub('', out) # remove bare URLs # 3. Whitespace normalisation out = _MULTI_SPACE_RE.sub(' ', out) out = _MULTI_NEWLINE_RE.sub('\n\n', out) out = out.strip() # 4. Hard-truncate preserving sentence boundary if len(out) > max_chars: # Try to cut at last sentence-ending punctuation before the limit cut = out[:max_chars] boundary = max(cut.rfind('.'), cut.rfind('!'), cut.rfind('?'), cut.rfind('…')) if boundary > max_chars // 2: out = out[:boundary + 1].strip() else: out = cut.rstrip() + '…' return out