"""
voice_utils.py — Voice pipeline utilities (importable without FastAPI).
Extracted from main.py to enable unit testing without full app startup.
"""
import re
_SENTENCE_SPLIT_RE = re.compile(
r'(?<=[.!?…])\s+' # standard sentence end
r'|(?<=[,;:])\s{2,}' # long pause after punctuation
r'|(?<=\n)\s*(?=\S)' # new paragraph
)
MIN_CHUNK_CHARS = 30 # avoid splitting "OK." into tiny TTS calls
MAX_CHUNK_CHARS = 250 # align with max_tts_chars in voice policy
MAX_TTS_SAFE_CHARS = 700 # hard server-side limit (memory-service accepts ≤700)
# Markdown/code patterns to strip before TTS
_MD_BOLD_RE = re.compile(r'\*\*(.+?)\*\*', re.DOTALL)
_MD_ITALIC_RE = re.compile(r'\*(.+?)\*', re.DOTALL)
_MD_HEADER_RE = re.compile(r'^#{1,6}\s+', re.MULTILINE)
_MD_LIST_RE = re.compile(r'^[\-\*]\s+', re.MULTILINE)
_MD_ORDERED_RE = re.compile(r'^\d+\.\s+', re.MULTILINE)
_MD_CODE_BLOCK_RE = re.compile(r'```.*?```', re.DOTALL)
_MD_INLINE_CODE_RE = re.compile(r'`[^`]+`')
_MD_LINK_RE = re.compile(r'\[([^\]]+)\]\([^)]+\)')
_MD_URL_RE = re.compile(r'https?://\S+')
_MULTI_SPACE_RE = re.compile(r'[ \t]{2,}')
_MULTI_NEWLINE_RE = re.compile(r'\n{3,}')
def split_into_voice_chunks(text: str, max_chars: int = MAX_CHUNK_CHARS) -> list[str]:
"""Split text into TTS-friendly chunks (sentences / clauses).
Rules:
- Try sentence boundaries first.
- Merge short fragments (< MIN_CHUNK_CHARS) with the next chunk.
- Hard-split anything > max_chars on a word boundary.
Returns a list of non-empty strings. Never loses content.
"""
raw = [s.strip() for s in _SENTENCE_SPLIT_RE.split(text) if s.strip()]
if not raw:
return [text.strip()] if text.strip() else []
chunks: list[str] = []
buf = ""
for part in raw:
candidate = (buf + " " + part).strip() if buf else part
if len(candidate) > max_chars:
if buf:
chunks.append(buf)
# hard-split part at word boundary
while len(part) > max_chars:
cut = part[:max_chars].rsplit(" ", 1)
chunks.append(cut[0].strip())
part = part[len(cut[0]):].strip()
buf = part
else:
buf = candidate
if buf:
chunks.append(buf)
# Merge tiny trailing fragments into the previous chunk
merged: list[str] = []
for chunk in chunks:
if merged and len(chunk) < MIN_CHUNK_CHARS:
merged[-1] = merged[-1] + " " + chunk
else:
merged.append(chunk)
return merged
def clean_think_blocks(text: str) -> str:
"""Remove ... reasoning blocks from LLM output.
1. Strip complete blocks (DOTALL for multiline).
2. Fallback: if an unclosed remains, drop everything after it.
"""
cleaned = re.sub(r".*?", "", text,
flags=re.DOTALL | re.IGNORECASE)
if "" in cleaned.lower():
cleaned = re.split(r"(?i)", cleaned)[0]
return cleaned.strip()
def sanitize_for_voice(text: str, max_chars: int = MAX_TTS_SAFE_CHARS) -> str:
"""Server-side final barrier before TTS synthesis.
Pipeline (order matters):
1. Strip blocks
2. Strip markdown (code blocks first → inline → bold → italic → headers → lists → links → URLs)
3. Collapse whitespace
4. Hard-truncate to max_chars on sentence boundary when possible
Returns clean, TTS-ready plain text. Never raises.
"""
if not text:
return ""
# 1. blocks
out = clean_think_blocks(text)
# 2. Markdown stripping (order: fenced code before inline to avoid partial matches)
out = _MD_CODE_BLOCK_RE.sub('', out)
out = _MD_INLINE_CODE_RE.sub('', out)
out = _MD_BOLD_RE.sub(r'\1', out)
out = _MD_ITALIC_RE.sub(r'\1', out)
out = _MD_HEADER_RE.sub('', out)
out = _MD_LIST_RE.sub('', out)
out = _MD_ORDERED_RE.sub('', out)
out = _MD_LINK_RE.sub(r'\1', out) # keep link text, drop URL
out = _MD_URL_RE.sub('', out) # remove bare URLs
# 3. Whitespace normalisation
out = _MULTI_SPACE_RE.sub(' ', out)
out = _MULTI_NEWLINE_RE.sub('\n\n', out)
out = out.strip()
# 4. Hard-truncate preserving sentence boundary
if len(out) > max_chars:
# Try to cut at last sentence-ending punctuation before the limit
cut = out[:max_chars]
boundary = max(cut.rfind('.'), cut.rfind('!'), cut.rfind('?'), cut.rfind('…'))
if boundary > max_chars // 2:
out = out[:boundary + 1].strip()
else:
out = cut.rstrip() + '…'
return out