New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
131 lines
4.6 KiB
Python
131 lines
4.6 KiB
Python
"""
|
|
voice_utils.py — Voice pipeline utilities (importable without FastAPI).
|
|
|
|
Extracted from main.py to enable unit testing without full app startup.
|
|
"""
|
|
import re
|
|
|
|
_SENTENCE_SPLIT_RE = re.compile(
|
|
r'(?<=[.!?…])\s+' # standard sentence end
|
|
r'|(?<=[,;:])\s{2,}' # long pause after punctuation
|
|
r'|(?<=\n)\s*(?=\S)' # new paragraph
|
|
)
|
|
|
|
MIN_CHUNK_CHARS = 30 # avoid splitting "OK." into tiny TTS calls
|
|
MAX_CHUNK_CHARS = 250 # align with max_tts_chars in voice policy
|
|
MAX_TTS_SAFE_CHARS = 700 # hard server-side limit (memory-service accepts ≤700)
|
|
|
|
# Markdown/code patterns to strip before TTS
|
|
_MD_BOLD_RE = re.compile(r'\*\*(.+?)\*\*', re.DOTALL)
|
|
_MD_ITALIC_RE = re.compile(r'\*(.+?)\*', re.DOTALL)
|
|
_MD_HEADER_RE = re.compile(r'^#{1,6}\s+', re.MULTILINE)
|
|
_MD_LIST_RE = re.compile(r'^[\-\*]\s+', re.MULTILINE)
|
|
_MD_ORDERED_RE = re.compile(r'^\d+\.\s+', re.MULTILINE)
|
|
_MD_CODE_BLOCK_RE = re.compile(r'```.*?```', re.DOTALL)
|
|
_MD_INLINE_CODE_RE = re.compile(r'`[^`]+`')
|
|
_MD_LINK_RE = re.compile(r'\[([^\]]+)\]\([^)]+\)')
|
|
_MD_URL_RE = re.compile(r'https?://\S+')
|
|
_MULTI_SPACE_RE = re.compile(r'[ \t]{2,}')
|
|
_MULTI_NEWLINE_RE = re.compile(r'\n{3,}')
|
|
|
|
|
|
def split_into_voice_chunks(text: str, max_chars: int = MAX_CHUNK_CHARS) -> list[str]:
|
|
"""Split text into TTS-friendly chunks (sentences / clauses).
|
|
|
|
Rules:
|
|
- Try sentence boundaries first.
|
|
- Merge short fragments (< MIN_CHUNK_CHARS) with the next chunk.
|
|
- Hard-split anything > max_chars on a word boundary.
|
|
|
|
Returns a list of non-empty strings. Never loses content.
|
|
"""
|
|
raw = [s.strip() for s in _SENTENCE_SPLIT_RE.split(text) if s.strip()]
|
|
if not raw:
|
|
return [text.strip()] if text.strip() else []
|
|
|
|
chunks: list[str] = []
|
|
buf = ""
|
|
for part in raw:
|
|
candidate = (buf + " " + part).strip() if buf else part
|
|
if len(candidate) > max_chars:
|
|
if buf:
|
|
chunks.append(buf)
|
|
# hard-split part at word boundary
|
|
while len(part) > max_chars:
|
|
cut = part[:max_chars].rsplit(" ", 1)
|
|
chunks.append(cut[0].strip())
|
|
part = part[len(cut[0]):].strip()
|
|
buf = part
|
|
else:
|
|
buf = candidate
|
|
if buf:
|
|
chunks.append(buf)
|
|
|
|
# Merge tiny trailing fragments into the previous chunk
|
|
merged: list[str] = []
|
|
for chunk in chunks:
|
|
if merged and len(chunk) < MIN_CHUNK_CHARS:
|
|
merged[-1] = merged[-1] + " " + chunk
|
|
else:
|
|
merged.append(chunk)
|
|
return merged
|
|
|
|
|
|
def clean_think_blocks(text: str) -> str:
|
|
"""Remove <think>...</think> reasoning blocks from LLM output.
|
|
|
|
1. Strip complete blocks (DOTALL for multiline).
|
|
2. Fallback: if an unclosed <think> remains, drop everything after it.
|
|
"""
|
|
cleaned = re.sub(r"<think>.*?</think>", "", text,
|
|
flags=re.DOTALL | re.IGNORECASE)
|
|
if "<think>" in cleaned.lower():
|
|
cleaned = re.split(r"(?i)<think>", cleaned)[0]
|
|
return cleaned.strip()
|
|
|
|
|
|
def sanitize_for_voice(text: str, max_chars: int = MAX_TTS_SAFE_CHARS) -> str:
|
|
"""Server-side final barrier before TTS synthesis.
|
|
|
|
Pipeline (order matters):
|
|
1. Strip <think> blocks
|
|
2. Strip markdown (code blocks first → inline → bold → italic → headers → lists → links → URLs)
|
|
3. Collapse whitespace
|
|
4. Hard-truncate to max_chars on sentence boundary when possible
|
|
|
|
Returns clean, TTS-ready plain text. Never raises.
|
|
"""
|
|
if not text:
|
|
return ""
|
|
|
|
# 1. <think> blocks
|
|
out = clean_think_blocks(text)
|
|
|
|
# 2. Markdown stripping (order: fenced code before inline to avoid partial matches)
|
|
out = _MD_CODE_BLOCK_RE.sub('', out)
|
|
out = _MD_INLINE_CODE_RE.sub('', out)
|
|
out = _MD_BOLD_RE.sub(r'\1', out)
|
|
out = _MD_ITALIC_RE.sub(r'\1', out)
|
|
out = _MD_HEADER_RE.sub('', out)
|
|
out = _MD_LIST_RE.sub('', out)
|
|
out = _MD_ORDERED_RE.sub('', out)
|
|
out = _MD_LINK_RE.sub(r'\1', out) # keep link text, drop URL
|
|
out = _MD_URL_RE.sub('', out) # remove bare URLs
|
|
|
|
# 3. Whitespace normalisation
|
|
out = _MULTI_SPACE_RE.sub(' ', out)
|
|
out = _MULTI_NEWLINE_RE.sub('\n\n', out)
|
|
out = out.strip()
|
|
|
|
# 4. Hard-truncate preserving sentence boundary
|
|
if len(out) > max_chars:
|
|
# Try to cut at last sentence-ending punctuation before the limit
|
|
cut = out[:max_chars]
|
|
boundary = max(cut.rfind('.'), cut.rfind('!'), cut.rfind('?'), cut.rfind('…'))
|
|
if boundary > max_chars // 2:
|
|
out = out[:boundary + 1].strip()
|
|
else:
|
|
out = cut.rstrip() + '…'
|
|
|
|
return out
|