13. Multi-Language Support
Voice AI systems must detect, recognize, and respond in multiple languages. This requires pipeline modifications at transcription, intent classification, and synthesis stages.
Language Detection Pipeline
Real-time language detection uses a streaming model that maintains context while remaining responsive to language switches.
from pyannote.audio import Pipeline
import torch
class LanguageDetector:
def __init__(self, model_name: str = "pyannote/language-id"):
self.pipeline = Pipeline.from_pretrained(model_name)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.pipeline.to(self.device)
self.current_language = "en"
self.confidence_threshold = 0.7
async def detect(self, audio_chunk: bytes) -> str:
# Convert bytes to torch tensor
audio_tensor = self.bytes_to_tensor(audio_chunk)
with torch.no_grad():
output = self.pipeline({"audio": audio_tensor})
language = output["language"]
confidence = output["confidence"]
if confidence > self.confidence_threshold:
self.current_language = language
return self.current_language
def bytes_to_tensor(self, audio_bytes: bytes) -> torch.Tensor:
import numpy as np
audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
audio_float = audio_np.astype(np.float32) / 32768.0
return torch.from_numpy(audio_float).unsqueeze(0)
Language-Specific Models
Loading separate models per language increases memory requirements. Optimize by loading a multilingual base model and adding language-specific adapter weights.
class MultiLanguagePipeline:
def __init__(self, base_model: str, languages: list[str]):
from transformers import AutoModelForCausalLM
self.base_model = AutoModelForCausalLM.from_pretrained(base_model)
self.adapters = {}
for lang in languages:
adapter_path = f"./adapters/{lang}"
self.adapters[lang] = self.load_adapter(adapter_path)
def load_adapter(self, path: str):
from peft import PeftModel
return PeftModel.from_pretrained(self.base_model, path)
async def generate(self, prompt: str, language: str) -> str:
if language in self.adapters:
self.base_model.load_adapter(language)
self.base_model.set_adapter(language)
else:
self.base_model.set_adapter("default")
inputs = self.tokenizer(prompt, return_tensors="pt")
outputs = self.base_model.generate(**inputs, max_new_tokens=100)
return self.tokenizer.decode(outputs[0])
Cross-Language Synthesis
TTS systems trained on multiple languages require language tags to produce correct phonemes and prosody.
class MultilingualTTS:
def __init__(self, model_path: str):
from vocos import Vocos
from transformers import AutoTokenizer, AutoProcessor
self.model = load_model(model_path)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.processor = AutoProcessor.from_pretrained(model_path)
self.language_map = {"en": "<|en|>", "es": "<|es|>", "fr": "<|fr|>", "de": "<|de|>"}
def synthesize(self, text: str, language: str) -> bytes:
lang_tag = self.language_map.get(language, "<|en|>")
full_text = f"{lang_tag}{text}<|eos|>"
inputs = self.processor(text=full_text, return_tensors="pt")
output = self.model.generate(**inputs)
return self.decode_audio(output)
Language Switching Logic
async def process_stream(self, audio_chunks: list[bytes], target_lang: str):
current_lang = await self.language_detector.detect(audio_chunks[0])
if current_lang != target_lang:
return {"action": "switch", "language": current_lang}
transcription = await self.transcribe(audio_chunks, language=current_lang)
response = await self.generate(transcription, language=current_lang)
audio_response = self.synthesize(response, language=current_lang)
return {"audio": audio_response}
Language detection introduces ~50-100ms latency. Cache recent language decisions to avoid repeated inference on the same audio segment.
Build a voice pipeline that detects language every 5 seconds of audio and switches the TTS voice between English (US) and Spanish (MX) voices using Coqui TTS. Log all language switches. Time: 15 minutes.