13. Multi-Language Support

Chapter 13 of 22 · 20 min

Voice AI systems must detect, recognize, and respond in multiple languages. This requires pipeline modifications at transcription, intent classification, and synthesis stages.

Language Detection Pipeline

Real-time language detection uses a streaming model that maintains context while remaining responsive to language switches.

from pyannote.audio import Pipeline
import torch

class LanguageDetector:
    def __init__(self, model_name: str = "pyannote/language-id"):
        self.pipeline = Pipeline.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.pipeline.to(self.device)
        self.current_language = "en"
        self.confidence_threshold = 0.7
    
    async def detect(self, audio_chunk: bytes) -> str:
        # Convert bytes to torch tensor
        audio_tensor = self.bytes_to_tensor(audio_chunk)
        
        with torch.no_grad():
            output = self.pipeline({"audio": audio_tensor})
        
        language = output["language"]
        confidence = output["confidence"]
        
        if confidence > self.confidence_threshold:
            self.current_language = language
        
        return self.current_language
    
    def bytes_to_tensor(self, audio_bytes: bytes) -> torch.Tensor:
        import numpy as np
        audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
        audio_float = audio_np.astype(np.float32) / 32768.0
        return torch.from_numpy(audio_float).unsqueeze(0)

Language-Specific Models

Loading separate models per language increases memory requirements. Optimize by loading a multilingual base model and adding language-specific adapter weights.

class MultiLanguagePipeline:
    def __init__(self, base_model: str, languages: list[str]):
        from transformers import AutoModelForCausalLM
        self.base_model = AutoModelForCausalLM.from_pretrained(base_model)
        self.adapters = {}
        
        for lang in languages:
            adapter_path = f"./adapters/{lang}"
            self.adapters[lang] = self.load_adapter(adapter_path)
    
    def load_adapter(self, path: str):
        from peft import PeftModel
        return PeftModel.from_pretrained(self.base_model, path)
    
    async def generate(self, prompt: str, language: str) -> str:
        if language in self.adapters:
            self.base_model.load_adapter(language)
            self.base_model.set_adapter(language)
        else:
            self.base_model.set_adapter("default")
        
        inputs = self.tokenizer(prompt, return_tensors="pt")
        outputs = self.base_model.generate(**inputs, max_new_tokens=100)
        return self.tokenizer.decode(outputs[0])

Cross-Language Synthesis

TTS systems trained on multiple languages require language tags to produce correct phonemes and prosody.

class MultilingualTTS:
    def __init__(self, model_path: str):
        from vocos import Vocos
        from transformers import AutoTokenizer, AutoProcessor
        
        self.model = load_model(model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.processor = AutoProcessor.from_pretrained(model_path)
        self.language_map = {"en": "<|en|>", "es": "<|es|>", "fr": "<|fr|>", "de": "<|de|>"}
    
    def synthesize(self, text: str, language: str) -> bytes:
        lang_tag = self.language_map.get(language, "<|en|>")
        full_text = f"{lang_tag}{text}<|eos|>"
        
        inputs = self.processor(text=full_text, return_tensors="pt")
        output = self.model.generate(**inputs)
        
        return self.decode_audio(output)

Language Switching Logic

async def process_stream(self, audio_chunks: list[bytes], target_lang: str):
    current_lang = await self.language_detector.detect(audio_chunks[0])
    
    if current_lang != target_lang:
        return {"action": "switch", "language": current_lang}
    
    transcription = await self.transcribe(audio_chunks, language=current_lang)
    response = await self.generate(transcription, language=current_lang)
    audio_response = self.synthesize(response, language=current_lang)
    
    return {"audio": audio_response}

Language detection introduces ~50-100ms latency. Cache recent language decisions to avoid repeated inference on the same audio segment.

EXERCISE

Build a voice pipeline that detects language every 5 seconds of audio and switches the TTS voice between English (US) and Spanish (MX) voices using Coqui TTS. Log all language switches. Time: 15 minutes.