Voice Cloning — Voice AI with Local Models (Chapter 15)

Voice cloning generates synthetic speech matching a target speaker's characteristics from short audio samples. This enables personalized voice AI without recording dedicated voice talent.

Speaker Encoding

Speaker encoders extract embedding vectors representing vocal characteristics from audio.

import torch
import torchaudio
from speechbrain.inference import EncoderClassifier

class SpeakerEncoder:
    def __init__(self, model_path: str = "speechbrain/spkrec-xvect-voxceleb"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.encoder = EncoderClassifier.from_hparams(
            source=model_path,
            savedir="pretrained_models/spkrec-xvect"
        ).to(self.device)
    
    @torch.no_grad()
    def encode(self, audio_path: str) -> np.ndarray:
        waveform, sample_rate = torchaudio.load(audio_path)
        waveform = self._preprocess(waveform, sample_rate)
        
        embeddings = self.encoder.encode_batch(waveform)
        return embeddings.cpu().numpy().flatten()
    
    def _preprocess(self, waveform: torch.Tensor, sr: int) -> torch.Tensor:
        if sr != 16000:
            transform = torchaudio.transforms.Resample(sr, 16000)
            waveform = transform(waveform)
        
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        
        target_length = 16000 * 3  # 3 seconds
        if waveform.shape[1] < target_length:
            waveform = torch.nn.functional.pad(waveform, (0, target_length - waveform.shape[1]))
        else:
            waveform = waveform[:, :target_length]
        
        return waveform.to(self.device)

XTTS Voice Cloning

Coqui's XTTS provides high-quality multi-lingual voice cloning with just 6-30 seconds of reference audio.

from TTS.api import TTS

class VoiceCloner:
    def __init__(self, model_name: str = "tts_models/multilingual/multi-dataset/xtts"):
        self.tts = TTS(model_name).to("cuda")
        self.speakers = {}
    
    def clone_voice(self, reference_audio: str, speaker_name: str, language: str = "en"):
        embedding = self.tts.tts_manager.compute_speaker_embedding(
            reference_audio,
            gender=None
        )
        self.speakers[speaker_name] = embedding
        return speaker_name
    
    def speak(self, text: str, speaker_name: str, language: str = "en") -> np.ndarray:
        if speaker_name not in self.speakers:
            raise ValueError(f"Speaker {speaker_name} not found. Clone first.")
        
        speaker_embedding = self.speakers[speaker_name]
        wav = self.tts.tts(
            text=text,
            speaker_wav=[],
            speaker_embedding=speaker_embedding,
            language=language
        )
        return np.array(wav)

Low-Resource Voice Cloning

For systems without GPU, use voice conversion models that transform source audio to match target characteristics.

class RVCInference:
    def __init__(self, model_path: str, index_path: str):
        from rvc_inference import RVCPipeline
        self.pipeline = RVCPipeline(model_path=model_path, index_path=index_path)
    
    def convert(self, source_audio: np.ndarray, pitch_adjust: float = 0) -> np.ndarray:
        return self.pipeline.process(source_audio, f0调整=pitch_adjust)

Voice Consistency Maintenance

class ConsistentVoiceManager:
    def __init__(self, encoder_model: str, tts_model: str):
        self.encoder = SpeakerEncoder(encoder_model)
        self.tts = TTS(tts_model)
        self.active_voice = None
        self.voice_embeddings = {}
    
    def register_voice(self, name: str, audio_path: str):
        embedding = self.encoder.encode(audio_path)
        self.voice_embeddings[name] = embedding
        if self.active_voice is None:
            self.active_voice = name
    
    def set_active_voice(self, name: str):
        if name not in self.voice_embeddings:
            raise ValueError(f"Voice {name} not registered")
        self.active_voice = name
    
    def speak(self, text: str) -> np.ndarray:
        if self.active_voice is None:
            raise RuntimeError("No active voice set")
        
        embedding = self.voice_embeddings[self.active_voice]
        return self.tts.tts_with_speaker(text, speaker_embedding=embedding)

Voice cloning requires careful attention to consent and ethical usage. Implement usage logging and prevent cloning of未经同意的 voices.