15. Voice Cloning
Voice cloning generates synthetic speech matching a target speaker's characteristics from short audio samples. This enables personalized voice AI without recording dedicated voice talent.
Speaker Encoding
Speaker encoders extract embedding vectors representing vocal characteristics from audio.
import torch
import torchaudio
from speechbrain.inference import EncoderClassifier
class SpeakerEncoder:
def __init__(self, model_path: str = "speechbrain/spkrec-xvect-voxceleb"):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.encoder = EncoderClassifier.from_hparams(
source=model_path,
savedir="pretrained_models/spkrec-xvect"
).to(self.device)
@torch.no_grad()
def encode(self, audio_path: str) -> np.ndarray:
waveform, sample_rate = torchaudio.load(audio_path)
waveform = self._preprocess(waveform, sample_rate)
embeddings = self.encoder.encode_batch(waveform)
return embeddings.cpu().numpy().flatten()
def _preprocess(self, waveform: torch.Tensor, sr: int) -> torch.Tensor:
if sr != 16000:
transform = torchaudio.transforms.Resample(sr, 16000)
waveform = transform(waveform)
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
target_length = 16000 * 3 # 3 seconds
if waveform.shape[1] < target_length:
waveform = torch.nn.functional.pad(waveform, (0, target_length - waveform.shape[1]))
else:
waveform = waveform[:, :target_length]
return waveform.to(self.device)
XTTS Voice Cloning
Coqui's XTTS provides high-quality multi-lingual voice cloning with just 6-30 seconds of reference audio.
from TTS.api import TTS
class VoiceCloner:
def __init__(self, model_name: str = "tts_models/multilingual/multi-dataset/xtts"):
self.tts = TTS(model_name).to("cuda")
self.speakers = {}
def clone_voice(self, reference_audio: str, speaker_name: str, language: str = "en"):
embedding = self.tts.tts_manager.compute_speaker_embedding(
reference_audio,
gender=None
)
self.speakers[speaker_name] = embedding
return speaker_name
def speak(self, text: str, speaker_name: str, language: str = "en") -> np.ndarray:
if speaker_name not in self.speakers:
raise ValueError(f"Speaker {speaker_name} not found. Clone first.")
speaker_embedding = self.speakers[speaker_name]
wav = self.tts.tts(
text=text,
speaker_wav=[],
speaker_embedding=speaker_embedding,
language=language
)
return np.array(wav)
Low-Resource Voice Cloning
For systems without GPU, use voice conversion models that transform source audio to match target characteristics.
class RVCInference:
def __init__(self, model_path: str, index_path: str):
from rvc_inference import RVCPipeline
self.pipeline = RVCPipeline(model_path=model_path, index_path=index_path)
def convert(self, source_audio: np.ndarray, pitch_adjust: float = 0) -> np.ndarray:
return self.pipeline.process(source_audio, f0调整=pitch_adjust)
Voice Consistency Maintenance
class ConsistentVoiceManager:
def __init__(self, encoder_model: str, tts_model: str):
self.encoder = SpeakerEncoder(encoder_model)
self.tts = TTS(tts_model)
self.active_voice = None
self.voice_embeddings = {}
def register_voice(self, name: str, audio_path: str):
embedding = self.encoder.encode(audio_path)
self.voice_embeddings[name] = embedding
if self.active_voice is None:
self.active_voice = name
def set_active_voice(self, name: str):
if name not in self.voice_embeddings:
raise ValueError(f"Voice {name} not registered")
self.active_voice = name
def speak(self, text: str) -> np.ndarray:
if self.active_voice is None:
raise RuntimeError("No active voice set")
embedding = self.voice_embeddings[self.active_voice]
return self.tts.tts_with_speaker(text, speaker_embedding=embedding)
Voice cloning requires careful attention to consent and ethical usage. Implement usage logging and prevent cloning of未经同意的 voices.
Clone a voice using XTTS with a 15-second audio sample, then generate three different sentences using the cloned voice. Measure average generation time on GPU. Time: 15 minutes.