17. Model Quantization for Voice
Chapter 17 of 22 · 25 min
Quantization reduces model memory and increases inference throughput by representing weights in lower precision. For voice models, INT8 and INT4 quantization maintain quality while enabling deployment on consumer GPUs.
Post-Training Quantization (PTQ)
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
class QuantizedVoiceLLM:
def __init__(self, model_name: str):
self.model_name = model_name
def load_int8(self) -> torch.nn.Module:
quantization_config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False
)
model = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=quantization_config,
device_map="auto"
)
return model
def load_int4(self, compute_dtype: torch.dtype = torch.float16) -> torch.nn.Module:
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4"
)
model = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=quantization_config,
device_map="auto"
)
return model
GPTQ Quantization
GPTQ provides better quality than naive INT4 at similar compression ratios.
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
class GPTQQuantizer:
def __init__(self, model_name: str):
self.model_name = model_name
def quantize(self, calibration_data: list[str], output_path: str, bits: int = 4):
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
model = AutoGPTQForCausalLM.from_pretrained(
self.model_name,
BaseQuantizeConfig(
bits=bits,
group_size=128,
desc_act=True # Activation order for better quality
)
)
# Tokenize calibration data
calibration_tokens = []
for text in calibration_data:
tokens = tokenizer(text, return_tensors="pt")
calibration_tokens.append(tokens["input_ids"])
# Quantize
model.quantize(calibration_tokens)
model.save_quantized(output_path)
return model
def load_quantized(self, quantized_path: str) -> torch.nn.Module:
return AutoGPTQForCausalLM.from_quantized(
quantized_path,
device="cuda",
trust_remote_code=True
)
AWQ Quantization
Activation-aware weight quantization (AWQ) often outperforms GPTQ on voice tasks.
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
class AWQQuantizer:
def __init__(self, model_name: str):
self.model_name = model_name
def quantize(self, calibration_data: list[str], output_path: str):
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
model = AutoAWQForCausalLM.from_pretrained(self.model_name)
quant_config = {
"zero_point": True,
"q_group_size": 128,
"w_bit": 4,
"version": "GEMM"
}
model.quantize(
tokenizer=tokenizer,
quant_config=quant_config,
calib_data=calibration_data
)
model.save_quantized(output_path)
def load(self, quantized_path: str):
return AutoAWQForCausalLM.from_quantized(quantized_path)
Batch Quantization for Production
import os
from concurrent.futures import ThreadPoolExecutor
def quantize_multiple_models(models: list[str], output_dir: str, bits: int = 4):
def quantize_single(args):
model_path, output_path = args
quantizer = GPTQQuantizer(model_path)
# Load calibration data
calibration_data = load_audio_transcripts(f"./data/{model_path}/calibration")
quantizer.quantize(calibration_data, output_path, bits=bits)
return output_path
tasks = [
(model, os.path.join(output_dir, model.replace("/", "_")))
for model in models
]
with ThreadPoolExecutor(max_workers=2) as executor:
results = list(executor.map(quantize_single, tasks))
return results
Benchmarking Quantized Models
import time
def benchmark_model(model: torch.nn.Module, input_ids: torch.Tensor, iterations: int = 100):
model.eval()
# Warmup
with torch.no_grad():
for _ in range(10):
_ = model(input_ids)
if torch.cuda.is_available():
torch.cuda.synchronize()
times = []
with torch.no_grad():
for _ in range(iterations):
start = time.perf_counter()
_ = model(input_ids)
if torch.cuda.is_available():
torch.cuda.synchronize()
times.append(time.perf_counter() - start)
return {
"mean_ms": np.mean(times) * 1000,
"p50_ms": np.percentile(times, 50) * 1000,
"p95_ms": np.percentile(times, 95) * 1000,
"std_ms": np.std(times) * 1000
}
# Compare models
base_model = load_model("meta-llama/Llama-2-7b")
int8_model = quantizer.load_int8()
int4_model = gptq_quantizer.load_quantized("./quantized/llama-2-7b-4bit")
test_input = tokenizer("Hello world", return_tensors="pt")["input_ids"].cuda()
print("Base model:", benchmark_model(base_model, test_input))
print("INT8 model:", benchmark_model(int8_model, test_input))
print("INT4 model:", benchmark_model(int4_model, test_input))
Memory Requirements by Precision
| Model Size | FP16 Memory | INT8 Memory | INT4 Memory |
|---|---|---|---|
| 7B | 14GB | 7GB | 4GB |
| 13B | 26GB | 13GB | 7GB |
| 70B | 140GB | 70GB | 35GB |
EXERCISE
Quantize a Whisper model to INT8 using BitsAndBytes, then compare transcription quality and inference speed against the FP16 baseline on a 60-second audio sample. Time: 15 minutes.