Model Quantization for Voice — Voice AI with Local Models (Chapter 17)

Quantization reduces model memory and increases inference throughput by representing weights in lower precision. For voice models, INT8 and INT4 quantization maintain quality while enabling deployment on consumer GPUs.

Post-Training Quantization (PTQ)

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

class QuantizedVoiceLLM:
    def __init__(self, model_name: str):
        self.model_name = model_name
    
    def load_int8(self) -> torch.nn.Module:
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True,
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=quantization_config,
            device_map="auto"
        )
        return model
    
    def load_int4(self, compute_dtype: torch.dtype = torch.float16) -> torch.nn.Module:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=quantization_config,
            device_map="auto"
        )
        return model

GPTQ Quantization

GPTQ provides better quality than naive INT4 at similar compression ratios.

from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

class GPTQQuantizer:
    def __init__(self, model_name: str):
        self.model_name = model_name
    
    def quantize(self, calibration_data: list[str], output_path: str, bits: int = 4):
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        model = AutoGPTQForCausalLM.from_pretrained(
            self.model_name,
            BaseQuantizeConfig(
                bits=bits,
                group_size=128,
                desc_act=True  # Activation order for better quality
            )
        )
        
        # Tokenize calibration data
        calibration_tokens = []
        for text in calibration_data:
            tokens = tokenizer(text, return_tensors="pt")
            calibration_tokens.append(tokens["input_ids"])
        
        # Quantize
        model.quantize(calibration_tokens)
        model.save_quantized(output_path)
        
        return model
    
    def load_quantized(self, quantized_path: str) -> torch.nn.Module:
        return AutoGPTQForCausalLM.from_quantized(
            quantized_path,
            device="cuda",
            trust_remote_code=True
        )

AWQ Quantization

Activation-aware weight quantization (AWQ) often outperforms GPTQ on voice tasks.

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

class AWQQuantizer:
    def __init__(self, model_name: str):
        self.model_name = model_name
    
    def quantize(self, calibration_data: list[str], output_path: str):
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        model = AutoAWQForCausalLM.from_pretrained(self.model_name)
        
        quant_config = {
            "zero_point": True,
            "q_group_size": 128,
            "w_bit": 4,
            "version": "GEMM"
        }
        
        model.quantize(
            tokenizer=tokenizer,
            quant_config=quant_config,
            calib_data=calibration_data
        )
        
        model.save_quantized(output_path)
    
    def load(self, quantized_path: str):
        return AutoAWQForCausalLM.from_quantized(quantized_path)

Batch Quantization for Production

import os
from concurrent.futures import ThreadPoolExecutor

def quantize_multiple_models(models: list[str], output_dir: str, bits: int = 4):
    def quantize_single(args):
        model_path, output_path = args
        quantizer = GPTQQuantizer(model_path)
        
        # Load calibration data
        calibration_data = load_audio_transcripts(f"./data/{model_path}/calibration")
        
        quantizer.quantize(calibration_data, output_path, bits=bits)
        return output_path
    
    tasks = [
        (model, os.path.join(output_dir, model.replace("/", "_")))
        for model in models
    ]
    
    with ThreadPoolExecutor(max_workers=2) as executor:
        results = list(executor.map(quantize_single, tasks))
    
    return results

Benchmarking Quantized Models

import time

def benchmark_model(model: torch.nn.Module, input_ids: torch.Tensor, iterations: int = 100):
    model.eval()
    
    # Warmup
    with torch.no_grad():
        for _ in range(10):
            _ = model(input_ids)
    
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    
    times = []
    with torch.no_grad():
        for _ in range(iterations):
            start = time.perf_counter()
            _ = model(input_ids)
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            times.append(time.perf_counter() - start)
    
    return {
        "mean_ms": np.mean(times) * 1000,
        "p50_ms": np.percentile(times, 50) * 1000,
        "p95_ms": np.percentile(times, 95) * 1000,
        "std_ms": np.std(times) * 1000
    }

# Compare models
base_model = load_model("meta-llama/Llama-2-7b")
int8_model = quantizer.load_int8()
int4_model = gptq_quantizer.load_quantized("./quantized/llama-2-7b-4bit")

test_input = tokenizer("Hello world", return_tensors="pt")["input_ids"].cuda()

print("Base model:", benchmark_model(base_model, test_input))
print("INT8 model:", benchmark_model(int8_model, test_input))
print("INT4 model:", benchmark_model(int4_model, test_input))

Memory Requirements by Precision

Model Size	FP16 Memory	INT8 Memory	INT4 Memory
7B	14GB	7GB	4GB
13B	26GB	13GB	7GB
70B	140GB	70GB	35GB