RUNLOCALAIv38
->Will it run?Best GPUCompareTroubleshootStartLearnPulseModelsHardwareToolsBench
Run check
RUNLOCALAI

Independently operated catalog for local-AI hardware and software. Hand-written verdicts. Source-cited claims. Reproducible commands when we have them.

OP·Fredoline Eruo
DIR
  • Models
  • Hardware
  • Tools
  • Benchmarks
TOOLS
  • Will it run?
  • Compare hardware
  • Cost vs cloud
  • Choose my GPU
  • Prompting kits
  • Quick answers
REF
  • All buyer guides
  • Learn local AI
  • Methodology
  • Glossary
  • Errors KB
  • Trust
EDITOR
  • About
  • Author
  • How we make money
  • Editorial policy
  • Contact
LEGAL
  • Privacy
  • Terms
  • Sitemap
MAIL · MONTHLY DIGEST
Get monthly local AI changes
Monthly recap. No spam.
DISCLOSURE

Some links on this site are affiliate links (Amazon Associates and other first-class retailers). When you buy through them, we earn a small commission at no extra cost to you. Affiliate links do not influence our verdicts — there are cards we rate highly that we don't have affiliate relationships with, and cards that sell well that we refuse to recommend. Read more →

© 2026 runlocalai.coIndependently operated
RUNLOCALAI · v38
  1. >
  2. Home
  3. /Learn
  4. /Courses
  5. /Voice AI with Local Models
  6. /Ch. 17
Voice AI with Local Models

17. Model Quantization for Voice

Chapter 17 of 22 · 25 min
KEY INSIGHT

Quantization reduces voice model memory footprint by 2-4x with INT8/INT4 precision, enabling deployment of larger models on consumer hardware with acceptable quality trade-offs.

Quantization reduces model memory and increases inference throughput by representing weights in lower precision. For voice models, INT8 and INT4 quantization maintain quality while enabling deployment on consumer GPUs.

Post-Training Quantization (PTQ)

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

class QuantizedVoiceLLM:
    def __init__(self, model_name: str):
        self.model_name = model_name
    
    def load_int8(self) -> torch.nn.Module:
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True,
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=quantization_config,
            device_map="auto"
        )
        return model
    
    def load_int4(self, compute_dtype: torch.dtype = torch.float16) -> torch.nn.Module:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4"
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=quantization_config,
            device_map="auto"
        )
        return model

GPTQ Quantization

GPTQ provides better quality than naive INT4 at similar compression ratios.

from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

class GPTQQuantizer:
    def __init__(self, model_name: str):
        self.model_name = model_name
    
    def quantize(self, calibration_data: list[str], output_path: str, bits: int = 4):
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        model = AutoGPTQForCausalLM.from_pretrained(
            self.model_name,
            BaseQuantizeConfig(
                bits=bits,
                group_size=128,
                desc_act=True  # Activation order for better quality
            )
        )
        
        # Tokenize calibration data
        calibration_tokens = []
        for text in calibration_data:
            tokens = tokenizer(text, return_tensors="pt")
            calibration_tokens.append(tokens["input_ids"])
        
        # Quantize
        model.quantize(calibration_tokens)
        model.save_quantized(output_path)
        
        return model
    
    def load_quantized(self, quantized_path: str) -> torch.nn.Module:
        return AutoGPTQForCausalLM.from_quantized(
            quantized_path,
            device="cuda",
            trust_remote_code=True
        )

AWQ Quantization

Activation-aware weight quantization (AWQ) often outperforms GPTQ on voice tasks.

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

class AWQQuantizer:
    def __init__(self, model_name: str):
        self.model_name = model_name
    
    def quantize(self, calibration_data: list[str], output_path: str):
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        model = AutoAWQForCausalLM.from_pretrained(self.model_name)
        
        quant_config = {
            "zero_point": True,
            "q_group_size": 128,
            "w_bit": 4,
            "version": "GEMM"
        }
        
        model.quantize(
            tokenizer=tokenizer,
            quant_config=quant_config,
            calib_data=calibration_data
        )
        
        model.save_quantized(output_path)
    
    def load(self, quantized_path: str):
        return AutoAWQForCausalLM.from_quantized(quantized_path)

Batch Quantization for Production

import os
from concurrent.futures import ThreadPoolExecutor

def quantize_multiple_models(models: list[str], output_dir: str, bits: int = 4):
    def quantize_single(args):
        model_path, output_path = args
        quantizer = GPTQQuantizer(model_path)
        
        # Load calibration data
        calibration_data = load_audio_transcripts(f"./data/{model_path}/calibration")
        
        quantizer.quantize(calibration_data, output_path, bits=bits)
        return output_path
    
    tasks = [
        (model, os.path.join(output_dir, model.replace("/", "_")))
        for model in models
    ]
    
    with ThreadPoolExecutor(max_workers=2) as executor:
        results = list(executor.map(quantize_single, tasks))
    
    return results

Benchmarking Quantized Models

import time

def benchmark_model(model: torch.nn.Module, input_ids: torch.Tensor, iterations: int = 100):
    model.eval()
    
    # Warmup
    with torch.no_grad():
        for _ in range(10):
            _ = model(input_ids)
    
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    
    times = []
    with torch.no_grad():
        for _ in range(iterations):
            start = time.perf_counter()
            _ = model(input_ids)
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            times.append(time.perf_counter() - start)
    
    return {
        "mean_ms": np.mean(times) * 1000,
        "p50_ms": np.percentile(times, 50) * 1000,
        "p95_ms": np.percentile(times, 95) * 1000,
        "std_ms": np.std(times) * 1000
    }

# Compare models
base_model = load_model("meta-llama/Llama-2-7b")
int8_model = quantizer.load_int8()
int4_model = gptq_quantizer.load_quantized("./quantized/llama-2-7b-4bit")

test_input = tokenizer("Hello world", return_tensors="pt")["input_ids"].cuda()

print("Base model:", benchmark_model(base_model, test_input))
print("INT8 model:", benchmark_model(int8_model, test_input))
print("INT4 model:", benchmark_model(int4_model, test_input))

Memory Requirements by Precision

Model Size FP16 Memory INT8 Memory INT4 Memory
7B 14GB 7GB 4GB
13B 26GB 13GB 7GB
70B 140GB 70GB 35GB
EXERCISE

Quantize a Whisper model to INT8 using BitsAndBytes, then compare transcription quality and inference speed against the FP16 baseline on a 60-second audio sample. Time: 15 minutes.

← Chapter 16
Low-Latency Optimization
Chapter 18 →
Error Handling