Integration with Runtimes — Custom Quantization and Kernels (Chapter 17)

Custom kernels integrate with inference runtimes through standardized interfaces, enabling deployment in production environments.

ONNX Runtime Integration

ONNX Runtime uses execution providers and custom operators for extensibility:

import onnxruntime as ort
from onnxruntime.capi import _ld_preload  # Preload CUDA libraries

class QuantizedOp OrtCustomOp:
    def __init__(self, kernel_library_path):
        self.kernel_lib = kernel_library_path
        
    def create_kernel(self, session_options, provider_options):
        return QuantizedKernel(self.kernel_lib)

class QuantizedKernel:
    def __init__(self, lib_path):
        self.lib = ctypes.CDLL(lib_path)
        
    def compute(self, args, outputs):
        # args: list of numpy arrays
        # outputs: list of numpy arrays (preallocated)
        a_int8, b_int8, scale_a, scale_b, out_scale = args[:5]
        c_int32 = outputs[0]
        
        # Launch kernel
        launch_quantized_gemm(
            a_int8.ctypes.data_as(ctypes.POINTER(ctypes.c_int8)),
            b_int8.ctypes.data_as(ctypes.POINTER(ctypes.c_int8)),
            c_int32.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
            scale_a.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            scale_b.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
            out_scale
        )

# Register custom op
ort_session_options = ort.SessionOptions()
ort_session_options.register_custom_ops_library("lib/quantized_ops.so")

# Usage
session = ort.InferenceSession("model.onnx", sess_options=ort_session_options,
                               providers=['CUDAExecutionProvider'])

TorchScript Integration

import torch
from torch.utils.cpp_extension import load_inline

quantized_source = """
#include <torch/extension.h>
#include <cuda_runtime.h>

torch::Tensor quantized_gemm(
    torch::Tensor a, torch::Tensor b,
    torch::Tensor scale_a, torch::Tensor scale_b,
    float output_scale) {
    
    auto c = torch::zeros({a.size(0), b.size(1)}, 
                         torch::kInt32, a.device());
    
    quantized_gemm_kernel(
        a.data_ptr<int8_t>(),
        b.data_ptr<int8_t>(),
        c.data_ptr<int32_t>(),
        scale_a.data_ptr<float>(),
        scale_b.data_ptr<float>(),
        output_scale
    );
    
    return c;
}
"""

quantized_module = load_inline(
    name='quantized_ops',
    cpp_sources=quantized_source,
    cuda_sources=cuda_kernel_source,
    functions=['quantized_gemm'],
    verbose=True
)

# Wrap in TorchScript-compatible class
class QuantizedGEMMWrapper(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.module = quantized_module
        
    def forward(self, a, b, scale_a, scale_b, output_scale):
        return self.module.quantized_gemm(a, b, scale_a, scale_b, output_scale)

Local verification checkpoint

Run the smallest example from this chapter in a local workspace and record the package version, runtime, data path, and observed output. If the result depends on model size, vector count, CPU/GPU backend, or available memory, note that constraint beside the exercise so the lesson remains reproducible.