17. Integration with Runtimes
Custom kernels integrate with inference runtimes through standardized interfaces, enabling deployment in production environments.
ONNX Runtime Integration
ONNX Runtime uses execution providers and custom operators for extensibility:
import onnxruntime as ort
from onnxruntime.capi import _ld_preload # Preload CUDA libraries
class QuantizedOp OrtCustomOp:
def __init__(self, kernel_library_path):
self.kernel_lib = kernel_library_path
def create_kernel(self, session_options, provider_options):
return QuantizedKernel(self.kernel_lib)
class QuantizedKernel:
def __init__(self, lib_path):
self.lib = ctypes.CDLL(lib_path)
def compute(self, args, outputs):
# args: list of numpy arrays
# outputs: list of numpy arrays (preallocated)
a_int8, b_int8, scale_a, scale_b, out_scale = args[:5]
c_int32 = outputs[0]
# Launch kernel
launch_quantized_gemm(
a_int8.ctypes.data_as(ctypes.POINTER(ctypes.c_int8)),
b_int8.ctypes.data_as(ctypes.POINTER(ctypes.c_int8)),
c_int32.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
scale_a.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
scale_b.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
out_scale
)
# Register custom op
ort_session_options = ort.SessionOptions()
ort_session_options.register_custom_ops_library("lib/quantized_ops.so")
# Usage
session = ort.InferenceSession("model.onnx", sess_options=ort_session_options,
providers=['CUDAExecutionProvider'])
TorchScript Integration
import torch
from torch.utils.cpp_extension import load_inline
quantized_source = """
#include <torch/extension.h>
#include <cuda_runtime.h>
torch::Tensor quantized_gemm(
torch::Tensor a, torch::Tensor b,
torch::Tensor scale_a, torch::Tensor scale_b,
float output_scale) {
auto c = torch::zeros({a.size(0), b.size(1)},
torch::kInt32, a.device());
quantized_gemm_kernel(
a.data_ptr<int8_t>(),
b.data_ptr<int8_t>(),
c.data_ptr<int32_t>(),
scale_a.data_ptr<float>(),
scale_b.data_ptr<float>(),
output_scale
);
return c;
}
"""
quantized_module = load_inline(
name='quantized_ops',
cpp_sources=quantized_source,
cuda_sources=cuda_kernel_source,
functions=['quantized_gemm'],
verbose=True
)
# Wrap in TorchScript-compatible class
class QuantizedGEMMWrapper(torch.nn.Module):
def __init__(self):
super().__init__()
self.module = quantized_module
def forward(self, a, b, scale_a, scale_b, output_scale):
return self.module.quantized_gemm(a, b, scale_a, scale_b, output_scale)
Local verification checkpoint
Run the smallest example from this chapter in a local workspace and record the package version, runtime, data path, and observed output. If the result depends on model size, vector count, CPU/GPU backend, or available memory, note that constraint beside the exercise so the lesson remains reproducible.
Local verification checkpoint
Run the smallest example from this chapter in a local workspace and record the package version, runtime, data path, and observed output. If the result depends on model size, vector count, CPU/GPU backend, or available memory, note that constraint beside the exercise so the lesson remains reproducible.
Integrate your custom quantized kernels into ONNX Runtime and benchmark against standard operators on a transformer model.