Quantize and Compile Models¶
Furiosa Model Zoo provides pre-compiled binaries that can be used directly with the NPU. However, we also offer the original model files and related metadata to allow for the application of different compiler options and calibration methods. In this document, we will explore the usage of the following two fields within the Model object:
tensor_name_to_range
origin
For learn more about quantization and performance optimization, you can refer to the relevant SDK's documentation pages.
Now, we will run ResNet50 model without any further optimizations.
In [1]:
Copied!
from furiosa.models import vision
from furiosa.quantizer import quantize
from furiosa.runtime.sync import create_runner
import onnx
import numpy as np
from time import perf_counter
model = vision.ResNet50()
f32_onnx_model = onnx.load_from_string(model.origin)
quantized_onnx = quantize(f32_onnx_model, model.tensor_name_to_range)
print("Example field of calibration ranges:", next(iter(model.tensor_name_to_range.items())))
with create_runner(quantized_onnx) as runner:
runner.model.print_summary()
input_tensor_desc = runner.model.inputs()
fake_input = [
np.asarray(np.random.randint(256, size=desc.shape), dtype=desc.dtype.numpy)
for desc in input_tensor_desc
]
starting_time = perf_counter()
for _ in range(1000):
runner.run(fake_input)
print("Average inference time:", perf_counter() - starting_time, "ms")
from furiosa.models import vision
from furiosa.quantizer import quantize
from furiosa.runtime.sync import create_runner
import onnx
import numpy as np
from time import perf_counter
model = vision.ResNet50()
f32_onnx_model = onnx.load_from_string(model.origin)
quantized_onnx = quantize(f32_onnx_model, model.tensor_name_to_range)
print("Example field of calibration ranges:", next(iter(model.tensor_name_to_range.items())))
with create_runner(quantized_onnx) as runner:
runner.model.print_summary()
input_tensor_desc = runner.model.inputs()
fake_input = [
np.asarray(np.random.randint(256, size=desc.shape), dtype=desc.dtype.numpy)
for desc in input_tensor_desc
]
starting_time = perf_counter()
for _ in range(1000):
runner.run(fake_input)
print("Average inference time:", perf_counter() - starting_time, "ms")
libfuriosa_hal.so --- v0.11.0, built @ 43c901f libfuriosa_hal.so --- v0.11.0, built @ 43c901f
Example field of calibration ranges: ('input_tensor:0', (-123.5584560111165, 150.34208860248327)) Inputs: {0: TensorDesc(shape=(1, 3, 224, 224), dtype=FLOAT32, format=NCHW, size=602112, len=150528)} Outputs: {0: TensorDesc(shape=(1,), dtype=INT64, format=?, size=8, len=1)} Average inference time: 5.456097726011649 ms
According to performance tuning guide, we can remove input tensors' quantize operator to optimize the model.
Please note that input tensors' data type has been changed from float32 to unsigned int 8.
In [2]:
Copied!
from copy import deepcopy
from furiosa.quantizer import ModelEditor, get_pure_input_names, TensorType
model_wo_input_quantize = deepcopy(f32_onnx_model)
editor = ModelEditor(model_wo_input_quantize)
for input_name in get_pure_input_names(model_wo_input_quantize):
editor.convert_input_type(input_name, TensorType.UINT8)
quantized_onnx_wo_input_quantize = quantize(model_wo_input_quantize, model.tensor_name_to_range)
with create_runner(quantized_onnx_wo_input_quantize) as runner:
input_tensor_desc = runner.model.inputs()
runner.model.print_summary()
fake_input = [
np.random.randint(256, size=desc.shape, dtype=desc.dtype.numpy)
for desc in input_tensor_desc
]
starting_time = perf_counter()
for _ in range(1000):
runner.run(fake_input)
print("Average inference time:", perf_counter() - starting_time, "ms")
from copy import deepcopy
from furiosa.quantizer import ModelEditor, get_pure_input_names, TensorType
model_wo_input_quantize = deepcopy(f32_onnx_model)
editor = ModelEditor(model_wo_input_quantize)
for input_name in get_pure_input_names(model_wo_input_quantize):
editor.convert_input_type(input_name, TensorType.UINT8)
quantized_onnx_wo_input_quantize = quantize(model_wo_input_quantize, model.tensor_name_to_range)
with create_runner(quantized_onnx_wo_input_quantize) as runner:
input_tensor_desc = runner.model.inputs()
runner.model.print_summary()
fake_input = [
np.random.randint(256, size=desc.shape, dtype=desc.dtype.numpy)
for desc in input_tensor_desc
]
starting_time = perf_counter()
for _ in range(1000):
runner.run(fake_input)
print("Average inference time:", perf_counter() - starting_time, "ms")
Inputs: {0: TensorDesc(shape=(1, 3, 224, 224), dtype=UINT8, format=NCHW, size=150528, len=150528)} Outputs: {0: TensorDesc(shape=(1,), dtype=INT64, format=?, size=8, len=1)} Average inference time: 2.715405730996281 ms