Quantize and Compile Models¶

Furiosa Model Zoo provides pre-compiled binaries that can be used directly with the NPU. However, we also offer the original model files and related metadata to allow for the application of different compiler options and calibration methods. In this document, we will explore the usage of the following two fields within the Model object:

tensor_name_to_range
origin

For learn more about quantization and performance optimization, you can refer to the relevant SDK's documentation pages.

Now, we will run ResNet50 model without any further optimizations.

In [1]:

Copied!





from furiosa.models import vision
from furiosa.quantizer import quantize
from furiosa.runtime.sync import create_runner

import onnx
import numpy as np

from time import perf_counter

model = vision.ResNet50()
f32_onnx_model = onnx.load_from_string(model.origin)
quantized_onnx = quantize(f32_onnx_model, model.tensor_name_to_range)

print("Example field of calibration ranges:", next(iter(model.tensor_name_to_range.items())))

with create_runner(quantized_onnx) as runner:
    runner.model.print_summary()
    input_tensor_desc = runner.model.inputs()
    fake_input = [
        np.asarray(np.random.randint(256, size=desc.shape), dtype=desc.dtype.numpy)
        for desc in input_tensor_desc
    ]
    starting_time = perf_counter()
    for _ in range(1000):
        runner.run(fake_input)
    print("Average inference time:", perf_counter() - starting_time, "ms")
from furiosa.models import vision
from furiosa.quantizer import quantize
from furiosa.runtime.sync import create_runner

import onnx
import numpy as np

from time import perf_counter

model = vision.ResNet50()
f32_onnx_model = onnx.load_from_string(model.origin)
quantized_onnx = quantize(f32_onnx_model, model.tensor_name_to_range)

print("Example field of calibration ranges:", next(iter(model.tensor_name_to_range.items())))

with create_runner(quantized_onnx) as runner:
    runner.model.print_summary()
    input_tensor_desc = runner.model.inputs()
    fake_input = [
        np.asarray(np.random.randint(256, size=desc.shape), dtype=desc.dtype.numpy)
        for desc in input_tensor_desc
    ]
    starting_time = perf_counter()
    for _ in range(1000):
        runner.run(fake_input)
    print("Average inference time:", perf_counter() - starting_time, "ms")

libfuriosa_hal.so --- v0.11.0, built @ 43c901f
libfuriosa_hal.so --- v0.11.0, built @ 43c901f

Example field of calibration ranges: ('input_tensor:0', (-123.5584560111165, 150.34208860248327))
Inputs:
{0: TensorDesc(shape=(1, 3, 224, 224), dtype=FLOAT32, format=NCHW, size=602112, len=150528)}
Outputs:
{0: TensorDesc(shape=(1,), dtype=INT64, format=?, size=8, len=1)}
Average inference time: 5.456097726011649 ms

According to performance tuning guide, we can remove input tensors' quantize operator to optimize the model.

Please note that input tensors' data type has been changed from float32 to unsigned int 8.

In [2]:

Copied!





from copy import deepcopy
from furiosa.quantizer import ModelEditor, get_pure_input_names, TensorType


model_wo_input_quantize = deepcopy(f32_onnx_model)
editor = ModelEditor(model_wo_input_quantize)
for input_name in get_pure_input_names(model_wo_input_quantize):
    editor.convert_input_type(input_name, TensorType.UINT8)
quantized_onnx_wo_input_quantize = quantize(model_wo_input_quantize, model.tensor_name_to_range)

with create_runner(quantized_onnx_wo_input_quantize) as runner:
    input_tensor_desc = runner.model.inputs()
    runner.model.print_summary()
    fake_input = [
        np.random.randint(256, size=desc.shape, dtype=desc.dtype.numpy)
        for desc in input_tensor_desc
    ]
    starting_time = perf_counter()
    for _ in range(1000):
        runner.run(fake_input)
    print("Average inference time:", perf_counter() - starting_time, "ms")
from copy import deepcopy
from furiosa.quantizer import ModelEditor, get_pure_input_names, TensorType


model_wo_input_quantize = deepcopy(f32_onnx_model)
editor = ModelEditor(model_wo_input_quantize)
for input_name in get_pure_input_names(model_wo_input_quantize):
    editor.convert_input_type(input_name, TensorType.UINT8)
quantized_onnx_wo_input_quantize = quantize(model_wo_input_quantize, model.tensor_name_to_range)

with create_runner(quantized_onnx_wo_input_quantize) as runner:
    input_tensor_desc = runner.model.inputs()
    runner.model.print_summary()
    fake_input = [
        np.random.randint(256, size=desc.shape, dtype=desc.dtype.numpy)
        for desc in input_tensor_desc
    ]
    starting_time = perf_counter()
    for _ in range(1000):
        runner.run(fake_input)
    print("Average inference time:", perf_counter() - starting_time, "ms")

Inputs:
{0: TensorDesc(shape=(1, 3, 224, 224), dtype=UINT8, format=NCHW, size=150528, len=150528)}
Outputs:
{0: TensorDesc(shape=(1,), dtype=INT64, format=?, size=8, len=1)}
Average inference time: 2.715405730996281 ms