Quantize and compiling Models¶
Related fields:
- tensor_name_to_range
- origin
See the Model's input/output summary
In [1]:
Copied!
from furiosa.models import vision
from furiosa.quantizer import quantize
from furiosa.runtime.sync import create_runner
import onnx
model = vision.ResNet50()
print("First field of calibration ranges:", next(iter(model.tensor_name_to_range.items())))
f32_onnx_model = onnx.load_from_string(model.origin)
quantized_onnx = quantize(f32_onnx_model, model.tensor_name_to_range)
with create_runner(quantized_onnx) as runner:
runner.model.print_summary()
from furiosa.models import vision
from furiosa.quantizer import quantize
from furiosa.runtime.sync import create_runner
import onnx
model = vision.ResNet50()
print("First field of calibration ranges:", next(iter(model.tensor_name_to_range.items())))
f32_onnx_model = onnx.load_from_string(model.origin)
quantized_onnx = quantize(f32_onnx_model, model.tensor_name_to_range)
with create_runner(quantized_onnx) as runner:
runner.model.print_summary()
libfuriosa_hal.so --- v0.11.0, built @ 43c901f libfuriosa_hal.so --- v0.11.0, built @ 43c901f
First field of calibration ranges: ('input_tensor:0', (-123.5584560111165, 150.34208860248327)) 2023-08-28T01:28:05.078547Z INFO furiosa_rt_core::driver::event_driven::coord: FuriosaRT (v0.10.0, rev: ac1a04a8d, built at: 2023-08-08T12:15:46Z) bootstrapping ... 2023-08-28T01:28:05.084541Z INFO furiosa_rt_core::driver::event_driven::coord: Found furiosa-compiler (v0.10.0, rev: f8f05c8, built at: 2023-08-08T11:58:09Z) 2023-08-28T01:28:05.084554Z INFO furiosa_rt_core::driver::event_driven::coord: Found libhal (type: warboy, v0.11.0, rev: 43c901f built at: 2023-08-08T12:07:35Z) 2023-08-28T01:28:05.084557Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-0] detected 1 NPU device(s): 2023-08-28T01:28:05.094238Z INFO furiosa_rt_core::driver::event_driven::coord: - [0] npu:2:0-1 (warboy-b0-2pe, 128dpes, firmware: 1.7.0, f7b0f28) 2023-08-28T01:28:05.094462Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-0] started 2023-08-28T01:28:09.761370Z INFO furiosa::runtime: Saving the compilation log into /root/.local/state/furiosa/logs/compiler-20230828102809-v88fdx.log 2023-08-28T01:28:09.761655Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-0] created Sess-08a8e56d using npu:2:0-1 2023-08-28T01:28:09.773352Z INFO furiosa_rt_core::driver::event_driven::coord: [Sess-08a8e56d] compiling the model (target: warboy-b0-2pe, 128dpes, size: 102.2 MB) 2023-08-28T01:28:12.780705Z INFO furiosa_rt_core::driver::event_driven::coord: [Sess-08a8e56d] the model compile is successful (took 3 secs) 2023-08-28T01:28:13.227285Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-0] created 1 NPU threads on npu:2:0-1 (DRAM: 180.0 kiB/16.0 GiB, SRAM: 31.4 MiB/128.0 MiB) Inputs: {0: TensorDesc(shape=(1, 3, 224, 224), dtype=FLOAT32, format=NCHW, size=602112, len=150528)} Outputs: {0: TensorDesc(shape=(1,), dtype=INT64, format=?, size=8, len=1)} 2023-08-28T01:28:13.413781Z INFO furiosa_rt_core::driver::event_driven::coord: [Sess-08a8e56d] terminated 2023-08-28T01:28:13.417749Z INFO furiosa_rt_core::npu::raw: NPU (npu:2:0-1) has been closed 2023-08-28T01:28:13.419580Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-0] stopped
Run inferences with scaling
In [2]:
Copied!
import numpy as np
from time import perf_counter
quantized_onnx = quantize(f32_onnx_model, model.tensor_name_to_range)
with create_runner(quantized_onnx) as runner:
input_tensor_desc = runner.model.inputs()
runner.model.print_summary()
fake_input = [
np.asarray(np.random.randint(256, size=desc.shape), dtype=desc.dtype.numpy)
for desc in input_tensor_desc
]
starting_time = perf_counter()
for _ in range(1000):
runner.run(fake_input)
print("Average inference time:", perf_counter() - starting_time, "ms")
import numpy as np
from time import perf_counter
quantized_onnx = quantize(f32_onnx_model, model.tensor_name_to_range)
with create_runner(quantized_onnx) as runner:
input_tensor_desc = runner.model.inputs()
runner.model.print_summary()
fake_input = [
np.asarray(np.random.randint(256, size=desc.shape), dtype=desc.dtype.numpy)
for desc in input_tensor_desc
]
starting_time = perf_counter()
for _ in range(1000):
runner.run(fake_input)
print("Average inference time:", perf_counter() - starting_time, "ms")
2023-08-28T01:28:13.806497Z INFO furiosa_rt_core::driver::event_driven::coord: FuriosaRT (v0.10.0, rev: ac1a04a8d, built at: 2023-08-08T12:15:46Z) bootstrapping ... 2023-08-28T01:28:13.811804Z INFO furiosa_rt_core::driver::event_driven::coord: Found furiosa-compiler (v0.10.0, rev: f8f05c8, built at: 2023-08-08T11:58:09Z) 2023-08-28T01:28:13.811809Z INFO furiosa_rt_core::driver::event_driven::coord: Found libhal (type: warboy, v0.11.0, rev: 43c901f built at: 2023-08-08T12:07:35Z) 2023-08-28T01:28:13.811811Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-1] detected 1 NPU device(s): 2023-08-28T01:28:13.823402Z INFO furiosa_rt_core::driver::event_driven::coord: - [0] npu:2:0-1 (warboy-b0-2pe, 128dpes, firmware: 1.7.0, f7b0f28) 2023-08-28T01:28:13.823534Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-1] started 2023-08-28T01:28:17.728906Z INFO furiosa::runtime: Saving the compilation log into /root/.local/state/furiosa/logs/compiler-20230828102817-okpycl.log 2023-08-28T01:28:17.729397Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-1] created Sess-541713a4 using npu:2:0-1 2023-08-28T01:28:17.743437Z INFO furiosa_rt_core::driver::event_driven::coord: [Sess-541713a4] compiling the model (target: warboy-b0-2pe, 128dpes, size: 102.2 MB) 2023-08-28T01:28:20.754124Z INFO furiosa_rt_core::driver::event_driven::coord: [Sess-541713a4] the model compile is successful (took 3 secs) 2023-08-28T01:28:21.203223Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-1] created 1 NPU threads on npu:2:0-1 (DRAM: 180.0 kiB/16.0 GiB, SRAM: 31.4 MiB/128.0 MiB) Inputs: {0: TensorDesc(shape=(1, 3, 224, 224), dtype=FLOAT32, format=NCHW, size=602112, len=150528)} Outputs: {0: TensorDesc(shape=(1,), dtype=INT64, format=?, size=8, len=1)} Average inference time: 5.614574640989304 ms 2023-08-28T01:28:27.071158Z INFO furiosa_rt_core::driver::event_driven::coord: [Sess-541713a4] terminated 2023-08-28T01:28:27.080271Z INFO furiosa_rt_core::npu::raw: NPU (npu:2:0-1) has been closed 2023-08-28T01:28:27.085561Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-1] stopped
Run inferences without scaling (and quantize).
See performance tuning guide for more details.
Please note that input data type has been changed
In [3]:
Copied!
from copy import deepcopy
from furiosa.quantizer import ModelEditor, get_pure_input_names, TensorType
model_wo_input_quantize = deepcopy(f32_onnx_model)
editor = ModelEditor(model_wo_input_quantize)
for input_name in get_pure_input_names(model_wo_input_quantize):
editor.convert_input_type(input_name, TensorType.UINT8)
quantized_onnx_wo_input_quantize = quantize(model_wo_input_quantize, model.tensor_name_to_range)
with create_runner(quantized_onnx_wo_input_quantize) as runner:
input_tensor_desc = runner.model.inputs()
runner.model.print_summary()
fake_input = [
np.random.randint(256, size=desc.shape, dtype=desc.dtype.numpy)
for desc in input_tensor_desc
]
starting_time = perf_counter()
for _ in range(1000):
runner.run(fake_input)
print("Average inference time:", perf_counter() - starting_time, "ms")
from copy import deepcopy
from furiosa.quantizer import ModelEditor, get_pure_input_names, TensorType
model_wo_input_quantize = deepcopy(f32_onnx_model)
editor = ModelEditor(model_wo_input_quantize)
for input_name in get_pure_input_names(model_wo_input_quantize):
editor.convert_input_type(input_name, TensorType.UINT8)
quantized_onnx_wo_input_quantize = quantize(model_wo_input_quantize, model.tensor_name_to_range)
with create_runner(quantized_onnx_wo_input_quantize) as runner:
input_tensor_desc = runner.model.inputs()
runner.model.print_summary()
fake_input = [
np.random.randint(256, size=desc.shape, dtype=desc.dtype.numpy)
for desc in input_tensor_desc
]
starting_time = perf_counter()
for _ in range(1000):
runner.run(fake_input)
print("Average inference time:", perf_counter() - starting_time, "ms")
2023-08-28T01:28:27.838330Z INFO furiosa_rt_core::driver::event_driven::coord: FuriosaRT (v0.10.0, rev: ac1a04a8d, built at: 2023-08-08T12:15:46Z) bootstrapping ... 2023-08-28T01:28:27.844387Z INFO furiosa_rt_core::driver::event_driven::coord: Found furiosa-compiler (v0.10.0, rev: f8f05c8, built at: 2023-08-08T11:58:09Z) 2023-08-28T01:28:27.844399Z INFO furiosa_rt_core::driver::event_driven::coord: Found libhal (type: warboy, v0.11.0, rev: 43c901f built at: 2023-08-08T12:07:35Z) 2023-08-28T01:28:27.844403Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-2] detected 1 NPU device(s): 2023-08-28T01:28:27.854235Z INFO furiosa_rt_core::driver::event_driven::coord: - [0] npu:2:0-1 (warboy-b0-2pe, 128dpes, firmware: 1.7.0, f7b0f28) 2023-08-28T01:28:27.854453Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-2] started 2023-08-28T01:28:32.712311Z INFO furiosa::runtime: Saving the compilation log into /root/.local/state/furiosa/logs/compiler-20230828102832-nwtxhl.log 2023-08-28T01:28:32.712548Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-2] created Sess-78e2493d using npu:2:0-1 2023-08-28T01:28:32.733493Z INFO furiosa_rt_core::driver::event_driven::coord: [Sess-78e2493d] compiling the model (target: warboy-b0-2pe, 128dpes, size: 102.2 MB) 2023-08-28T01:28:35.824911Z INFO furiosa_rt_core::driver::event_driven::coord: [Sess-78e2493d] the model compile is successful (took 3 secs) 2023-08-28T01:28:36.227750Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-2] created 1 NPU threads on npu:2:0-1 (DRAM: 180.0 kiB/16.0 GiB, SRAM: 31.4 MiB/128.0 MiB) Inputs: {0: TensorDesc(shape=(1, 3, 224, 224), dtype=UINT8, format=NCHW, size=150528, len=150528)} Outputs: {0: TensorDesc(shape=(1,), dtype=INT64, format=?, size=8, len=1)} Average inference time: 2.5746346139349043 ms 2023-08-28T01:28:39.026687Z INFO furiosa_rt_core::driver::event_driven::coord: [Sess-78e2493d] terminated 2023-08-28T01:28:39.036203Z INFO furiosa_rt_core::npu::raw: NPU (npu:2:0-1) has been closed 2023-08-28T01:28:39.041580Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-2] stopped
You can compile & use only 1pe
In [4]:
Copied!
from furiosa.runtime.sync import Runtime
with Runtime(device="warboy(1)*1") as runtime:
with runtime.create_runner(quantized_onnx_wo_input_quantize) as runner:
input_tensor_desc = runner.model.inputs()
runner.model.print_summary()
fake_input = [
np.random.randint(256, size=desc.shape, dtype=desc.dtype.numpy)
for desc in input_tensor_desc
]
starting_time = perf_counter()
for _ in range(1000):
runner.run(fake_input)
print("Average inference time:", perf_counter() - starting_time, "ms")
from furiosa.runtime.sync import Runtime
with Runtime(device="warboy(1)*1") as runtime:
with runtime.create_runner(quantized_onnx_wo_input_quantize) as runner:
input_tensor_desc = runner.model.inputs()
runner.model.print_summary()
fake_input = [
np.random.randint(256, size=desc.shape, dtype=desc.dtype.numpy)
for desc in input_tensor_desc
]
starting_time = perf_counter()
for _ in range(1000):
runner.run(fake_input)
print("Average inference time:", perf_counter() - starting_time, "ms")
2023-08-28T01:28:39.079503Z INFO furiosa_rt_core::driver::event_driven::coord: FuriosaRT (v0.10.0, rev: ac1a04a8d, built at: 2023-08-08T12:15:46Z) bootstrapping ... 2023-08-28T01:28:39.084505Z INFO furiosa_rt_core::driver::event_driven::coord: Found furiosa-compiler (v0.10.0, rev: f8f05c8, built at: 2023-08-08T11:58:09Z) 2023-08-28T01:28:39.084523Z INFO furiosa_rt_core::driver::event_driven::coord: Found libhal (type: warboy, v0.11.0, rev: 43c901f built at: 2023-08-08T12:07:35Z) 2023-08-28T01:28:39.084529Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-3] detected 1 NPU device(s): 2023-08-28T01:28:39.094433Z INFO furiosa_rt_core::driver::event_driven::coord: - [0] npu:2:0 (warboy-b0, 64dpes, firmware: 1.7.0, f7b0f28) 2023-08-28T01:28:39.094599Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-3] started 2023-08-28T01:28:44.681264Z INFO furiosa::runtime: Saving the compilation log into /root/.local/state/furiosa/logs/compiler-20230828102844-v2bdzh.log 2023-08-28T01:28:44.681540Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-3] created Sess-660528cf using npu:2:0 2023-08-28T01:28:44.693511Z INFO furiosa_rt_core::driver::event_driven::coord: [Sess-660528cf] compiling the model (target: warboy-b0, 64dpes, size: 102.2 MB) 2023-08-28T01:28:47.133498Z INFO furiosa_rt_core::driver::event_driven::coord: [Sess-660528cf] the model compile is successful (took 2 secs) 2023-08-28T01:28:47.336202Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-3] created 1 NPU threads on npu:2:0 (DRAM: 14.0 MiB/16.0 GiB, SRAM: 16.0 MiB/64.0 MiB) Inputs: {0: TensorDesc(shape=(1, 3, 224, 224), dtype=UINT8, format=NCHW, size=150528, len=150528)} Outputs: {0: TensorDesc(shape=(1,), dtype=INT64, format=?, size=8, len=1)} Average inference time: 2.743666300084442 ms 2023-08-28T01:28:50.269460Z INFO furiosa_rt_core::driver::event_driven::coord: [Sess-660528cf] terminated 2023-08-28T01:28:50.279019Z INFO furiosa_rt_core::npu::raw: NPU (npu:2:0) has been closed 2023-08-28T01:28:50.284263Z INFO furiosa_rt_core::driver::event_driven::coord: [Runtime-3] stopped