Module nux.quantizer
Helper Utility to transform onnx models to quantized models
This module allows users to transform Onnx models to quantized models which can run in Furiosa AI eco-systems, which is based on static per-channel post-training 8-bit quantization scheme
post_training_quantization()
is the main function to transform Onnx models to quantized models.
Other functions are intermediate steps, and are called by post_training_quantization()
.
We expose intermediate steps as functions to allow users
to customize the algorithms to generate quantized models.
Please take a look at the implementation of post_training_quantization()
to build your algorithm.
Expand source code
"""Helper Utility to transform onnx models to quantized models
This module allows users to transform Onnx models
to quantized models which can run in Furiosa AI eco-systems, which is based on
[static per-channel post-training 8-bit quantization scheme](https://www.tensorflow.org/lite/performance/quantization_spec)
`post_training_quantization()` is the main function to transform Onnx models to quantized models.
Other functions are intermediate steps, and are called by `post_training_quantization()`.
We expose intermediate steps as functions to allow users
to customize the algorithms to generate quantized models.
Please take a look at the implementation of `post_training_quantization()` to build your algorithm.
"""
from typing import List, Dict, Tuple, Union
import logging
import io
import numpy as np
import onnx
import onnxruntime as ort
import furiosacli
def post_training_quantization(
model: onnx.ModelProto,
input_tensors: List[str],
# Dict[str, np.ndarray]: Calibration data has been stacked as batch
# List[Dict[str, np.ndarray]]: Each element of the list is single batch 'feed_dict'.
calibration_data: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]] = None,
) -> onnx.ModelProto:
"""Transforms an Onnx model to a model quantized from a given calibration data, \
which help to adjust the min/max ranges of quantization scheme.
If calibration_data is not set, this function uses randomly generated calibration data.
Returns:
Quantized Onnx model
"""
model = optimize(model)
calibration_model = build_calibration_model(model, input_tensors)
if calibration_data is None:
logging.info("[Quantizer] will calibrate with random inputs")
dynamic_ranges = calibrate_with_random_input(calibration_model)
else:
dynamic_ranges = calibrate(calibration_model, calibration_data)
quantized_model = quantize(model, input_tensors, dynamic_ranges)
return quantized_model
def optimize(model: onnx.ModelProto) -> onnx.ModelProto:
"""Optimizes an Onnx model by using various optimization \
rules with tensor shape annotations
Args:
model: Onnx model
Returns:
Optimized Onnx model
"""
session = furiosacli.clidriver.Session()
model = model.SerializeToString()
result = furiosacli.commands.Optimize.optimize(session, model)
model = onnx.load_model(io.BytesIO(result))
return model
def build_calibration_model(model: onnx.ModelProto, input_tensors: List[str]) -> onnx.ModelProto:
"""Rewrites an Onnx model to have additional operators \
to collect min/max values at each operator from a model networks
Args:
model: Onnx model
input_tensors: input tensor names
Returns:
Rewritten Onnx model with reduce-min/reduce-max nodes
"""
session = furiosacli.clidriver.Session()
model = model.SerializeToString()
result = furiosacli.commands.BuildCalibrationModel.build_calibration_model(session, model, input_tensors)
model = onnx.load_model(io.BytesIO(result))
return model
def quantize(model: onnx.ModelProto,
input_tensors: List[str],
dynamic_ranges: Dict[str, Tuple[float, float]]) -> onnx.ModelProto:
"""Transform an Onnx model to a quantized model with the given dynamic ranges
Args:
model: Onnx model
input_tensors: Input tensor names
dynamic_ranges: Min/max range values
Returns:
Quantized Onnx model
"""
session = furiosacli.clidriver.Session()
model = model.SerializeToString()
result = furiosacli.commands.Quantize.quantize(session, model, input_tensors, dynamic_ranges)
model = onnx.load_model(io.BytesIO(result))
return model
def calibrate(
model: onnx.ModelProto,
# Dict[str, np.ndarray]: Calibration data has been stacked as batch
# List[Dict[str, np.ndarray]]: Each element of the list is single batch 'feed_dict'.
calibration_data: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]],
) -> Dict[str, Tuple[float, float]]:
"""Collects min/max values of each operator from `calibration_data` and return the min/max ranges.
Args:
model: Onnx model
calibration_data: sample input data
Returns:
min-max ranges of each node
"""
# Log severity level 3(Error) in order not to print warnings(level 2)
ort.set_default_logger_severity(3)
sess = ort.InferenceSession(model.SerializeToString(), None)
# Observe ReduceMin & ReduceMax
observers = [output.name for output in sess.get_outputs() if
'ReduceMin' in output.name or 'ReduceMax' in output.name]
if isinstance(calibration_data, Dict):
zipped = []
for (k, v) in calibration_data.items():
# Zip, {'x', [[1,2,3], [4,5,6]]} -> [('x', [1,2,3]), ('x', [4,5,6])]
zipped.append(zip([k] * v.shape[0], np.split(v, v.shape[0], axis=0)))
# Transpose
transposed = zip(*zipped)
calibration_data = map(lambda data: dict(data), transposed)
# Calibrate
observed_vals = [sess.run(observers, feed_dict) for feed_dict in calibration_data]
# Calculate dynamic ranges
val_dicts = dict(zip(observers, zip(*observed_vals)))
node_names = [key.rpartition('_')[0] for key in val_dicts.keys() if 'ReduceMax' in key]
min_dicts = [float(np.min(value)) for key, value in val_dicts.items() if 'ReduceMin' in key]
max_dicts = [float(np.max(value)) for key, value in val_dicts.items() if 'ReduceMax' in key]
return dict(zip(node_names, zip(min_dicts, max_dicts)))
def calibrate_with_random_input(model: onnx.ModelProto):
"""It is almost the same as `calibrate()`, \
but it runs without `calibration_data`.
So, it can be used for only testing the model quantization algorithm \
without actual calibration data.
Args:
model: Onnx model
calibration_data: sample input data
Returns:
dynamic ranges of input tensors
"""
# Log severity level 3(Error) in order not to print warnings(level 2)
ort.set_default_logger_severity(3)
sess = ort.InferenceSession(model.SerializeToString(), None)
calibration_data: List[Dict[str, np.ndarray]] = []
for _ in range(10):
feed_dict: Dict[str, np.ndarray] = dict()
for attr in sess.get_inputs():
if attr.type == 'tensor(float)':
dtype = np.float32
elif attr.type == 'tensor(int64)':
dtype = np.int64
else:
raise Exception(f'Unknown dtype: {attr.type}')
feed_dict[attr.name] = np.random.random(attr.shape).astype(dtype)
calibration_data.append(feed_dict)
return calibrate(model, calibration_data)
Functions
def build_calibration_model(model: onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto, input_tensors: List[str]) ‑> onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto
-
Rewrites an Onnx model to have additional operators to collect min/max values at each operator from a model networks
Args
model
- Onnx model
input_tensors
- input tensor names
Returns
Rewritten Onnx model with reduce-min/reduce-max nodes
Expand source code
def build_calibration_model(model: onnx.ModelProto, input_tensors: List[str]) -> onnx.ModelProto: """Rewrites an Onnx model to have additional operators \ to collect min/max values at each operator from a model networks Args: model: Onnx model input_tensors: input tensor names Returns: Rewritten Onnx model with reduce-min/reduce-max nodes """ session = furiosacli.clidriver.Session() model = model.SerializeToString() result = furiosacli.commands.BuildCalibrationModel.build_calibration_model(session, model, input_tensors) model = onnx.load_model(io.BytesIO(result)) return model
def calibrate(model: onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto, calibration_data: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]]) ‑> Dict[str, Tuple[float, float]]
-
Collects min/max values of each operator from
calibration_data
and return the min/max ranges.Args
model
- Onnx model
calibration_data
- sample input data
Returns
min-max ranges of each node
Expand source code
def calibrate( model: onnx.ModelProto, # Dict[str, np.ndarray]: Calibration data has been stacked as batch # List[Dict[str, np.ndarray]]: Each element of the list is single batch 'feed_dict'. calibration_data: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]], ) -> Dict[str, Tuple[float, float]]: """Collects min/max values of each operator from `calibration_data` and return the min/max ranges. Args: model: Onnx model calibration_data: sample input data Returns: min-max ranges of each node """ # Log severity level 3(Error) in order not to print warnings(level 2) ort.set_default_logger_severity(3) sess = ort.InferenceSession(model.SerializeToString(), None) # Observe ReduceMin & ReduceMax observers = [output.name for output in sess.get_outputs() if 'ReduceMin' in output.name or 'ReduceMax' in output.name] if isinstance(calibration_data, Dict): zipped = [] for (k, v) in calibration_data.items(): # Zip, {'x', [[1,2,3], [4,5,6]]} -> [('x', [1,2,3]), ('x', [4,5,6])] zipped.append(zip([k] * v.shape[0], np.split(v, v.shape[0], axis=0))) # Transpose transposed = zip(*zipped) calibration_data = map(lambda data: dict(data), transposed) # Calibrate observed_vals = [sess.run(observers, feed_dict) for feed_dict in calibration_data] # Calculate dynamic ranges val_dicts = dict(zip(observers, zip(*observed_vals))) node_names = [key.rpartition('_')[0] for key in val_dicts.keys() if 'ReduceMax' in key] min_dicts = [float(np.min(value)) for key, value in val_dicts.items() if 'ReduceMin' in key] max_dicts = [float(np.max(value)) for key, value in val_dicts.items() if 'ReduceMax' in key] return dict(zip(node_names, zip(min_dicts, max_dicts)))
def calibrate_with_random_input(model: onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto)
-
It is almost the same as
calibrate()
, but it runs withoutcalibration_data
.So, it can be used for only testing the model quantization algorithm without actual calibration data.
Args
model
- Onnx model
calibration_data
- sample input data
Returns
dynamic ranges of input tensors
Expand source code
def calibrate_with_random_input(model: onnx.ModelProto): """It is almost the same as `calibrate()`, \ but it runs without `calibration_data`. So, it can be used for only testing the model quantization algorithm \ without actual calibration data. Args: model: Onnx model calibration_data: sample input data Returns: dynamic ranges of input tensors """ # Log severity level 3(Error) in order not to print warnings(level 2) ort.set_default_logger_severity(3) sess = ort.InferenceSession(model.SerializeToString(), None) calibration_data: List[Dict[str, np.ndarray]] = [] for _ in range(10): feed_dict: Dict[str, np.ndarray] = dict() for attr in sess.get_inputs(): if attr.type == 'tensor(float)': dtype = np.float32 elif attr.type == 'tensor(int64)': dtype = np.int64 else: raise Exception(f'Unknown dtype: {attr.type}') feed_dict[attr.name] = np.random.random(attr.shape).astype(dtype) calibration_data.append(feed_dict) return calibrate(model, calibration_data)
def optimize(model: onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto) ‑> onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto
-
Optimizes an Onnx model by using various optimization rules with tensor shape annotations
Args
model
- Onnx model
Returns
Optimized Onnx model
Expand source code
def optimize(model: onnx.ModelProto) -> onnx.ModelProto: """Optimizes an Onnx model by using various optimization \ rules with tensor shape annotations Args: model: Onnx model Returns: Optimized Onnx model """ session = furiosacli.clidriver.Session() model = model.SerializeToString() result = furiosacli.commands.Optimize.optimize(session, model) model = onnx.load_model(io.BytesIO(result)) return model
def post_training_quantization(model: onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto, input_tensors: List[str], calibration_data: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]] = None) ‑> onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto
-
Transforms an Onnx model to a model quantized from a given calibration data, which help to adjust the min/max ranges of quantization scheme.
If calibration_data is not set, this function uses randomly generated calibration data.
Returns
Quantized Onnx model
Expand source code
def post_training_quantization( model: onnx.ModelProto, input_tensors: List[str], # Dict[str, np.ndarray]: Calibration data has been stacked as batch # List[Dict[str, np.ndarray]]: Each element of the list is single batch 'feed_dict'. calibration_data: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]] = None, ) -> onnx.ModelProto: """Transforms an Onnx model to a model quantized from a given calibration data, \ which help to adjust the min/max ranges of quantization scheme. If calibration_data is not set, this function uses randomly generated calibration data. Returns: Quantized Onnx model """ model = optimize(model) calibration_model = build_calibration_model(model, input_tensors) if calibration_data is None: logging.info("[Quantizer] will calibrate with random inputs") dynamic_ranges = calibrate_with_random_input(calibration_model) else: dynamic_ranges = calibrate(calibration_model, calibration_data) quantized_model = quantize(model, input_tensors, dynamic_ranges) return quantized_model
def quantize(model: onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto, input_tensors: List[str], dynamic_ranges: Dict[str, Tuple[float, float]]) ‑> onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto
-
Transform an Onnx model to a quantized model with the given dynamic ranges
Args
model
- Onnx model
input_tensors
- Input tensor names
dynamic_ranges
- Min/max range values
Returns
Quantized Onnx model
Expand source code
def quantize(model: onnx.ModelProto, input_tensors: List[str], dynamic_ranges: Dict[str, Tuple[float, float]]) -> onnx.ModelProto: """Transform an Onnx model to a quantized model with the given dynamic ranges Args: model: Onnx model input_tensors: Input tensor names dynamic_ranges: Min/max range values Returns: Quantized Onnx model """ session = furiosacli.clidriver.Session() model = model.SerializeToString() result = furiosacli.commands.Quantize.quantize(session, model, input_tensors, dynamic_ranges) model = onnx.load_model(io.BytesIO(result)) return model