Module nux.quantizer

Helper Utility to transform onnx models to quantized models

This module allows users to transform Onnx models to quantized models which can run in Furiosa AI eco-systems, which is based on static per-channel post-training 8-bit quantization scheme

post_training_quantization() is the main function to transform Onnx models to quantized models. Other functions are intermediate steps, and are called by post_training_quantization().

We expose intermediate steps as functions to allow users to customize the algorithms to generate quantized models. Please take a look at the implementation of post_training_quantization() to build your algorithm.

Expand source code
"""Helper Utility to transform onnx models to quantized models

This module allows users to transform Onnx models
to quantized models which can run in Furiosa AI eco-systems, which is based on
[static per-channel post-training 8-bit quantization scheme](https://www.tensorflow.org/lite/performance/quantization_spec)

`post_training_quantization()` is the main function to transform Onnx models to quantized models.
Other functions are intermediate steps, and are called by `post_training_quantization()`.

We expose intermediate steps as functions to allow users
to customize the algorithms to generate quantized models.
Please take a look at the implementation of `post_training_quantization()` to build your algorithm.
"""
from typing import List, Dict, Tuple, Union

import logging
import io

import numpy as np
import onnx
import onnxruntime as ort

import furiosacli


def post_training_quantization(
        model: onnx.ModelProto,
        input_tensors: List[str],
        # Dict[str, np.ndarray]: Calibration data has been stacked as batch
        # List[Dict[str, np.ndarray]]: Each element of the list is single batch 'feed_dict'.
        calibration_data: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]] = None,
) -> onnx.ModelProto:
    """Transforms an Onnx model to a model quantized from a given calibration data, \
    which help to adjust the min/max ranges of quantization scheme.

    If calibration_data is not set, this function uses randomly generated calibration data.

    Returns:
        Quantized Onnx model
    """
    model = optimize(model)
    calibration_model = build_calibration_model(model, input_tensors)
    if calibration_data is None:
        logging.info("[Quantizer] will calibrate with random inputs")
        dynamic_ranges = calibrate_with_random_input(calibration_model)
    else:
        dynamic_ranges = calibrate(calibration_model, calibration_data)
    quantized_model = quantize(model, input_tensors, dynamic_ranges)
    return quantized_model


def optimize(model: onnx.ModelProto) -> onnx.ModelProto:
    """Optimizes an Onnx model by using various optimization \
    rules with tensor shape annotations

    Args:
        model: Onnx model

    Returns:
        Optimized Onnx model
    """
    session = furiosacli.clidriver.Session()
    model = model.SerializeToString()
    result = furiosacli.commands.Optimize.optimize(session, model)
    model = onnx.load_model(io.BytesIO(result))
    return model


def build_calibration_model(model: onnx.ModelProto, input_tensors: List[str]) -> onnx.ModelProto:
    """Rewrites an Onnx model to have additional operators \
    to collect min/max values at each operator from a model networks

    Args:
        model: Onnx model
        input_tensors: input tensor names

    Returns:
        Rewritten Onnx model with reduce-min/reduce-max nodes
    """
    session = furiosacli.clidriver.Session()
    model = model.SerializeToString()
    result = furiosacli.commands.BuildCalibrationModel.build_calibration_model(session, model, input_tensors)
    model = onnx.load_model(io.BytesIO(result))
    return model


def quantize(model: onnx.ModelProto,
             input_tensors: List[str],
             dynamic_ranges: Dict[str, Tuple[float, float]]) -> onnx.ModelProto:
    """Transform an Onnx model to a quantized model with the given dynamic ranges

    Args:
        model: Onnx model
        input_tensors: Input tensor names
        dynamic_ranges: Min/max range values

    Returns:
        Quantized Onnx model
    """
    session = furiosacli.clidriver.Session()
    model = model.SerializeToString()
    result = furiosacli.commands.Quantize.quantize(session, model, input_tensors, dynamic_ranges)
    model = onnx.load_model(io.BytesIO(result))
    return model


def calibrate(
        model: onnx.ModelProto,
        # Dict[str, np.ndarray]: Calibration data has been stacked as batch
        # List[Dict[str, np.ndarray]]: Each element of the list is single batch 'feed_dict'.
        calibration_data: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]],
) -> Dict[str, Tuple[float, float]]:
    """Collects min/max values of each operator from `calibration_data` and return the min/max ranges.

    Args:
        model: Onnx model
        calibration_data: sample input data

    Returns:
        min-max ranges of each node
    """
    # Log severity level 3(Error) in order not to print warnings(level 2)
    ort.set_default_logger_severity(3)
    sess = ort.InferenceSession(model.SerializeToString(), None)

    # Observe ReduceMin & ReduceMax
    observers = [output.name for output in sess.get_outputs() if
                 'ReduceMin' in output.name or 'ReduceMax' in output.name]

    if isinstance(calibration_data, Dict):
        zipped = []
        for (k, v) in calibration_data.items():
            # Zip, {'x', [[1,2,3], [4,5,6]]} -> [('x', [1,2,3]), ('x', [4,5,6])]
            zipped.append(zip([k] * v.shape[0], np.split(v, v.shape[0], axis=0)))
        # Transpose
        transposed = zip(*zipped)
        calibration_data = map(lambda data: dict(data), transposed)

    # Calibrate
    observed_vals = [sess.run(observers, feed_dict) for feed_dict in calibration_data]

    # Calculate dynamic ranges
    val_dicts = dict(zip(observers, zip(*observed_vals)))
    node_names = [key.rpartition('_')[0] for key in val_dicts.keys() if 'ReduceMax' in key]
    min_dicts = [float(np.min(value)) for key, value in val_dicts.items() if 'ReduceMin' in key]
    max_dicts = [float(np.max(value)) for key, value in val_dicts.items() if 'ReduceMax' in key]
    return dict(zip(node_names, zip(min_dicts, max_dicts)))


def calibrate_with_random_input(model: onnx.ModelProto):
    """It is almost the same as `calibrate()`, \
    but it runs without `calibration_data`.

    So, it can be used for only testing the model quantization algorithm \
    without actual calibration data.

    Args:
        model: Onnx model
        calibration_data: sample input data

    Returns:
        dynamic ranges of input tensors
    """
    # Log severity level 3(Error) in order not to print warnings(level 2)
    ort.set_default_logger_severity(3)
    sess = ort.InferenceSession(model.SerializeToString(), None)

    calibration_data: List[Dict[str, np.ndarray]] = []
    for _ in range(10):
        feed_dict: Dict[str, np.ndarray] = dict()
        for attr in sess.get_inputs():
            if attr.type == 'tensor(float)':
                dtype = np.float32
            elif attr.type == 'tensor(int64)':
                dtype = np.int64
            else:
                raise Exception(f'Unknown dtype: {attr.type}')
            feed_dict[attr.name] = np.random.random(attr.shape).astype(dtype)
        calibration_data.append(feed_dict)

    return calibrate(model, calibration_data)

Functions

def build_calibration_model(model: onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto, input_tensors: List[str]) ‑> onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto

Rewrites an Onnx model to have additional operators to collect min/max values at each operator from a model networks

Args

model
Onnx model
input_tensors
input tensor names

Returns

Rewritten Onnx model with reduce-min/reduce-max nodes

Expand source code
def build_calibration_model(model: onnx.ModelProto, input_tensors: List[str]) -> onnx.ModelProto:
    """Rewrites an Onnx model to have additional operators \
    to collect min/max values at each operator from a model networks

    Args:
        model: Onnx model
        input_tensors: input tensor names

    Returns:
        Rewritten Onnx model with reduce-min/reduce-max nodes
    """
    session = furiosacli.clidriver.Session()
    model = model.SerializeToString()
    result = furiosacli.commands.BuildCalibrationModel.build_calibration_model(session, model, input_tensors)
    model = onnx.load_model(io.BytesIO(result))
    return model
def calibrate(model: onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto, calibration_data: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]]) ‑> Dict[str, Tuple[float, float]]

Collects min/max values of each operator from calibration_data and return the min/max ranges.

Args

model
Onnx model
calibration_data
sample input data

Returns

min-max ranges of each node

Expand source code
def calibrate(
        model: onnx.ModelProto,
        # Dict[str, np.ndarray]: Calibration data has been stacked as batch
        # List[Dict[str, np.ndarray]]: Each element of the list is single batch 'feed_dict'.
        calibration_data: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]],
) -> Dict[str, Tuple[float, float]]:
    """Collects min/max values of each operator from `calibration_data` and return the min/max ranges.

    Args:
        model: Onnx model
        calibration_data: sample input data

    Returns:
        min-max ranges of each node
    """
    # Log severity level 3(Error) in order not to print warnings(level 2)
    ort.set_default_logger_severity(3)
    sess = ort.InferenceSession(model.SerializeToString(), None)

    # Observe ReduceMin & ReduceMax
    observers = [output.name for output in sess.get_outputs() if
                 'ReduceMin' in output.name or 'ReduceMax' in output.name]

    if isinstance(calibration_data, Dict):
        zipped = []
        for (k, v) in calibration_data.items():
            # Zip, {'x', [[1,2,3], [4,5,6]]} -> [('x', [1,2,3]), ('x', [4,5,6])]
            zipped.append(zip([k] * v.shape[0], np.split(v, v.shape[0], axis=0)))
        # Transpose
        transposed = zip(*zipped)
        calibration_data = map(lambda data: dict(data), transposed)

    # Calibrate
    observed_vals = [sess.run(observers, feed_dict) for feed_dict in calibration_data]

    # Calculate dynamic ranges
    val_dicts = dict(zip(observers, zip(*observed_vals)))
    node_names = [key.rpartition('_')[0] for key in val_dicts.keys() if 'ReduceMax' in key]
    min_dicts = [float(np.min(value)) for key, value in val_dicts.items() if 'ReduceMin' in key]
    max_dicts = [float(np.max(value)) for key, value in val_dicts.items() if 'ReduceMax' in key]
    return dict(zip(node_names, zip(min_dicts, max_dicts)))
def calibrate_with_random_input(model: onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto)

It is almost the same as calibrate(), but it runs without calibration_data.

So, it can be used for only testing the model quantization algorithm without actual calibration data.

Args

model
Onnx model
calibration_data
sample input data

Returns

dynamic ranges of input tensors

Expand source code
def calibrate_with_random_input(model: onnx.ModelProto):
    """It is almost the same as `calibrate()`, \
    but it runs without `calibration_data`.

    So, it can be used for only testing the model quantization algorithm \
    without actual calibration data.

    Args:
        model: Onnx model
        calibration_data: sample input data

    Returns:
        dynamic ranges of input tensors
    """
    # Log severity level 3(Error) in order not to print warnings(level 2)
    ort.set_default_logger_severity(3)
    sess = ort.InferenceSession(model.SerializeToString(), None)

    calibration_data: List[Dict[str, np.ndarray]] = []
    for _ in range(10):
        feed_dict: Dict[str, np.ndarray] = dict()
        for attr in sess.get_inputs():
            if attr.type == 'tensor(float)':
                dtype = np.float32
            elif attr.type == 'tensor(int64)':
                dtype = np.int64
            else:
                raise Exception(f'Unknown dtype: {attr.type}')
            feed_dict[attr.name] = np.random.random(attr.shape).astype(dtype)
        calibration_data.append(feed_dict)

    return calibrate(model, calibration_data)
def optimize(model: onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto) ‑> onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto

Optimizes an Onnx model by using various optimization rules with tensor shape annotations

Args

model
Onnx model

Returns

Optimized Onnx model

Expand source code
def optimize(model: onnx.ModelProto) -> onnx.ModelProto:
    """Optimizes an Onnx model by using various optimization \
    rules with tensor shape annotations

    Args:
        model: Onnx model

    Returns:
        Optimized Onnx model
    """
    session = furiosacli.clidriver.Session()
    model = model.SerializeToString()
    result = furiosacli.commands.Optimize.optimize(session, model)
    model = onnx.load_model(io.BytesIO(result))
    return model
def post_training_quantization(model: onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto, input_tensors: List[str], calibration_data: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]] = None) ‑> onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto

Transforms an Onnx model to a model quantized from a given calibration data, which help to adjust the min/max ranges of quantization scheme.

If calibration_data is not set, this function uses randomly generated calibration data.

Returns

Quantized Onnx model

Expand source code
def post_training_quantization(
        model: onnx.ModelProto,
        input_tensors: List[str],
        # Dict[str, np.ndarray]: Calibration data has been stacked as batch
        # List[Dict[str, np.ndarray]]: Each element of the list is single batch 'feed_dict'.
        calibration_data: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]] = None,
) -> onnx.ModelProto:
    """Transforms an Onnx model to a model quantized from a given calibration data, \
    which help to adjust the min/max ranges of quantization scheme.

    If calibration_data is not set, this function uses randomly generated calibration data.

    Returns:
        Quantized Onnx model
    """
    model = optimize(model)
    calibration_model = build_calibration_model(model, input_tensors)
    if calibration_data is None:
        logging.info("[Quantizer] will calibrate with random inputs")
        dynamic_ranges = calibrate_with_random_input(calibration_model)
    else:
        dynamic_ranges = calibrate(calibration_model, calibration_data)
    quantized_model = quantize(model, input_tensors, dynamic_ranges)
    return quantized_model
def quantize(model: onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto, input_tensors: List[str], dynamic_ranges: Dict[str, Tuple[float, float]]) ‑> onnx.onnx_ONNX_REL_1_7_ml_pb2.ModelProto

Transform an Onnx model to a quantized model with the given dynamic ranges

Args

model
Onnx model
input_tensors
Input tensor names
dynamic_ranges
Min/max range values

Returns

Quantized Onnx model

Expand source code
def quantize(model: onnx.ModelProto,
             input_tensors: List[str],
             dynamic_ranges: Dict[str, Tuple[float, float]]) -> onnx.ModelProto:
    """Transform an Onnx model to a quantized model with the given dynamic ranges

    Args:
        model: Onnx model
        input_tensors: Input tensor names
        dynamic_ranges: Min/max range values

    Returns:
        Quantized Onnx model
    """
    session = furiosacli.clidriver.Session()
    model = model.SerializeToString()
    result = furiosacli.commands.Quantize.quantize(session, model, input_tensors, dynamic_ranges)
    model = onnx.load_model(io.BytesIO(result))
    return model