diff --git a/README.md b/README.md index 1e4c14b3b75e7765747f7d1d543fba8d90c15672..e4440f17094c7cfda668f5161c3c44adc9081780 100644 --- a/README.md +++ b/README.md @@ -42,17 +42,19 @@ A particular care is needed for the biases rescaling at each step. ## Doing quantization step by step -It is possible to perform the PTQ step by step, thank's to the exposed functions of the API. +It is possible to perform the PTQ step by step, thanks to the exposed functions of the API. In that case, here is the standard pipeline: -1) remove the flatten and dropout nodes -2) expand the meta-operators (if there are some) -3) insert the scaling nodes -4) perform the parameter normalization -5) perform the output value normalization, over a calibration dataset -6) quantize the normalized network + - Prepare the network for the PTQ (remove the flatten nodes, fuse the BatchNorms ...) + - Insert the scaling nodes that will allow the model calibration + - Perform the Cross Layer Equalization if possible + - Perform the parameter normalization + - Compute the node output ranges over an input calibration dataset + - Adjust the output ranges using a specified error metric (MSE, KL, ...) + - Perform the activation normalization + - Quantize the normalized network + - Convert the scaling factors to bit-shifting operations if needed ## Further work -* add smart clipping methods for the normalizations. -* add Quantization Aware Training (QAT). +* add Quantization Aware Training (QAT) \ No newline at end of file diff --git a/aidge_quantization/unit_tests/aidge_ptq.py b/aidge_quantization/unit_tests/aidge_ptq.py deleted file mode 100644 index 507a24b382fc915665a0fc82005420fd2af21dc5..0000000000000000000000000000000000000000 --- a/aidge_quantization/unit_tests/aidge_ptq.py +++ /dev/null @@ -1,142 +0,0 @@ -import gzip -import numpy as np -import matplotlib.pyplot as plt - -import aidge_core -import aidge_backend_cpu -import aidge_onnx -import aidge_quantization - -NB_SAMPLES = 100 # max : 1000 -NB_BITS = 4 - -# -------------------------------------------------------------- -# LOAD THE MODEL IN AIDGE -# -------------------------------------------------------------- - -aidge_model = aidge_onnx.load_onnx("assets/ConvNet.onnx", verbose=False) -aidge_core.remove_flatten(aidge_model) - -# -------------------------------------------------------------- -# LOAD THE SAMPLES / LABELS (NUMPY) -# -------------------------------------------------------------- - -samples = np.load(gzip.GzipFile('assets/mnist_samples.npy.gz', "r")) -labels = np.load(gzip.GzipFile('assets/mnist_labels.npy.gz', "r")) - -# -------------------------------------------------------------- -# SETUP THE AIDGE SCHEDULER -# -------------------------------------------------------------- - -# Create the Producer node -input_array = np.zeros(784).astype('float32') -input_tensor = aidge_core.Tensor(input_array) -input_node = aidge_core.Producer(input_tensor, "X") - -# Configuration for the inputs -input_node.get_operator().set_datatype(aidge_core.dtype.float32) -input_node.get_operator().set_backend("cpu") - -# Link Producer to the Graph -input_node.add_child(aidge_model) - -# Configuration for the model -aidge_model.set_datatype(aidge_core.dtype.float32) -aidge_model.set_backend("cpu") - -# Create the Scheduler -scheduler = aidge_core.SequentialScheduler(aidge_model) - -# -------------------------------------------------------------- -# RUN SOME EXAMPLE INFERENCES WITH AIDGE -# -------------------------------------------------------------- - -def propagate(model, scheduler, sample): - # Setup the input - input_tensor = aidge_core.Tensor(sample) - input_node.get_operator().set_output(0, input_tensor) - # Run the inference - scheduler.forward(verbose=False) - # Gather the results - output_node = model.get_output_nodes().pop() - output_tensor = output_node.get_operator().get_output(0) - return np.array(output_tensor) - -def bake_sample(sample): - sample = np.reshape(sample, (1, 1, 28, 28)) - return sample.astype('float32') - -print('\n EXAMPLE INFERENCES :') -for i in range(10): - input_array = bake_sample(samples[i]) - output_array = propagate(aidge_model, scheduler, input_array) - print(labels[i] , ' -> ', np.round(output_array, 2)) - -# -------------------------------------------------------------- -# COMPUTE THE MODEL ACCURACY -# -------------------------------------------------------------- - -def compute_accuracy(model, samples, labels): - acc = 0 - for i, sample in enumerate(samples): - x = bake_sample(sample) - y = propagate(model, scheduler, x) - if labels[i] == np.argmax(y): - acc += 1 - return acc / len(samples) - -accuracy = compute_accuracy(aidge_model, samples[0:NB_SAMPLES], labels) -print(f'\n MODEL ACCURACY : {accuracy * 100:.3f}%') - -# -------------------------------------------------------------- -# CREATE THE TENSOR SUBSET -# -------------------------------------------------------------- - -tensors = [] -for sample in samples[0:NB_SAMPLES]: - sample = bake_sample(sample) - tensor = aidge_core.Tensor(sample) - tensors.append(tensor) - -# -------------------------------------------------------------- -# APPLY THE PTQ TO THE MODEL -# -------------------------------------------------------------- - -aidge_quantization.quantize_network(aidge_model, NB_BITS, tensors) - -# -------------------------------------------------------------- -# UPDATE THE SCHEDULER -# -------------------------------------------------------------- - -scheduler = aidge_core.SequentialScheduler(aidge_model) - -# -------------------------------------------------------------- -# QUANTIZE THE INPUT TENSORS -# -------------------------------------------------------------- - -scaling = 2**(NB_BITS-1)-1 -for i in range(NB_SAMPLES): - samples[i] = np.round(samples[i]*scaling) - -# -------------------------------------------------------------- -# RUN SOME QUANTIZED INFERENCES WITH AIDGE -# -------------------------------------------------------------- - -print('\n EXAMPLE QUANTIZED INFERENCES :') -for i in range(10): - input_array = bake_sample(samples[i]) - output_array = propagate(aidge_model, scheduler, input_array) - print(labels[i] , ' -> ', np.round(output_array, 2)) - -# -------------------------------------------------------------- -# COMPUTE THE MODEL ACCURACY -# -------------------------------------------------------------- - -accuracy = compute_accuracy(aidge_model, samples[0:NB_SAMPLES], labels) -print(f'\n QUANTIZED MODEL ACCURACY : {accuracy * 100:.3f}%') - -# -------------------------------------------------------------- -# WORK IS DONE ! -# -------------------------------------------------------------- - -print('\n that\'s all folks !\n') diff --git a/aidge_quantization/unit_tests/assets/.gitattributes b/aidge_quantization/unit_tests/assets/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..0ff1cae46ef5b055e93771e54246ce59350758ab --- /dev/null +++ b/aidge_quantization/unit_tests/assets/.gitattributes @@ -0,0 +1,4 @@ +MiniResNet.onnx filter=lfs diff=lfs merge=lfs -text +ConvNet.onnx filter=lfs diff=lfs merge=lfs -text +mnist_labels.npy.gz filter=lfs diff=lfs merge=lfs -text +mnist_samples.npy.gz filter=lfs diff=lfs merge=lfs -text diff --git a/aidge_quantization/unit_tests/assets/ConvNet.onnx b/aidge_quantization/unit_tests/assets/ConvNet.onnx index 20e8c079e890606eb580d82bab8cfedc12958903..1b81da3ddd7fbe229074a60c607e3f84ca3c46e1 100644 Binary files a/aidge_quantization/unit_tests/assets/ConvNet.onnx and b/aidge_quantization/unit_tests/assets/ConvNet.onnx differ diff --git a/aidge_quantization/unit_tests/assets/MiniResNet.onnx b/aidge_quantization/unit_tests/assets/MiniResNet.onnx new file mode 100644 index 0000000000000000000000000000000000000000..def45161ac46d9c6d790b3fad94799d4b263e8c7 --- /dev/null +++ b/aidge_quantization/unit_tests/assets/MiniResNet.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ab12257830e35366eabd58e39dcc66c53c9f75fd490a271b97ed6225351e634 +size 440650 diff --git a/aidge_quantization/unit_tests/assets/mnist_labels.npy.gz b/aidge_quantization/unit_tests/assets/mnist_labels.npy.gz index 33a549d4ad2f14e618c3b722f52dc4ee55efc519..745b4cc58b43c66da2530e77fd3836584f5abfde 100644 Binary files a/aidge_quantization/unit_tests/assets/mnist_labels.npy.gz and b/aidge_quantization/unit_tests/assets/mnist_labels.npy.gz differ diff --git a/aidge_quantization/unit_tests/assets/mnist_samples.npy.gz b/aidge_quantization/unit_tests/assets/mnist_samples.npy.gz index 08ade345cd98d61f297c6c3e787d395b3af51056..ad9ff4d8c05d99079a710048cd5a76d5abed94b0 100644 Binary files a/aidge_quantization/unit_tests/assets/mnist_samples.npy.gz and b/aidge_quantization/unit_tests/assets/mnist_samples.npy.gz differ diff --git a/aidge_quantization/unit_tests/test_ptq.py b/aidge_quantization/unit_tests/test_ptq.py new file mode 100644 index 0000000000000000000000000000000000000000..91f6d82ff7f844bc235cd7dd3391d8fe8d82bf88 --- /dev/null +++ b/aidge_quantization/unit_tests/test_ptq.py @@ -0,0 +1,112 @@ +import unittest +import gzip +import numpy as np + +import aidge_core +import aidge_backend_cpu +import aidge_onnx +import aidge_quantization + +# -------------------------------------------------------------- +# CONFIGS +# -------------------------------------------------------------- + +NB_SAMPLES = 1000 # max : 1000 +SAMPLE_SHAPE = (1, 1, 28, 28) +MODEL_NAME = 'MiniResNet.onnx' # 'ConvNet.onnx' +ACCURACIES = (95.4, 94.5) # (97.9, 97.7) +NB_BITS = 4 + +# -------------------------------------------------------------- +# UTILS +# -------------------------------------------------------------- + +def propagate(model, scheduler, sample): + input_tensor = aidge_core.Tensor(sample) + scheduler.forward(True, [input_tensor]) + output_node = model.get_output_nodes().pop() + output_tensor = output_node.get_operator().get_output(0) + return np.array(output_tensor) + +def prepare_sample(sample): + sample = np.reshape(sample, SAMPLE_SHAPE) + return sample.astype('float32') + +def compute_accuracy(model, samples, labels): + acc = 0 + scheduler = aidge_core.SequentialScheduler(model) + for i, sample in enumerate(samples): + x = prepare_sample(sample) + y = propagate(model, scheduler, x) + if labels[i] == np.argmax(y): + acc += 1 + return acc / len(samples) + +# -------------------------------------------------------------- +# TEST CLASS +# -------------------------------------------------------------- + +class test_ptq(unittest.TestCase): + + def setUp(self): + + # load the samples / labels (numpy) + + self.samples = np.load(gzip.GzipFile('assets/mnist_samples.npy.gz', "r")) + self.labels = np.load(gzip.GzipFile('assets/mnist_labels.npy.gz', "r")) + + # load the model in AIDGE + + self.model = aidge_onnx.load_onnx("assets/" + MODEL_NAME, verbose=False) + aidge_core.remove_flatten(self.model) + + self.model.set_datatype(aidge_core.dtype.float32) + self.model.set_backend("cpu") + + def tearDown(self): + pass + + + def test_model(self): + + # compute the base accuracy + + accuracy = compute_accuracy(self.model, self.samples[0:NB_SAMPLES], self.labels) + self.assertAlmostEqual(accuracy * 100, ACCURACIES[0], msg='base accuracy does not meet the baseline !', delta=0.1) + + def test_quant_model(self): + + # create the calibration dataset + + tensors = [] + for sample in self.samples[0:NB_SAMPLES]: + sample = prepare_sample(sample) + tensor = aidge_core.Tensor(sample) + tensors.append(tensor) + + # quantize the model + + aidge_quantization.quantize_network( + self.model, + NB_BITS, + tensors, + clipping_mode=aidge_quantization.Clipping.MSE, + apply_rounding=True, + optimize_signs=True, + single_shift=False + ) + + # rescale the inputs + + scaling = 2**(NB_BITS-1)-1 + for i in range(NB_SAMPLES): + self.samples[i] = self.samples[i]*scaling # XXX np.round ??? + + # compute the quantized accuracy + + accuracy = compute_accuracy(self.model, self.samples, self.labels) + self.assertAlmostEqual(accuracy * 100, ACCURACIES[1], msg='quantized accuracy does not meet the baseline !', delta=0.1) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/include/aidge/QuantPTQ.hpp b/include/aidge/QuantPTQ.hpp deleted file mode 100644 index fdc9300e69ba0873409c5eed60e31773166a6947..0000000000000000000000000000000000000000 --- a/include/aidge/QuantPTQ.hpp +++ /dev/null @@ -1,99 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2023 CEA-List - * - * This program and the accompanying materials are made available under the - * terms of the Eclipse Public License 2.0 which is available at - * http://www.eclipse.org/legal/epl-2.0. - * - * SPDX-License-Identifier: EPL-2.0 - * - ********************************************************************************/ - -#ifndef AIDGE_QUANTIZATION_QUANTPTQ_H_ -#define AIDGE_QUANTIZATION_QUANTPTQ_H_ - -#include <cstdint> // std::uint8_t -#include <map> -#include <memory> -#include <string> -#include <vector> - -#include "aidge/data/Tensor.hpp" -#include "aidge/graph/GraphView.hpp" - -namespace Aidge { - - /** - * @brief Determine whether an input GraphView can be quantized or not. - * @param graphView The GraphView to be checked. - * @return True if the GraphView can be quantized, else false. - */ - bool checkArchitecture(std::shared_ptr<GraphView> graphView); - - /** - * @brief Insert a scaling node after each affine node of the GraphView. - * @param graphView The GraphView containing the affine nodes. - */ - void insertScalingNodes(std::shared_ptr<GraphView> graphView); - - /** - * @brief Normalize the parameters of each parametrized node, so that they fit in the [-1:1] range. - * @param graphView The GraphView containing the parametrized nodes. - */ - void normalizeParameters(std::shared_ptr<GraphView> graphView); - - /** - * @brief Compute the value ranges of every affine node output, given an input dataset. - * @param graphView The GraphView containing the affine nodes, on which the inferences are performed. - * @param inputDataSet The input dataset, consisting of a vector of input samples. - * @return A map associating each affine node name to it's corresponding output range. - */ - std::map<std::string, float> computeRanges(std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet); - - /** - * @brief Normalize the activations of each affine node so that it become equal to one. - * This is done by reconfiguring the scaling nodes, as well as rescaling the weights and biases tensors. - * @param graphView The GraphView containing the affine nodes. - * @param valueRanges The node output value ranges computed over the calibration dataset. - */ - void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::string, float> valueRanges); - - - /** - * @brief Quantize an already normalized (in term of parameters and activations) network. - * @param graphView The GraphView to be quantized. - * @param nbBits The desired number of bits of the quantization. - */ - void quantizeNormalizedNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits); - - /** - * @brief Main quantization routine. Performs every step of the quantization pipeline. - * @param graphView The GraphView to be quantized. - * @param nbBits The desired number of bits of the quantization. - * @param inputDataSet The input dataset on which the value ranges are computed. - */ - void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, std::vector<std::shared_ptr<Tensor>> inputDataSet, bool OptimizeCliping); - - /** - * @brief Compute the weight ranges of every affine node. Provided for debuging purposes. - * @param graphView The GraphView containing the affine nodes. - * @return A map associating each affine node name to it's corresponding weight range. - */ - std::map<std::string, float> getWeightRanges(std::shared_ptr<GraphView> graphView); - - /** - * @brief Clear the affine nodes biases. Provided form debuging purposes. - * @param graphView The GraphView containing the affine nodes. - */ - void clearBiases(std::shared_ptr<GraphView> graphView); - - void devPTQ(std::shared_ptr<GraphView> graphView); - - std::map<std::string, std::vector<int>> computeScalingHistograms(std::map<std::string, float> valueRanges, int nbBins, std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet); - - float computeBestClipping(std::vector<int> histogram, std::uint8_t nbBits); - -} - -#endif /* AIDGE_QUANTIZATION_QUANTPTQ_H_ */ - diff --git a/include/aidge/quantization/PTQ/CLE.hpp b/include/aidge/quantization/PTQ/CLE.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d94b6e930209450bc5d33331832b81a2623c56a7 --- /dev/null +++ b/include/aidge/quantization/PTQ/CLE.hpp @@ -0,0 +1,37 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_QUANTIZATION_PTQ_CLE_H_ +#define AIDGE_QUANTIZATION_PTQ_CLE_H_ + +//#include <cstdint> +//#include <map> +//#include <memory> +//#include <string> +//#include <vector> + +#include "aidge/data/Tensor.hpp" +#include "aidge/graph/GraphView.hpp" + +namespace Aidge +{ + + /** + * @brief Equalize the ranges of the nodes parameters by proceding iteratively. + * Can only be applied to single branch networks (otherwise does not edit the graphView). + * @param graphView The GraphView to process. + * @param targetDelta the stopping criterion (typical value : 0.01) + */ + void crossLayerEqualization(std::shared_ptr<GraphView> graphView, float targetDelta = 0.01); + +} + +#endif /* AIDGE_QUANTIZATION_PTQ_CLE_H_ */ \ No newline at end of file diff --git a/include/aidge/quantization/PTQ/Clip.hpp b/include/aidge/quantization/PTQ/Clip.hpp new file mode 100644 index 0000000000000000000000000000000000000000..740f044faa9be15ab093d471bda0cabb04f3ce53 --- /dev/null +++ b/include/aidge/quantization/PTQ/Clip.hpp @@ -0,0 +1,75 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_QUANTIZATION_PTQ_CLIP_H_ +#define AIDGE_QUANTIZATION_PTQ_CLIP_H_ + +//#include <cstdint> +//#include <map> +//#include <memory> +//#include <string> +//#include <vector> + +#include "aidge/data/Tensor.hpp" +#include "aidge/graph/GraphView.hpp" + +namespace Aidge +{ + /** + * @brief Kind of clipping policy to apply during the activation quantization + */ + enum Clipping {MAX = 1, MSE, AA, KL}; + + /** + * @brief Compute the histograms of the activations of each node contained in the map of the ranges (passed as argument). + * @param valueRanges A map associating each considered node name to its corresponding output range. + * @param nbBins Desired number of bins of the returned histograms. + * @param graphView The GraphView containing the considered nodes. + * @param inputDataSet The input dataset, consisting of a vector of input samples. + * @return A map associating each node name to it's corresponding activation histogram. + */ + std::map<std::string, std::vector<int>> computeHistograms(std::map<std::string, float> valueRanges, int nbBins, std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet); + + /** + * @brief Given an input activation histogram, compute the optimal clipping value in the sense of the Lp norm. + * @param histogram: The provided activation histogram. + * @param nbBits: The quantization number of bits. + * @param exponent: The exponent of the Lp norm (e.g. 2 for the MSE). + * @return The optimal clipping value. + */ + float computeMEClipping(std::vector<int> histogram, std::uint8_t nbBits, float exponent); + + /** + * @brief Given an input activation histogram, compute the optimal clipping value in the sense of the KL divergence. + * @param histogram: The provided activation histogram. + * @param nbBits: The quantization number of bits. + * @return The optimal clipping value. + */ + float computeKLClipping(std::vector<int> histogram, std::uint8_t nbBits); + + /** + * @brief Return a corrected map of the provided activation ranges. + * To do so compute the optimal clipping values for every node and multiply the input ranges by those values. + * The method used to compute the clippings can be eihter 'MSE', 'AA', 'KL' or 'MAX'. + * @param clippingMode The method used to compute the optimal clippings. + * @param valueRanges The map associating each affine node to its output range. + * @param nbBits The quantization number of bits. + * @param graphView The GraphView containing the considered nodes. + * @param inputDataSet The input dataset, consisting of a vector of input samples. + * @param verbose Whether to print the clipping values or not. + * @return The corrected map associating each provided node to its clipped range. + */ + std::map<std::string, float> adjustRanges(Clipping clippingMode, std::map<std::string, float> valueRanges, std::uint8_t nbBits, std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet, bool verbose); + +} + +#endif /* AIDGE_QUANTIZATION_PTQ_CLIP_H_ */ + diff --git a/include/aidge/quantization/PTQ/PTQ.hpp b/include/aidge/quantization/PTQ/PTQ.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d24831c3460aac037091636450a7290c47e1775b --- /dev/null +++ b/include/aidge/quantization/PTQ/PTQ.hpp @@ -0,0 +1,161 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_QUANTIZATION_PTQ_PTQ_H_ +#define AIDGE_QUANTIZATION_PTQ_PTQ_H_ + +//#include <cstdint> +//#include <map> +//#include <memory> +//#include <string> +//#include <vector> + +#include "aidge/data/Tensor.hpp" +#include "aidge/graph/GraphView.hpp" + +namespace Aidge { + + /** + * @brief Set of the types of the nodes which contain affine transforms (that is Y = A.X + B) + */ + static const std::set<std::string> affineNodeTypes({"FC", "Conv", "ConvDepthWise", "PaddedConv", "PaddedConvDepthWise"}); + + /** + * @brief Set of the types of the nodes which does not affect the PTQ process + */ + static const std::set<std::string> seamlessNodeTypes({"Pad", "MaxPooling", "AvgPooling", "PaddedMaxPooling", "PaddedAvgPooling", "GlobalAveragePooling", "Reshape", "Transpose", "Gather"}); + + /** + * @brief Set of the types of the nodes that merge multiple branches into one + */ + static const std::set<std::string> mergingNodeTypes({"Add", "Concat", "Sub"}); + + /** + * @brief Determine if a node contains an affine transform (that is Y = A.X + B) + * @param node The node to be checked + * @return True if the node is affine, else false. + */ + bool isAffine(std::shared_ptr<Node> node); + + /** + * @brief Determine if a node contains an operator that does not affect the PTQ process + * @param node The node to be checked + * @return True if the node is seamless, else false. + */ + bool isSeamless(std::shared_ptr<Node> node); + + /** + * @brief Determine if a node contains an operator that merges multiple branches into one + * @param node The node to be checked + * @return True if the node is merging, else false. + */ + bool isMerging(std::shared_ptr<Node> node); + + /** + * @brief Retrieve the scheduled vector of node of a graphView, without the Producer nodes. + * @param graphView The graphView containing the nodes + * @param verbose Whether to print the node vector or not + * @return The scheduled vector of nodes + */ + std::vector<std::shared_ptr<Node>> retrieveNodeVector(std::shared_ptr<GraphView> graphView, bool newSchedule = true, bool verbose = false); + + /** + * @brief Determine whether an input GraphView can be quantized or not. + * @param graphView The GraphView to be checked. + * @return True if the GraphView can be quantized, else false. + */ + bool checkArchitecture(std::shared_ptr<GraphView> graphView); + + /** + * @brief Insert a scaling node after each affine node of the GraphView. + * Also insert a scaling node in every purely residual branches. + * @param graphView The GraphView containing the affine nodes. + */ + void insertScalingNodes(std::shared_ptr<GraphView> graphView); + + /** + * @brief Normalize the parameters of each parametrized node, so that they fit in the [-1:1] range. + * @param graphView The GraphView containing the parametrized nodes. + */ + void normalizeParameters(std::shared_ptr<GraphView> graphView); + + /** + * @brief Compute the activation ranges of every affine node, given an input dataset. + * @param graphView The GraphView containing the affine nodes, on which the inferences are performed. + * @param inputDataSet The input dataset, consisting of a vector of input samples. + * @param scalingNodesOnly Whether to restrain the retreival of the ranges to scaling nodes only or not. + * @return A map associating each affine node name to it's corresponding output range. + */ + std::map<std::string, float> computeRanges(std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet, bool scalingNodesOnly); + + /** + * @brief Normalize the activations of each affine node so that they fit in the [-1:1] range. + * This is done by reconfiguring the scaling nodes, as well as rescaling the weights and biases tensors. + * @param graphView The GraphView containing the affine nodes. + * @param valueRanges The node output value ranges computed over the calibration dataset. + */ + void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::string, float> valueRanges); + + /** + * @brief For each node, compute the sign of its input and output values. + * The goal of the routine is to maximize the number of unsigned IOs in order to double the value resolution when possible. + * @param graphView The GraphView to analyze. + * @param verbose Whether to print the sign map or not. + * @return A map associating a pair of sign to each node of the GraphView (a sign for the input and one for the output). + */ + std::map<std::string, std::pair<bool, bool>> computeSignMap(std::shared_ptr<GraphView> graphView, bool verbose); + + /** + * @brief Quantize an already normalized (in term of parameters and activations) network. + * @param graphView The GraphView to be quantized. + * @param nbBits The desired number of bits of the quantization. + * @param applyRounding Whether to apply the rounding operations or not. + * @param optimizeSigns Whether to take account of the IO signs of the operators or not. + * @param verbose Whether to print the sign map or not. + */ + void quantizeNormalizedNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, bool applyRounding, bool optimizeSigns, bool verbose); + + /** + * @brief Main quantization routine. Performs every step of the quantization pipeline. + * @param graphView The GraphView to be quantized. + * @param nbBits The desired number of bits of the quantization. + * @param inputDataSet The input dataset on which the value ranges are computed. + * @param clippingMode: Type of the clipping optimization. Can be either 'MAX', 'MSE', 'AA' or 'KL'. + * @param applyRounding Whether to apply the rounding operations or not. + * @param optimizeSigns Whether to take account of the IO signs of the operators or not. + * @param singleShift Whether to convert the scaling factors into powers of two. If true the approximations are compensated using the previous nodes weights. + * @param verbose Whether to print internal informations about the quantization process. + */ + void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, std::vector<std::shared_ptr<Tensor>> inputDataSet, Clipping clippingMode, bool applyRounding, bool optimizeSigns, bool singleShift, bool verbose); + + /** + * @brief Compute the weight ranges of every affine node. Provided for debugging purposes. + * @param graphView The GraphView containing the affine nodes. + * @return A map associating each affine node name to it's corresponding weight range. + */ + std::map<std::string, float> getWeightRanges(std::shared_ptr<GraphView> graphView); + + /** + * @brief Clear the affine nodes biases. Provided form debugging purposes. + * @param graphView The GraphView containing the affine nodes. + */ + void clearBiases(std::shared_ptr<GraphView> graphView); + + /** + * @brief Developement and test routine. + * @param graphView The GraphView under test. + */ + void devPTQ(std::shared_ptr<GraphView> graphView); + +} + +#endif /* AIDGE_QUANTIZATION_PTQ_PTQ_H_ */ + diff --git a/python_binding/pybind_PTQ.cpp b/python_binding/pybind_PTQ.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1d9359c20b74943943104223a283edda8d0d74b5 --- /dev/null +++ b/python_binding/pybind_PTQ.cpp @@ -0,0 +1,229 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <pybind11/pybind11.h> +#include <pybind11/stl.h> + +#include <string> + +#include "aidge/PTQ/Clip.hpp" +#include "aidge/PTQ/CLE.hpp" +#include "aidge/PTQ/PTQ.hpp" + +#include "aidge/hook/Hook.hpp" +#include "aidge/graph/GraphView.hpp" + +namespace py = pybind11; + +namespace Aidge { +void init_QuantPTQ(py::module &m) { + + py::enum_<Clipping>(m, "Clipping", "Kind of clipping policy to apply during the activation quantization") + .value("MAX", Clipping::MAX) + .value("MSE", Clipping::MSE) + .value("AA" , Clipping::AA) + .value("KL" , Clipping::KL); + + m.def("check_architecture", &checkArchitecture, py::arg("network"), + R"mydelimiter( + Determine whether an input GraphView can be quantized or not. + :param network: The GraphView to be checked. + :type network: :py:class:`aidge_core.GraphView` + :return: True if the GraphView can be quantized, else False. + :rtype: bool + )mydelimiter"); + + m.def("insert_scaling_nodes", &insertScalingNodes, py::arg("network"), + R"mydelimiter( + Insert a scaling node after each affine node of the GraphView. + Also insert a scaling node in every purely residual branches. + :param network: The GraphView containing the affine nodes. + :type network: :py:class:`aidge_core.GraphView` + )mydelimiter"); + + m.def("normalize_parameters", &normalizeParameters, py::arg("network"), + R"mydelimiter( + Normalize the parameters of each parametrized node, so that they fit in the [-1:1] range. + :param network: The GraphView containing the parametrized nodes. + :type network: :py:class:`aidge_core.GraphView` + )mydelimiter"); + + m.def("compute_ranges", &computeRanges, py::arg("network"), py::arg("input_dataset"), py::arg("scaling_nodes_only"), + R"mydelimiter( + Compute the activation ranges of every affine node, given an input dataset. + :param network: The GraphView containing the affine nodes, on which the inferences are performed. + :type network: :py:class:`aidge_core.GraphView` + :param input_dataset: The input dataset, consisting of a vector of input samples. + :type input_dataset: list of :py:class:`aidge_core.Tensor` + :param scaling_nodes_only: Whether to restrain the retreival of the ranges to scaling nodes only or not + :type scaling_nodes_only: bool + :return: A map associating each considered node name to it's corresponding output range. + :rtype: dict + )mydelimiter"); + + m.def("normalize_activations", &normalizeActivations, py::arg("network"), py::arg("value_ranges"), + R"mydelimiter( + Normalize the activations of each affine node so that they fit in the [-1:1] range. + This is done by reconfiguring the scaling nodes, as well as rescaling the weights and biases tensors. + :param network: The GraphView containing the affine nodes. + :type network: :py:class:`aidge_core.GraphView` + :param value_ranges: The node output value ranges computed over the calibration dataset. + :type value_ranges: list of float. + )mydelimiter"); + + m.def("quantize_normalized_network", &quantizeNormalizedNetwork, py::arg("network"), py::arg("nb_bits"), py::arg("apply_rounding"), py::arg("optimize_signs"), py::arg("verbose"), + R"mydelimiter( + Quantize an already normalized (in term of parameters and activations) network. + :param network: The GraphView to be quantized. + :type network: :py:class:`aidge_core.GraphView` + :param nb_bits: The desired number of bits of the quantization. + :type nb_bits: int + :param apply_rounding: Whether to apply the rounding operations or not. + :type apply_rounding: bool + :param optimize_signs: Whether to take account of the IO signs of the operators or not. + :type optimize_signs: bool + :param verbose: Whether to print the sign map or not. + :type verbose: bool + )mydelimiter"); + + m.def("quantize_network", &quantizeNetwork ,py::arg("network"), py::arg("nb_bits"), py::arg("input_dataset"), py::arg("clipping_mode") = "MAX", py::arg("apply_rounding") = true, py::arg("optimize_signs") = false, py::arg("single_shift") = false, py::arg("verbose") = false, + R"mydelimiter( + Main quantization routine. Performs every step of the quantization pipeline. + :param network: The GraphView to be quantized. + :type network: :py:class:`aidge_core.GraphView` + :param nb_bits: The desired number of bits of the quantization. + :type nb_bits: int + :param input_dataset: The input dataset on which the value ranges are computed. + :type input_dataset: list of :py:class:`aidge_core.Tensor` + :param clipping_mode: Type of the clipping optimization. Can be either 'MAX', 'MSE', 'AA' or 'KL'. + :type clipping_mode: string + :param apply_rounding: Whether to apply the rounding operations or not. + :type apply_rounding: bool + :param optimize_signs: Whether to take account of the IO signs of the operators or not. + :type optimize_signs: bool + :param single_shift: Whether to convert the scaling factors into powers of two. If true the approximations are compensated using the previous nodes weights. + :type single_shift: bool + :param verbose: Whether to print internal informations about the quantization process. + :type verbose: bool + )mydelimiter"); + + m.def("compute_histograms", &computeHistograms, py::arg("value_ranges"), py::arg("nb_bins"), py::arg("network"), py::arg("input_dataset"), + R"mydelimiter( + Compute the histograms of the activations of each node contained in the map of the ranges (passed as argument). + :param value_ranges: A map associating each considered node name to its corresponding output range. + :type value_ranges: dict + :param nb_bins: Desired number of bins of the returned histograms. + :type nb_bins: int + :param network: The GraphView containing the considered nodes. + :type network: :py:class:`aidge_core.GraphView` + :param input_dataset: The input dataset, consisting of a list of input samples. + :type input_dataset: list of :py:class:`aidge_core.Tensor` + :return: A map associating each node name to it's corresponding activation histogram. + :rtype: dict + )mydelimiter"); + + m.def("compute_me_clipping", &computeMEClipping, py::arg("histogram"), py::arg("nb_bits"), py::arg("exponent"), + R"mydelimiter( + Given an input activation histogram, compute the optimal clipping value in the sense of the Lp norm. + :param histogram: The provided activation histogram. + :type histogram: list + :param nb_bits: The quantization number of bits. + :type nb_bits: int + :param exponent: The exponent of the Lp norm (e.g. 2 for the MSE). + :type exponent: int + :return: The optimal clipping value. + :rtype: float + )mydelimiter"); + + m.def("compute_kl_clipping", &computeKLClipping, py::arg("histogram"), py::arg("nb_bits"), + R"mydelimiter( + Given an input activation histogram, compute the optimal clipping value in the sense of the KL divergence. + :param histogram: The provided activation histogram. + :type histogram: list + :param nb_bits: The quantization number of bits. + :type nb_bits: int + :return: The optimal clipping value. + :rtype: float + )mydelimiter"); + + m.def("adjust_ranges", &adjustRanges, py::arg("clipping_mode"), py::arg("value_ranges"), py::arg("nb_bits"), py::arg("network"), py::arg("input_dataset"), py::arg("verbose"), + R"mydelimiter( + Return a corrected map of the provided activation ranges. + To do so compute the optimal clipping values for every node and multiply the input ranges by those values. + The method used to compute the clippings can be eihter 'MSE', 'AA', 'KL' or 'MAX'. + :param clipping_mode: The method used to compute the optimal clippings. + :type clipping_mode: enum + :param value_ranges: The map associating each affine node to its output range. + :type value_ranges: dict + :param nb_bits: The quantization number of bits. + :type nb_bits: int + :param network: The GraphView containing the considered nodes. + :type network: :py:class:`aidge_core.GraphView` + :param input_dataset: The input dataset, consisting of a list of input samples. + :type input_dataset: list of :py:class:`aidge_core.Tensor` + :param verbose: Whether to print the clipping values or not. + :type verbose: bool + :return: The corrected map associating to each provided node its clipped range. + :rtype: dict + )mydelimiter"); + + + m.def("compute_sign_map", &computeSignMap, py::arg("network"), py::arg("verbose"), + R"mydelimiter( + For each node, compute the sign of its input and output values. + The goal of the routine is to maximize the number of unsigned IOs in order to double the value resolution when possible. + :param network: The GraphView to analyze. + :type network: :py:class:`aidge_core.GraphView` + :param verbose: Whether to print the sign map or not. + :type verbose: bool + :return: A map associating a pair of signs to each node of the GraphView (a sign for the input and one for the output). + :rtype: dict + )mydelimiter"); + + m.def("cross_layer_equalization", &crossLayerEqualization, py::arg("network"), py::arg("target_delta"), + R"mydelimiter( + Equalize the ranges of the nodes parameters by proceding iteratively. + Can only be applied to single branch networks (otherwise does not edit the graphView). + :param network: The GraphView to process. + :type network: :py:class:`aidge_core.GraphView` + :param target_delta: the stopping criterion (typical value : 0.01) + :type target_delta: float + )mydelimiter"); + + m.def("get_weight_ranges", &getWeightRanges, py::arg("network"), + R"mydelimiter( + Compute the weight ranges of every affine nodes. Provided for debugging purposes. + :param network: The GraphView containing the affine nodes. + :type network: :py:class:`aidge_core.GraphView` + :return: A map associating each affine node name to it's corresponding weight range. + :rtype: dict + )mydelimiter"); + + m.def("clear_biases", &clearBiases, py::arg("network"), + R"mydelimiter( + Clear the affine nodes biases. Provided form debugging purposes. + :param network: The GraphView containing the affine nodes. + :type network: :py:class:`aidge_core.GraphView` + )mydelimiter"); + + m.def("dev_ptq", &devPTQ, py::arg("network"), + R"mydelimiter( + Developement and test routine. + :param network: The GraphView under test. + :type network: :py:class:`aidge_core.GraphView` + )mydelimiter"); +} + +PYBIND11_MODULE(aidge_quantization, m) { + init_QuantPTQ(m); +} + +} // namespace Aidge diff --git a/python_binding/pybind_QuantPTQ.cpp b/python_binding/pybind_QuantPTQ.cpp deleted file mode 100644 index 6927513c5d292b48a0e1e751b8396c7f553ca2d3..0000000000000000000000000000000000000000 --- a/python_binding/pybind_QuantPTQ.cpp +++ /dev/null @@ -1,118 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2023 CEA-List - * - * This program and the accompanying materials are made available under the - * terms of the Eclipse Public License 2.0 which is available at - * http://www.eclipse.org/legal/epl-2.0. - * - * SPDX-License-Identifier: EPL-2.0 - * - ********************************************************************************/ - -#include <pybind11/pybind11.h> -#include <pybind11/stl.h> - -#include <string> - -#include "aidge/QuantPTQ.hpp" -#include "aidge/hook/Hook.hpp" -#include "aidge/graph/GraphView.hpp" - -namespace py = pybind11; - -namespace Aidge { -void init_QuantPTQ(py::module &m) { - - m.def("check_architecture", &checkArchitecture, py::arg("network"), - R"mydelimiter( - Determine whether an input GraphView can be quantized or not. - :param network: The GraphView to be checked. - :type network: :py:class:`aidge_core.GraphView` - :return: True if the GraphView can be quantized, else false. - :rtype: bool - )mydelimiter"); - - m.def("insert_scaling_nodes", &insertScalingNodes, py::arg("network"), - R"mydelimiter( - Insert a scaling node after each affine node of the GraphView. - :param network: The GraphView containing the affine nodes. - :type network: :py:class:`aidge_core.GraphView` - )mydelimiter"); - - m.def("normalize_parameters", &normalizeParameters, py::arg("network"), - R"mydelimiter( - Normalize the parameters of each parametrized node, so that they fit in the [-1:1] range. - :param network: The GraphView containing the parametrized nodes. - :type network: :py:class:`aidge_core.GraphView` - )mydelimiter"); - - m.def("compute_ranges", &computeRanges, py::arg("network"), py::arg("input_dataset"), - R"mydelimiter( - Compute the value ranges of every affine node output, given an input dataset. - :param network: The GraphView containing the affine nodes, on which the inferences are performed. - :type network: :py:class:`aidge_core.GraphView` - :param input_dataset: inputDataSet The input dataset, consisting of a vector of input samples. - :type input_dataset: A list of :py:class:`aidge_core.Tensor` - :return: A map associating each affine node name to it's corresponding output range. - :rtype: dict - )mydelimiter"); - - m.def("normalize_activations", &normalizeActivations, py::arg("network"), py::arg("value_ranges"), - R"mydelimiter( - Normalize the activations of each affine node so that it become equal to one. - This is done by reconfiguring the scaling nodes, as well as rescaling the weights and biases tensors. - :param network: The GraphView containing the affine nodes. - :type network: :py:class:`aidge_core.GraphView` - :param value_ranges: The node output value ranges computed over the calibration dataset. - :type value_ranges: list of float. - )mydelimiter"); - - m.def("quantize_normalized_network", &quantizeNormalizedNetwork, py::arg("network"), py::arg("nb_bits"), - R"mydelimiter( - Quantize an already normalized (in term of parameters and activations) network. - :param network: The GraphView to be quantized. - :type network: :py:class:`aidge_core.GraphView` - :param nb_bits: The desired number of bits of the quantization. - :type nb_bits: int - )mydelimiter"); - - m.def("quantize_network", &quantizeNetwork ,py::arg("network"), py::arg("nb_bits"), py::arg("input_dataset"), py::arg("optimize_cliping") = false, - R"mydelimiter( - Main quantization routine. Performs every step of the quantization pipeline. - :param network: The GraphView to be quantized. - :type network: :py:class:`aidge_core.GraphView` - :param nb_bits: The desired number of bits of the quantization. - :type nb_bits: int - :param input_dataset: The input dataset on which the value ranges are computed. - :type input_dataset: list of :py:class:`aidge_core.Tensor` - :param optimize_cliping: Whether to optimize the cliping values or not. - :type optimize_cliping: bool - )mydelimiter"); - - m.def("get_weight_ranges", &getWeightRanges, py::arg("network"), - R"mydelimiter( - Compute the weight ranges of every affine node. Provided for debuging purposes. - :param network: graphView The GraphView containing the affine nodes. - :type network: :py:class:`aidge_core.GraphView` - :return: A map associating each affine node name to it's corresponding weight range. - :rtype: dict - )mydelimiter"); - - m.def("clear_biases", &clearBiases, py::arg("network"), - R"mydelimiter( - Clear the affine nodes biases. Provided form debugging purposes. - :param network: The GraphView containing the affine nodes. - :type network: :py:class:`aidge_core.GraphView` - )mydelimiter"); - - m.def("compute_scaling_histograms", &computeScalingHistograms, py::arg("value_ranges"), py::arg("nb_bins"), py::arg("network"), py::arg("input_dataset"), "compute scaling histogram"); - m.def("compute_best_clipping", &computeBestClipping, py::arg("histogram"), py::arg("nb_bits"), "compute the best clipping for an histogram"); - - m.def("dev_ptq", &devPTQ, py::arg("network"), "dev ptq"); -} - -PYBIND11_MODULE(aidge_quantization, m) { - init_QuantPTQ(m); -} - -} // namespace Aidge diff --git a/src/PTQ/CLE.cpp b/src/PTQ/CLE.cpp new file mode 100644 index 0000000000000000000000000000000000000000..df9b31edc614a8d220747d36832aff710f663bff --- /dev/null +++ b/src/PTQ/CLE.cpp @@ -0,0 +1,116 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/quantization/PTQ/CLE.hpp" +#include "aidge/quantization/PTQ/Clip.hpp" +#include "aidge/quantization/PTQ/PTQ.hpp" + +#include "aidge/graph/GraphView.hpp" +#include "aidge/scheduler/SequentialScheduler.hpp" +#include "aidge/scheduler/Scheduler.hpp" +#include "aidge/utils/Log.hpp" +#include "aidge/operator/OperatorTensor.hpp" + +namespace Aidge +{ + +static std::shared_ptr<Tensor> getWeightTensor(std::shared_ptr<Node> node) +{ + return std::static_pointer_cast<OperatorTensor>(node->getOperator())->getInput(1); +} + +static std::shared_ptr<Tensor> getBiasTensor(std::shared_ptr<Node> node) +{ + return std::static_pointer_cast<OperatorTensor>(node->getOperator())->getInput(2); +} + +static void rescaleTensor(std::shared_ptr<Tensor> tensor, float scaling) +{ + // Get the tensor data pointer + float * castedTensor = static_cast <float *> (tensor->getImpl()->rawPtr()); + + // Rescale the tensor + for(std::size_t i = 0; i < tensor->size(); i++) + castedTensor[i] *= scaling; +} + +static float getTensorAbsoluteMax(std::shared_ptr <Tensor> tensor) +{ + // Get the tensor data pointer and edit it + float * castedTensor = static_cast<float*>(tensor->getImpl()->rawPtr()); + + // Get the tensor absolute max value + float maxValue = 0.0f; + for(std::size_t i = 0; i < tensor->size(); ++i) { + if(std::fabs(castedTensor[i]) > maxValue) { + maxValue = std::fabs(castedTensor[i]); + } + } + return maxValue; +} + +void crossLayerEqualization(std::shared_ptr<GraphView> graphView, float targetDelta) +{ + std::vector<std::shared_ptr<Node>> nodeVector = retrieveNodeVector(graphView); + + // Check if the CLE can be applied ... + + for (std::shared_ptr<Node> node : nodeVector) + if (node->getChildren().size() > 1) + { + Log::info(" Network have multiple branches, skipping the CLE ... "); + return; + } + + Log::info(" Applying the Cross-Layer Equalization ... "); + + // Get the vector of affine nodes + + std::vector<std::shared_ptr<Node>> affineNodeVector; + for (std::shared_ptr<Node> node : nodeVector) + if (isAffine(node)) + affineNodeVector.push_back(node); + + float maxRangeDelta; + + do + { + maxRangeDelta = 0.0; + /* + std::cout << " ----- " << std::endl; + for (std::shared_ptr<Node> node : affineNodeVector) + std::cout << getTensorAbsoluteMax(getWeightTensor(node)) << std::endl; + */ + for (size_t i = 0; i < (affineNodeVector.size() - 1); i++) + { + std::shared_ptr<Node> n1 = affineNodeVector[i]; + std::shared_ptr<Node> n2 = affineNodeVector[i+1]; + + float r1 = getTensorAbsoluteMax(getWeightTensor(n1)); + float r2 = getTensorAbsoluteMax(getWeightTensor(n2)); + + float s1 = std::sqrt(r1 * r2) / r1; + float s2 = std::sqrt(r1 * r2) / r2; + + rescaleTensor(getWeightTensor(n1), s1); + rescaleTensor(getWeightTensor(n2), s2); + + rescaleTensor(getBiasTensor(n1), s1); + + float rangeDelta = std::abs(r1 - r2); + if (rangeDelta > maxRangeDelta) + maxRangeDelta = rangeDelta; + } + } + while (maxRangeDelta > targetDelta); +} + +} \ No newline at end of file diff --git a/src/PTQ/Clip.cpp b/src/PTQ/Clip.cpp new file mode 100644 index 0000000000000000000000000000000000000000..502685b12105c4746b0a509525a14cabce3a6eff --- /dev/null +++ b/src/PTQ/Clip.cpp @@ -0,0 +1,228 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/quantization/PTQ/CLE.hpp" +#include "aidge/quantization/PTQ/Clip.hpp" +#include "aidge/quantization/PTQ/PTQ.hpp" + +#include "aidge/scheduler/SequentialScheduler.hpp" +#include "aidge/scheduler/Scheduler.hpp" + +namespace Aidge +{ + +std::map<std::string, std::vector<int>> computeHistograms(std::map<std::string, float> valueRanges, int nbBins, std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet) +{ + std::shared_ptr<Node> firstNode = retrieveNodeVector(graphView)[0]; + + //std::cout << " COMPUTING HISTOGRAMS ... " << std::endl; + + std::map<std::string, std::vector<int>> histograms; + + SequentialScheduler scheduler(graphView); + scheduler.resetScheduling(); + + // Setup the histograms ... + + for (std::shared_ptr<Node> node : graphView->getNodes()) + { + bool isInsideRanges = (valueRanges.find(node->name()) != valueRanges.end()); + if (isInsideRanges) + { + std::vector<int> histogram; + for (int i = 0; i < nbBins; i++) + histogram.push_back(0); + + histograms.insert(std::make_pair(node->name(), histogram)); + } + } + + // Fill the histograms ... + + scheduler.resetScheduling(); + + int it = 0; + + for (std::shared_ptr<Tensor> inputTensor : inputDataSet) + { + Log::info(" IT (BIS) : {}", it++); + + // Inference ... + + scheduler.forward(true, {inputTensor}); + + // Gather values ... + for (std::shared_ptr<Node> node : graphView->getNodes()) + { + bool isInsideRanges = (valueRanges.find(node->name()) != valueRanges.end()); + if (isInsideRanges) + { + float valueRange = valueRanges[node->name()]; + + std::shared_ptr<Operator> nodeOperator = node->getOperator(); + std::shared_ptr<Tensor> valueTensor = std::static_pointer_cast<Tensor> (nodeOperator->getRawOutput(0)); + + float * castedTensor = static_cast<float *> (valueTensor->getImpl()->rawPtr()); + for(std::size_t i = 0; i < valueTensor->size(); i++) + { + int bin = std::round(std::abs(castedTensor[i] / valueRange * nbBins)); + histograms[node->name()][bin]++; + } + } + } + } + + return histograms; +} + +float computeMEClipping(std::vector<int> histogram, std::uint8_t nbBits, float exponent) +{ + int nbBins = histogram.size(); + int nbIter = 100; + int signedMax = (1 << (nbBits - 1)) - 1; + + std::vector<float> clippingErrors; + for (int it = 1; it < nbIter; it++) + { + // Compute the rounding cost of this particular clipping ... + float accumulatedError = 0.0; + float clipping = it / static_cast<float> (nbIter); + for (int bin = 0; bin < nbBins; bin++) + { + float value = (bin + 0.5) / nbBins; + float scaling = signedMax / clipping; + float rounded = std::round(value * scaling) / scaling; + float clipped = std::min(clipping, rounded); + + float approxError = std::abs(clipped - value); + accumulatedError += std::pow(approxError, exponent) * histogram[bin]; + } + clippingErrors.push_back(accumulatedError); + } + + std::vector<float>::iterator it = std::min_element(clippingErrors.begin(), clippingErrors.end()); + float bestClipping = static_cast<float> (std::distance(clippingErrors.begin(), it)) / static_cast<float> (nbIter); + + return bestClipping; +} + +float computeKLClipping(std::vector<int> refHistogram, std::uint8_t nbBits) +{ + // KL Clipping + + int nbIter = 100; + int signedMax = (1 << (nbBits - 1)) - 1; + + float refNorm = 0; + for (int n : refHistogram) + refNorm += static_cast<float> (n); + + std::vector<float> clippingErrors; + for (int it = 1; it < nbIter; it++) + { + float clipping = it / static_cast<float> (nbIter); + + // Create the histogram for this particular clipping ... + + std::vector<int> quantHistogram; + for (int i = 0; i < signedMax; i++) + quantHistogram.push_back(0); + + for (std::size_t refBin = 0; refBin < refHistogram.size(); refBin++) + { + float value = (static_cast<float> (refBin) + 0.5f) / static_cast<float> (refHistogram.size()); + int quantBin = std::floor(value / clipping * signedMax); + quantBin = std::min(quantBin, signedMax-1); + quantHistogram[quantBin] += refHistogram[refBin]; + } + + // Compute the mass of the histogram + + float quantNorm = 0; + for (std::size_t refBin = 0; refBin < refHistogram.size(); refBin++) + { + float value = (static_cast<float> (refBin) + 0.5f) / static_cast<float> (refHistogram.size()); + int quantBin = std::floor(value / clipping * signedMax); + if (quantBin < static_cast<int> (quantHistogram.size())) + quantNorm += quantHistogram[quantBin]; + } + + // Compute the KL divergence + + float accumulatedError = 0.0; + for (std::size_t refBin = 0; refBin < refHistogram.size(); refBin++) + { + float value = (static_cast<float> (refBin) + 0.5f) / static_cast<float> (refHistogram.size()); + int quantBin = std::floor(value / clipping * signedMax); + + float p = static_cast<float> (refHistogram[refBin]) / refNorm; + float q = (quantBin < static_cast<int> (quantHistogram.size())) ? + static_cast<float> (quantHistogram[quantBin]) / quantNorm : 0; + + if (p != 0 && q != 0) + accumulatedError += q * std::log(q / p); + } + + clippingErrors.push_back(accumulatedError); + } + + std::vector<float>::iterator it = std::min_element(clippingErrors.begin() + 1, clippingErrors.end()); + float bestClipping = static_cast<float> (std::distance(clippingErrors.begin(), it)) / static_cast<float> (nbIter); + + return bestClipping; +} + + +std::map<std::string, float> adjustRanges(Clipping clippingMode, std::map<std::string, float> valueRanges, std::uint8_t nbBits, std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet, bool verbose) +{ + /* + std::set<std::string> supportedModes({"MSE", "KL", "MAX", "AA"}); + bool isSupported = supportedModes.find(clippingMode) != supportedModes.end(); + if (!isSupported) + { + Log::info(" Clipping mode '{}' is not supported. No clipping will be applied ...", clippingMode); + return valueRanges; + } + */ + + float clipping = 1.0f; + int nbBins = (1 << (nbBits + 4)) ; // XXX Enhance this !!! + + if (clippingMode != Clipping::MAX) + { + if (verbose) + Log::info(" === CLIPPING VALUES === "); + + std::map<std::string, std::vector<int>> histograms = computeHistograms(valueRanges, nbBins, graphView, inputDataSet); + + for (std::shared_ptr<Node> node : graphView->getNodes()) + if (node->type() == "Scaling") + { + std::vector<int> histogram = histograms[node->name()]; + + if (clippingMode == Clipping::MSE) + clipping = computeMEClipping(histogram, nbBits, 2.0); + if (clippingMode == Clipping::AA) + clipping = computeMEClipping(histogram, nbBits, 1.0); + if (clippingMode == Clipping::KL) + clipping = computeKLClipping(histogram, nbBits); + + if (verbose) + Log::info(" {:.6f} ({})", clipping, node->name()); + + valueRanges[node->name()] *= clipping; + } + } + + return valueRanges; +} + +} \ No newline at end of file diff --git a/src/PTQ/PTQ.cpp b/src/PTQ/PTQ.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6409b7bc2a783b62d74abef63e7aade33ac771df --- /dev/null +++ b/src/PTQ/PTQ.cpp @@ -0,0 +1,1031 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include "aidge/quantization/PTQ/CLE.hpp" +#include "aidge/quantization/PTQ/Clip.hpp" +#include "aidge/quantization/PTQ/PTQ.hpp" + +#include "aidge/data/Tensor.hpp" +#include "aidge/graph/GraphView.hpp" +#include "aidge/graph/Node.hpp" +#include "aidge/scheduler/SequentialScheduler.hpp" +#include "aidge/scheduler/Scheduler.hpp" +#include "aidge/utils/Log.hpp" + +#include "aidge/operator/Producer.hpp" +#include "aidge/operator/Mul.hpp" +#include "aidge/operator/ReLU.hpp" +#include "aidge/operator/Scaling.hpp" +#include "aidge/recipes/Recipes.hpp" + +namespace Aidge +{ + +static std::string makeUniqueName(std::string baseName, std::shared_ptr<GraphView> graphView) +{ + std::set<std::string> existingNames; + for (std::shared_ptr<Node> node : graphView->getNodes()) + existingNames.insert(node->name()); + + bool isInside = (existingNames.find(baseName) != existingNames.end()); + + if (!isInside) + return baseName; + + int index = 1; + std::string newName = baseName; + while (isInside) + { + newName = baseName + "_" + std::to_string(index); + isInside = (existingNames.find(newName) != existingNames.end()); + index++; + } + + return newName; +} + +bool isAffine(std::shared_ptr<Node> node) +{ + return (affineNodeTypes.find(node->type()) != affineNodeTypes.end()); +} + +bool isSeamless(std::shared_ptr<Node> node) +{ + return (seamlessNodeTypes.find(node->type()) != seamlessNodeTypes.end()); +} + +bool isMerging(std::shared_ptr<Node> node) +{ + return (mergingNodeTypes.find(node->type()) != mergingNodeTypes.end()); +} + +bool checkArchitecture(std::shared_ptr<GraphView> graphView) +{ + std::set<std::string> otherNodeTypes({"Flatten", "Softmax", "ReLU", "Producer"}); + + for (std::shared_ptr<Node> node : graphView->getNodes()) + { + bool isOther = otherNodeTypes.find(node->type()) != otherNodeTypes.end(); + if (!isOther && !isAffine(node) && !isSeamless(node) && !isMerging(node)) { + Log::warn(" GraphView can't be quantized : node type {} is not supported !", node->type()); + return false; + } + } + + return true; +} + +static void fillTensor(std::shared_ptr<Tensor> tensor, float value) +{ + // Get the tensor data pointer + float * castedTensor = static_cast <float *> (tensor->getImpl()->rawPtr()); + + // Fill the tensor + for(std::size_t i = 0; i < tensor->size(); i++) + castedTensor[i] = value; +} + +static void rescaleTensor(std::shared_ptr<Tensor> tensor, float scaling) +{ + // Get the tensor data pointer + float * castedTensor = static_cast <float *> (tensor->getImpl()->rawPtr()); + + // Rescale the tensor + for(std::size_t i = 0; i < tensor->size(); i++) + castedTensor[i] *= scaling; +} + +static void roundTensor(std::shared_ptr<Tensor> tensor) +{ + // Get the tensor data pointer + float * castedTensor = static_cast <float *> (tensor->getImpl()->rawPtr()); + + // Rescale the tensor + for(std::size_t i = 0; i < tensor->size(); i++) + castedTensor[i] = std::round(castedTensor[i]); +} + +static float getTensorAbsoluteMax(std::shared_ptr <Tensor> tensor) +{ + // Get the tensor data pointer and edit it + float * castedTensor = static_cast<float*>(tensor->getImpl()->rawPtr()); + + // Get the tensor absolute max value + float maxValue = 0.0f; + for(std::size_t i = 0; i < tensor->size(); ++i) { + if(std::fabs(castedTensor[i]) > maxValue) { + maxValue = std::fabs(castedTensor[i]); + } + } + return maxValue; +} + +static void removeMatchingNodes(std::vector<std::shared_ptr<Node>>& nodeVector, std::string nodeType) +{ + std::vector<std::shared_ptr<Node>>::iterator iter = nodeVector.begin(); + while (iter != nodeVector.end()) + { + if ((*iter)->type() == nodeType) + iter = nodeVector.erase(iter); + else + ++iter; + } +} + +static void fixScheduling(std::vector<std::shared_ptr<Node>>& nodeVector) { + + std::vector<std::shared_ptr<Node>> correctedVector; + + for (int i = (nodeVector.size() - 1); i >= 0; --i) + { + std::shared_ptr<Node> node = nodeVector[i]; + bool isAlreadyInside = (std::find(correctedVector.begin(), correctedVector.end(), node) != correctedVector.end()); + if (!isAlreadyInside) + correctedVector.push_back(node); + } + + std::reverse(correctedVector.begin(), correctedVector.end()); + + nodeVector = correctedVector; +} + +static std::shared_ptr<Tensor> getWeightTensor(std::shared_ptr<Node> node) +{ + return std::static_pointer_cast<OperatorTensor>(node->getOperator())->getInput(1); +} + +static std::shared_ptr<Tensor> getBiasTensor(std::shared_ptr<Node> node) +{ + return std::static_pointer_cast<OperatorTensor>(node->getOperator())->getInput(2); +} + +std::vector<std::shared_ptr<Node>> retrieveNodeVector(std::shared_ptr<GraphView> graphView, bool newSchedule, bool verbose) +{ + std::vector<std::shared_ptr<Node>> nodeVector; + + SequentialScheduler scheduler(graphView); + + if (newSchedule) + { + scheduler.resetScheduling(); + scheduler.generateScheduling(); // old way : scheduler.forward(); + } + + nodeVector = scheduler.getStaticScheduling(); + + fixScheduling(nodeVector); + removeMatchingNodes(nodeVector, "Producer"); + + if (verbose) + { + Log::info("NB OF NODES = {}", nodeVector.size()); + for (std::shared_ptr<Node> node : nodeVector) + Log::info("{} {}", node->type(), node->name()); + } + + return nodeVector; +} + +static std::shared_ptr<Node> getFirstNode(std::shared_ptr<GraphView> graphView) +{ + return retrieveNodeVector(graphView)[0]; +} + +static std::shared_ptr<Node> getLastNode(std::shared_ptr<GraphView> graphView) +{ + std::shared_ptr<Node> currNode = graphView->rootNode(); + while (currNode->getChildren().size() != 0) + currNode = (*currNode->getChildren().begin()); + + return currNode; +} + +static void popSoftMax(std::shared_ptr<GraphView> graphView) +{ + std::shared_ptr<Node> lastNode = getLastNode(graphView); + if (lastNode->type() == "Softmax") { + graphView->replace({lastNode}, {}); // remove does not work !!! + } +} + +static void prepareNetwork(std::shared_ptr<GraphView> graphView) +{ + removeFlatten(graphView); + + bool containsBatchNorm = false; + std::vector<std::shared_ptr<Node>> nodeVector = retrieveNodeVector(graphView); + for (std::shared_ptr<Node> node : nodeVector) + if (node->type() == "BatchNorm") + { + containsBatchNorm = true; + break; + } + + if (containsBatchNorm) + fuseBatchNorm(graphView); + + popSoftMax(graphView); +} + + +// XXX HERE : Branches containing only Seamless nodes should be considered as residual too !!! +void insertResidualNodes(std::shared_ptr<GraphView> graphView) +{ + // TODO: double check this ... + + std::vector<std::shared_ptr<Node>> nodeVector = retrieveNodeVector(graphView); + + for (std::shared_ptr<Node> node : nodeVector) + { + if (isMerging(node)) + { + int nbParents = node->getParents().size(); + for (int i = 0; i < nbParents; i++) + { + std::shared_ptr<Node> parentNode = node->getParent(i); + bool parentIsForking = (parentNode->getChildren().size() > 1); + if (parentIsForking) + { + // temporary verbose ... + Log::info(" ### found residual branch at index {}", i); + Log::info(" ### inserting multiplicative node ..."); + + std::string residualNodeName = makeUniqueName(parentNode->name() + "_Res", graphView); + std::shared_ptr<Node> residualNode = Scaling(1.0, 0, false, residualNodeName); + residualNode->getOperator()->setDataType(DataType::Float32); + residualNode->getOperator()->setBackend("cpu"); + + graphView->insertParent(node, residualNode, i, 0, 0); + } + } + } + } +} + +static int getInputIndex(std::shared_ptr<Node> node, std::shared_ptr<Node> parentNode) +{ + int index = 0; + while (node->getParent(index) != parentNode) + index++; + return index; +} + +void insertScalingNodes(std::shared_ptr<GraphView> graphView) +{ + insertResidualNodes(graphView); + + std::set<std::shared_ptr<Node>> nodeSet = graphView->getNodes(); + + for (std::shared_ptr<Node> parentNode : nodeSet) + { + if (isAffine(parentNode) || isMerging(parentNode)) + { + std::string scalingNodeName = makeUniqueName(parentNode->name() + "_Scaling", graphView); + std::shared_ptr<Node> scalingNode = Scaling(1.0, 0, false, scalingNodeName); + scalingNode->getOperator()->setDataType(DataType::Float32); + scalingNode->getOperator()->setBackend("cpu"); + + if (parentNode->getChildren().size() > 0) + { + // SCALING NODE INSERTION + + // We always have one output from Affine and Add nodes, but possibly multiple childs + std::vector<std::shared_ptr<Node>> nextNodes = parentNode->getChildren(0); + + // For each node in nextNodes store the connexion index + std::vector<int> inputIndices(nextNodes.size()); + for (std::size_t i = 0; i < nextNodes.size(); i++) + inputIndices[i] = getInputIndex(nextNodes[i], parentNode); + + for (std::shared_ptr<Node> nextNode : nextNodes) + parentNode->removeChild(nextNode, 0); + + parentNode->addChild(scalingNode, 0, 0); + + for (std::size_t i = 0; i < nextNodes.size(); i++) + scalingNode->addChild(nextNodes[i], 0, inputIndices[i]); + + graphView->add(scalingNode); + } + else + { + // Log::info(" last node reached ! "); + graphView->addChild(scalingNode); + } + } + } +} + +static std::shared_ptr<Node> getPreviousScalingNode(std::shared_ptr<Node> mergingNode) +{ + std::shared_ptr<Node> currNode = mergingNode; + while(currNode->type() != "Scaling") + { + if (currNode->getParents().size() == 0) + { + Log::warn(" Warning : No previous Scaling node were found ! "); + break; + } + currNode = currNode->getParents()[0]; + } + return currNode; +} + +void normalizeParameters(std::shared_ptr<GraphView> graphView) +{ + // CREATE THE ACCUMULATED RATIO MAP /////////////////////////////////////// + + std::vector<std::shared_ptr<Node>> nodeVector = retrieveNodeVector(graphView); + + std::map<std::string, float> accumulatedRatios; + for (std::shared_ptr<Node> node : nodeVector) + { + accumulatedRatios.insert(std::make_pair(node->name(), 1.0)); + } + + // ITERATE OVER THE GRAPH ///////////////////////////////////////////////// + + std::shared_ptr<Node> firstNode = getFirstNode(graphView); + + for (std::shared_ptr<Node> node : nodeVector) + { + // Scaling nodes still have a ratio of 1, so they are seamless ... + if (node->type() == "ReLU" || node->type() == "Scaling" || isSeamless(node)) + { + if (node != firstNode) + { + std::shared_ptr<Node> prevNode = node->getParent(0); + accumulatedRatios[node->name()] = accumulatedRatios[prevNode->name()]; + } + } + + // Residual nodes should enter in this category but their ratio is 1 ... + if (isAffine(node)) + { + // Rescale the weight tensor + std::shared_ptr<Tensor> weightTensor = getWeightTensor(node); + float scaling = getTensorAbsoluteMax(weightTensor); + float ratio = 1.0 / scaling; + rescaleTensor(weightTensor, ratio); + + // Accumulate the ratio + if (node == firstNode) + { + accumulatedRatios[node->name()] = ratio; + } + else + { + std::shared_ptr<Node> prevNode = node->getParent(0); + accumulatedRatios[node->name()] = accumulatedRatios[prevNode->name()] * ratio; + } + + // Handle the bias ... + bool nodeHasBias = (node->getParents().size() == 3); + if (nodeHasBias) + { + std::shared_ptr<Tensor> biasTensor = getBiasTensor(node); + + // Check that a bias is present (as it is optional) + if (biasTensor) + rescaleTensor(biasTensor, accumulatedRatios[node->name()] ); + } + } + + if (isMerging(node)) + { + std::vector<std::shared_ptr<Node>> mergingNodes = node->getParents(); + + // Compute the max ratio ... + float maxRatio = 0; + for (std::shared_ptr<Node> mergingNode : mergingNodes) + { + float merginNodeRatio = accumulatedRatios[mergingNode->name()]; + if (merginNodeRatio > maxRatio) + maxRatio = merginNodeRatio; + } + + accumulatedRatios[node->name()] = maxRatio; + + // Rescale the previous scaling Nodes + for (std::shared_ptr<Node> mergingNode : mergingNodes) + { + float mergingNodeRatio = accumulatedRatios[mergingNode->name()]; + float rescaling = mergingNodeRatio / maxRatio; + + std::shared_ptr<Node> scalingNode = getPreviousScalingNode(mergingNode); + + std::shared_ptr<Scaling_Op> scalingOperator = std::static_pointer_cast<Scaling_Op> (scalingNode->getOperator()); + scalingOperator->scalingFactor() /= rescaling; + accumulatedRatios[mergingNode->name()] /= rescaling; // optional ... + } + } + } +} + +std::map<std::string, float> computeRanges(std::shared_ptr<GraphView> graphView, std::shared_ptr<Tensor> inputTensor, bool scalingNodesOnly) +{ + std::map<std::string, float> valueRanges; + + SequentialScheduler scheduler(graphView); + scheduler.resetScheduling(); + + // Inference ... + + scheduler.forward(true, {inputTensor}); + + // Gather ranges ... + + std::set<std::shared_ptr<Node>> nodeSet = graphView->getNodes(); + for (std::shared_ptr<Node> node : nodeSet) + { + if ((scalingNodesOnly && (node->type() == "Scaling")) || (!scalingNodesOnly && (node->type() != "Producer"))) + { + std::shared_ptr<Operator> nodeOperator = node->getOperator(); + std::shared_ptr<Tensor> valueTensor = std::static_pointer_cast<Tensor> (nodeOperator->getRawOutput(0)); + float range = getTensorAbsoluteMax(valueTensor); + + // Associate the value to the scaling node ... + valueRanges.insert(std::make_pair(node->name(), range)); + } + } + + return valueRanges; +} + +std::map<std::string, float> computeRanges(std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet, bool scalingNodesOnly) +{ + std::map<std::string, float> valueRanges; + std::set<std::shared_ptr<Node>> nodeSet = graphView->getNodes(); + + // std::shared_ptr<Node> inputNode = getFirstNode(graphView); + + for (std::shared_ptr<Node> node : nodeSet) + if ((scalingNodesOnly && (node->type() == "Scaling")) || (!scalingNodesOnly && (node->type() != "Producer"))) + valueRanges.insert(std::make_pair(node->name(), 0)); + + SequentialScheduler scheduler(graphView); + scheduler.resetScheduling(); + + int it = 0; + + for (std::shared_ptr<Tensor> sample : inputDataSet) + { + Log::info(" IT : {}", it++); + + // Inference ... + + scheduler.forward(true, {sample}); + + // Gather the sample ranges ... + + std::map<std::string, float> sampleRanges; + for (std::shared_ptr<Node> node : nodeSet) + { + if ((scalingNodesOnly && (node->type() == "Scaling")) || (!scalingNodesOnly && (node->type() != "Producer"))) + { + std::shared_ptr<Operator> nodeOperator = node->getOperator(); + std::shared_ptr<Tensor> valueTensor = std::static_pointer_cast<Tensor> (nodeOperator->getRawOutput(0)); + float range = getTensorAbsoluteMax(valueTensor); + + // Associate the value to the scaling node ... + sampleRanges.insert(std::make_pair(node->name(), range)); + } + } + + // Update the global value ranges ... + + for (std::shared_ptr<Node> node : nodeSet) + { + if ((scalingNodesOnly && (node->type() == "Scaling")) || (!scalingNodesOnly && (node->type() != "Producer"))) + { + std::string nodeName = node->name(); + if (sampleRanges[nodeName] > valueRanges[nodeName]) + valueRanges[nodeName] = sampleRanges[nodeName]; + } + } + } + + return valueRanges; +} + +void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::string, float> valueRanges) +{ + std::shared_ptr<Node> firstNode = getFirstNode(graphView); + + // CREATE THE SCALING FACTOR MAP ////////////////////////////////////////// + + std::vector<std::shared_ptr<Node>> nodeVector = retrieveNodeVector(graphView); + + std::map<std::string, float> scalingFactors; + + for (std::shared_ptr<Node> node : nodeVector) + scalingFactors.insert(std::make_pair(node->name(), 1.0)); + + // ITERATE OVER THE GRAPH ///////////////////////////////////////////////// + + for (std::shared_ptr<Node> node : nodeVector) + { + // Seamless scaling factor propagation ... + + if (isAffine(node) || isSeamless(node) || node->type() == "ReLU") + { + if (node == firstNode) + { + scalingFactors[node->name()] = 1.0; + } + else + { + std::shared_ptr<Node> prevNode = node->getParent(0); + scalingFactors[node->name()] = scalingFactors[prevNode->name()]; + } + } + + // Here prevNode is either a 'Affine' or a 'Merging' + // => do not split the cases, just handle the bias ... + + if (node->type() == "Scaling") + { + // retrieve the previous scaling factor ... + std::shared_ptr<Node> prevNode = node->getParent(0); + float prevScalingFactor = scalingFactors[prevNode->name()]; + + // ValueRanges must contains all the scaling nodes !!! + float scalingFactor = valueRanges[node->name()]; + + std::shared_ptr<Scaling_Op> scalingOperator = std::static_pointer_cast<Scaling_Op> (node->getOperator()); + scalingOperator->scalingFactor() /= (scalingFactor / prevScalingFactor); + + scalingFactors[node->name()] = scalingFactor; + + // If prevNode is Affine, fix the bias ... + + if (isAffine(prevNode)) + { + bool prevNodeHasBias = (prevNode->getParents().size() == 3); + if (prevNodeHasBias) { + std::shared_ptr<Tensor> biasTensor = getBiasTensor(prevNode); + rescaleTensor(biasTensor, 1.0 / prevScalingFactor); + } + } + } + + // Merging nodes handling : use a maximum arbritation ... + + if (isMerging(node)) + { + std::vector<std::shared_ptr<Node>> mergingNodes = node->getParents(); + + // Compute the max scaling ... + float maxScaling = 0; + for (std::size_t i = 0; i < mergingNodes.size(); i++) + { + float merginNodeScaling = scalingFactors[mergingNodes[i]->name()]; + if (merginNodeScaling > maxScaling) { + maxScaling = merginNodeScaling; + } + } + + scalingFactors[node->name()] = maxScaling; + + for (std::shared_ptr<Node> mergingNode : mergingNodes) + { + float mergingNodeScaling = scalingFactors[mergingNode->name()]; + float rescaling = mergingNodeScaling / maxScaling; + + std::shared_ptr<Node> scalingNode = getPreviousScalingNode(mergingNode); + //Log::info(" SCALING NODE : {} {}", scalingNode->type(), scalingNode->name()); + + std::shared_ptr<Scaling_Op> scalingOperator = std::static_pointer_cast<Scaling_Op> (scalingNode->getOperator()); + scalingOperator->scalingFactor() *= rescaling; + } + } + } +} + +std::map<std::string, std::pair<bool, bool>> computeSignMap(std::shared_ptr<GraphView> graphView, bool verbose) +{ + std::shared_ptr<Node> firstNode = getFirstNode(graphView); + + std::map<std::string, std::pair<bool, bool>> signMap; + + std::pair<bool, bool> unsignedPair(true, true); + for (std::shared_ptr<Node> node : graphView->getNodes()) + if (node->type() != "Producer") + signMap.insert(std::make_pair(node->name(), unsignedPair)); + + // ITERATE OVER THE GRAPH + + std::vector<std::shared_ptr<Node>> nodeVector = retrieveNodeVector(graphView); + + for (std::shared_ptr<Node> node : nodeVector) + { + bool isFirstNode = (node == firstNode); + + if (isAffine(node)) + { + // Affine nodes always have a single parent + if (!isFirstNode) + signMap[node->name()].first = signMap[node->getParent(0)->name()].second; + else + signMap[node->name()].first = false; + + signMap[node->name()].second = false; + } + + if (node->type() == "Scaling") + { + signMap[node->name()].second = false; + + // Scaling nodes always have a single parent + std::shared_ptr<Node> parent = node->getParent(0); + + bool allChildrenAreReLU = true; + allChildrenAreReLU &= !(node->getChildren().empty()); // a bit convoluted ... + for (std::shared_ptr <Node> child : node->getChildren()) + allChildrenAreReLU &= (child->type() == "ReLU"); + + // Correct the previous single node (when it is an Affine node) ... + if (allChildrenAreReLU) + if (isAffine(parent) || isMerging(parent)) + signMap[parent->name()].second = true; + + // Maintain unsigned output + if (signMap[parent->name()].second) + signMap[node->name()].second = true; + + // Set the link ... + signMap[node->name()].first = signMap[parent->name()].second; + } + + if (isMerging(node)) + { + std::vector<std::shared_ptr<Node>> parentNodes = node->getParents(); + + bool allParentAreSigned = true; + bool allParentAreUnsigned = true; + for(std::shared_ptr<Node> parent : parentNodes) + { + bool parentSign = signMap[parent->name()].second; + allParentAreSigned &= !parentSign; + allParentAreUnsigned &= parentSign; + } + + if (allParentAreSigned) + signMap[node->name()] = std::make_pair(false, false); + else if (allParentAreUnsigned) + signMap[node->name()] = std::make_pair(true, true); + else + { + // Arbitration : Signed type wins ! + for(std::shared_ptr<Node> parent : parentNodes) + { + while (parent->type() != "Scaling") + { + signMap[parent->name()] = std::make_pair(false, false); + // We are on a branch so nodes always have 1 parent ... + parent = parent->getParent(0); + } + + signMap[parent->name()].second = false; + } + + signMap[node->name()].first = false; + } + } + + if (node->type() == "ReLU" || isSeamless(node)) + { + // Thoses nodes always have a single parent + std::shared_ptr<Node> parent = node->getParent(0); + signMap[node->name()].first = signMap[parent->name()].second; + signMap[node->name()].second = signMap[node->name()].first; + } + } + + // VERBOSE + + if (verbose) + { + Log::info(" === SIGN MAP === "); + for (std::shared_ptr<Node> node : nodeVector) + Log::info(" {}{} | {}", static_cast<int>(signMap[node->name()].first), static_cast<int>(signMap[node->name()].second), node->name()); + } + + // SANITY CHECK (TEMPORARY) + + for (std::shared_ptr<Node> node : nodeVector) + if (node != firstNode) + { + for (std::shared_ptr<Node> parent : node->getParents()) + if (parent->type() != "Producer") + if (signMap[parent->name()].second != signMap[node->name()].first) + Log::error(" computeSignMap : link is not sane ! ({} -> {})", parent->name(), node->name()); + } + + return signMap; +} + + +void quantizeNormalizedNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, bool applyRounding, bool optimizeSigns, bool verbose) +{ + float signedMax = (1 << (nbBits - 1)) - 1; + float unsignedMax = (1 << nbBits) - 1; + + std::map<std::string, std::pair<bool, bool>> signMap; + + if (optimizeSigns) + signMap = computeSignMap(graphView, verbose); + else + { + std::pair<bool, bool> signedPair(false, false); + for (std::shared_ptr<Node> node : graphView->getNodes()) + if (node->type() != "Producer") + signMap.insert(std::make_pair(node->name(), signedPair)); + } + + // ITERATE OVER THE GRAPH ///////////////////////////////////////////////// + + std::vector<std::shared_ptr<Node>> nodeVector = retrieveNodeVector(graphView); + + for (std::shared_ptr<Node> node : nodeVector) + { + if (isAffine(node)) + { + // Rescale the weight tensor + + std::shared_ptr<Tensor> weightTensor = getWeightTensor(node); + rescaleTensor(weightTensor, signedMax); + + if (applyRounding) + roundTensor(weightTensor); + + // Rescale the bias tensor + + bool nodeHasBias = (node->getParents().size() == 3); + if (nodeHasBias) + { + bool inputIsUnsigned = signMap[node->name()].first; + float rescaling = inputIsUnsigned ? unsignedMax * signedMax : signedMax * signedMax; + + std::shared_ptr<Tensor> biasTensor = getBiasTensor(node); + rescaleTensor(biasTensor, rescaling); + + if (applyRounding) + roundTensor(biasTensor); + } + + // Compensate the rescaling using the next Scaling node + + float rescaling = 1.0 / signedMax; + + bool inputIsUnsigned = signMap[node->name()].first; + bool outputIsUnsigned = signMap[node->name()].second; + + rescaling /= inputIsUnsigned ? unsignedMax : signedMax; + rescaling *= outputIsUnsigned ? unsignedMax : signedMax; + + std::shared_ptr<Node> scalingNode = *(node->getChildren().begin()); // Assert if scalingNode is a Scaling ... + std::shared_ptr<Scaling_Op> scalingOperator = std::static_pointer_cast<Scaling_Op> (scalingNode->getOperator()); + scalingOperator->scalingFactor() *= rescaling; + } + + if (isMerging(node)) + { + float rescaling = 1.0; + + bool inputIsUnsigned = signMap[node->name()].first; + bool outputIsUnsigned = signMap[node->name()].second; + + rescaling /= inputIsUnsigned ? unsignedMax : signedMax; + rescaling *= outputIsUnsigned ? unsignedMax : signedMax; + + std::shared_ptr<Node> scalingNode = *(node->getChildren().begin()); // Assert if scalingNode is a Scaling ... + std::shared_ptr<Scaling_Op> scalingOperator = std::static_pointer_cast<Scaling_Op> (scalingNode->getOperator()); + scalingOperator->scalingFactor() *= rescaling; + } + + // Handle the Scaling Nodes ... + + if (node->type() == "Scaling") + { + std::shared_ptr<Scaling_Op> scalingOperator = std::static_pointer_cast<Scaling_Op> (node->getOperator()); + + if (optimizeSigns) + { + float rescaling = 1.0; + + bool inputIsUnsigned = signMap[node->name()].first; + bool outputIsUnsigned = signMap[node->name()].second; + + rescaling /= inputIsUnsigned ? unsignedMax : signedMax; + rescaling *= outputIsUnsigned ? unsignedMax : signedMax; + + scalingOperator->scalingFactor() *= rescaling; + + scalingOperator->isOutputUnsigned() = outputIsUnsigned; + } + + if (applyRounding) + scalingOperator->quantizedNbBits() = nbBits; + } + } +} + +static void insertCompensationNodes(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits) +{ + // XXX Use the signMap to increase the resolution when possible ... + float signedMax = (1 << (nbBits - 1)) - 1; + + std::vector<std::shared_ptr<Node>> nodeVector = retrieveNodeVector(graphView); + + for (std::shared_ptr<Node> node : nodeVector) + { + // A merging node is always followed by a scaling node at this point ... + + if (node->type() == "Scaling") + { + bool prevNodeIsForking = ((node->getParent(0))->getChildren().size() > 1); + bool prevNodeIsAffine = isAffine(node->getParent(0)); + bool insertNode = prevNodeIsForking || !prevNodeIsAffine; + + if (insertNode) + { + // create and insert the multplicative node + + std::string mulNodeName = makeUniqueName(node->name() + "_Mul", graphView); + std::shared_ptr<Node> mulNode = Mul(mulNodeName); + mulNode->getOperator()->setDataType(DataType::Float32); + mulNode->getOperator()->setBackend("cpu"); + + graphView->insertParent(node, mulNode, 0, 0, 0); + + // create and insert the producer node + + std::shared_ptr<Tensor> inputTensor = std::static_pointer_cast<Tensor> (mulNode->getOperator()->getRawInput(0)); + + std::shared_ptr<Tensor> coeffTensor = std::make_shared<Tensor>(); + coeffTensor->setDataType(DataType::Float32); + coeffTensor->setBackend("cpu"); + coeffTensor->resize(inputTensor->dims()); + fillTensor(coeffTensor, 1); + + std::shared_ptr<Node> producerNode = Producer(coeffTensor, makeUniqueName("coeff", graphView)); + producerNode->addChild(mulNode); + graphView->add(producerNode); + + // rescale the coeffs and edit scaling factor + + fillTensor(coeffTensor, signedMax); + std::shared_ptr<Scaling_Op> scalingOperator = std::static_pointer_cast<Scaling_Op> (node->getOperator()); + scalingOperator->scalingFactor() /= signedMax; + + // TODO : double check this !!! + //std::cout << getTensorAbsoluteMax(coeffTensor) << std::endl; + } + } + } +} + +void performSingleShiftApproximation(std::shared_ptr<GraphView> graphView, bool applyRounding) +{ + std::vector<std::shared_ptr<Node>> nodeVector = retrieveNodeVector(graphView); + + for (std::shared_ptr<Node> node : nodeVector) + { + if (isAffine(node) || (node->type() == "Mul")) + { + std::shared_ptr<Node> scalingNode = (*node->getChildren().begin()); + + std::shared_ptr<Scaling_Op> scalingOperator = std::static_pointer_cast<Scaling_Op> (scalingNode->getOperator()); + + float base = scalingOperator->scalingFactor(); + + float approx = std::pow(2, std::ceil(std::log2(base))); + + scalingOperator->scalingFactor() = approx; + + float ratio = base / approx; + + std::shared_ptr<Tensor> weightTensor = getWeightTensor(node); + rescaleTensor(weightTensor, ratio); + if (applyRounding) + roundTensor(weightTensor); + + bool nodeHasBias = (node->getParents().size() == 3); + if (nodeHasBias) + { + std::shared_ptr<Tensor> biasTensor = getBiasTensor(node); + rescaleTensor(biasTensor, ratio); + if (applyRounding) + roundTensor(biasTensor); + } + } + } +} + +static void printScalingFactors(std::shared_ptr<GraphView> graphView) +{ + Log::info(" === SCALING FACTORS === "); + for (auto node : retrieveNodeVector(graphView)) + if (node->type() == "Scaling") + { + std::shared_ptr<Scaling_Op> scalingOperator = std::static_pointer_cast<Scaling_Op> (node->getOperator()); + float factor = scalingOperator->scalingFactor(); + Log::info(" {:.6f} ({})", factor, node->name()); + } +} + +void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, std::vector<std::shared_ptr<Tensor>> inputDataSet, Clipping clippingMode, bool applyRounding, bool optimizeSigns, bool singleShift, bool verbose) +{ + Log::info(" === QUANT PTQ 0.2.19 === "); + + if (!checkArchitecture(graphView)) + return; + + Log::info(" Preparing the network for the PTQ ... "); + prepareNetwork(graphView); + + Log::info(" Inserting the scaling nodes ..."); + insertScalingNodes(graphView); + + crossLayerEqualization(graphView); + + Log::info(" Normalizing the parameters ..."); + normalizeParameters(graphView); + + Log::info(" Computing the value ranges ..."); + std::map<std::string, float> valueRanges = computeRanges(graphView, inputDataSet, true); + + Log::info(" Optimizing the clipping values ..."); + valueRanges = adjustRanges(clippingMode, valueRanges, nbBits, graphView, inputDataSet, verbose); + + Log::info(" Normalizing the activations ..."); + normalizeActivations(graphView, valueRanges); + + Log::info(" Quantizing the normalized network ..."); + quantizeNormalizedNetwork(graphView, nbBits, applyRounding, optimizeSigns, verbose); + + if (singleShift) + { + Log::info( " Inserting the compensation nodes ..."); + insertCompensationNodes(graphView, nbBits); + + Log::info(" Performing the Single-Shift approximation ..."); + performSingleShiftApproximation(graphView, applyRounding); + } + + if (verbose) + printScalingFactors(graphView); + + Log::info(" Resetting the scheduler ..."); + SequentialScheduler scheduler(graphView); + scheduler.resetScheduling(); + + Log::info(" Network is quantized !"); +} + +std::map<std::string, float> getWeightRanges(std::shared_ptr<GraphView> graphView) +{ + std::map<std::string, float> weightRanges; + + for (std::shared_ptr<Node> node : graphView->getNodes()) + { + if (isAffine(node)) + { + std::shared_ptr<Tensor> weightTensor = getWeightTensor(node); + float range = getTensorAbsoluteMax(weightTensor); + weightRanges.insert(std::make_pair(node->name(), range)); + } + } + + return weightRanges; +} + +void clearBiases(std::shared_ptr<GraphView> graphView) +{ + for (std::shared_ptr<Node> node : graphView->getNodes()) { + if (node->type() == "FC" || node->type() == "Conv") { + std::shared_ptr<Tensor> biasTensor = std::static_pointer_cast<OperatorTensor>(node->getOperator())->getInput(2); + rescaleTensor(biasTensor, 0); + } + } +} + +void devPTQ(std::shared_ptr<GraphView> graphView) +{ + SequentialScheduler scheduler(graphView); + scheduler.generateScheduling(); + auto s = scheduler.getStaticScheduling(); + for (std::shared_ptr<Node> node : s) + std::cout << " UUU : " << node->name() << std::endl; +} + +} diff --git a/src/QuantPTQ.cpp b/src/QuantPTQ.cpp deleted file mode 100644 index ef28d72f4ffc037415bebd86c8ade16d8a4554a9..0000000000000000000000000000000000000000 --- a/src/QuantPTQ.cpp +++ /dev/null @@ -1,838 +0,0 @@ -/******************************************************************************** - * Copyright (c) 2023 CEA-List - * - * This program and the accompanying materials are made available under the - * terms of the Eclipse Public License 2.0 which is available at - * http://www.eclipse.org/legal/epl-2.0. - * - * SPDX-License-Identifier: EPL-2.0 - * - ********************************************************************************/ - -#include "aidge/QuantPTQ.hpp" - -#include <algorithm> // std::find, std::reverse -#include <cmath> // std::round -#include <cstddef> // std::size_t -#include <cstdint> // std::uint8_t -#include <map> -#include <memory> -#include <set> -#include <string> -#include <utility> // std::make_pair -#include <vector> - -#include "aidge/data/Tensor.hpp" -#include "aidge/graph/GraphView.hpp" -#include "aidge/graph/Node.hpp" -#include "aidge/scheduler/SequentialScheduler.hpp" -#include "aidge/utils/Log.hpp" - -#include "aidge/operator/Add.hpp" -#include "aidge/operator/Concat.hpp" -#include "aidge/operator/Conv.hpp" -#include "aidge/operator/ConvDepthWise.hpp" -#include "aidge/operator/FC.hpp" -#include "aidge/operator/Mul.hpp" -#include "aidge/operator/ReLU.hpp" -#include "aidge/operator/Scaling.hpp" -#include "aidge/recipes/Recipes.hpp" -#include "aidge/operator/MetaOperator.hpp" - -namespace Aidge{ - -static std::string makeUniqueName(std::string baseName, std::shared_ptr<GraphView> graphView) -{ - std::set<std::string> existingNames; - for (std::shared_ptr<Node> node : graphView->getNodes()) - existingNames.insert(node->name()); - - bool isInside = (existingNames.find(baseName) != existingNames.end()); - - if (!isInside) - return baseName; - - int index = 1; - std::string newName = baseName; - while (isInside) - { - newName = baseName + "_" + std::to_string(index); - isInside = (existingNames.find(newName) != existingNames.end()); - index++; - } - - return newName; -} - -static bool isAffine(std::shared_ptr<Node> node) -{ - std::set<std::string> affineNodeTypes({"FC", "Conv", "ConvDepthWise", "PaddedConv", "PaddedConvDepthWise"}); - return (affineNodeTypes.find(node->type()) != affineNodeTypes.end()); -} - -static bool isSeamless(std::shared_ptr<Node> node) -{ - std::set<std::string> seamlessNodeTypes({"Pad", "MaxPooling", "AvgPooling", "PaddedMaxPooling", "PaddedAvgPooling", "GlobalAveragePooling"}); - return (seamlessNodeTypes.find(node->type()) != seamlessNodeTypes.end()); -} - -bool checkArchitecture(std::shared_ptr<GraphView> graphView) -{ - std::set<std::string> otherNodeTypes({"Flatten", "Add", "Concat", "Softmax", "ReLU", "Producer"}); - - for (std::shared_ptr<Node> node : graphView->getNodes()) - { - bool isOther = otherNodeTypes.find(node->type()) != otherNodeTypes.end(); - if (!isOther && !isAffine(node) && !isSeamless(node)) { - Log::info(" GraphView can't be quantized : node type {} is not supported !", node->type()); - return false; - } - } - - return true; -} - -static std::shared_ptr<Node> getFirstNode(std::shared_ptr<GraphView> graphView) -{ - std::shared_ptr<Node> currNode = graphView->rootNode(); - if (currNode->type() == "Producer") - currNode = *(currNode->getChildren()).begin(); - - std::shared_ptr<Node> parentNode = currNode->getParent(0); - while (parentNode->type() != "Producer") { - currNode = parentNode; - parentNode = currNode->getParent(0); - } - - return currNode; -} - -static std::shared_ptr<Node> getLastNode(std::shared_ptr<GraphView> graphView) -{ - std::shared_ptr<Node> currNode = graphView->rootNode(); - while (currNode->getChildren().size() != 0) - currNode = (*currNode->getChildren().begin()); - - return currNode; -} - -static void popSoftMax(std::shared_ptr<GraphView> graphView) -{ - std::shared_ptr<Node> lastNode = getLastNode(graphView); - if (lastNode->type() == "Softmax") { - graphView->replace({lastNode}, {}); // remove does not work !!! - } -} - -static void fillTensor(std::shared_ptr<Tensor> tensor, float value) -{ - // Get the tensor data pointer - float * castedTensor = static_cast <float *> (tensor->getImpl()->rawPtr()); - - // Fill the tensor - for(std::size_t i = 0; i < tensor->size(); i++) - castedTensor[i] = value; -} - -static void rescaleTensor(std::shared_ptr<Tensor> tensor, float scaling) -{ - // Get the tensor data pointer - float * castedTensor = static_cast <float *> (tensor->getImpl()->rawPtr()); - - // Rescale the tensor - for(std::size_t i = 0; i < tensor->size(); i++) - castedTensor[i] *= scaling; -} - -static void roundTensor(std::shared_ptr<Tensor> tensor) -{ - // Get the tensor data pointer - float * castedTensor = static_cast <float *> (tensor->getImpl()->rawPtr()); - - // Rescale the tensor - for(std::size_t i = 0; i < tensor->size(); i++) - castedTensor[i] = std::round(castedTensor[i]); -} - -static float getTensorAbsoluteMax(std::shared_ptr <Tensor> tensor) -{ - // Get the tensor data pointer and edit it - float * castedTensor = static_cast<float*>(tensor->getImpl()->rawPtr()); - - // Get the tensor absolute max value - float maxValue = 0.0f; - for(std::size_t i = 0; i < tensor->size(); ++i) { - if(std::fabs(castedTensor[i]) > maxValue) { - maxValue = std::fabs(castedTensor[i]); - } - } - return maxValue; -} - -static void removeMatchingNodes(std::vector<std::shared_ptr<Node>>& nodeVector, std::string nodeType) -{ - std::vector<std::shared_ptr<Node>>::iterator iter = nodeVector.begin(); - while (iter != nodeVector.end()) - { - if ((*iter)->type() == nodeType) - iter = nodeVector.erase(iter); - else - ++iter; - } -} - -static void fixScheduling(std::vector<std::shared_ptr<Node>>& nodeVector) { - - std::vector<std::shared_ptr<Node>> correctedVector; - - for (int i = (nodeVector.size() - 1); i >= 0; --i) - { - std::shared_ptr<Node> node = nodeVector[i]; - bool isAlreadyInside = (std::find(correctedVector.begin(), correctedVector.end(), node) != correctedVector.end()); - if (!isAlreadyInside) - correctedVector.push_back(node); - } - - std::reverse(correctedVector.begin(), correctedVector.end()); - - nodeVector = correctedVector; -} - -static std::shared_ptr<Tensor> getWeightTensor(std::shared_ptr<Node> node) -{ - return std::static_pointer_cast<OperatorTensor>(node->getOperator())->getInput(1); -} - -static std::shared_ptr<Tensor> getBiasTensor(std::shared_ptr<Node> node) -{ - return std::static_pointer_cast<OperatorTensor>(node->getOperator())->getInput(2); -} - - -void appendIdentity(std::shared_ptr<GraphView> graphView) { - - std::shared_ptr<Node> lastNode = getLastNode(graphView); - - int size = std::static_pointer_cast<OperatorTensor> (lastNode->getOperator())->getOutput(0)->size(); - - std::shared_ptr<Node> identityNode = FC(size, size, true, makeUniqueName("Identity", graphView)); - identityNode->getOperator()->setDataType(DataType::Float32); - identityNode->getOperator()->setBackend("cpu"); - - std::shared_ptr<Tensor> weightTensor = std::static_pointer_cast<Tensor> (identityNode->getOperator()->getRawInput(1)); - fillTensor(weightTensor, 0); - - float * castedWeightTensor = static_cast<float *> (weightTensor->getImpl()->rawPtr()); - for (int n = 0; n < size; n++) - castedWeightTensor[n + size * n] = 1.0; - - graphView->addChild(identityNode); -} - - -std::vector<std::shared_ptr<Node>> extractNodeVector(std::shared_ptr<GraphView> graphView, bool verbose) -{ - std::vector<std::shared_ptr<Node>> nodeVector; - - SequentialScheduler scheduler(graphView); - scheduler.forward(); - nodeVector = scheduler.getStaticScheduling(); - - //graphView->forwardDims(); - //scheduler.generateScheduling(); - //nodeVector = scheduler.getStaticScheduling(); - - fixScheduling(nodeVector); - - removeMatchingNodes(nodeVector, "Producer"); - - if (verbose) - { - Log::info("NB OF NODES = {}", nodeVector.size()); - for (std::shared_ptr<Node> node : nodeVector) - Log::info("{} {}", node->type(), node->name()); - } - - //for (auto node : nodeVector) - // std::cout << node->type() << std::endl; - - return nodeVector; -} - -void insertResidualNodes(std::shared_ptr<GraphView> graphView) -{ - std::vector<std::shared_ptr<Node>> nodeVector = extractNodeVector(graphView, false); - - for (std::shared_ptr<Node> node : nodeVector) - { - if (node->type() == "Add" || node->type() == "Concat") - { - int nbParents = node->getParents().size(); - for (int i = 0; i < nbParents; i++) - { - std::shared_ptr<Node> parentNode = node->getParent(i); - bool parentIsForking = (parentNode->getChildren().size() > 1); - if (parentIsForking) - { - Log::info(" ### found residual branch at index {}", i); - Log::info(" ### inserting multiplicative node ..."); - - std::string residualNodeName = makeUniqueName(parentNode->name() + "_Res", graphView); - std::shared_ptr<Node> residualNode = Scaling(1.0, 0, false, residualNodeName); - residualNode->getOperator()->setDataType(DataType::Float32); - residualNode->getOperator()->setBackend("cpu"); - - graphView->insertParent(node, residualNode, i, 0, 0); - } - } - } - } - - graphView->forwardDims(); -} - - -void insertScalingNodes(std::shared_ptr<GraphView> graphView) -{ - insertResidualNodes(graphView); - - std::set<std::shared_ptr<Node>> nodeSet = graphView->getNodes(); - - for (std::shared_ptr<Node> node : nodeSet) - { - if (isAffine(node)) - { - std::string scalingNodeName = makeUniqueName(node->name() + "_Scaling", graphView); - std::shared_ptr<Node> scalingNode = Scaling(1.0, 0, false, scalingNodeName); - scalingNode->getOperator()->setDataType(DataType::Float32); - scalingNode->getOperator()->setBackend("cpu"); - - if (node->getChildren().size() > 0) - { - std::shared_ptr<Node> nextNode = node->getChildren(0)[0]; - - // XXX TODO : be extra careful about this ... - int i = 0; - while (nextNode->getParent(i) != node) i++; - graphView->insertParent(nextNode, scalingNode, i, 0, 0); - } - else - { - // Log::info(" last node reached ! "); - graphView->addChild(scalingNode); - } - } - } - - graphView->forwardDims(); - - // XXX Append identity if needed ... - if (getLastNode(graphView)->type() == "Scaling") - appendIdentity(graphView); - -} - - -static std::shared_ptr<Node> getPreviousScalingNode(std::shared_ptr<Node> mergingNode) -{ - std::shared_ptr<Node> currNode = mergingNode; - while(currNode->type() != "Scaling") - { - if (currNode->getParents().size() == 0) - { - Log::warn(" Warning : No previous Scaling node were found ! "); - break; - } - currNode = currNode->getParents()[0]; - } - return currNode; -} - - -// Be more careful about the '*' and '/' ... -void normalizeParameters(std::shared_ptr<GraphView> graphView) -{ - // CREATE THE ACCUMULATED RATIO MAP /////////////////////////////////////// - - std::vector<std::shared_ptr<Node>> nodeVector = extractNodeVector(graphView, false); - - std::map<std::string, float> accumulatedRatios; - for (std::shared_ptr<Node> node : nodeVector) - accumulatedRatios.insert(std::make_pair(node->name(), 1.0)); - - // ITERATE OVER THE GRAPH ///////////////////////////////////////////////// - - for (std::shared_ptr<Node> node : nodeVector) - { - // Scaling nodes still have a ratio of 1, so they are seamless ... - if (node->type() == "ReLU" || node->type() == "Scaling" || isSeamless(node)) - { - if (node != getFirstNode(graphView)) - { - std::shared_ptr<Node> prevNode = node->getParent(0); - accumulatedRatios[node->name()] = accumulatedRatios[prevNode->name()]; - } - } - - // Residual nodes should enter in this category but their ratio is 1 ... - if (isAffine(node)) - { - // Rescale the weight tensor - std::shared_ptr<Tensor> weightTensor = getWeightTensor(node); - float scaling = getTensorAbsoluteMax(weightTensor); - float ratio = 1.0 / scaling; - rescaleTensor(weightTensor, ratio); - - // Accumulate the ratio - if (node == getFirstNode(graphView)) - accumulatedRatios[node->name()] = ratio; - else - { - std::shared_ptr<Node> prevNode = node->getParent(0); - accumulatedRatios[node->name()] = accumulatedRatios[prevNode->name()] * ratio; - } - - // Handle the bias ... - bool nodeHasBias = (node->getParents().size() == 3); - if (nodeHasBias) - { - std::shared_ptr<Tensor> biasTensor = getBiasTensor(node); - // Check that a bias is present (as it is optional) - if (biasTensor) { - rescaleTensor(biasTensor, accumulatedRatios[node->name()] ); - } - } - - } - - if (node->type() == "Add" || node->type() == "Concat") - { - // We should assert if merging nodes are all scalings ! - std::vector<std::shared_ptr<Node>> mergingNodes = node->getParents(); - - // Compute the max ratio ... - float maxRatio = 0; - for (std::shared_ptr<Node> mergingNode : mergingNodes) - { - float merginNodeRatio = accumulatedRatios[mergingNode->name()]; - if (merginNodeRatio > maxRatio) - maxRatio = merginNodeRatio; - } - - accumulatedRatios[node->name()] = maxRatio; - - // Rescale the previous scaling Nodes - for (std::shared_ptr<Node> mergingNode : mergingNodes) - { - float mergingNodeRatio = accumulatedRatios[mergingNode->name()]; - float rescaling = mergingNodeRatio / maxRatio; - - std::shared_ptr<Node> scalingNode = getPreviousScalingNode(mergingNode); - - std::shared_ptr<Scaling_Op> scalingOperator = std::static_pointer_cast<Scaling_Op> (scalingNode->getOperator()); - scalingOperator->scalingFactor() /= rescaling; - accumulatedRatios[mergingNode->name()] /= rescaling; // optional ... - } - } - } -} - -std::map<std::string, float> computeRanges(std::shared_ptr<GraphView> graphView, std::shared_ptr<Tensor> inputTensor) -{ - std::map<std::string, float> valueRanges; - - SequentialScheduler scheduler(graphView); - - std::shared_ptr<Node> inputNode = getFirstNode(graphView); - - // Setup the input - std::shared_ptr<Node> inputProducer = inputNode->getParent(0); - inputProducer->getOperator()->setOutput(0, inputTensor); - - // Forward ... - scheduler.forward(); - - // Gather ranges ... - - std::set<std::shared_ptr<Node>> nodeSet = graphView->getNodes(); - for (std::shared_ptr<Node> node : nodeSet) - { - if (node->type() == "Scaling") // XXX (node->type() != "Producer") - { - std::shared_ptr<Operator> nodeOperator = node->getOperator(); - std::shared_ptr<Tensor> valueTensor = std::static_pointer_cast<Tensor> (nodeOperator->getRawOutput(0)); - float range = getTensorAbsoluteMax(valueTensor); - - // Associate the value to the scaling node ... - valueRanges.insert(std::make_pair(node->name(), range)); - } - } - - return valueRanges; -} - -std::map<std::string, float> computeRanges(std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet) -{ - std::map<std::string, float> valueRanges; - std::set<std::shared_ptr<Node>> nodeSet = graphView->getNodes(); - - for (std::shared_ptr<Node> node : nodeSet) - if (node->type() == "Scaling") // XXX (node->type() != "Producer") - valueRanges.insert(std::make_pair(node->name(), 0)); - - //int i = 0; - for (std::shared_ptr<Tensor> sample : inputDataSet) - { - std::map<std::string, float> sampleRanges = computeRanges(graphView, sample); - for (std::shared_ptr<Node> node : nodeSet) - { - if (node->type() == "Scaling") // XXX (node->type() != "Producer") - { - std::string nodeName = node->name(); - if (sampleRanges[nodeName] > valueRanges[nodeName]) - valueRanges[nodeName] = sampleRanges[nodeName]; - } - } - } - - return valueRanges; -} - -void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::string, float> valueRanges) -{ - // CREATE THE SCALING FACTOR MAP ////////////////////////////////////////// - - std::vector<std::shared_ptr<Node>> nodeVector = extractNodeVector(graphView, false); - - std::map<std::string, float> scalingFactors; - - for (std::shared_ptr<Node> node : nodeVector) - scalingFactors.insert(std::make_pair(node->name(), 1.0)); - - // ITERATE OVER THE GRAPH ///////////////////////////////////////////////// - - for (std::shared_ptr<Node> node : nodeVector) - { - // Seamless scaling factor propagation ... - if (isAffine(node) || isSeamless(node) || node->type() == "ReLU") - { - if (node == getFirstNode(graphView)) - { - scalingFactors[node->name()] = 1.0; - } - else - { - std::shared_ptr<Node> prevNode = node->getParent(0); - scalingFactors[node->name()] = scalingFactors[prevNode->name()]; - } - } - - if (node->type() == "Scaling") - { - // Retreive the previous scaling factor ... - std::shared_ptr<Node> prevNode = node->getParent(0); - float prevScalingFactor = scalingFactors[prevNode->name()]; - - // XXX HERE : valueRanges must contains all the scaling nodes !!! - float scalingFactor = valueRanges[node->name()]; - - std::shared_ptr<Scaling_Op> scalingOperator = std::static_pointer_cast<Scaling_Op> (node->getOperator()); - scalingOperator->scalingFactor() /= (scalingFactor / prevScalingFactor); - - scalingFactors[node->name()] = scalingFactor; - - // Fix the bias ... - bool prevNodeHasBias = (prevNode->getParents().size() == 3); - if (prevNodeHasBias) { - std::shared_ptr<Tensor> biasTensor = getBiasTensor(prevNode); - rescaleTensor(biasTensor, 1.0 / prevScalingFactor); - } - } - - if (node->type() == "Concat" || node->type() == "Add") - { - // We should assert if merging nodes are all scalings ! - std::vector<std::shared_ptr<Node>> mergingNodes = node->getParents(); - - // Compute the max scaling ... - float maxScaling = 0; - int maxNodeIndex = 0; - for (std::size_t i = 0; i < mergingNodes.size(); i++) - { - float merginNodeScaling = scalingFactors[mergingNodes[i]->name()]; - if (merginNodeScaling > maxScaling) { - maxScaling = merginNodeScaling; - maxNodeIndex = i; - } - } - - // Ensure that the adding node does not overflow ... - if (node->type() == "Add") { - std::shared_ptr<Node> maxNode = mergingNodes[maxNodeIndex]; - maxScaling /= valueRanges[getPreviousScalingNode(maxNode)->name()]; - maxScaling *= valueRanges[getPreviousScalingNode(node)->name()]; - } - - scalingFactors[node->name()] = maxScaling; - - for (std::shared_ptr<Node> mergingNode : mergingNodes) - { - float mergingNodeScaling = scalingFactors[mergingNode->name()]; - float rescaling = mergingNodeScaling / maxScaling; - - std::shared_ptr<Node> scalingNode = getPreviousScalingNode(mergingNode); - //Log::info(" SCALING NODE : {} {}", scalingNode->type(), scalingNode->name()); - - std::shared_ptr<Scaling_Op> scalingOperator = std::static_pointer_cast<Scaling_Op> (scalingNode->getOperator()); - scalingOperator->scalingFactor() *= rescaling; - } - } - } -} - -void quantizeNormalizedNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits) -{ - float signedMax = (1 << (nbBits - 1)) - 1; - - // ITERATE OVER THE GRAPH ///////////////////////////////////////////////// - - std::vector<std::shared_ptr<Node>> nodeVector = extractNodeVector(graphView, false); - - for (std::shared_ptr<Node> node : nodeVector) - { - // XXX should be removed when the Scaling issue is fixed !!! - bool isLastIdentity = (node->type() == "FC") && (node == getLastNode(graphView)); - - if (isAffine(node) && !isLastIdentity) - { - // Rescale the weight tensor - std::shared_ptr<Tensor> weightTensor = getWeightTensor(node); - rescaleTensor(weightTensor, signedMax); - roundTensor(weightTensor); - - // Rescale the bias tensor - bool nodeHasBias = (node->getParents().size() == 3); - if (nodeHasBias) { - std::shared_ptr<Tensor> biasTensor = getBiasTensor(node); - rescaleTensor(biasTensor, signedMax * signedMax); - roundTensor(biasTensor); - } - - std::shared_ptr<Node> scalingNode = *(node->getChildren().begin()); - std::shared_ptr<Scaling_Op> scalingOperator = std::static_pointer_cast<Scaling_Op> (scalingNode->getOperator()); - scalingOperator->scalingFactor() /= signedMax; - scalingOperator->quantizedNbBits() = nbBits; - } - } - - // Ensure that residual scaling nodes are also quantized ... - for (std::shared_ptr<Node> node : nodeVector) - { - if (node->type() == "Scaling") - { - std::shared_ptr<Scaling_Op> scalingOperator = std::static_pointer_cast<Scaling_Op> (node->getOperator()); - scalingOperator->quantizedNbBits() = nbBits; // XXX HERE !!! - } - } -} - -std::map<std::string, std::vector<int>> computeScalingHistograms(std::map<std::string, float> valueRanges, int nbBins, std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet) -{ - //std::cout << " COMPUTING HISTOGRAMS ... " << std::endl; - - std::map<std::string, std::vector<int>> histograms; - - SequentialScheduler scheduler(graphView); - - std::shared_ptr<Node> inputNode = getFirstNode(graphView); - - // Setup the histograms ... - - for (std::shared_ptr<Node> node : graphView->getNodes()) - { - if (node->type() == "Scaling") - { - std::vector<int> histogram; - for (int i = 0; i < nbBins; i++) - histogram.push_back(0); - - histograms.insert(std::make_pair(node->name(), histogram)); - } - } - - // Fill the histograms ... - - for (std::shared_ptr<Tensor> inputTensor : inputDataSet) - { - // Setup the input - std::shared_ptr<Node> inputProducer = inputNode->getParent(0); - inputProducer->getOperator()->setOutput(0, inputTensor); - - // Forward ... - scheduler.forward(); - - // Gather values ... - for (std::shared_ptr<Node> node : graphView->getNodes()) - { - if (node->type() == "Scaling") - { - float valueRange = valueRanges[node->name()]; - - std::shared_ptr<Operator> nodeOperator = node->getOperator(); - std::shared_ptr<Tensor> valueTensor = std::static_pointer_cast<Tensor> (nodeOperator->getRawOutput(0)); - - float * castedTensor = static_cast<float *> (valueTensor->getImpl()->rawPtr()); - for(std::size_t i = 0; i < valueTensor->size(); i++) - { - int bin = std::round(std::abs(castedTensor[i] / valueRange * nbBins)); - histograms[node->name()][bin]++; - } - } - } - } - - return histograms; -} - -float computeBestClipping(std::vector<int> histogram, std::uint8_t nbBits) -{ - //std::cout << " TEST " << std::endl; - - int nbBins = histogram.size(); - int nbIter = 100; - int signedMax = (1 << (nbBits - 1)) - 1; - - // Compute the cumulative approximation error : - // At each iteration we test a clipping candidate and loop over - // the histogram to accumulate the squared error - - std::vector<float> clippingErrors; - for (int it = 0; it < nbIter; it++) - { - // Compute the rounding cost of this particular clipping ... - float acc = 0.0; - float clipping = it / static_cast<float> (nbIter); - for (int bin = 0; bin < nbBins; bin++) - { - float value = (bin + 0.5) / nbBins; - float scaling = signedMax / clipping; - float rounded = std::round(value * scaling) / scaling; - float clipped = std::min(clipping, rounded); - - float approxError = (clipped - value); - acc += (approxError * approxError) * histogram[bin]; - } - clippingErrors.push_back(acc); - } - - //for (int it = 0; it < nbIter; it++) - // std::cout << " it : " << it << " " << clippingErrors[it] << std::endl; - - float bestClipping = 0.0; - float minClippingError = clippingErrors[0]; - for (int it = 0; it < nbIter; it++) - if (clippingErrors[it] < minClippingError) - { - bestClipping = it / static_cast<float> (nbIter); - minClippingError = clippingErrors[it]; - } - - return bestClipping; -} - -void adjustRanges(std::map<std::string, float>& valueRanges, std::uint8_t nbBits, std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet) -{ - //std::cout << " BEFORE CLIPING : " << std::endl; - //std::map<std::string, float>::iterator it; - //for (it = valueRanges.begin(); it != valueRanges.end(); it++) - // std::cout << it->first << " " << it->second << std::endl; - - int nbBins = (1 << (nbBits + 4)) ; // XXX Enhance this !!! - - std::map<std::string, std::vector<int>> histograms = computeScalingHistograms(valueRanges, nbBins, graphView, inputDataSet); - - for (std::shared_ptr<Node> node : graphView->getNodes()) - if (node->type() == "Scaling") - { - std::vector<int> histogram = histograms[node->name()]; - float cliping = computeBestClipping(histogram, nbBits); - //std::cout << " cliping " << node->name() << " " << cliping << std::endl; - valueRanges[node->name()] *= cliping; - } - - //std::cout << " AFTER CLIPING : " << std::endl; - //for (it = valueRanges.begin(); it != valueRanges.end(); it++) - // std::cout << it->first << " " << it->second << std::endl; -} - -void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, std::vector<std::shared_ptr<Tensor>> inputDataSet, bool OptimizeCliping) -{ - Log::info(" === QUANT PTQ 0.2.8 === "); - - if (!checkArchitecture(graphView)) - return; - - Log::info(" Removing the flatten nodes ... "); - removeFlatten(graphView); - - Log::info(" Removing the Softmax node ... "); - popSoftMax(graphView); - - Log::info(" Inserting the scaling nodes ..."); - insertScalingNodes(graphView); - - Log::info(" Normalizing the parameters ..."); - normalizeParameters(graphView); - - Log::info(" Computing the value ranges ..."); - std::map<std::string, float> valueRanges = computeRanges(graphView, inputDataSet); - - if (OptimizeCliping) - { - Log::info(" Optimizing the cliping values ..."); - adjustRanges(valueRanges, nbBits, graphView, inputDataSet); - } - - Log::info(" Normalizing the activations ..."); - normalizeActivations(graphView, valueRanges); - - Log::info(" Quantizing the normalized network ..."); - quantizeNormalizedNetwork(graphView, nbBits); - - Log::info(" Network is quantized !"); -} - -std::map<std::string, float> getWeightRanges(std::shared_ptr<GraphView> graphView) -{ - std::map<std::string, float> weightRanges; - - for (std::shared_ptr<Node> node : graphView->getNodes()) - { - if (isAffine(node)) - { - std::shared_ptr<Tensor> weightTensor = getWeightTensor(node); - float range = getTensorAbsoluteMax(weightTensor); - weightRanges.insert(std::make_pair(node->name(), range)); - } - } - - return weightRanges; -} - -void clearBiases(std::shared_ptr<GraphView> graphView) { - - for (std::shared_ptr<Node> node : graphView->getNodes()) { - if (node->type() == "FC" || node->type() == "Conv") { - std::shared_ptr<Tensor> biasTensor = std::static_pointer_cast<OperatorTensor>(node->getOperator())->getInput(2); - rescaleTensor(biasTensor, 0); - } - } -} - -void devPTQ(std::shared_ptr<GraphView> graphView) -{ - for (std::shared_ptr<Node> node : graphView->getNodes()) - std::cout << " ### node : " << node->type() << std::endl; -} -} -