diff --git a/include/aidge/QuantPTQ.hpp b/include/aidge/QuantPTQ.hpp index 499ded0d7f9054acddec09f6c0ed90b4ed4140a4..fdc9300e69ba0873409c5eed60e31773166a6947 100644 --- a/include/aidge/QuantPTQ.hpp +++ b/include/aidge/QuantPTQ.hpp @@ -54,9 +54,9 @@ namespace Aidge { * @brief Normalize the activations of each affine node so that it become equal to one. * This is done by reconfiguring the scaling nodes, as well as rescaling the weights and biases tensors. * @param graphView The GraphView containing the affine nodes. - * @param inputDataSet The input dataset on which the value ranges are computed. + * @param valueRanges The node output value ranges computed over the calibration dataset. */ - void normalizeActivations(std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet); + void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::string, float> valueRanges); /** @@ -72,7 +72,7 @@ namespace Aidge { * @param nbBits The desired number of bits of the quantization. * @param inputDataSet The input dataset on which the value ranges are computed. */ - void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, std::vector<std::shared_ptr<Tensor>> inputDataSet); + void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, std::vector<std::shared_ptr<Tensor>> inputDataSet, bool OptimizeCliping); /** * @brief Compute the weight ranges of every affine node. Provided for debuging purposes. @@ -88,6 +88,11 @@ namespace Aidge { void clearBiases(std::shared_ptr<GraphView> graphView); void devPTQ(std::shared_ptr<GraphView> graphView); + + std::map<std::string, std::vector<int>> computeScalingHistograms(std::map<std::string, float> valueRanges, int nbBins, std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet); + + float computeBestClipping(std::vector<int> histogram, std::uint8_t nbBits); + } #endif /* AIDGE_QUANTIZATION_QUANTPTQ_H_ */ diff --git a/python_binding/pybind_QuantPTQ.cpp b/python_binding/pybind_QuantPTQ.cpp index 1a9f08e94f56095931f6885e2711c37d2698c845..6927513c5d292b48a0e1e751b8396c7f553ca2d3 100644 --- a/python_binding/pybind_QuantPTQ.cpp +++ b/python_binding/pybind_QuantPTQ.cpp @@ -57,14 +57,14 @@ void init_QuantPTQ(py::module &m) { :rtype: dict )mydelimiter"); - m.def("normalize_activations", &normalizeActivations, py::arg("network"), py::arg("input_dataset"), + m.def("normalize_activations", &normalizeActivations, py::arg("network"), py::arg("value_ranges"), R"mydelimiter( Normalize the activations of each affine node so that it become equal to one. This is done by reconfiguring the scaling nodes, as well as rescaling the weights and biases tensors. :param network: The GraphView containing the affine nodes. :type network: :py:class:`aidge_core.GraphView` - :param input_dataset: The input dataset on which the value ranges are computed. - :type input_dataset: list of :py:class:`aidge_core.Tensor` + :param value_ranges: The node output value ranges computed over the calibration dataset. + :type value_ranges: list of float. )mydelimiter"); m.def("quantize_normalized_network", &quantizeNormalizedNetwork, py::arg("network"), py::arg("nb_bits"), @@ -76,7 +76,7 @@ void init_QuantPTQ(py::module &m) { :type nb_bits: int )mydelimiter"); - m.def("quantize_network", &quantizeNetwork ,py::arg("network"), py::arg("nb_bits"), py::arg("input_dataset"), + m.def("quantize_network", &quantizeNetwork ,py::arg("network"), py::arg("nb_bits"), py::arg("input_dataset"), py::arg("optimize_cliping") = false, R"mydelimiter( Main quantization routine. Performs every step of the quantization pipeline. :param network: The GraphView to be quantized. @@ -85,6 +85,8 @@ void init_QuantPTQ(py::module &m) { :type nb_bits: int :param input_dataset: The input dataset on which the value ranges are computed. :type input_dataset: list of :py:class:`aidge_core.Tensor` + :param optimize_cliping: Whether to optimize the cliping values or not. + :type optimize_cliping: bool )mydelimiter"); m.def("get_weight_ranges", &getWeightRanges, py::arg("network"), @@ -103,6 +105,9 @@ void init_QuantPTQ(py::module &m) { :type network: :py:class:`aidge_core.GraphView` )mydelimiter"); + m.def("compute_scaling_histograms", &computeScalingHistograms, py::arg("value_ranges"), py::arg("nb_bins"), py::arg("network"), py::arg("input_dataset"), "compute scaling histogram"); + m.def("compute_best_clipping", &computeBestClipping, py::arg("histogram"), py::arg("nb_bits"), "compute the best clipping for an histogram"); + m.def("dev_ptq", &devPTQ, py::arg("network"), "dev ptq"); } diff --git a/src/QuantPTQ.cpp b/src/QuantPTQ.cpp index 2f12d0febaae49b0c589295bf39521c4783b01d7..be929360ca3abba3af7ebc87aede5af9854d6762 100644 --- a/src/QuantPTQ.cpp +++ b/src/QuantPTQ.cpp @@ -78,7 +78,7 @@ static bool isSeamless(std::shared_ptr<Node> node) bool checkArchitecture(std::shared_ptr<GraphView> graphView) { - std::set<std::string> otherNodeTypes({"Add", "Concat", "Softmax", "ReLU", "Producer"}); + std::set<std::string> otherNodeTypes({"Flatten", "Add", "Concat", "Softmax", "ReLU", "Producer"}); for (std::shared_ptr<Node> node : graphView->getNodes()) { @@ -124,6 +124,16 @@ static void popSoftMax(std::shared_ptr<GraphView> graphView) } } +static void fillTensor(std::shared_ptr<Tensor> tensor, float value) +{ + // Get the tensor data pointer + float * castedTensor = static_cast <float *> (tensor->getImpl()->rawPtr()); + + // Fill the tensor + for(std::size_t i = 0; i < tensor->size(); i++) + castedTensor[i] = value; +} + static void rescaleTensor(std::shared_ptr<Tensor> tensor, float scaling) { // Get the tensor data pointer @@ -210,7 +220,7 @@ void appendIdentity(std::shared_ptr<GraphView> graphView) { identityNode->getOperator()->setBackend("cpu"); std::shared_ptr<Tensor> weightTensor = std::static_pointer_cast<Tensor> (identityNode->getOperator()->getRawInput(1)); - rescaleTensor(weightTensor, 0); + fillTensor(weightTensor, 0); float * castedWeightTensor = static_cast<float *> (weightTensor->getImpl()->rawPtr()); for (int n = 0; n < size; n++) @@ -222,10 +232,15 @@ void appendIdentity(std::shared_ptr<GraphView> graphView) { std::vector<std::shared_ptr<Node>> extractNodeVector(std::shared_ptr<GraphView> graphView, bool verbose) { + std::vector<std::shared_ptr<Node>> nodeVector; + SequentialScheduler scheduler(graphView); scheduler.forward(); + nodeVector = scheduler.getStaticScheduling(); - std::vector<std::shared_ptr<Node>> nodeVector = scheduler.getStaticScheduling(); + //graphView->forwardDims(); + //scheduler.generateScheduling(); + //nodeVector = scheduler.getStaticScheduling(); fixScheduling(nodeVector); @@ -238,6 +253,9 @@ std::vector<std::shared_ptr<Node>> extractNodeVector(std::shared_ptr<GraphView> Log::info("{} {}", node->type(), node->name()); } + //for (auto node : nodeVector) + // std::cout << node->type() << std::endl; + return nodeVector; } @@ -319,7 +337,14 @@ static std::shared_ptr<Node> getPreviousScalingNode(std::shared_ptr<Node> mergin { std::shared_ptr<Node> currNode = mergingNode; while(currNode->type() != "Scaling") + { + if (currNode->getParents().size() == 0) + { + Log::warn(" Warning : No previous Scaling node were found ! "); + break; + } currNode = currNode->getParents()[0]; + } return currNode; } @@ -429,7 +454,7 @@ std::map<std::string, float> computeRanges(std::shared_ptr<GraphView> graphView, std::set<std::shared_ptr<Node>> nodeSet = graphView->getNodes(); for (std::shared_ptr<Node> node : nodeSet) { - if (node->type() != "Producer") + if (node->type() == "Scaling") // XXX (node->type() != "Producer") { std::shared_ptr<Operator> nodeOperator = node->getOperator(); std::shared_ptr<Tensor> valueTensor = std::static_pointer_cast<Tensor> (nodeOperator->getRawOutput(0)); @@ -449,32 +474,29 @@ std::map<std::string, float> computeRanges(std::shared_ptr<GraphView> graphView, std::set<std::shared_ptr<Node>> nodeSet = graphView->getNodes(); for (std::shared_ptr<Node> node : nodeSet) - if (node->type() != "Producer") + if (node->type() == "Scaling") // XXX (node->type() != "Producer") valueRanges.insert(std::make_pair(node->name(), 0)); - for (std::shared_ptr<Tensor> sample : inputDataSet) // XXX HERE const + //int i = 0; + for (std::shared_ptr<Tensor> sample : inputDataSet) { std::map<std::string, float> sampleRanges = computeRanges(graphView, sample); for (std::shared_ptr<Node> node : nodeSet) { - if (node->type() != "Producer") + if (node->type() == "Scaling") // XXX (node->type() != "Producer") { std::string nodeName = node->name(); if (sampleRanges[nodeName] > valueRanges[nodeName]) valueRanges[nodeName] = sampleRanges[nodeName]; } } - } + } return valueRanges; } -void normalizeActivations(std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet) +void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::string, float> valueRanges) { - // EXTRACT VALUE RANGE MAP //////////////////////////////////////////////// - - std::map<std::string, float> valueRanges = computeRanges(graphView, inputDataSet); - // CREATE THE SCALING FACTOR MAP ////////////////////////////////////////// std::vector<std::shared_ptr<Node>> nodeVector = extractNodeVector(graphView, false); @@ -508,7 +530,8 @@ void normalizeActivations(std::shared_ptr<GraphView> graphView, std::vector<std: std::shared_ptr<Node> prevNode = node->getParent(0); float prevScalingFactor = scalingFactors[prevNode->name()]; - float scalingFactor = valueRanges[node->name()]; // XXX HERE !!! + // XXX HERE : valueRanges must contains all the scaling nodes !!! + float scalingFactor = valueRanges[node->name()]; std::shared_ptr<Scaling_Op> scalingOperator = std::static_pointer_cast<Scaling_Op> (node->getOperator()); scalingOperator->getAttr<float>("scalingFactor") /= (scalingFactor / prevScalingFactor); @@ -543,11 +566,10 @@ void normalizeActivations(std::shared_ptr<GraphView> graphView, std::vector<std: // Ensure that the adding node does not overflow ... if (node->type() == "Add") { std::shared_ptr<Node> maxNode = mergingNodes[maxNodeIndex]; - maxScaling /= valueRanges[maxNode->name()] / valueRanges[node->name()]; + maxScaling /= valueRanges[getPreviousScalingNode(maxNode)->name()]; + maxScaling *= valueRanges[getPreviousScalingNode(node)->name()]; } - // Log::info(" MAX SCALING : {} ", maxScaling); - scalingFactors[node->name()] = maxScaling; for (std::shared_ptr<Node> mergingNode : mergingNodes) @@ -611,9 +633,138 @@ void quantizeNormalizedNetwork(std::shared_ptr<GraphView> graphView, std::uint8_ } } -void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, std::vector<std::shared_ptr<Tensor>> inputDataSet) +std::map<std::string, std::vector<int>> computeScalingHistograms(std::map<std::string, float> valueRanges, int nbBins, std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet) +{ + //std::cout << " COMPUTING HISTOGRAMS ... " << std::endl; + + std::map<std::string, std::vector<int>> histograms; + + SequentialScheduler scheduler(graphView); + + std::shared_ptr<Node> inputNode = getFirstNode(graphView); + + // Setup the histograms ... + + for (std::shared_ptr<Node> node : graphView->getNodes()) + { + if (node->type() == "Scaling") + { + std::vector<int> histogram; + for (int i = 0; i < nbBins; i++) + histogram.push_back(0); + + histograms.insert(std::make_pair(node->name(), histogram)); + } + } + + // Fill the histograms ... + + for (std::shared_ptr<Tensor> inputTensor : inputDataSet) + { + // Setup the input + std::shared_ptr<Node> inputProducer = inputNode->getParent(0); + inputProducer->getOperator()->setOutput(0, inputTensor); + + // Forward ... + scheduler.forward(); + + // Gather values ... + for (std::shared_ptr<Node> node : graphView->getNodes()) + { + if (node->type() == "Scaling") + { + float valueRange = valueRanges[node->name()]; + + std::shared_ptr<Operator> nodeOperator = node->getOperator(); + std::shared_ptr<Tensor> valueTensor = std::static_pointer_cast<Tensor> (nodeOperator->getRawOutput(0)); + + float * castedTensor = static_cast<float *> (valueTensor->getImpl()->rawPtr()); + for(std::size_t i = 0; i < valueTensor->size(); i++) + { + int bin = std::round(std::abs(castedTensor[i] / valueRange * nbBins)); + histograms[node->name()][bin]++; + } + } + } + } + + return histograms; +} + +float computeBestClipping(std::vector<int> histogram, std::uint8_t nbBits) +{ + //std::cout << " TEST " << std::endl; + + int nbBins = histogram.size(); + int nbIter = 100; + int signedMax = (1 << (nbBits - 1)) - 1; + + // Compute the cumulative approximation error : + // At each iteration we test a clipping candidate and loop over + // the histogram to accumulate the squared error + + std::vector<float> clippingErrors; + for (int it = 0; it < nbIter; it++) + { + // Compute the rounding cost of this particular clipping ... + float acc = 0.0; + float clipping = it / static_cast<float> (nbIter); + for (int bin = 0; bin < nbBins; bin++) + { + float value = (bin + 0.5) / nbBins; + float scaling = signedMax / clipping; + float rounded = std::round(value * scaling) / scaling; + float clipped = std::min(clipping, rounded); + + float approxError = (clipped - value); + acc += (approxError * approxError) * histogram[bin]; + } + clippingErrors.push_back(acc); + } + + //for (int it = 0; it < nbIter; it++) + // std::cout << " it : " << it << " " << clippingErrors[it] << std::endl; + + float bestClipping = 0.0; + float minClippingError = clippingErrors[0]; + for (int it = 0; it < nbIter; it++) + if (clippingErrors[it] < minClippingError) + { + bestClipping = it / static_cast<float> (nbIter); + minClippingError = clippingErrors[it]; + } + + return bestClipping; +} + +void adjustRanges(std::map<std::string, float>& valueRanges, std::uint8_t nbBits, std::shared_ptr<GraphView> graphView, std::vector<std::shared_ptr<Tensor>> inputDataSet) { - Log::info(" === QUANT PTQ 0.2.7 === "); + //std::cout << " BEFORE CLIPING : " << std::endl; + //std::map<std::string, float>::iterator it; + //for (it = valueRanges.begin(); it != valueRanges.end(); it++) + // std::cout << it->first << " " << it->second << std::endl; + + int nbBins = (1 << (nbBits + 4)) ; // XXX Enhance this !!! + + std::map<std::string, std::vector<int>> histograms = computeScalingHistograms(valueRanges, nbBins, graphView, inputDataSet); + + for (std::shared_ptr<Node> node : graphView->getNodes()) + if (node->type() == "Scaling") + { + std::vector<int> histogram = histograms[node->name()]; + float cliping = computeBestClipping(histogram, nbBits); + //std::cout << " cliping " << node->name() << " " << cliping << std::endl; + valueRanges[node->name()] *= cliping; + } + + //std::cout << " AFTER CLIPING : " << std::endl; + //for (it = valueRanges.begin(); it != valueRanges.end(); it++) + // std::cout << it->first << " " << it->second << std::endl; +} + +void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, std::vector<std::shared_ptr<Tensor>> inputDataSet, bool OptimizeCliping) +{ + Log::info(" === QUANT PTQ 0.2.8 === "); if (!checkArchitecture(graphView)) return; @@ -630,8 +781,17 @@ void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, Log::info(" Normalizing the parameters ..."); normalizeParameters(graphView); + Log::info(" Computing the value ranges ..."); + std::map<std::string, float> valueRanges = computeRanges(graphView, inputDataSet); + + if (OptimizeCliping) + { + Log::info(" Optimizing the cliping values ..."); + adjustRanges(valueRanges, nbBits, graphView, inputDataSet); + } + Log::info(" Normalizing the activations ..."); - normalizeActivations(graphView, inputDataSet); + normalizeActivations(graphView, valueRanges); Log::info(" Quantizing the normalized network ..."); quantizeNormalizedNetwork(graphView, nbBits); @@ -671,78 +831,5 @@ void devPTQ(std::shared_ptr<GraphView> graphView) for (std::shared_ptr<Node> node : graphView->getNodes()) std::cout << " ### node : " << node->type() << std::endl; } - - -} - - -/* -std::map<std::string, std::vector<int>> getValueHistograms(std::shared_ptr<GraphView> graphView, std::shared_ptr<Tensor> inputTensor, std::map<std::string, float> valueRanges) -{ - int nb_bins = 32; - - std::map<std::string, std::vector<int>> valueHistograms; - std::set<std::shared_ptr<Node>> nodeSet = graphView->getNodes(); - - // SETUP THE HISTOGRAMS //////////////////////////////////////////////////////// - - for (std::shared_ptr<Node> node : nodeSet) - { - if (node->type() == "Scaling") - { - std::vector<int> histogram; - for (int i = 0; i < nb_bins; i++) - histogram.push_back(0); - - valueHistograms.insert(std::make_pair(node->name(), histogram)); - } - } - - // PROPAGATE THE INPUTS //////////////////////////////////////////////////////// - - SequentialScheduler scheduler(graphView); - - std::shared_ptr<Node> inputNode = getFirstNode(graphView); - std::shared_ptr<Node> inputProducer = inputNode->getParent(0); - inputProducer->getOperator()->setOutput(0, inputTensor); - - scheduler.forward(); - - // GATHER THE RESULTS ////////////////////////////////////////////////////////// - - std::set<std::shared_ptr<Node>> nodeSet = graphView->getNodes(); - - for (std::shared_ptr<Node> node : nodeSet) - { - if (node->type() == "Scaling") - { - std::shared_ptr<Operator> nodeOperator = node->getOperator(); - std::shared_ptr<Tensor> valueTensor = std::static_pointer_cast<Tensor> (nodeOperator->getRawOutput(0)); - float* castedTensor = static_cast <float *> (valueTensor->getImpl()->rawPtr()); - - // Fill the histogram ... - - for (std::size_t i = 0; i < valueTensor->size(); i++) - { - float ratio = std::abs(castedTensor[i]) / valueRanges[node->name()]; - int bin_index = (int) (ratio * (nb_bins - 1)); - valueHistograms[node->name()][bin_index]++; - } - } - } - - return valueHistograms; } -void printHistograms(std::map<std::string, std::vector<int>> histograms) -{ - std::map<std::string, std::vector<int>>::iterator it; - for (it = histograms.begin(); it != histograms.end(); it++) - { - std::cout << " Node : " << it->first << " -> "; - for (std::size_t i = 0; i < it->second.size(); i++) - std::cout << (it->second)[i] << " "; - std::cout << std::endl; - } -} -*/ \ No newline at end of file