diff --git a/include/aidge/operator/PTQMetaOps.hpp b/include/aidge/operator/PTQMetaOps.hpp index b9bad0d18f099e94d4c52254b08629c7f947db6a..9ca76fbd40b9366aa82c6521fba931d284da137a 100644 --- a/include/aidge/operator/PTQMetaOps.hpp +++ b/include/aidge/operator/PTQMetaOps.hpp @@ -29,14 +29,6 @@ namespace Aidge { /// @return A shared pointer to an instance of the meta-operator node. std::shared_ptr<Aidge::Node> Quantizer(double scalingFactor, double clipMin, double clipMax, const std::string& name); -/// @brief The purpose of Scaling is to encapsulate the Mul operator and tag it as a PTQ node rather than a regular Mul operator. -/// Therefore, this meta-operator consists solely of a [Mul] operation. -/// -/// @param scalingFactor The scaling factor to apply to the input (a scalar to multiply the input with). -/// @param name The name of the meta-operator node created. -/// @return A shared pointer to an instance of the scaling node. -std::shared_ptr<Aidge::Node> Scaling(double scalingFactor, const std::string& name = ""); - /// @brief Updates the scaling factor of a PTQ meta-operator node, allowing for dynamic adjustment of the scaling parameter. /// This function sets a new scaling factor for a specified meta-operator node, modifying the scalar applied in the [Mul] operation. /// The meta-operator node must be a PTQ-specific operator, such as a Quantizer or Scaling node. diff --git a/include/aidge/quantization/PTQ/PTQ.hpp b/include/aidge/quantization/PTQ/PTQ.hpp index bfe671e3556c3af2c367ce7f86708f01c8e3d3b5..1d1b71ba7501580ea99103d351eafac9a7f793d2 100644 --- a/include/aidge/quantization/PTQ/PTQ.hpp +++ b/include/aidge/quantization/PTQ/PTQ.hpp @@ -69,6 +69,26 @@ namespace Aidge { * @return The scheduled vector of nodes */ std::vector<std::shared_ptr<Node>> retrieveNodeVector(std::shared_ptr<GraphView> graphView, bool newSchedule = true, bool verbose = false); + + /** + * @brief Inserts a scaling node below the given producer node in the graph view. + * If the node is already a producer scaling node, it accumulates the scaling factor by multiplyins its value directly. + * + * @param node A shared pointer to the producer node where the scaling node will be inserted (below). + * @param scalingFactor The scaling factor to apply. + * @param graphView A shared pointer to the graph view in which the nodes are located. + * @return True if the scaling node was successfully inserted or the scaling factor was accumulated; False otherwise. + */ + bool insertScalingBelowProducer(std::shared_ptr<Node> node, double scalingFactor, std::shared_ptr<GraphView> graphView); + + /** + * @brief Inserts a rounding node below the given producer (also below its ows producerScaling) node in the graph view. + * + * @param node A shared pointer to the producer node where the rounding node will be inserted. + * @param graphView A shared pointer to the graph view in which the nodes are located. + * @return True if the rounding node was successfully inserted; False otherwise. + */ + bool insertRoundBelowProducer(std::shared_ptr<Node> node, std::shared_ptr<GraphView> graphView); /** * @brief Determine whether an input GraphView can be quantized or not. @@ -77,6 +97,14 @@ namespace Aidge { */ bool checkArchitecture(std::shared_ptr<GraphView> graphView); + /** + * @brief This function multiplies the existing scaling factor by a given coefficient. It verifies that the node is of the correct type ("Mul") + * and has the `isScaling` attribute. If these conditions are not met, a warning is logged. + * @param node A shared pointer to an `Aidge::Node` object representing the node to modify. + * @param coeff A double representing the multiplication coefficient to apply to the scaling factor. + */ + void multiplyScalingFactor(std::shared_ptr<Aidge::Node> node, double coeff); + void prepareNetwork(std::shared_ptr<GraphView> graphView); diff --git a/include/aidge/quantization/QAT/QAT_LSQ.hpp b/include/aidge/quantization/QAT/QAT_LSQ.hpp index 922187abca915daa1c00f3949d0d791b0d3e1c39..4970be07fae8737a1c2863600757bb81ff3a65f9 100644 --- a/include/aidge/quantization/QAT/QAT_LSQ.hpp +++ b/include/aidge/quantization/QAT/QAT_LSQ.hpp @@ -9,29 +9,36 @@ * ********************************************************************************/ -#ifndef AIDGE_QUANTIZATION_QUANTIZATION_QAT_LSQ_H_ -#define AIDGE_QUANTIZATION_QUANTIZATION_QAT_LSQ_H_ +#ifndef AIDGE_QUANTIZATION_QAT_LSQ_H_ +#define AIDGE_QUANTIZATION_QAT_LSQ_H_ -#include <cstddef> // std::size_t -#include <memory> - -#include "aidge/data/Tensor.hpp" +#include "aidge/graph/Node.hpp" #include "aidge/graph/GraphView.hpp" +#include "aidge/data/Tensor.hpp" namespace Aidge { namespace QuantLSQ { /** - * @brief Given a GraphView with parameters properly initialized, insert - * the LSQ quantizer nodes, and setup the adjustment their step-sizes. - * @param graphView The GraphView containing the network to quantize. + * @brief Insert the LSQ quantizer nodes in a given GraphView + * @param graphView The GraphView containing the graph to quantize. * @param nbBits Number of quantization bits. + * @param span Fixed output span of the quantizers. */ +void insertQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits, float step_size); -void setupQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits); +/** + * @brief Given a GraphView with parameters properly initialized and some calibration data, + * insert the LSQ quantizer nodes, and adjust their step-sizes. + * @param graphView The GraphView containing the graph to quantize. + * @param nbBits Number of quantization bits. + * @param calibrationData Calibration data used to adjust the spans. + * @param scale Multiplicative constant applied to the spans. + */ +void insertAndInitQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits, std::shared_ptr<Tensor> calibrationData); -} // namespace QuantLSQ -} // namespace Aidge +} +} -#endif /* AIDGE_QUANTIZATION_QUANTIZATION_QAT_LSQ_H_ */ +#endif /* AIDGE_QUANTIZATION_QAT_LSQ_H_ */ diff --git a/python_binding/pybind_PTQ.cpp b/python_binding/pybind_PTQ.cpp index 1de797693468273814f4c5e82a161991648d06ff..ae0a0def28a861e2fc207adbc27c6af47dc0ded8 100644 --- a/python_binding/pybind_PTQ.cpp +++ b/python_binding/pybind_PTQ.cpp @@ -213,13 +213,6 @@ void init_PTQ(py::module &m) { :type network: :py:class:`aidge_core.GraphView` )mydelimiter"); - m.def("dev_ptq", &devPTQ, py::arg("network"), - R"mydelimiter( - Developement and test routine. - :param network: The GraphView under test. - :type network: :py:class:`aidge_core.GraphView` - )mydelimiter"); - m.def("prepare_network", &prepareNetwork, py::arg("network"), "prepare the network for the PTQ"); } diff --git a/python_binding/pybind_QAT_LSQ.cpp b/python_binding/pybind_QAT_LSQ.cpp index 4bba3b6baa5eda41a024399eb1be1402c74b2c1b..206985efe4558a84ce1ed67a1264bd6902213764 100644 --- a/python_binding/pybind_QAT_LSQ.cpp +++ b/python_binding/pybind_QAT_LSQ.cpp @@ -23,6 +23,8 @@ void init_QAT_LSQ(py::module &m) { auto mQuantLSQ = m.def_submodule("lsq"); - mQuantLSQ.def("setup_quantizers", &QuantLSQ::setupQuantizers, py::arg("network"), py::arg("nb_bits")); + mQuantLSQ.def("insert_quantizers", &QuantLSQ::insertQuantizers, py::arg("network"), py::arg("nb_bits"), py::arg("step_size")); + + mQuantLSQ.def("insert_and_init_quantizers", &QuantLSQ::insertAndInitQuantizers, py::arg("network"), py::arg("nb_bits"), py::arg("calibration_data")); } } // namespace Aidge diff --git a/src/PTQ/CLE.cpp b/src/PTQ/CLE.cpp index 28858d0e3c693a7620bc32806008523e0602faa9..2738f8a92154368962e9162fba62c41b7622d07c 100644 --- a/src/PTQ/CLE.cpp +++ b/src/PTQ/CLE.cpp @@ -20,7 +20,10 @@ #include "aidge/quantization/PTQ/PTQ.hpp" // retrieveNodeVector #include "aidge/graph/GraphView.hpp" -#include "aidge/graph/Node.hpp" + +#include "aidge/scheduler/SequentialScheduler.hpp" +#include "aidge/scheduler/Scheduler.hpp" +#include "aidge/utils/Log.hpp" #include "aidge/operator/OperatorTensor.hpp" #include "aidge/utils/Log.hpp" @@ -30,6 +33,12 @@ #include "aidge/operator/Reshape.hpp" #include "aidge/operator/Round.hpp" +#include "aidge/operator/Mul.hpp" +#include "aidge/operator/ArgMax.hpp" +#include "aidge/operator/Abs.hpp" +#include "aidge/operator/Reshape.hpp" +#include "aidge/operator/Round.hpp" + namespace Aidge { @@ -49,7 +58,7 @@ static void rescaleTensor(std::shared_ptr<Tensor> tensor, double scaling) mulOp.setDataType(tensor->dataType()); mulOp.setBackend(tensor->backend()); - std::shared_ptr<Aidge::Tensor> scalingTensor = std::make_shared<Aidge::Tensor>(scaling); + std::shared_ptr<Aidge::Tensor> scalingTensor = std::make_shared<Aidge::Tensor>(Aidge::Array1D<double, 1> {scaling}); scalingTensor->setDataType(tensor->dataType()); scalingTensor->setBackend(tensor->backend()); @@ -67,7 +76,7 @@ static void rescaleTensor(std::shared_ptr<Tensor> tensor, double scaling) static double getTensorAbsoluteMax(std::shared_ptr<Tensor> tensor) { // get the abs tensor - + std::shared_ptr<Tensor> fallback; //Fallback tensor for refCastFR std::shared_ptr<Tensor> absTensor = std::make_shared<Tensor>(tensor->abs()); // flatten the abs tensor @@ -81,6 +90,7 @@ static double getTensorAbsoluteMax(std::shared_ptr<Tensor> tensor) reshapeOp.associateInput(0, absTensor); reshapeOp.forward(); std::shared_ptr<Tensor> flatTensor = reshapeOp.getOutput(0); + const Tensor& localFlatTensor = flatTensor->refCastFrom(fallback, DataType::Float64, "cpu"); // Get the argmax @@ -90,13 +100,24 @@ static double getTensorAbsoluteMax(std::shared_ptr<Tensor> tensor) argmaxOp.associateInput(0, flatTensor); argmaxOp.forward(); - std::shared_ptr<Tensor> argmaxTensor = argmaxOp.getOutput(0); + + const Tensor& argMaxTensor = argmaxOp.getOutput(0)->refCastFrom(fallback, DataType::Float64, "cpu"); // Return the max - int maxIndex = std::round(argmaxTensor->get<double>(0)); + int maxIndex = std::round(argMaxTensor.get<double>(0)); - return flatTensor->get<double>(maxIndex); + return localFlatTensor.get<double>(maxIndex); +} +//Function used to extraxt the local tensor (from a ProducerScalingNode) +std::shared_ptr<Aidge::Tensor> getLocalTensor(std::shared_ptr<Node> node) { + if (node->getParent(1)->attributes()->hasAttr("quantization.ptq.isProducerScaling")) { + std::shared_ptr<Aidge::OperatorTensor> operatorTensor = std::static_pointer_cast<OperatorTensor>(node->getParent(1)->getOperator()); + operatorTensor->forward();// We need the forward pass to compute the scaled value of the Tensor + return operatorTensor->getOutput(0); + } else { + return getWeightTensor(node); + } } void crossLayerEqualization(std::shared_ptr<GraphView> graphView, double targetDelta) @@ -131,16 +152,18 @@ void crossLayerEqualization(std::shared_ptr<GraphView> graphView, double targetD std::shared_ptr<Node> n1 = affineNodeVector[i]; std::shared_ptr<Node> n2 = affineNodeVector[i+1]; - double r1 = getTensorAbsoluteMax(getWeightTensor(n1)); - double r2 = getTensorAbsoluteMax(getWeightTensor(n2)); + std::shared_ptr<Aidge::Tensor> n1localTensor = getLocalTensor(n1); + std::shared_ptr<Aidge::Tensor> n2localTensor = getLocalTensor(n2); + + double r1 = getTensorAbsoluteMax(n1localTensor); + double r2 = getTensorAbsoluteMax(n2localTensor); double s1 = std::sqrt(r1 * r2) / r1; double s2 = std::sqrt(r1 * r2) / r2; - rescaleTensor(getWeightTensor(n1), s1); - rescaleTensor(getWeightTensor(n2), s2); - - rescaleTensor(getBiasTensor(n1), s1); + insertScalingBelowProducer(n1->getParent(1),s1,graphView); + insertScalingBelowProducer(n2->getParent(1),s2,graphView); + insertScalingBelowProducer(n1->getParent(2),s1,graphView); double rangeDelta = std::abs(r1 - r2); if (rangeDelta > maxRangeDelta) diff --git a/src/PTQ/Clipping.cpp b/src/PTQ/Clipping.cpp index 66b0ab36fba7634d7ee350cdccb27895ffa52da1..a4e7fed921604fcf9d18c6e50991220c4785f3bb 100644 --- a/src/PTQ/Clipping.cpp +++ b/src/PTQ/Clipping.cpp @@ -222,7 +222,7 @@ std::map<std::string, double> adjustRanges(Clipping clippingMode, std::map<std:: for (std::shared_ptr<Node> node : graphView->getNodes()) { - if (node->type() == "Scaling") + if (node->attributes()->hasAttr("quantization.ptq.isScaling")) { std::vector<int> histogram = histograms[node->name()]; diff --git a/src/PTQ/PTQ.cpp b/src/PTQ/PTQ.cpp index 7c29ee0b9178fbb07f4a2d5edf9f0ad7ac8dcac4..f03fc7bcea039a1939e116cc842f7062f28c5cae 100644 --- a/src/PTQ/PTQ.cpp +++ b/src/PTQ/PTQ.cpp @@ -14,7 +14,6 @@ #include "aidge/quantization/PTQ/PTQ.hpp" #include "aidge/operator/PTQMetaOps.hpp" - #include "aidge/data/Tensor.hpp" #include "aidge/graph/GraphView.hpp" #include "aidge/graph/Node.hpp" @@ -22,16 +21,15 @@ #include "aidge/scheduler/Scheduler.hpp" #include "aidge/utils/Log.hpp" + #include "aidge/operator/Producer.hpp" #include "aidge/operator/Mul.hpp" +#include "aidge/operator/Round.hpp" #include "aidge/operator/ReLU.hpp" #include "aidge/operator/BatchNorm.hpp" #include "aidge/operator/Conv.hpp" - #include "aidge/operator/ArgMax.hpp" -#include "aidge/operator/Abs.hpp" #include "aidge/operator/Reshape.hpp" -#include "aidge/operator/Round.hpp" #include "aidge/recipes/Recipes.hpp" @@ -55,59 +53,124 @@ bool isMerging(std::shared_ptr<Node> node) { return (mergingNodeTypes.find(node->type()) != mergingNodeTypes.end()); } +static int getInputIndex(std::shared_ptr<Node> node, std::shared_ptr<Node> parentNode) +{ + int index = 0; + while (node->getParent(index) != parentNode) + index++; + return index; +} -bool checkArchitecture(std::shared_ptr<GraphView> graphView) + +void multiplyScalingFactor(std::shared_ptr<Aidge::Node> node,double coeff) { - std::set<std::string> otherNodeTypes({"Flatten", "Softmax", "BatchNorm2D", "ReLU", "Producer"}); + AIDGE_ASSERT(node->type() == "Mul" && (node->attributes()->hasAttr("quantization.ptq.isProducerScaling") || node->attributes()->hasAttr("quantization.ptq.isScaling")), + "Cannot update the scaling factor on Node of type {} with no scaling tag",node->type()); + auto scalingFactorTensor = std::static_pointer_cast<OperatorTensor>(node->getOperator())->getInput(1); + std::shared_ptr<Tensor> fallback; + const Tensor& localTensor = scalingFactorTensor->refCastFrom(fallback, DataType::Float64, "cpu"); + double previousScalingFactor = localTensor.get<double>(0); + std::shared_ptr<Tensor> finalTensor = std::make_shared<Tensor>(Array1D<double, 1> {previousScalingFactor * coeff}); + node->input(1).first->getOperator()->setOutput(0, finalTensor); +} +/* Util function to insert a node below another one already connected */ +void insertNodeBetween(std::shared_ptr<Node> parent, + std::shared_ptr<Node> newNode, + std::shared_ptr<GraphView> graphView) +{ + // Checking the parents always have at least 1 children + AIDGE_ASSERT(parent->getChildren().size() > 0, "The parent node must have at least one child to insert a new node."); + + // Retrieve children connection indexes + std::vector<std::shared_ptr<Node>> nextNodes = parent->getChildren(0); + std::vector<int> inputIndices(nextNodes.size()); + for (std::size_t i = 0; i < nextNodes.size(); i++) { + inputIndices[i] = getInputIndex(nextNodes[i], parent); + } - for (std::shared_ptr<Node> node : graphView->getNodes()) - { - bool isOther = otherNodeTypes.find(node->type()) != otherNodeTypes.end(); - if (!isOther && !isAffine(node) && !isSeamless(node) && !isMerging(node)) { - Log::warn(" GraphView can't be quantized : node type {} is not supported !", node->type()); - return false; - } + // Disconnect childs from parent + for (std::shared_ptr<Node> nextNode : nextNodes) { + parent->removeChild(nextNode, 0); } - return true; + // Insert the new node between the child and the parent + parent->addChild(newNode, 0, 0); + for (std::size_t i = 0; i < nextNodes.size(); i++) { + newNode->addChild(nextNodes[i], 0, inputIndices[i]); + } + + graphView->add(newNode); } -static void rescaleTensor(std::shared_ptr<Tensor> tensor, double scaling) +bool insertRoundBelowProducer(std::shared_ptr<Node> node,std::shared_ptr<GraphView> graphView) { - auto mulOp = Mul_Op(); - mulOp.setDataType(tensor->dataType()); - mulOp.setBackend(tensor->backend()); + if(node->attributes()->hasAttr("quantization.ptq.isProducerScaling") && node->type() != "Round") + { + std::shared_ptr<Aidge::Node> roundNode = Round(node->name() + "_Round"); + roundNode->getOperator()->setDataType(DataType::Float64); // getDataType(parentNode) + roundNode->getOperator()->setBackend(node->getOperator()->backend()); - std::shared_ptr<Aidge::Tensor> scalingTensor = std::make_shared<Aidge::Tensor>(scaling); - scalingTensor->setDataType(tensor->dataType()); - scalingTensor->setBackend(tensor->backend()); + insertNodeBetween(node,roundNode,graphView); + roundNode->attributes()->addAttr("quantization.ptq.isProducerRounding",0.0); + return true; + } + return false; +} +bool insertScalingBelowProducer(std::shared_ptr<Node> node,double scalingFactor, std::shared_ptr<GraphView> graphView) +{ + if(node->attributes()->hasAttr("quantization.ptq.isProducerRounding")) + { + //In this case we 'bump' the node to the one above him (an actual ProducerScaling) + // because the round node is not usable (only used when SSA is enabled) + node = node->getParent(0); + } + if(node->attributes()->hasAttr("quantization.ptq.isProducerScaling")) + { + // We accumulate the multiples scaling factors by multiplying the SF of the ProducerScaling node + // (adding new nodes each time would make the graph unusable) + multiplyScalingFactor(node,scalingFactor); + return true; + } + AIDGE_ASSERT(node->type() == "Producer","Cannot apply a scaling factor on node of type: {} which is not a producer", node->type()); + std::string scalingNodeName = makeUniqueName(node->name() + "_ProducerScaling", graphView); + + std::shared_ptr<Aidge::Node> scalingNode = Mul(scalingNodeName); + scalingNode->attributes()->addAttr("quantization.ptq.isProducerScaling",0.0); + + std::shared_ptr<Tensor> scalingFactorTensor = std::make_shared<Tensor>(Array1D<double, 1> {scalingFactor}); + std::shared_ptr<Node> scalingFactorProducer = addProducer(scalingNode, 1, {1}, "Factor"); + scalingFactorProducer->getOperator()->setOutput(0, scalingFactorTensor); + graphView->add(scalingFactorProducer); + + scalingNode->getOperator()->setDataType(DataType::Float64); + std::string producerBackend = std::static_pointer_cast<OperatorTensor>(node->getOperator())->getOutput(0)->backend(); + scalingNode->getOperator()->setBackend(producerBackend); - mulOp.associateInput(0, tensor); - mulOp.associateInput(1, scalingTensor); + insertNodeBetween(node, scalingNode, graphView); - mulOp.forward(); - - auto outTensor = mulOp.getOutput(0); - *tensor = *outTensor; + return true; } -static void roundTensor(std::shared_ptr<Tensor> tensor) +bool checkArchitecture(std::shared_ptr<GraphView> graphView) { - auto roundOp = Round_Op(); - roundOp.setDataType(tensor->dataType()); - roundOp.setBackend(tensor->backend()); + std::set<std::string> otherNodeTypes({"Flatten", "Softmax", "BatchNorm2D", "ReLU", "Producer"}); - roundOp.associateInput(0, tensor); - roundOp.forward(); - - auto outTensor = roundOp.getOutput(0); - *tensor = *outTensor; + for (std::shared_ptr<Node> node : graphView->getNodes()) + { + bool isOther = otherNodeTypes.find(node->type()) != otherNodeTypes.end(); + if (!isOther && !isAffine(node) && !isSeamless(node) && !isMerging(node)) { + Log::warn(" GraphView can't be quantized : node type {} is not supported !", node->type()); + return false; + } + } + + return true; } -// TODO : make the retreival of argmax values backend independant (refCastFrom) static double getTensorAbsoluteMax(std::shared_ptr<Tensor> tensor) { // get the abs tensor + std::shared_ptr<Tensor> fallback; //Fallback tensor for refCastFR std::shared_ptr<Tensor> absTensor = std::make_shared<Tensor>(tensor->abs()); @@ -122,6 +185,7 @@ static double getTensorAbsoluteMax(std::shared_ptr<Tensor> tensor) reshapeOp.associateInput(0, absTensor); reshapeOp.forward(); std::shared_ptr<Tensor> flatTensor = reshapeOp.getOutput(0); + const Tensor& localFlatTensor = flatTensor->refCastFrom(fallback, DataType::Float64, "cpu"); // Get the argmax @@ -131,13 +195,13 @@ static double getTensorAbsoluteMax(std::shared_ptr<Tensor> tensor) argmaxOp.associateInput(0, flatTensor); argmaxOp.forward(); - std::shared_ptr<Tensor> argmaxTensor = argmaxOp.getOutput(0); + const Tensor& argMaxTensor = argmaxOp.getOutput(0)->refCastFrom(fallback, DataType::Float64, "cpu"); // Return the max - int maxIndex = std::round(argmaxTensor->get<double>(0)); + int maxIndex = std::round(argMaxTensor.get<double>(0)); - return flatTensor->get<double>(maxIndex); + return localFlatTensor.get<double>(maxIndex); } @@ -151,6 +215,15 @@ static std::vector<std::shared_ptr<Node>> removeMatchingNodes(std::vector<std::s return remainingNodes; } +static std::vector<std::shared_ptr<Node>> removeProdScalingNodes(std::vector<std::shared_ptr<Node>> nodeVector) +{ + std::vector<std::shared_ptr<Node>> remainingNodes; + for (std::shared_ptr<Node> node : nodeVector) + if (!node->attributes()->hasAttr("quantization.ptq.isProducerScaling")) + remainingNodes.push_back(node); + + return remainingNodes; +} static void fixScheduling(std::vector<std::shared_ptr<Node>>& nodeVector) { @@ -195,6 +268,7 @@ std::vector<std::shared_ptr<Node>> retrieveNodeVector(std::shared_ptr<GraphView> fixScheduling(nodeVector); nodeVector = removeMatchingNodes(nodeVector, "Producer"); + nodeVector = removeProdScalingNodes(nodeVector); if (verbose) { @@ -214,7 +288,6 @@ static std::shared_ptr<Node> getFirstNode(std::shared_ptr<GraphView> graphView) void prepareNetwork(std::shared_ptr<GraphView> graphView) { removeFlatten(graphView); - sanitizeNodeNames(graphView); bool containsBatchNorm = false; @@ -264,25 +337,26 @@ void insertResidualNodes(std::shared_ptr<GraphView> graphView) Log::info(" ### inserting multiplicative node ..."); std::string residualNodeName = makeUniqueName(parentNode->name() + "_Res", graphView); - std::shared_ptr<Node> residualNode = Scaling(1.0, residualNodeName); + std::shared_ptr<Node> residualNode = Mul(residualNodeName); + residualNode->attributes()->addAttr("quantization.ptq.isScaling", 0.0); + residualNode->attributes()->addAttr("quantization.ptq.isResidual", 0.0); + + //Adding the SF as a producer of the node + std::shared_ptr<Tensor> scalingFactorTensor = std::make_shared<Tensor>(Array1D<double, 1> {1.0}); + std::shared_ptr<Node> scalingFactorProducer = addProducer(residualNode, 1, {1}, "ScalingFactor"); + scalingFactorProducer->getOperator()->setOutput(0, scalingFactorTensor); - residualNode->getOperator()->setDataType(DataType::Float64); //getDataType(parentNode) - residualNode->getOperator()->setBackend("cpu"); + residualNode->getOperator()->setDataType(DataType::Float64); // getDataType(parentNode) + residualNode->getOperator()->setBackend(parentNode->getOperator()->backend()); graphView->insertParent(node, residualNode, i, 0, 0); + graphView->add(scalingFactorProducer); } } } } } -static int getInputIndex(std::shared_ptr<Node> node, std::shared_ptr<Node> parentNode) -{ - int index = 0; - while (node->getParent(index) != parentNode) - index++; - return index; -} void insertScalingNodes(std::shared_ptr<GraphView> graphView) { @@ -295,37 +369,30 @@ void insertScalingNodes(std::shared_ptr<GraphView> graphView) if (isAffine(parentNode) || isMerging(parentNode)) { std::string scalingNodeName = makeUniqueName(parentNode->name() + "_Scaling", graphView); - std::shared_ptr<Node> scalingNode = Scaling(1.0, scalingNodeName); + //std::shared_ptr<Node> scalingNode = Scaling(1.0, scalingNodeName); + + //Adding Mul operator with tag "quantization.ptq.isScaling" + std::shared_ptr<Aidge::Node> scalingNode = Mul(scalingNodeName); + scalingNode->attributes()->addAttr("quantization.ptq.isScaling",0.0); + + //Adding the SF as a producer of the node + std::shared_ptr<Tensor> scalingFactorTensor = std::make_shared<Tensor>(Array1D<double, 1> {1.0}); + std::shared_ptr<Node> scalingFactorProducer = addProducer(scalingNode, 1, {1}, "ScalingFactor"); + scalingFactorProducer->getOperator()->setOutput(0, scalingFactorTensor); scalingNode->getOperator()->setDataType(DataType::Float64); // getDataType(parentNode) - scalingNode->getOperator()->setBackend("cpu"); + scalingNode->getOperator()->setBackend(parentNode->getOperator()->backend()); if (parentNode->getChildren().size() > 0) { - // SCALING NODE INSERTION - - // We always have one output from Affine and Add nodes, but possibly multiple childs - std::vector<std::shared_ptr<Node>> nextNodes = parentNode->getChildren(0); - - // For each node in nextNodes store the connexion index - std::vector<int> inputIndices(nextNodes.size()); - for (std::size_t i = 0; i < nextNodes.size(); i++) - inputIndices[i] = getInputIndex(nextNodes[i], parentNode); - - for (std::shared_ptr<Node> nextNode : nextNodes) - parentNode->removeChild(nextNode, 0); - - parentNode->addChild(scalingNode, 0, 0); - - for (std::size_t i = 0; i < nextNodes.size(); i++) - scalingNode->addChild(nextNodes[i], 0, inputIndices[i]); - - graphView->add(scalingNode); + insertNodeBetween(parentNode,scalingNode,graphView); + graphView->add(scalingFactorProducer); } else { // Log::info(" last node reached ! "); parentNode->addChild(scalingNode, 0, 0); + graphView->add(scalingFactorProducer); graphView->add(scalingNode); } } @@ -335,7 +402,7 @@ void insertScalingNodes(std::shared_ptr<GraphView> graphView) static std::shared_ptr<Node> getPreviousScalingNode(std::shared_ptr<Node> mergingNode) { std::shared_ptr<Node> currNode = mergingNode; - while(currNode->type() != "Scaling") + while(!currNode->attributes()->hasAttr("quantization.ptq.isScaling")) { if (currNode->getParents().size() == 0) { @@ -378,7 +445,7 @@ void normalizeParameters(std::shared_ptr<GraphView> graphView) for (std::shared_ptr<Node> node : nodeVector) { // Scaling nodes still have a ratio of 1, so they are seamless ... - if (node->type() == "ReLU" || node->type() == "Scaling" || isSeamless(node)) + if (node->type() == "ReLU" || node->attributes()->hasAttr("quantization.ptq.isScaling") || isSeamless(node)) { if (node != firstNode) { @@ -394,7 +461,8 @@ void normalizeParameters(std::shared_ptr<GraphView> graphView) std::shared_ptr<Tensor> weightTensor = getWeightTensor(node); double scaling = getTensorAbsoluteMax(weightTensor); double ratio = 1.0 / scaling; - rescaleTensor(weightTensor, ratio); + //rescaleTensor(weightTensor, ratio); + insertScalingBelowProducer(node->getParent(1),ratio,graphView); // Accumulate the ratio if (node == firstNode) @@ -412,7 +480,8 @@ void normalizeParameters(std::shared_ptr<GraphView> graphView) if (nodeHasBias(node)) { std::shared_ptr<Tensor> biasTensor = getBiasTensor(node); - rescaleTensor(biasTensor, accumulatedRatios[node->name()] ); + //rescaleTensor(biasTensor, accumulatedRatios[node->name()] ); + insertScalingBelowProducer(node->getParent(2),accumulatedRatios[node->name()],graphView); } } @@ -439,8 +508,7 @@ void normalizeParameters(std::shared_ptr<GraphView> graphView) std::shared_ptr<Node> scalingNode = getPreviousScalingNode(mergingNode); - double currScalingFactor = getScalingFactor(scalingNode); - updateScalingFactor(scalingNode, currScalingFactor / rescaling); + multiplyScalingFactor(scalingNode,1/rescaling); accumulatedRatios[mergingNode->name()] /= rescaling; // optional ... } @@ -465,7 +533,7 @@ std::map<std::string, double> computeRanges(std::shared_ptr<GraphView> graphView std::set<std::shared_ptr<Node>> nodeSet = graphView->getNodes(); for (std::shared_ptr<Node> node : nodeSet) { - if ((scalingNodesOnly && (node->type() == "Scaling")) || (!scalingNodesOnly && (node->type() != "Producer"))) + if ((scalingNodesOnly && (node->attributes()->hasAttr("quantization.ptq.isScaling"))) || (!scalingNodesOnly && (node->type() != "Producer"))) { std::shared_ptr<Operator> nodeOperator = node->getOperator(); std::shared_ptr<Tensor> valueTensor = std::static_pointer_cast<Tensor> (nodeOperator->getRawOutput(0)); @@ -487,7 +555,7 @@ std::map<std::string, double> computeRanges(std::shared_ptr<GraphView> graphView // std::shared_ptr<Node> inputNode = getFirstNode(graphView); for (std::shared_ptr<Node> node : nodeSet) - if ((scalingNodesOnly && (node->type() == "Scaling")) || (!scalingNodesOnly && (node->type() != "Producer"))) + if ((scalingNodesOnly && (node->attributes()->hasAttr("quantization.ptq.isScaling"))) || (!scalingNodesOnly && (node->type() != "Producer"))) valueRanges.insert(std::make_pair(node->name(), 0)); if (useCuda) @@ -514,7 +582,7 @@ std::map<std::string, double> computeRanges(std::shared_ptr<GraphView> graphView std::map<std::string, double> sampleRanges; for (std::shared_ptr<Node> node : nodeSet) { - if ((scalingNodesOnly && (node->type() == "Scaling")) || (!scalingNodesOnly && (node->type() != "Producer"))) + if ((scalingNodesOnly && (node->attributes()->hasAttr("quantization.ptq.isScaling"))) || (!scalingNodesOnly && (node->type() != "Producer"))) { std::shared_ptr<Operator> nodeOperator = node->getOperator(); std::shared_ptr<Tensor> valueTensor = std::static_pointer_cast<Tensor> (nodeOperator->getRawOutput(0)); @@ -536,7 +604,7 @@ std::map<std::string, double> computeRanges(std::shared_ptr<GraphView> graphView for (std::shared_ptr<Node> node : nodeSet) { - if ((scalingNodesOnly && (node->type() == "Scaling")) || (!scalingNodesOnly && (node->type() != "Producer"))) + if ((scalingNodesOnly && (node->attributes()->hasAttr("quantization.ptq.isScaling"))) || (!scalingNodesOnly && (node->type() != "Producer"))) { std::string nodeName = node->name(); if (sampleRanges[nodeName] > valueRanges[nodeName]) @@ -572,7 +640,7 @@ void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::st for (std::shared_ptr<Node> node : nodeVector) { // Seamless scaling factor propagation ... - + if (isAffine(node) || isSeamless(node) || node->type() == "ReLU") { if (node == firstNode) @@ -586,11 +654,13 @@ void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::st } } + // Here prevNode is either a 'Affine' or a 'Merging' // => do not split the cases, just handle the bias ... - if (node->type() == "Scaling") + if (node->attributes()->hasAttr("quantization.ptq.isScaling")) { + // retrieve the previous scaling factor ... std::shared_ptr<Node> prevNode = node->getParent(0); double prevScalingFactor = scalingFactors[prevNode->name()]; @@ -598,8 +668,7 @@ void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::st // ValueRanges must contains all the scaling nodes !!! double scalingFactor = valueRanges[node->name()]; - double currScalingFactor = getScalingFactor(node); - updateScalingFactor(node, currScalingFactor / (scalingFactor / prevScalingFactor)); + multiplyScalingFactor(node,1/(scalingFactor / prevScalingFactor)); scalingFactors[node->name()] = scalingFactor; @@ -607,11 +676,13 @@ void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::st if (isAffine(prevNode)) { + bool prevNodeHasBias = nodeHasBias(prevNode); if (prevNodeHasBias) - { + { std::shared_ptr<Tensor> biasTensor = getBiasTensor(prevNode); - rescaleTensor(biasTensor, 1.0 / prevScalingFactor); + //rescaleTensor(biasTensor, 1.0 / prevScalingFactor); + insertScalingBelowProducer(prevNode->getParent(2),1.0 / prevScalingFactor,graphView); } } } @@ -641,9 +712,8 @@ void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::st std::shared_ptr<Node> scalingNode = getPreviousScalingNode(mergingNode); //Log::info(" SCALING NODE : {} {}", scalingNode->type(), scalingNode->name()); - - double currScalingFactor = getScalingFactor(scalingNode); - updateScalingFactor(scalingNode, currScalingFactor * rescaling); + + multiplyScalingFactor(scalingNode,rescaling) ; } } } @@ -679,7 +749,7 @@ std::map<std::string, std::pair<bool, bool>> computeSignMap(std::shared_ptr<Grap signMap[node->name()].second = false; } - if (node->type() == "Scaling") + if (node->attributes()->hasAttr("quantization.ptq.isScaling")) { signMap[node->name()].second = false; @@ -726,7 +796,7 @@ std::map<std::string, std::pair<bool, bool>> computeSignMap(std::shared_ptr<Grap // Arbitration : Signed type wins ! for(std::shared_ptr<Node> parent : parentNodes) { - while (parent->type() != "Scaling") + while (!parent->attributes()->hasAttr("quantization.ptq.isScaling")) { signMap[parent->name()] = std::make_pair(false, false); // We are on a branch so nodes always have 1 parent ... @@ -808,26 +878,23 @@ void quantizeNormalizedNetwork(std::shared_ptr<GraphView> graphView, std::uint8_ if (isAffine(node)) { // Rescale the weight tensor - std::shared_ptr<Tensor> weightTensor = getWeightTensor(node); - rescaleTensor(weightTensor, signedMax); + insertScalingBelowProducer(node->getParent(1),signedMax,graphView); if (!noQuant) - roundTensor(weightTensor); + insertRoundBelowProducer(node->getParent(1),graphView); // Rescale the bias tensor - if (nodeHasBias(node)) { bool inputIsUnsigned = signMap[node->name()].first; double rescaling = inputIsUnsigned ? unsignedMax * signedMax : signedMax * signedMax; - - + std::shared_ptr<Tensor> biasTensor = getBiasTensor(node); - rescaleTensor(biasTensor, rescaling); + insertScalingBelowProducer(node->getParent(2),rescaling,graphView); if (!noQuant) - roundTensor(biasTensor); + insertRoundBelowProducer(node->getParent(2),graphView); } // Compensate the rescaling using the next Scaling node @@ -842,8 +909,7 @@ void quantizeNormalizedNetwork(std::shared_ptr<GraphView> graphView, std::uint8_ std::shared_ptr<Node> scalingNode = *(node->getChildren().begin()); // Assert if scalingNode is a Scaling ... - double currScalingFactor = getScalingFactor(scalingNode); - updateScalingFactor(scalingNode, currScalingFactor * rescaling); + multiplyScalingFactor(scalingNode,rescaling) ; } if (isMerging(node)) @@ -858,23 +924,25 @@ void quantizeNormalizedNetwork(std::shared_ptr<GraphView> graphView, std::uint8_ std::shared_ptr<Node> scalingNode = *(node->getChildren().begin()); // Assert if scalingNode is a Scaling ... - double currScalingFactor = getScalingFactor(scalingNode); // XXX bad naming - updateScalingFactor(scalingNode, currScalingFactor * rescaling); + multiplyScalingFactor(scalingNode,rescaling) ; } // Handle the Scaling Nodes ... - if (node->type() == "Scaling") + if (node->attributes()->hasAttr("quantization.ptq.isScaling")) { if (!noQuant) { // Replace the Scaling Node by Quantizer + auto scalingFactorTensor = std::static_pointer_cast<OperatorTensor>(node->getOperator())->getInput(1); + std::shared_ptr<Tensor> fallback; + const Tensor& localTensor = scalingFactorTensor->refCastFrom(fallback, DataType::Float64, "cpu"); + double old_sf = localTensor.get<double>(0);//!\\ - std::shared_ptr<Node> quantizerNode = Quantizer(getScalingFactor(node), -(signedMax + 1), signedMax, node->name()); + std::shared_ptr<Node> quantizerNode = Quantizer(old_sf, -(signedMax + 1), signedMax, node->name()); quantizerNode->getOperator()->setDataType(DataType::Float64); // getDataType(parentNode) - quantizerNode->getOperator()->setBackend("cpu"); - - graphView->replace({node}, {quantizerNode}); + quantizerNode->getOperator()->setBackend(node->getOperator()->backend()); + graphView->replace({node,node->getParent(1)}, {quantizerNode}); if (optimizeSigns) { @@ -888,6 +956,7 @@ void quantizeNormalizedNetwork(std::shared_ptr<GraphView> graphView, std::uint8_ double currScalingFactor = getScalingFactor(quantizerNode); updateScalingFactor(quantizerNode, currScalingFactor * rescaling); + if(outputIsUnsigned) { @@ -910,41 +979,37 @@ static void insertCompensationNodes(std::shared_ptr<GraphView> graphView, std::u { // A merging node is always followed by a Quantizer node at this point - if (node->type() == "Quantizer") + if (node->type() == "Quantizer" && (node->attributes()->hasAttr("quantization.ptq.isResidual") || !isAffine(node->getParent(0)))) { - // check if the Quantizer is a residual one, and insert a compensation node if so ... - bool prevNodeIsForking = ((node->getParent(0))->getChildren().size() > 1); - bool prevNodeIsAffine = isAffine(node->getParent(0)); - bool insertNode = prevNodeIsForking || !prevNodeIsAffine; - - if (insertNode) - { - // create and insert the multplicative node before the Quantizer + // check if the Quantizer is a residual one, and insert a compensation node if so ... + // create and insert the multplicative node before the Quantizer - std::string mulNodeName = makeUniqueName(node->name() + "_Mul", graphView); - std::shared_ptr<Node> mulNode = Mul(mulNodeName); - mulNode->getOperator()->setDataType(DataType::Float64); // getDataType(parentNode) - mulNode->getOperator()->setBackend("cpu"); + std::string mulNodeName = makeUniqueName(node->name() + "_Mul", graphView); + std::shared_ptr<Node> mulNode = Mul(mulNodeName); + + mulNode->attributes()->addAttr("quantization.ptq.isCompensation",0.0); + mulNode->getOperator()->setDataType(DataType::Float64); // getDataType(parentNode) + mulNode->getOperator()->setBackend(node->getOperator()->backend()); - graphView->insertParent(node, mulNode, 0, 0, 0); + graphView->insertParent(node, mulNode, 0, 0, 0); - // Add the coeff producer to the multiplier node + // Add the coeff producer to the multiplier node - std::shared_ptr<Node> coeffProducer = addProducer(mulNode, 1, {1}, ""); - std::shared_ptr<Tensor> coeffTensor = std::make_shared<Tensor>(signedMax); - coeffProducer->getOperator()->setOutput(0, coeffTensor); + std::shared_ptr<Node> coeffProducer = addProducer(mulNode, 1, {1}, ""); + std::shared_ptr<Tensor> coeffTensor = std::make_shared<Tensor>(Array1D<double, 1> {signedMax}); + coeffProducer->getOperator()->setOutput(0, coeffTensor); - coeffProducer->getOperator()->setDataType(DataType::Float64); - coeffProducer->getOperator()->setBackend("cpu"); + coeffProducer->getOperator()->setDataType(DataType::Float64); + coeffProducer->getOperator()->setBackend(node->getOperator()->backend()); - graphView->add(coeffProducer); // needed ? + graphView->add(coeffProducer); // needed ? - // Adapt the scaling factor value accordingly + // Adapt the scaling factor value accordingly - double currScalingFactor = getScalingFactor(node); - updateScalingFactor(node, currScalingFactor / signedMax); - } + double currScalingFactor = getScalingFactor(node); + updateScalingFactor(node, currScalingFactor / signedMax); + } } } @@ -955,9 +1020,7 @@ void performSingleShiftApproximation(std::shared_ptr<GraphView> graphView, bool for (std::shared_ptr<Node> node : nodeVector) { - // TODO : use Compensation nodes instead of Mul nodes - - if (isAffine(node) || (node->type() == "Mul")) + if (isAffine(node) || (node->type() == "Mul" && node->attributes()->hasAttr("quantization.ptq.isCompensation"))) { std::shared_ptr<Node> scalingNode = (*node->getChildren().begin()); @@ -965,21 +1028,20 @@ void performSingleShiftApproximation(std::shared_ptr<GraphView> graphView, bool double approx = std::pow(2, std::ceil(std::log2(base))); - updateScalingFactor(scalingNode, approx); + updateScalingFactor(scalingNode,approx); double ratio = base / approx; - std::shared_ptr<Tensor> weightTensor = getWeightTensor(node); - rescaleTensor(weightTensor, ratio); + insertScalingBelowProducer(node->getParent(1),ratio,graphView); if (!noQuant) - roundTensor(weightTensor); + insertRoundBelowProducer(node->getParent(1),graphView); if (nodeHasBias(node)) { - std::shared_ptr<Tensor> biasTensor = getBiasTensor(node); - rescaleTensor(biasTensor, ratio); + insertScalingBelowProducer(node->getParent(2),ratio,graphView); + if (!noQuant) - roundTensor(biasTensor); + insertRoundBelowProducer(node->getParent(2),graphView); } } } @@ -988,7 +1050,7 @@ void performSingleShiftApproximation(std::shared_ptr<GraphView> graphView, bool static void printScalingFactors(std::shared_ptr<GraphView> graphView) { for (auto node : retrieveNodeVector(graphView)) - if (node->type() == "Scaling" || node->type() == "Quantizer") + if (node->attributes()->hasAttr("quantization.ptq.isScaling") || node->type() == "Quantizer") { double scalingFactor = getScalingFactor(node); Log::info(" {:.6f} ({})", scalingFactor, node->name()); @@ -1010,18 +1072,6 @@ static void setupDataType(std::shared_ptr<GraphView> graphView, std::vector<std: tensor->setDataType(dataType); } -static void printRanges(std::shared_ptr<GraphView> graphView, std::map<std::string, double> valueRanges) -{ - SequentialScheduler scheduler(graphView); - scheduler.resetScheduling(); - scheduler.generateScheduling(); - - auto scheduling = scheduler.getStaticScheduling(); - for (auto node : scheduling) - if (node->type() == "Scaling") - Log::info(" {} range = {} ", node->name(), valueRanges[node->name()]); -} - void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, std::vector<std::shared_ptr<Tensor>> inputDataSet, Clipping clippingMode, bool noQuant, bool optimizeSigns, bool singleShift, bool useCuda, bool verbose) { Log::info(" === QUANT PTQ 0.2.21 === "); @@ -1041,7 +1091,6 @@ void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, insertScalingNodes(graphView); crossLayerEqualization(graphView); - Log::info(" Normalizing the parameters ..."); normalizeParameters(graphView); @@ -1049,14 +1098,12 @@ void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, std::map<std::string, double> valueRanges = computeRanges(graphView, inputDataSet, true, useCuda); //Log::info(" === RANGES (BEFORE ADJUST) ==="); - //printRanges(graphView, valueRanges); Log::info(" Optimizing the clipping values ..."); valueRanges = adjustRanges(clippingMode, valueRanges, nbBits, graphView, inputDataSet, useCuda, verbose); - //Log::info(" === RANGES (AFTER ADJUST) ==="); + //Log:debug("=== RANGES (AFTER ADJUST) ==="); //printRanges(graphView, valueRanges); - Log::info(" Normalizing the activations ..."); normalizeActivations(graphView, valueRanges); @@ -1075,17 +1122,9 @@ void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, if (verbose) printScalingFactors(graphView); - //Log::info(" === SCALINGS (BEFORE CAST) ==="); - //printScalingFactors(graphView); - - setupDataType(graphView, inputDataSet, initialDataType); - if (useCuda) graphView->setBackend("cuda"); - //Log::info(" === SCALINGS (AFTER CAST) ==="); - //printScalingFactors(graphView); - Log::info(" Reseting the scheduler ..."); SequentialScheduler scheduler(graphView); scheduler.resetScheduling(); @@ -1115,15 +1154,9 @@ void clearBiases(std::shared_ptr<GraphView> graphView) for (std::shared_ptr<Node> node : graphView->getNodes()) { if (node->type() == "FC" || node->type() == "Conv2D") { std::shared_ptr<Tensor> biasTensor = std::static_pointer_cast<OperatorTensor>(node->getOperator())->getInput(2); - rescaleTensor(biasTensor, 0); + //rescaleTensor(biasTensor, 0); + insertScalingBelowProducer(node->getParent(2),0,graphView); } } } - -void devPTQ(std::shared_ptr<GraphView> graphView) -{ - for (std::shared_ptr<Node> node : graphView->getNodes()) - Log::info(" UUU : {}", node->name()); -} - } diff --git a/src/QAT/QAT_LSQ.cpp b/src/QAT/QAT_LSQ.cpp index 6eae077b060027eb4029f6b59f55376a1674df70..9b51e846df498a9303b7373ae1c86d4b007a96f0 100644 --- a/src/QAT/QAT_LSQ.cpp +++ b/src/QAT/QAT_LSQ.cpp @@ -21,152 +21,193 @@ #include "aidge/graph/Matching.hpp" #include "aidge/recipes/QuantRecipes.hpp" +namespace Aidge { -namespace Aidge +void QuantLSQ::insertQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits, float stepSize) { + const auto matches = SinglePassGraphMatching(graphView).match("(Conv2D#|FC#)"); -static float getTensorAbsMean(std::shared_ptr<Tensor> tensor) -{ - auto valueTensor = (*tensor).abs().mean(); - std::shared_ptr<Tensor> fallback; - const Tensor& localTensor = valueTensor.refCastFrom(fallback, DataType::Float32, "cpu"); - return localTensor.get<float>(0); -} + for (const auto& match : matches) + { + auto linearNode = match.graph->rootNode(); -static float getTensorStd(std::shared_ptr<Tensor> tensor) -{ - auto valueTensor = (*tensor); - - auto skewedTensor = valueTensor - valueTensor.mean(); - auto squaredTensor = skewedTensor * skewedTensor; - auto varianceTensor = squaredTensor.mean(); + std::pair<int, int> signedRange = {-std::pow(2, nbBits - 1), std::pow(2, nbBits - 1) - 1}; + std::pair<int, int> unsignedRange = {0, std::pow(2, nbBits) - 1}; - std::shared_ptr<Tensor> fallback; - auto localTensor = varianceTensor.refCastFrom(fallback, DataType::Float32, "cpu"); - - float variance = localTensor.get<float>(0); - return std::sqrt(variance); -} + // INPUT QUANTIZERS INSERTION + // TODO : double check this, and use createUniqueName() + auto inputQuantizerName = makeUniqueName(linearNode->name() + "_lsq_i", graphView); + auto inputQuantizerNode = LSQ(signedRange, inputQuantizerName); -// INIT THE STEP SIZE OF A QUANTIZER NODE + // Set the step size -static bool initStepSize(std::shared_ptr<Node> quantizer) -{ - const auto quantizerOp = std::static_pointer_cast<LSQ_Op>(quantizer->getOperator()); + auto inputStepSizeOp = inputQuantizerNode->getParent(1)->getOperator(); + auto inputStepSizeTensor = std::make_shared<Tensor>(Array1D<float, 1>({{stepSize}})); + inputStepSizeOp->setOutput(0, inputStepSizeTensor); - // This formula is the one proposed in the paper ... + // Absorb the ReLU when possible ... - // float inputAbsMean = getTensorAbsMean(quantizerOp->getInput(0)); - // float stepSize = 2.0f * (inputAbsMean / std::sqrt(quantizerOp->range().second)); + // XXX is this safe ??? + bool nodeHasParent = static_cast<bool> (linearNode->getParents()[0]); + // bool nodeHasParent = (linearNode->getParents().size() != 0); - // .. but this formula seems to work better !!! + if (nodeHasParent) { + auto parentNode = linearNode->getParents()[0]; + if (parentNode->type() == "ReLU") { + auto inputQuantizerOp = std::static_pointer_cast<LSQ_Op> (inputQuantizerNode->getOperator()); + inputQuantizerOp->range() = unsignedRange; + graphView->replace({parentNode}, {}); + } + } - float inputStd = getTensorStd(quantizerOp->getInput(0)); - float stepSize = 8.0f * (inputStd / (quantizerOp->range().second)); + // We need to handle the case where the linear node is the first one ... - // TODO : use the scalar constructor - auto stepSizeTensor = std::make_shared<Tensor>(Array1D<float, 1>({{stepSize}})); + if (nodeHasParent) { + graphView->insertParent(linearNode, inputQuantizerNode, 0, 0, 0); + } else { + inputQuantizerNode->addChild(graphView); + graphView->add(inputQuantizerNode); + } - // XXX Manage backend here ? - stepSizeTensor->setBackend(quantizerOp->getInput(0)->backend()); - stepSizeTensor->setDataType(quantizerOp->getInput(0)->dataType()); + // PARAM QUANTIZERS INSERTION - auto stepSizeProducer = quantizer->getParent(1); + // TODO : double check this, and use createUniqueName() + auto paramQuantizerName = makeUniqueName(linearNode->name() + "_lsq_p", graphView); + auto paramQuantizerNode = LSQ(signedRange, paramQuantizerName); + graphView->insertParent(linearNode, paramQuantizerNode, 1, 0, 0); - stepSizeProducer->getOperator()->setOutput(0, stepSizeTensor); + // Set the step size - Log::notice(" [ INIT STEP SIZE = {} ] ", stepSize); + auto paramStepSizeOp = paramQuantizerNode->getParent(1)->getOperator(); + auto paramStepSizeTensor = std::make_shared<Tensor>(Array1D<float, 1>({{stepSize}})); + paramStepSizeOp->setOutput(0, paramStepSizeTensor); + } - return false; } -static void setupInputQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits) +static float getTensorAbsMean(std::shared_ptr<Tensor> tensor) { - const auto matches = SinglePassGraphMatching(graphView).match("(Conv2D#|PaddedConv2D#|FC#)"); + auto backend = tensor->backend(); + if (backend == "cuda") + tensor->setBackend("cpu"); - for (const auto& match : matches) - { - auto linearNode = match.graph->rootNode(); + float acc = 0; + float* castedTensor = static_cast<float *> (tensor->getImpl()->rawPtr()); + for(std::size_t i = 0; i < tensor->size(); i++) + acc += std::abs(castedTensor[i]); + acc /= static_cast<float> (tensor->size()); - // Log::notice(" SET INPUT QUANTIZER : {} ", linearNode->type()); - - std::pair<int, int> signedRange = {-std::pow(2, nbBits - 1), std::pow(2, nbBits - 1) - 1}; - std::pair<int, int> unsignedRange = {0, std::pow(2, nbBits) - 1}; + if (backend == "cuda") + tensor->setBackend("cuda"); - // Create the input quantizer node - - auto quantizerName = makeUniqueName(linearNode->name() + "_lsq_i", graphView); - auto quantizerNode = LSQ(signedRange, quantizerName); + return acc; +} - // Init the step-size using the node call stack +static std::map<std::string, float> collectInputStats(std::shared_ptr<GraphView> graphView, std::shared_ptr<Tensor> calibrationData, bool useCuda) +{ + // Propagate the calibration tensor - quantizerNode->addBeforeForward([quantizerNode](){ return initStepSize(quantizerNode); }); + SequentialScheduler scheduler(graphView); + scheduler.resetScheduling(); + scheduler.forward(true, {calibrationData}); - // Absorb the ReLU when possible ... + // Store the input tensor statistics - bool nodeHasParent = static_cast<bool> (linearNode->getParents()[0]); // XXX is this safe ? + if (useCuda) + graphView->setBackend("cpu"); - if (nodeHasParent) + std::map<std::string, float> inputStats; + for (auto node : graphView->getNodes()) + { + if (node->type() == "FC" || node->type() == "Conv2D") // TODO: use graph matching !!! { - bool allParentsAreReLU = true; - for (auto parentNode : linearNode->getParents()) - if (parentNode->type() != "ReLU") - allParentsAreReLU = false; - - if (allParentsAreReLU) { - auto quantizerOp = std::static_pointer_cast<LSQ_Op> (quantizerNode->getOperator()); - quantizerOp->range() = unsignedRange; - } - - // TODO : remove the ReLUs when possible + const auto op = std::static_pointer_cast<LSQ_Op>(node->getOperator()); + float inputAbsMean = getTensorAbsMean(op->getInput(0)); + inputStats.insert(std::make_pair(node->name(), inputAbsMean)); + fmt::println("{} -> {}", node->name(), inputAbsMean); } + } - // Insert the quantizer in the graphView ... - // (We need to handle the case where the linear node is the first one) + if (useCuda) + graphView->setBackend("cuda"); - if (nodeHasParent) { - graphView->insertParent(linearNode, quantizerNode, 0, 0, 0); - } else { - quantizerNode->addChild(graphView); - graphView->add(quantizerNode); + return inputStats; +} + +static std::map<std::string, float> collectParamStats(std::shared_ptr<GraphView> graphView, bool useCuda) +{ + if (useCuda) + graphView->setBackend("cpu"); + + std::map<std::string, float> paramStats; + for (auto node : graphView->getNodes()) + { + if (node->type() == "FC" || node->type() == "Conv2D") // TODO: use graph matching !!! + { + const auto op = std::static_pointer_cast<LSQ_Op>(node->getOperator()); + float paramAbsMean = getTensorAbsMean(op->getInput(1)); + paramStats.insert(std::make_pair(node->name(), paramAbsMean)); + fmt::println("{} -> {}", node->name(), paramAbsMean); } } -} + + if (useCuda) + graphView->setBackend("cuda"); -// PARAM QUANTIZERS INSERTION + return paramStats; +} -static void setupParamQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits) +static void adjustQuantizersStepSizes(std::shared_ptr<GraphView> graphView, std::map<std::string, float> inputStats, std::map<std::string, float> paramStats) { - const auto matches = SinglePassGraphMatching(graphView).match("(Conv2D#|PaddedConv2D#|FC#)"); - - std::pair<int, int> signedRange = {-std::pow(2, nbBits - 1), std::pow(2, nbBits - 1) - 1}; + const auto matches = SinglePassGraphMatching(graphView).match("(Conv2D#|FC#)"); for (const auto& match : matches) - { - auto linearNode = match.graph->rootNode(); + { + auto linearNode = match.graph->rootNode(); - // Log::notice(" SET PARAM QUANTIZER : {} ", linearNode->type()); + // INPUT QUANTIZERS STEP-SIZES - // TODO : double check this, and use createUniqueName() - auto quantizerName = makeUniqueName(linearNode->name() + "_lsq_p", graphView); - auto quantizerNode = LSQ(signedRange, quantizerName); + auto inputQuantNode = linearNode->getParent(0); + auto inputQuantOp = std::static_pointer_cast<LSQ_Op>(inputQuantNode->getOperator()); + + float absMean = inputStats[linearNode->name()]; + float stepSize = 2.0f * (absMean / std::sqrt(inputQuantOp->range().second)); - // Init the step-size using the node call stack + auto inputStepSizeOp = inputQuantNode->getParent(1)->getOperator(); + // XXX inputStepSizeOp->setOutput(0, std::make_shared<Tensor>(Array1D<float, 1>({{stepSize}}))); + auto inputStepSizeTensor = std::make_shared<Tensor>(Array1D<float, 1>({{stepSize}})); + inputStepSizeOp->setOutput(0, inputStepSizeTensor); - quantizerNode->addBeforeForward([quantizerNode](){ return initStepSize(quantizerNode); }); + // PARAM QUANTIZERS STEP-SIZES - // Insert the quantizer in the graphView + auto paramQuantNode = linearNode->getParent(1); + auto paramQuantOp = std::static_pointer_cast<LSQ_Op>(paramQuantNode->getOperator()); - graphView->insertParent(linearNode, quantizerNode, 1, 0, 0); + absMean = paramStats[linearNode->name()]; + stepSize = 2.0f * (absMean / std::sqrt(paramQuantOp->range().second)); + + auto paramStepSizeOp = paramQuantNode->getParent(1)->getOperator(); + // XXX paramStepSizeOp->setOutput(0, std::make_shared<Tensor>(Array1D<float, 1>({{stepSize}}))); + auto paramStepSizeTensor = std::make_shared<Tensor>(Array1D<float, 1>({{stepSize}})); + paramStepSizeOp->setOutput(0, paramStepSizeTensor); } } -void QuantLSQ::setupQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits) +void QuantLSQ::insertAndInitQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits, std::shared_ptr<Tensor> calibrationData) { - sanitizeNodeNames(graphView); - setupInputQuantizers(graphView, nbBits); - setupParamQuantizers(graphView, nbBits); + bool useCuda = (calibrationData->backend() == "cuda"); + + // Collect the tensor statisics + auto inputStats = collectInputStats(graphView, calibrationData, useCuda); + + auto paramStats = collectParamStats(graphView, useCuda); + + // Insert the quantizers + insertQuantizers(graphView, nbBits, 1.0); + + // Adjust the quantizers step-sizes + adjustQuantizersStepSizes(graphView, inputStats, paramStats); } } \ No newline at end of file diff --git a/src/operator/PTQMetaOps.cpp b/src/operator/PTQMetaOps.cpp index 56245da47076d8930ce29ab75e549d97d0d7493d..f86d454245a7fe088edd027732a91f5775cd2acf 100644 --- a/src/operator/PTQMetaOps.cpp +++ b/src/operator/PTQMetaOps.cpp @@ -60,23 +60,6 @@ std::shared_ptr<Node> Quantizer(double scalingFactor, double clipMin, double cli return metaopNode; } -std::shared_ptr<Node> Scaling(double scalingFactor, const std::string& name) -{ - std::shared_ptr<Tensor> scalingFactorTensor = std::make_shared<Tensor>(Array1D<double, 1> {scalingFactor}); - - std::shared_ptr<Node> mulNode = Mul((!name.empty()) ? name + "_Scaling" : ""); - - std::shared_ptr<Node> scalingFactorProducer = addProducer<1>(mulNode, 1, {1}, "ScalingFactor"); - scalingFactorProducer->getOperator()->setOutput(0, scalingFactorTensor); - - std::shared_ptr<GraphView> graphView = Sequential({mulNode}); - std::shared_ptr<GraphView> connectedGraphView = getConnectedGraphView(mulNode); - - NodePtr metaopNode = MetaOperator("Scaling", connectedGraphView, {}, name); - - return metaopNode; -} - static std::shared_ptr<Node> getSubNode(std::shared_ptr<GraphView> graphView, std::string nodeType) { std::shared_ptr<Node> mulNode = nullptr; @@ -87,10 +70,12 @@ static std::shared_ptr<Node> getSubNode(std::shared_ptr<GraphView> graphView, st return mulNode; } + + void updateScalingFactor(std::shared_ptr<Node> metaOpNode, double scalingFactor) { if(metaOpNode->type() != "Scaling" && metaOpNode->type() != "Quantizer") - Log::warn(" Cannot update the scaling factor on Node of type {}", metaOpNode->type()); + Log::warn("Cannot update the scaling factor on Node of type {}", metaOpNode->type()); std::shared_ptr<Tensor> scalingFactorTensor = std::make_shared<Tensor>(Array1D<double, 1> {scalingFactor}); @@ -99,7 +84,7 @@ void updateScalingFactor(std::shared_ptr<Node> metaOpNode, double scalingFactor) std::shared_ptr<Node> mulNode = getSubNode(metaOp->getMicroGraph(), "Mul"); if (!mulNode) - Log::warn(" Invalid PTQ MetaOperator, no Mul node found inside ! "); + Log::warn("Invalid PTQ MetaOperator, no Mul node found inside ! "); mulNode->input(1).first->getOperator()->setOutput(0, scalingFactorTensor); } @@ -107,7 +92,7 @@ void updateScalingFactor(std::shared_ptr<Node> metaOpNode, double scalingFactor) double getScalingFactor(std::shared_ptr<Node> MetaOpNode) { if (MetaOpNode->type() != "Scaling" && MetaOpNode->type() != "Quantizer") { - Log::warn(" Cannot get the scaling factor on Node of type {}", MetaOpNode->type()); + Log::warn("Cannot get the scaling factor on Node of type {}", MetaOpNode->type()); return 0; } @@ -116,7 +101,7 @@ double getScalingFactor(std::shared_ptr<Node> MetaOpNode) std::shared_ptr<Node> mulNode = getSubNode(metaOp->getMicroGraph(), "Mul"); if (!mulNode) { - Log::warn(" Invalid PTQ MetaOperator, no Mul found inside node of type {}", MetaOpNode->type()); + Log::warn("Invalid PTQ MetaOperator, no Mul found inside node of type {}", MetaOpNode->type()); return 0; } @@ -131,7 +116,7 @@ double getScalingFactor(std::shared_ptr<Node> MetaOpNode) void setClipRange(std::shared_ptr<Node> quantizerNode, double min, double max) { if (quantizerNode->type() != "Quantizer") { - Log::warn(" Cannot set the clipping range on Node of type {}", quantizerNode->type()); + Log::warn("Cannot set the clipping range on Node of type {}", quantizerNode->type()); return; } @@ -140,7 +125,7 @@ void setClipRange(std::shared_ptr<Node> quantizerNode, double min, double max) std::shared_ptr<Node> clipNode = getSubNode(metaOp->getMicroGraph(), "Clip"); if (!clipNode) { - Log::warn(" Invalid PTQ MetaOperator, no Clip found inside node of type {}", quantizerNode->type()); + Log::warn("Invalid PTQ MetaOperator, no Clip found inside node of type {}", quantizerNode->type()); return; }