diff --git a/include/aidge/operator/PTQMetaOps.hpp b/include/aidge/operator/PTQMetaOps.hpp
index b9bad0d18f099e94d4c52254b08629c7f947db6a..9ca76fbd40b9366aa82c6521fba931d284da137a 100644
--- a/include/aidge/operator/PTQMetaOps.hpp
+++ b/include/aidge/operator/PTQMetaOps.hpp
@@ -29,14 +29,6 @@ namespace Aidge {
 /// @return A shared pointer to an instance of the meta-operator node.
 std::shared_ptr<Aidge::Node> Quantizer(double scalingFactor, double clipMin, double clipMax, const std::string& name);
 
-/// @brief The purpose of Scaling is to encapsulate the Mul operator and tag it as a PTQ node rather than a regular Mul operator.
-/// Therefore, this meta-operator consists solely of a [Mul] operation.
-///
-/// @param scalingFactor The scaling factor to apply to the input (a scalar to multiply the input with).
-/// @param name The name of the meta-operator node created.
-/// @return A shared pointer to an instance of the scaling node.
-std::shared_ptr<Aidge::Node> Scaling(double scalingFactor, const std::string& name = "");
-
 /// @brief Updates the scaling factor of a PTQ meta-operator node, allowing for dynamic adjustment of the scaling parameter.
 /// This function sets a new scaling factor for a specified meta-operator node, modifying the scalar applied in the [Mul] operation.
 /// The meta-operator node must be a PTQ-specific operator, such as a Quantizer or Scaling node.
diff --git a/include/aidge/quantization/PTQ/PTQ.hpp b/include/aidge/quantization/PTQ/PTQ.hpp
index bfe671e3556c3af2c367ce7f86708f01c8e3d3b5..1d1b71ba7501580ea99103d351eafac9a7f793d2 100644
--- a/include/aidge/quantization/PTQ/PTQ.hpp
+++ b/include/aidge/quantization/PTQ/PTQ.hpp
@@ -69,6 +69,26 @@ namespace Aidge {
      * @return The scheduled vector of nodes
      */
     std::vector<std::shared_ptr<Node>> retrieveNodeVector(std::shared_ptr<GraphView> graphView, bool newSchedule = true, bool verbose = false);
+    
+    /**
+     * @brief Inserts a scaling node below the given producer node in the graph view. 
+     *        If the node is already a producer scaling node, it accumulates the scaling factor by multiplyins its value directly.
+     *
+     * @param node A shared pointer to the producer node where the scaling node will be inserted (below).
+     * @param scalingFactor The scaling factor to apply.
+     * @param graphView A shared pointer to the graph view in which the nodes are located.
+     * @return True if the scaling node was successfully inserted or the scaling factor was accumulated; False otherwise.
+     */
+    bool insertScalingBelowProducer(std::shared_ptr<Node> node, double scalingFactor, std::shared_ptr<GraphView> graphView);
+
+    /**
+     * @brief Inserts a rounding node below the given producer (also below its ows producerScaling) node in the graph view. 
+     *
+     * @param node A shared pointer to the producer node where the rounding node will be inserted.
+     * @param graphView A shared pointer to the graph view in which the nodes are located.
+     * @return True if the rounding node was successfully inserted; False otherwise.
+     */
+    bool insertRoundBelowProducer(std::shared_ptr<Node> node, std::shared_ptr<GraphView> graphView);
 
     /**
      * @brief Determine whether an input GraphView can be quantized or not.
@@ -77,6 +97,14 @@ namespace Aidge {
      */
     bool checkArchitecture(std::shared_ptr<GraphView> graphView);
 
+    /**
+     * @brief This function multiplies the existing scaling factor by a given coefficient. It verifies that the node is of the correct type ("Mul") 
+     * and has the `isScaling` attribute. If these conditions are not met, a warning is logged.
+     * @param node A shared pointer to an `Aidge::Node` object representing the node to modify.
+     * @param coeff  A double representing the multiplication coefficient to apply to the scaling factor.
+     */
+    void multiplyScalingFactor(std::shared_ptr<Aidge::Node> node, double coeff);
+
 
     void prepareNetwork(std::shared_ptr<GraphView> graphView);
 
diff --git a/include/aidge/quantization/QAT/QAT_LSQ.hpp b/include/aidge/quantization/QAT/QAT_LSQ.hpp
index 922187abca915daa1c00f3949d0d791b0d3e1c39..4970be07fae8737a1c2863600757bb81ff3a65f9 100644
--- a/include/aidge/quantization/QAT/QAT_LSQ.hpp
+++ b/include/aidge/quantization/QAT/QAT_LSQ.hpp
@@ -9,29 +9,36 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_QUANTIZATION_QUANTIZATION_QAT_LSQ_H_
-#define AIDGE_QUANTIZATION_QUANTIZATION_QAT_LSQ_H_
+#ifndef AIDGE_QUANTIZATION_QAT_LSQ_H_
+#define AIDGE_QUANTIZATION_QAT_LSQ_H_
 
-#include <cstddef>  // std::size_t
-#include <memory>
-
-#include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
 #include "aidge/graph/GraphView.hpp"
+#include "aidge/data/Tensor.hpp"
 
 namespace Aidge {
 namespace QuantLSQ {
 
 /**
- * @brief Given a GraphView with parameters properly initialized, insert
- * the LSQ quantizer nodes, and setup the adjustment their step-sizes.
- * @param graphView The GraphView containing the network to quantize.
+ * @brief Insert the LSQ quantizer nodes in a given GraphView
+ * @param graphView The GraphView containing the graph to quantize.
  * @param nbBits Number of quantization bits.
+ * @param span Fixed output span of the quantizers.
  */
+void insertQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits, float step_size);
 
-void setupQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits);
+/**
+ * @brief Given a GraphView with parameters properly initialized and some calibration data,
+ * insert the LSQ quantizer nodes, and adjust their step-sizes.
+ * @param graphView The GraphView containing the graph to quantize.
+ * @param nbBits Number of quantization bits.
+ * @param calibrationData Calibration data used to adjust the spans.
+ * @param scale Multiplicative constant applied to the spans.
+ */
+void insertAndInitQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits, std::shared_ptr<Tensor> calibrationData);
 
-}  // namespace QuantLSQ
-}  // namespace Aidge
+}
+}
 
-#endif /* AIDGE_QUANTIZATION_QUANTIZATION_QAT_LSQ_H_ */
+#endif /* AIDGE_QUANTIZATION_QAT_LSQ_H_ */
 
diff --git a/python_binding/pybind_PTQ.cpp b/python_binding/pybind_PTQ.cpp
index 1de797693468273814f4c5e82a161991648d06ff..ae0a0def28a861e2fc207adbc27c6af47dc0ded8 100644
--- a/python_binding/pybind_PTQ.cpp
+++ b/python_binding/pybind_PTQ.cpp
@@ -213,13 +213,6 @@ void init_PTQ(py::module &m) {
     :type network: :py:class:`aidge_core.GraphView`
     )mydelimiter");
 
-    m.def("dev_ptq", &devPTQ, py::arg("network"),
-    R"mydelimiter(
-    Developement and test routine.
-    :param network: The GraphView under test.
-    :type network: :py:class:`aidge_core.GraphView`
-    )mydelimiter");
-
     m.def("prepare_network", &prepareNetwork, py::arg("network"), "prepare the network for the PTQ");
 
 }
diff --git a/python_binding/pybind_QAT_LSQ.cpp b/python_binding/pybind_QAT_LSQ.cpp
index 4bba3b6baa5eda41a024399eb1be1402c74b2c1b..206985efe4558a84ce1ed67a1264bd6902213764 100644
--- a/python_binding/pybind_QAT_LSQ.cpp
+++ b/python_binding/pybind_QAT_LSQ.cpp
@@ -23,6 +23,8 @@ void init_QAT_LSQ(py::module &m) {
 
     auto mQuantLSQ = m.def_submodule("lsq");
 
-    mQuantLSQ.def("setup_quantizers", &QuantLSQ::setupQuantizers, py::arg("network"), py::arg("nb_bits"));
+    mQuantLSQ.def("insert_quantizers", &QuantLSQ::insertQuantizers, py::arg("network"), py::arg("nb_bits"), py::arg("step_size"));
+
+    mQuantLSQ.def("insert_and_init_quantizers", &QuantLSQ::insertAndInitQuantizers, py::arg("network"), py::arg("nb_bits"), py::arg("calibration_data"));
 }
 } // namespace Aidge
diff --git a/src/PTQ/CLE.cpp b/src/PTQ/CLE.cpp
index 28858d0e3c693a7620bc32806008523e0602faa9..2738f8a92154368962e9162fba62c41b7622d07c 100644
--- a/src/PTQ/CLE.cpp
+++ b/src/PTQ/CLE.cpp
@@ -20,7 +20,10 @@
 #include "aidge/quantization/PTQ/PTQ.hpp"  // retrieveNodeVector
 
 #include "aidge/graph/GraphView.hpp"
-#include "aidge/graph/Node.hpp"
+
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/scheduler/Scheduler.hpp"
+#include "aidge/utils/Log.hpp"
 #include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/Log.hpp"
 
@@ -30,6 +33,12 @@
 #include "aidge/operator/Reshape.hpp"
 #include "aidge/operator/Round.hpp"
 
+#include "aidge/operator/Mul.hpp"
+#include "aidge/operator/ArgMax.hpp"
+#include "aidge/operator/Abs.hpp"
+#include "aidge/operator/Reshape.hpp"
+#include "aidge/operator/Round.hpp"
+
 namespace Aidge
 {
 
@@ -49,7 +58,7 @@ static void rescaleTensor(std::shared_ptr<Tensor> tensor, double scaling)
     mulOp.setDataType(tensor->dataType());
     mulOp.setBackend(tensor->backend());
 
-    std::shared_ptr<Aidge::Tensor> scalingTensor = std::make_shared<Aidge::Tensor>(scaling);
+    std::shared_ptr<Aidge::Tensor> scalingTensor = std::make_shared<Aidge::Tensor>(Aidge::Array1D<double, 1> {scaling});
     scalingTensor->setDataType(tensor->dataType());
     scalingTensor->setBackend(tensor->backend());
 
@@ -67,7 +76,7 @@ static void rescaleTensor(std::shared_ptr<Tensor> tensor, double scaling)
 static double getTensorAbsoluteMax(std::shared_ptr<Tensor> tensor)
 {
     // get the abs tensor
-
+    std::shared_ptr<Tensor> fallback; //Fallback tensor for refCastFR
     std::shared_ptr<Tensor> absTensor = std::make_shared<Tensor>(tensor->abs());
 
     // flatten the abs tensor
@@ -81,6 +90,7 @@ static double getTensorAbsoluteMax(std::shared_ptr<Tensor> tensor)
     reshapeOp.associateInput(0, absTensor);
     reshapeOp.forward();
     std::shared_ptr<Tensor> flatTensor = reshapeOp.getOutput(0);
+    const Tensor& localFlatTensor = flatTensor->refCastFrom(fallback, DataType::Float64, "cpu"); 
 
     // Get the argmax
 
@@ -90,13 +100,24 @@ static double getTensorAbsoluteMax(std::shared_ptr<Tensor> tensor)
 
     argmaxOp.associateInput(0, flatTensor);
     argmaxOp.forward();
-    std::shared_ptr<Tensor> argmaxTensor = argmaxOp.getOutput(0);
+
+    const Tensor& argMaxTensor = argmaxOp.getOutput(0)->refCastFrom(fallback, DataType::Float64, "cpu"); 
 
     // Return the max
 
-    int maxIndex = std::round(argmaxTensor->get<double>(0));
+    int maxIndex = std::round(argMaxTensor.get<double>(0));
 
-    return flatTensor->get<double>(maxIndex);
+    return localFlatTensor.get<double>(maxIndex);
+}
+//Function used to extraxt the local tensor (from a ProducerScalingNode)
+std::shared_ptr<Aidge::Tensor> getLocalTensor(std::shared_ptr<Node> node) {
+    if (node->getParent(1)->attributes()->hasAttr("quantization.ptq.isProducerScaling")) {
+        std::shared_ptr<Aidge::OperatorTensor> operatorTensor = std::static_pointer_cast<OperatorTensor>(node->getParent(1)->getOperator());
+        operatorTensor->forward();// We need the forward pass to compute the scaled value of the Tensor
+        return operatorTensor->getOutput(0);
+    } else {
+        return getWeightTensor(node);
+    }
 }
 
 void crossLayerEqualization(std::shared_ptr<GraphView> graphView, double targetDelta)
@@ -131,16 +152,18 @@ void crossLayerEqualization(std::shared_ptr<GraphView> graphView, double targetD
             std::shared_ptr<Node> n1 = affineNodeVector[i];
             std::shared_ptr<Node> n2 = affineNodeVector[i+1];
 
-            double r1 = getTensorAbsoluteMax(getWeightTensor(n1));
-            double r2 = getTensorAbsoluteMax(getWeightTensor(n2));
+            std::shared_ptr<Aidge::Tensor> n1localTensor = getLocalTensor(n1);
+            std::shared_ptr<Aidge::Tensor> n2localTensor = getLocalTensor(n2);
+            
+            double r1 = getTensorAbsoluteMax(n1localTensor);
+            double r2 = getTensorAbsoluteMax(n2localTensor);
 
             double s1 = std::sqrt(r1 * r2) / r1;
             double s2 = std::sqrt(r1 * r2) / r2;
 
-            rescaleTensor(getWeightTensor(n1), s1);
-            rescaleTensor(getWeightTensor(n2), s2);
-
-            rescaleTensor(getBiasTensor(n1), s1);
+            insertScalingBelowProducer(n1->getParent(1),s1,graphView);
+            insertScalingBelowProducer(n2->getParent(1),s2,graphView);
+            insertScalingBelowProducer(n1->getParent(2),s1,graphView);
 
             double rangeDelta = std::abs(r1 - r2);
             if (rangeDelta > maxRangeDelta)
diff --git a/src/PTQ/Clipping.cpp b/src/PTQ/Clipping.cpp
index 66b0ab36fba7634d7ee350cdccb27895ffa52da1..a4e7fed921604fcf9d18c6e50991220c4785f3bb 100644
--- a/src/PTQ/Clipping.cpp
+++ b/src/PTQ/Clipping.cpp
@@ -222,7 +222,7 @@ std::map<std::string, double> adjustRanges(Clipping clippingMode, std::map<std::
 
         for (std::shared_ptr<Node> node : graphView->getNodes())
         {
-            if (node->type() == "Scaling")
+            if (node->attributes()->hasAttr("quantization.ptq.isScaling"))
             {
                 std::vector<int> histogram = histograms[node->name()];
 
diff --git a/src/PTQ/PTQ.cpp b/src/PTQ/PTQ.cpp
index 7c29ee0b9178fbb07f4a2d5edf9f0ad7ac8dcac4..f03fc7bcea039a1939e116cc842f7062f28c5cae 100644
--- a/src/PTQ/PTQ.cpp
+++ b/src/PTQ/PTQ.cpp
@@ -14,7 +14,6 @@
 #include "aidge/quantization/PTQ/PTQ.hpp"
 #include "aidge/operator/PTQMetaOps.hpp"
 
-
 #include "aidge/data/Tensor.hpp"
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/graph/Node.hpp"
@@ -22,16 +21,15 @@
 #include "aidge/scheduler/Scheduler.hpp"
 #include "aidge/utils/Log.hpp"
 
+
 #include "aidge/operator/Producer.hpp"
 #include "aidge/operator/Mul.hpp"
+#include "aidge/operator/Round.hpp"
 #include "aidge/operator/ReLU.hpp"
 #include "aidge/operator/BatchNorm.hpp"
 #include "aidge/operator/Conv.hpp"
-
 #include "aidge/operator/ArgMax.hpp"
-#include "aidge/operator/Abs.hpp"
 #include "aidge/operator/Reshape.hpp"
-#include "aidge/operator/Round.hpp"
 
 
 #include "aidge/recipes/Recipes.hpp"
@@ -55,59 +53,124 @@ bool isMerging(std::shared_ptr<Node> node)
 {
     return (mergingNodeTypes.find(node->type()) != mergingNodeTypes.end());
 }
+static int getInputIndex(std::shared_ptr<Node> node, std::shared_ptr<Node> parentNode)
+{
+    int index = 0;
+    while (node->getParent(index) != parentNode) 
+        index++;
+    return index;
+}
 
-bool checkArchitecture(std::shared_ptr<GraphView> graphView)
+
+void multiplyScalingFactor(std::shared_ptr<Aidge::Node> node,double coeff)
 {
-    std::set<std::string> otherNodeTypes({"Flatten", "Softmax", "BatchNorm2D", "ReLU", "Producer"});
+    AIDGE_ASSERT(node->type() == "Mul" && (node->attributes()->hasAttr("quantization.ptq.isProducerScaling") || node->attributes()->hasAttr("quantization.ptq.isScaling")),
+    "Cannot update the scaling factor on Node of type {} with no scaling tag",node->type());
+    auto scalingFactorTensor = std::static_pointer_cast<OperatorTensor>(node->getOperator())->getInput(1);
+    std::shared_ptr<Tensor> fallback;
+    const Tensor& localTensor = scalingFactorTensor->refCastFrom(fallback, DataType::Float64, "cpu");
+    double previousScalingFactor = localTensor.get<double>(0);
+    std::shared_ptr<Tensor> finalTensor = std::make_shared<Tensor>(Array1D<double, 1> {previousScalingFactor * coeff});
+    node->input(1).first->getOperator()->setOutput(0, finalTensor);
+}
+/* Util function to insert a node below another one already connected */
+void insertNodeBetween(std::shared_ptr<Node> parent, 
+                       std::shared_ptr<Node> newNode, 
+                       std::shared_ptr<GraphView> graphView) 
+{
+    // Checking the parents always have at least 1 children
+    AIDGE_ASSERT(parent->getChildren().size() > 0, "The parent node must have at least one child to insert a new node.");
+
+    // Retrieve children connection indexes
+    std::vector<std::shared_ptr<Node>> nextNodes = parent->getChildren(0);
+    std::vector<int> inputIndices(nextNodes.size());
+    for (std::size_t i = 0; i < nextNodes.size(); i++) {
+        inputIndices[i] = getInputIndex(nextNodes[i], parent);
+    }
 
-    for (std::shared_ptr<Node> node : graphView->getNodes())
-    {
-        bool isOther = otherNodeTypes.find(node->type()) != otherNodeTypes.end();
-        if (!isOther && !isAffine(node) && !isSeamless(node) && !isMerging(node)) {
-            Log::warn(" GraphView can't be quantized : node type {} is not supported !", node->type());
-            return false;
-        }
+    // Disconnect childs from parent
+    for (std::shared_ptr<Node> nextNode : nextNodes) {
+        parent->removeChild(nextNode, 0);
     }
 
-    return true;
+    // Insert the new node between the child and the parent
+    parent->addChild(newNode, 0, 0);
+    for (std::size_t i = 0; i < nextNodes.size(); i++) {
+        newNode->addChild(nextNodes[i], 0, inputIndices[i]);
+    }
+
+    graphView->add(newNode);
 }
 
-static void rescaleTensor(std::shared_ptr<Tensor> tensor, double scaling)
+bool insertRoundBelowProducer(std::shared_ptr<Node> node,std::shared_ptr<GraphView> graphView)
 {
-    auto mulOp = Mul_Op();
-    mulOp.setDataType(tensor->dataType());
-    mulOp.setBackend(tensor->backend());
+    if(node->attributes()->hasAttr("quantization.ptq.isProducerScaling") && node->type() != "Round")
+    {
+        std::shared_ptr<Aidge::Node> roundNode = Round(node->name() + "_Round");
+        roundNode->getOperator()->setDataType(DataType::Float64); // getDataType(parentNode)
+        roundNode->getOperator()->setBackend(node->getOperator()->backend());
 
-    std::shared_ptr<Aidge::Tensor> scalingTensor = std::make_shared<Aidge::Tensor>(scaling);
-    scalingTensor->setDataType(tensor->dataType());
-    scalingTensor->setBackend(tensor->backend());
+        insertNodeBetween(node,roundNode,graphView);
+        roundNode->attributes()->addAttr("quantization.ptq.isProducerRounding",0.0);
+        return true;
+    }
+    return false;
+}
+bool insertScalingBelowProducer(std::shared_ptr<Node> node,double scalingFactor, std::shared_ptr<GraphView> graphView)
+{
+    if(node->attributes()->hasAttr("quantization.ptq.isProducerRounding"))
+    {
+        //In this case we 'bump' the node to the one above him (an actual ProducerScaling)
+        // because the round node is not usable (only used when SSA is enabled)
+        node = node->getParent(0);
+    }
+    if(node->attributes()->hasAttr("quantization.ptq.isProducerScaling"))
+    {
+        // We accumulate the multiples scaling factors by multiplying the SF of the ProducerScaling node 
+        // (adding new nodes each time would make the graph unusable)
+        multiplyScalingFactor(node,scalingFactor);
+        return true;
+    }
+    AIDGE_ASSERT(node->type() == "Producer","Cannot apply a scaling factor on node of type: {} which is not a producer", node->type());
+    std::string scalingNodeName = makeUniqueName(node->name() + "_ProducerScaling", graphView);
+    
+    std::shared_ptr<Aidge::Node> scalingNode = Mul(scalingNodeName);
+    scalingNode->attributes()->addAttr("quantization.ptq.isProducerScaling",0.0);
+    
+    std::shared_ptr<Tensor> scalingFactorTensor = std::make_shared<Tensor>(Array1D<double, 1> {scalingFactor});
+    std::shared_ptr<Node> scalingFactorProducer = addProducer(scalingNode, 1, {1}, "Factor"); 
+    scalingFactorProducer->getOperator()->setOutput(0, scalingFactorTensor);
+    graphView->add(scalingFactorProducer);
+    
+    scalingNode->getOperator()->setDataType(DataType::Float64);
+    std::string producerBackend = std::static_pointer_cast<OperatorTensor>(node->getOperator())->getOutput(0)->backend();
+    scalingNode->getOperator()->setBackend(producerBackend);
 
-    mulOp.associateInput(0, tensor);
-    mulOp.associateInput(1, scalingTensor);
+    insertNodeBetween(node, scalingNode, graphView);
 
-    mulOp.forward();
-    
-    auto outTensor = mulOp.getOutput(0);
-    *tensor = *outTensor;
+    return true;
 }
 
-static void roundTensor(std::shared_ptr<Tensor> tensor)
+bool checkArchitecture(std::shared_ptr<GraphView> graphView)
 {
-    auto roundOp = Round_Op();
-    roundOp.setDataType(tensor->dataType());
-    roundOp.setBackend(tensor->backend());
+    std::set<std::string> otherNodeTypes({"Flatten", "Softmax", "BatchNorm2D", "ReLU", "Producer"});
 
-    roundOp.associateInput(0, tensor);
-    roundOp.forward();
-    
-    auto outTensor = roundOp.getOutput(0);
-    *tensor = *outTensor;
+    for (std::shared_ptr<Node> node : graphView->getNodes())
+    {
+        bool isOther = otherNodeTypes.find(node->type()) != otherNodeTypes.end();
+        if (!isOther && !isAffine(node) && !isSeamless(node) && !isMerging(node)) {
+            Log::warn(" GraphView can't be quantized : node type {} is not supported !", node->type());
+            return false;
+        }
+    }
+
+    return true;
 }
 
-// TODO : make the retreival of argmax values backend independant (refCastFrom)
 static double getTensorAbsoluteMax(std::shared_ptr<Tensor> tensor)
 {
     // get the abs tensor
+    std::shared_ptr<Tensor> fallback; //Fallback tensor for refCastFR
 
     std::shared_ptr<Tensor> absTensor = std::make_shared<Tensor>(tensor->abs());
 
@@ -122,6 +185,7 @@ static double getTensorAbsoluteMax(std::shared_ptr<Tensor> tensor)
     reshapeOp.associateInput(0, absTensor);
     reshapeOp.forward();
     std::shared_ptr<Tensor> flatTensor = reshapeOp.getOutput(0);
+    const Tensor& localFlatTensor = flatTensor->refCastFrom(fallback, DataType::Float64, "cpu"); 
 
     // Get the argmax
 
@@ -131,13 +195,13 @@ static double getTensorAbsoluteMax(std::shared_ptr<Tensor> tensor)
 
     argmaxOp.associateInput(0, flatTensor);
     argmaxOp.forward();
-    std::shared_ptr<Tensor> argmaxTensor = argmaxOp.getOutput(0);
+    const Tensor& argMaxTensor = argmaxOp.getOutput(0)->refCastFrom(fallback, DataType::Float64, "cpu"); 
 
     // Return the max
 
-    int maxIndex = std::round(argmaxTensor->get<double>(0));
+    int maxIndex = std::round(argMaxTensor.get<double>(0));
 
-    return flatTensor->get<double>(maxIndex);
+    return localFlatTensor.get<double>(maxIndex);
 }
 
 
@@ -151,6 +215,15 @@ static std::vector<std::shared_ptr<Node>> removeMatchingNodes(std::vector<std::s
 
     return remainingNodes;
 }
+static std::vector<std::shared_ptr<Node>> removeProdScalingNodes(std::vector<std::shared_ptr<Node>> nodeVector)
+{
+    std::vector<std::shared_ptr<Node>> remainingNodes;
+    for (std::shared_ptr<Node> node : nodeVector)
+        if (!node->attributes()->hasAttr("quantization.ptq.isProducerScaling"))
+            remainingNodes.push_back(node);
+
+    return remainingNodes;
+}
 
 static void fixScheduling(std::vector<std::shared_ptr<Node>>& nodeVector) {
 
@@ -195,6 +268,7 @@ std::vector<std::shared_ptr<Node>> retrieveNodeVector(std::shared_ptr<GraphView>
 
     fixScheduling(nodeVector);
     nodeVector = removeMatchingNodes(nodeVector, "Producer");
+    nodeVector = removeProdScalingNodes(nodeVector);
 
     if (verbose) 
     {
@@ -214,7 +288,6 @@ static std::shared_ptr<Node> getFirstNode(std::shared_ptr<GraphView> graphView)
 void prepareNetwork(std::shared_ptr<GraphView> graphView)
 {
     removeFlatten(graphView);
-
     sanitizeNodeNames(graphView);
 
     bool containsBatchNorm = false;
@@ -264,25 +337,26 @@ void insertResidualNodes(std::shared_ptr<GraphView> graphView)
                     Log::info(" ### inserting multiplicative node ...");
 
                     std::string residualNodeName = makeUniqueName(parentNode->name() + "_Res", graphView);
-                    std::shared_ptr<Node> residualNode = Scaling(1.0, residualNodeName);
+                    std::shared_ptr<Node> residualNode = Mul(residualNodeName);
+                    residualNode->attributes()->addAttr("quantization.ptq.isScaling", 0.0);
+                    residualNode->attributes()->addAttr("quantization.ptq.isResidual", 0.0);
+                    
+                    //Adding the SF as a producer of the node
+                    std::shared_ptr<Tensor> scalingFactorTensor = std::make_shared<Tensor>(Array1D<double, 1> {1.0});
+                    std::shared_ptr<Node> scalingFactorProducer = addProducer(residualNode, 1, {1}, "ScalingFactor"); 
+                    scalingFactorProducer->getOperator()->setOutput(0, scalingFactorTensor);
 
-                    residualNode->getOperator()->setDataType(DataType::Float64); //getDataType(parentNode)
-                    residualNode->getOperator()->setBackend("cpu");
+                    residualNode->getOperator()->setDataType(DataType::Float64); // getDataType(parentNode)
+                    residualNode->getOperator()->setBackend(parentNode->getOperator()->backend());
 
                     graphView->insertParent(node, residualNode, i, 0, 0);
+                    graphView->add(scalingFactorProducer);
                 }
             }
         }
     }
 }
 
-static int getInputIndex(std::shared_ptr<Node> node, std::shared_ptr<Node> parentNode)
-{
-    int index = 0;
-    while (node->getParent(index) != parentNode) 
-        index++;
-    return index;
-}
 
 void insertScalingNodes(std::shared_ptr<GraphView> graphView)
 {
@@ -295,37 +369,30 @@ void insertScalingNodes(std::shared_ptr<GraphView> graphView)
         if (isAffine(parentNode) || isMerging(parentNode))
         {
             std::string scalingNodeName = makeUniqueName(parentNode->name() + "_Scaling", graphView);
-            std::shared_ptr<Node> scalingNode = Scaling(1.0, scalingNodeName);
+            //std::shared_ptr<Node> scalingNode = Scaling(1.0, scalingNodeName);
+            
+            //Adding Mul operator with tag "quantization.ptq.isScaling"
+            std::shared_ptr<Aidge::Node> scalingNode = Mul(scalingNodeName);
+            scalingNode->attributes()->addAttr("quantization.ptq.isScaling",0.0);
+
+            //Adding the SF as a producer of the node
+            std::shared_ptr<Tensor> scalingFactorTensor = std::make_shared<Tensor>(Array1D<double, 1> {1.0});
+            std::shared_ptr<Node> scalingFactorProducer = addProducer(scalingNode, 1, {1}, "ScalingFactor"); 
+            scalingFactorProducer->getOperator()->setOutput(0, scalingFactorTensor);
 
             scalingNode->getOperator()->setDataType(DataType::Float64); // getDataType(parentNode)
-            scalingNode->getOperator()->setBackend("cpu");
+            scalingNode->getOperator()->setBackend(parentNode->getOperator()->backend());
 
             if (parentNode->getChildren().size() > 0)
             {
-                // SCALING NODE INSERTION
-                
-                // We always have one output from Affine and Add nodes, but possibly multiple childs
-                std::vector<std::shared_ptr<Node>> nextNodes = parentNode->getChildren(0); 
-
-                // For each node in nextNodes store the connexion index
-                std::vector<int> inputIndices(nextNodes.size());
-                for (std::size_t i = 0; i < nextNodes.size(); i++)
-                    inputIndices[i] = getInputIndex(nextNodes[i], parentNode);
-                    
-                for (std::shared_ptr<Node> nextNode : nextNodes)
-                    parentNode->removeChild(nextNode, 0);
-
-                parentNode->addChild(scalingNode, 0, 0);
-
-                for (std::size_t i = 0; i < nextNodes.size(); i++)
-                    scalingNode->addChild(nextNodes[i], 0, inputIndices[i]);
-
-                graphView->add(scalingNode);
+                insertNodeBetween(parentNode,scalingNode,graphView);
+                graphView->add(scalingFactorProducer);
             }
             else
             {
                 // Log::info(" last node reached ! ");
                 parentNode->addChild(scalingNode, 0, 0);
+                graphView->add(scalingFactorProducer);
                 graphView->add(scalingNode);
             }
         }
@@ -335,7 +402,7 @@ void insertScalingNodes(std::shared_ptr<GraphView> graphView)
 static std::shared_ptr<Node> getPreviousScalingNode(std::shared_ptr<Node> mergingNode)
 {
     std::shared_ptr<Node> currNode = mergingNode;
-    while(currNode->type() != "Scaling")
+    while(!currNode->attributes()->hasAttr("quantization.ptq.isScaling"))
     {
         if (currNode->getParents().size() == 0)
         {
@@ -378,7 +445,7 @@ void normalizeParameters(std::shared_ptr<GraphView> graphView)
     for (std::shared_ptr<Node> node : nodeVector)
     {
         // Scaling nodes still have a ratio of 1, so they are seamless ...
-        if (node->type() == "ReLU" || node->type() == "Scaling" || isSeamless(node))
+        if (node->type() == "ReLU" || node->attributes()->hasAttr("quantization.ptq.isScaling") || isSeamless(node))
         {
             if (node != firstNode)
             {
@@ -394,7 +461,8 @@ void normalizeParameters(std::shared_ptr<GraphView> graphView)
             std::shared_ptr<Tensor> weightTensor = getWeightTensor(node);
             double scaling = getTensorAbsoluteMax(weightTensor);
             double ratio = 1.0 / scaling;
-            rescaleTensor(weightTensor, ratio);
+            //rescaleTensor(weightTensor, ratio);
+            insertScalingBelowProducer(node->getParent(1),ratio,graphView);
 
             // Accumulate the ratio
             if (node == firstNode)
@@ -412,7 +480,8 @@ void normalizeParameters(std::shared_ptr<GraphView> graphView)
             if (nodeHasBias(node))
             {
                 std::shared_ptr<Tensor> biasTensor = getBiasTensor(node);
-                rescaleTensor(biasTensor, accumulatedRatios[node->name()] );
+                //rescaleTensor(biasTensor, accumulatedRatios[node->name()] );
+                insertScalingBelowProducer(node->getParent(2),accumulatedRatios[node->name()],graphView);
             }
         }
 
@@ -439,8 +508,7 @@ void normalizeParameters(std::shared_ptr<GraphView> graphView)
 
                 std::shared_ptr<Node> scalingNode = getPreviousScalingNode(mergingNode);
 
-                double currScalingFactor = getScalingFactor(scalingNode);
-                updateScalingFactor(scalingNode, currScalingFactor / rescaling);
+                multiplyScalingFactor(scalingNode,1/rescaling);
 
                 accumulatedRatios[mergingNode->name()] /= rescaling; // optional ...
             }
@@ -465,7 +533,7 @@ std::map<std::string, double> computeRanges(std::shared_ptr<GraphView> graphView
     std::set<std::shared_ptr<Node>> nodeSet = graphView->getNodes();
     for (std::shared_ptr<Node> node : nodeSet)
     {
-        if ((scalingNodesOnly && (node->type() == "Scaling")) || (!scalingNodesOnly && (node->type() != "Producer")))
+        if ((scalingNodesOnly && (node->attributes()->hasAttr("quantization.ptq.isScaling"))) || (!scalingNodesOnly && (node->type() != "Producer")))
         {
             std::shared_ptr<Operator> nodeOperator = node->getOperator();
             std::shared_ptr<Tensor> valueTensor = std::static_pointer_cast<Tensor> (nodeOperator->getRawOutput(0));
@@ -487,7 +555,7 @@ std::map<std::string, double> computeRanges(std::shared_ptr<GraphView> graphView
     // std::shared_ptr<Node> inputNode = getFirstNode(graphView);
 
     for (std::shared_ptr<Node> node : nodeSet)
-        if ((scalingNodesOnly && (node->type() == "Scaling")) || (!scalingNodesOnly && (node->type() != "Producer")))
+        if ((scalingNodesOnly && (node->attributes()->hasAttr("quantization.ptq.isScaling"))) || (!scalingNodesOnly && (node->type() != "Producer")))
             valueRanges.insert(std::make_pair(node->name(), 0));
 
     if (useCuda)
@@ -514,7 +582,7 @@ std::map<std::string, double> computeRanges(std::shared_ptr<GraphView> graphView
         std::map<std::string, double> sampleRanges;
         for (std::shared_ptr<Node> node : nodeSet)
         {
-            if ((scalingNodesOnly && (node->type() == "Scaling")) || (!scalingNodesOnly && (node->type() != "Producer")))
+            if ((scalingNodesOnly && (node->attributes()->hasAttr("quantization.ptq.isScaling"))) || (!scalingNodesOnly && (node->type() != "Producer")))
             {
                 std::shared_ptr<Operator> nodeOperator = node->getOperator();
                 std::shared_ptr<Tensor> valueTensor = std::static_pointer_cast<Tensor> (nodeOperator->getRawOutput(0));
@@ -536,7 +604,7 @@ std::map<std::string, double> computeRanges(std::shared_ptr<GraphView> graphView
 
         for (std::shared_ptr<Node> node : nodeSet)
         {
-            if ((scalingNodesOnly && (node->type() == "Scaling")) || (!scalingNodesOnly && (node->type() != "Producer")))
+            if ((scalingNodesOnly && (node->attributes()->hasAttr("quantization.ptq.isScaling"))) || (!scalingNodesOnly && (node->type() != "Producer")))
                 {
                     std::string nodeName = node->name();
                     if (sampleRanges[nodeName] > valueRanges[nodeName])
@@ -572,7 +640,7 @@ void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::st
     for (std::shared_ptr<Node> node : nodeVector)
     {
         // Seamless scaling factor propagation ...
-    
+
         if (isAffine(node) || isSeamless(node) || node->type() == "ReLU") 
         {
             if (node == firstNode)
@@ -586,11 +654,13 @@ void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::st
             }
         }
 
+
         // Here prevNode is either a 'Affine' or a 'Merging'
         // => do not split the cases, just handle the bias ...
 
-        if (node->type() == "Scaling") 
+        if (node->attributes()->hasAttr("quantization.ptq.isScaling")) 
         {
+
             // retrieve the previous scaling factor ...
             std::shared_ptr<Node> prevNode = node->getParent(0);
             double prevScalingFactor = scalingFactors[prevNode->name()];
@@ -598,8 +668,7 @@ void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::st
             // ValueRanges must contains all the scaling nodes !!!
             double scalingFactor = valueRanges[node->name()]; 
 
-            double currScalingFactor = getScalingFactor(node);
-            updateScalingFactor(node, currScalingFactor / (scalingFactor / prevScalingFactor));
+            multiplyScalingFactor(node,1/(scalingFactor / prevScalingFactor));
 
             scalingFactors[node->name()] = scalingFactor;
 
@@ -607,11 +676,13 @@ void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::st
 
             if (isAffine(prevNode))
             {
+
                 bool prevNodeHasBias = nodeHasBias(prevNode);
                 if (prevNodeHasBias)  
-                {
+                {                
                     std::shared_ptr<Tensor> biasTensor = getBiasTensor(prevNode);
-                    rescaleTensor(biasTensor, 1.0 / prevScalingFactor);
+                    //rescaleTensor(biasTensor, 1.0 / prevScalingFactor);
+                    insertScalingBelowProducer(prevNode->getParent(2),1.0 / prevScalingFactor,graphView);
                 }
             }
         }
@@ -641,9 +712,8 @@ void normalizeActivations(std::shared_ptr<GraphView> graphView, std::map<std::st
 
                 std::shared_ptr<Node> scalingNode = getPreviousScalingNode(mergingNode);
                 //Log::info(" SCALING NODE : {} {}", scalingNode->type(), scalingNode->name());
-
-                double currScalingFactor = getScalingFactor(scalingNode);
-                updateScalingFactor(scalingNode, currScalingFactor * rescaling);                
+                
+                multiplyScalingFactor(scalingNode,rescaling) ;          
             }
         }
     }
@@ -679,7 +749,7 @@ std::map<std::string, std::pair<bool, bool>> computeSignMap(std::shared_ptr<Grap
             signMap[node->name()].second = false;
         } 
 
-        if (node->type() == "Scaling") 
+        if (node->attributes()->hasAttr("quantization.ptq.isScaling")) 
         {
             signMap[node->name()].second = false;
 
@@ -726,7 +796,7 @@ std::map<std::string, std::pair<bool, bool>> computeSignMap(std::shared_ptr<Grap
                 // Arbitration : Signed type wins !
                 for(std::shared_ptr<Node> parent : parentNodes)
                 {
-                    while (parent->type() != "Scaling")
+                    while (!parent->attributes()->hasAttr("quantization.ptq.isScaling"))
                     {
                         signMap[parent->name()] = std::make_pair(false, false);
                         // We are on a branch so nodes always have 1 parent ...
@@ -808,26 +878,23 @@ void quantizeNormalizedNetwork(std::shared_ptr<GraphView> graphView, std::uint8_
         if (isAffine(node))
         {
             // Rescale the weight tensor
-
             std::shared_ptr<Tensor> weightTensor = getWeightTensor(node);
-            rescaleTensor(weightTensor, signedMax);
+            insertScalingBelowProducer(node->getParent(1),signedMax,graphView);
 
             if (!noQuant)
-                roundTensor(weightTensor);
+                insertRoundBelowProducer(node->getParent(1),graphView);
 
             // Rescale the bias tensor
-
             if (nodeHasBias(node))  
             {
                 bool inputIsUnsigned = signMap[node->name()].first;
                 double rescaling = inputIsUnsigned ? unsignedMax * signedMax : signedMax * signedMax;
-                
-
+            
                 std::shared_ptr<Tensor> biasTensor = getBiasTensor(node);
-                rescaleTensor(biasTensor, rescaling);
+                insertScalingBelowProducer(node->getParent(2),rescaling,graphView);
 
                 if (!noQuant)
-                    roundTensor(biasTensor);
+                    insertRoundBelowProducer(node->getParent(2),graphView);
             }
 
             // Compensate the rescaling using the next Scaling node
@@ -842,8 +909,7 @@ void quantizeNormalizedNetwork(std::shared_ptr<GraphView> graphView, std::uint8_
             
             std::shared_ptr<Node> scalingNode = *(node->getChildren().begin()); // Assert if scalingNode is a Scaling ...
 
-            double currScalingFactor = getScalingFactor(scalingNode);
-            updateScalingFactor(scalingNode, currScalingFactor * rescaling);
+            multiplyScalingFactor(scalingNode,rescaling) ;          
         }
         
         if (isMerging(node))
@@ -858,23 +924,25 @@ void quantizeNormalizedNetwork(std::shared_ptr<GraphView> graphView, std::uint8_
 
             std::shared_ptr<Node> scalingNode = *(node->getChildren().begin()); // Assert if scalingNode is a Scaling ...
         
-            double currScalingFactor = getScalingFactor(scalingNode); // XXX bad naming
-            updateScalingFactor(scalingNode, currScalingFactor * rescaling);
+            multiplyScalingFactor(scalingNode,rescaling) ;          
         }
         
         // Handle the Scaling Nodes ...
 
-        if (node->type() == "Scaling")
+        if (node->attributes()->hasAttr("quantization.ptq.isScaling"))
         {
             if (!noQuant) 
             {  
                 // Replace  the  Scaling Node by Quantizer
+                auto scalingFactorTensor = std::static_pointer_cast<OperatorTensor>(node->getOperator())->getInput(1);
+                std::shared_ptr<Tensor> fallback;
+                const Tensor& localTensor = scalingFactorTensor->refCastFrom(fallback, DataType::Float64, "cpu");
+                double old_sf = localTensor.get<double>(0);//!\\ 
 
-                std::shared_ptr<Node> quantizerNode = Quantizer(getScalingFactor(node), -(signedMax + 1), signedMax, node->name());
+                std::shared_ptr<Node> quantizerNode = Quantizer(old_sf, -(signedMax + 1), signedMax, node->name());
                 quantizerNode->getOperator()->setDataType(DataType::Float64); // getDataType(parentNode)
-                quantizerNode->getOperator()->setBackend("cpu");
-
-                graphView->replace({node}, {quantizerNode});
+                quantizerNode->getOperator()->setBackend(node->getOperator()->backend());
+                graphView->replace({node,node->getParent(1)}, {quantizerNode});
 
                 if (optimizeSigns)
                 {
@@ -888,6 +956,7 @@ void quantizeNormalizedNetwork(std::shared_ptr<GraphView> graphView, std::uint8_
 
                     double currScalingFactor = getScalingFactor(quantizerNode);
                     updateScalingFactor(quantizerNode, currScalingFactor * rescaling);
+                    
 
                     if(outputIsUnsigned)
                     {
@@ -910,41 +979,37 @@ static void insertCompensationNodes(std::shared_ptr<GraphView> graphView, std::u
     {
         // A merging node is always followed by a Quantizer node at this point
 
-        if (node->type() == "Quantizer")
+        if (node->type() == "Quantizer" && (node->attributes()->hasAttr("quantization.ptq.isResidual") || !isAffine(node->getParent(0))))
         {   
-            // check if the Quantizer is a residual one, and insert a compensation node if so ...
 
-            bool prevNodeIsForking = ((node->getParent(0))->getChildren().size() > 1);
-            bool prevNodeIsAffine = isAffine(node->getParent(0));
-            bool insertNode = prevNodeIsForking || !prevNodeIsAffine;
-
-            if (insertNode)
-            {
-                // create and insert the multplicative node before the Quantizer
+            // check if the Quantizer is a residual one, and insert a compensation node if so ...
+            // create and insert the multplicative node before the Quantizer
 
-                std::string mulNodeName = makeUniqueName(node->name() + "_Mul", graphView);
-                std::shared_ptr<Node> mulNode = Mul(mulNodeName);
-                mulNode->getOperator()->setDataType(DataType::Float64); // getDataType(parentNode)
-                mulNode->getOperator()->setBackend("cpu");
+            std::string mulNodeName = makeUniqueName(node->name() + "_Mul", graphView);
+            std::shared_ptr<Node> mulNode = Mul(mulNodeName);
+            
+            mulNode->attributes()->addAttr("quantization.ptq.isCompensation",0.0);
+            mulNode->getOperator()->setDataType(DataType::Float64); // getDataType(parentNode)
+            mulNode->getOperator()->setBackend(node->getOperator()->backend());
 
-                graphView->insertParent(node, mulNode, 0, 0, 0);
+            graphView->insertParent(node, mulNode, 0, 0, 0);
 
-                // Add the coeff producer to the multiplier node
+            // Add the coeff producer to the multiplier node
 
-                std::shared_ptr<Node> coeffProducer = addProducer(mulNode, 1, {1}, ""); 
-                std::shared_ptr<Tensor> coeffTensor = std::make_shared<Tensor>(signedMax);
-                coeffProducer->getOperator()->setOutput(0, coeffTensor);
+            std::shared_ptr<Node> coeffProducer = addProducer(mulNode, 1, {1}, ""); 
+            std::shared_ptr<Tensor> coeffTensor = std::make_shared<Tensor>(Array1D<double, 1> {signedMax});
+            coeffProducer->getOperator()->setOutput(0, coeffTensor);
 
-                coeffProducer->getOperator()->setDataType(DataType::Float64);
-                coeffProducer->getOperator()->setBackend("cpu"); 
+            coeffProducer->getOperator()->setDataType(DataType::Float64);
+            coeffProducer->getOperator()->setBackend(node->getOperator()->backend()); 
 
-                graphView->add(coeffProducer); // needed ?
+            graphView->add(coeffProducer); // needed ?
 
-                // Adapt the scaling factor value accordingly
+            // Adapt the scaling factor value accordingly
 
-                double currScalingFactor = getScalingFactor(node); 
-                updateScalingFactor(node, currScalingFactor / signedMax);
-            }
+            double currScalingFactor = getScalingFactor(node); 
+            updateScalingFactor(node, currScalingFactor / signedMax);
+            
         }
     }
 }
@@ -955,9 +1020,7 @@ void performSingleShiftApproximation(std::shared_ptr<GraphView> graphView, bool
 
     for (std::shared_ptr<Node> node : nodeVector)
     {
-        // TODO : use Compensation nodes instead of Mul nodes
-
-        if (isAffine(node) || (node->type() == "Mul"))
+        if (isAffine(node) || (node->type() == "Mul" && node->attributes()->hasAttr("quantization.ptq.isCompensation")))
         {
             std::shared_ptr<Node> scalingNode = (*node->getChildren().begin());
 
@@ -965,21 +1028,20 @@ void performSingleShiftApproximation(std::shared_ptr<GraphView> graphView, bool
 
             double approx = std::pow(2, std::ceil(std::log2(base)));
 
-            updateScalingFactor(scalingNode, approx);
+            updateScalingFactor(scalingNode,approx);
 
             double ratio = base / approx;
 
-            std::shared_ptr<Tensor> weightTensor = getWeightTensor(node);
-            rescaleTensor(weightTensor, ratio);
+            insertScalingBelowProducer(node->getParent(1),ratio,graphView);
             if (!noQuant)
-                roundTensor(weightTensor);
+                insertRoundBelowProducer(node->getParent(1),graphView);
 
             if (nodeHasBias(node))
             {
-                std::shared_ptr<Tensor> biasTensor = getBiasTensor(node);
-                rescaleTensor(biasTensor, ratio);
+                insertScalingBelowProducer(node->getParent(2),ratio,graphView);
+
                 if (!noQuant)
-                    roundTensor(biasTensor);
+                    insertRoundBelowProducer(node->getParent(2),graphView);
             }
         }
     }
@@ -988,7 +1050,7 @@ void performSingleShiftApproximation(std::shared_ptr<GraphView> graphView, bool
 static void printScalingFactors(std::shared_ptr<GraphView> graphView)
 {
     for (auto node : retrieveNodeVector(graphView))
-        if (node->type() == "Scaling" || node->type() == "Quantizer")
+        if (node->attributes()->hasAttr("quantization.ptq.isScaling") || node->type() == "Quantizer")
         {
             double scalingFactor = getScalingFactor(node);
             Log::info(" {:.6f} ({})", scalingFactor, node->name());
@@ -1010,18 +1072,6 @@ static void setupDataType(std::shared_ptr<GraphView> graphView, std::vector<std:
         tensor->setDataType(dataType);
 }
 
-static void printRanges(std::shared_ptr<GraphView> graphView, std::map<std::string, double> valueRanges)
-{
-    SequentialScheduler scheduler(graphView);
-    scheduler.resetScheduling();
-    scheduler.generateScheduling();
-
-    auto scheduling = scheduler.getStaticScheduling();
-    for (auto node : scheduling)
-        if (node->type() == "Scaling")
-            Log::info(" {} range = {} ", node->name(), valueRanges[node->name()]);
-}
-
 void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits, std::vector<std::shared_ptr<Tensor>> inputDataSet, Clipping clippingMode, bool noQuant, bool optimizeSigns, bool singleShift, bool useCuda, bool verbose)
 {
     Log::info(" === QUANT PTQ 0.2.21 === ");
@@ -1041,7 +1091,6 @@ void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits,
     insertScalingNodes(graphView);
 
     crossLayerEqualization(graphView);
-
     Log::info(" Normalizing the parameters ...");
     normalizeParameters(graphView);
 
@@ -1049,14 +1098,12 @@ void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits,
     std::map<std::string, double> valueRanges = computeRanges(graphView, inputDataSet, true, useCuda);
 
     //Log::info(" === RANGES (BEFORE ADJUST) ===");
-    //printRanges(graphView, valueRanges);
 
     Log::info(" Optimizing the clipping values ...");
     valueRanges = adjustRanges(clippingMode, valueRanges, nbBits, graphView, inputDataSet, useCuda, verbose);
 
-    //Log::info(" === RANGES (AFTER ADJUST) ===");
+    //Log:debug("=== RANGES (AFTER ADJUST) ===");
     //printRanges(graphView, valueRanges);
-
     Log::info(" Normalizing the activations ...");
     normalizeActivations(graphView, valueRanges);
 
@@ -1075,17 +1122,9 @@ void quantizeNetwork(std::shared_ptr<GraphView> graphView, std::uint8_t nbBits,
     if (verbose)
         printScalingFactors(graphView);
 
-    //Log::info(" === SCALINGS (BEFORE CAST) ===");
-    //printScalingFactors(graphView);
-
-    setupDataType(graphView, inputDataSet, initialDataType);
-
     if (useCuda)
         graphView->setBackend("cuda");
 
-    //Log::info(" === SCALINGS (AFTER CAST) ===");
-    //printScalingFactors(graphView);
-
     Log::info(" Reseting the scheduler ...");
     SequentialScheduler scheduler(graphView);
     scheduler.resetScheduling();
@@ -1115,15 +1154,9 @@ void clearBiases(std::shared_ptr<GraphView> graphView)
     for (std::shared_ptr<Node> node : graphView->getNodes()) {
         if (node->type() == "FC" || node->type() == "Conv2D") {
             std::shared_ptr<Tensor> biasTensor = std::static_pointer_cast<OperatorTensor>(node->getOperator())->getInput(2);
-            rescaleTensor(biasTensor, 0);
+            //rescaleTensor(biasTensor, 0);
+            insertScalingBelowProducer(node->getParent(2),0,graphView);
         }
     }
 }
-
-void devPTQ(std::shared_ptr<GraphView> graphView) 
-{
-    for (std::shared_ptr<Node> node : graphView->getNodes())
-        Log::info(" UUU : {}", node->name());   
-}
-
 }
diff --git a/src/QAT/QAT_LSQ.cpp b/src/QAT/QAT_LSQ.cpp
index 6eae077b060027eb4029f6b59f55376a1674df70..9b51e846df498a9303b7373ae1c86d4b007a96f0 100644
--- a/src/QAT/QAT_LSQ.cpp
+++ b/src/QAT/QAT_LSQ.cpp
@@ -21,152 +21,193 @@
 #include "aidge/graph/Matching.hpp"
 #include "aidge/recipes/QuantRecipes.hpp"
 
+namespace Aidge {
 
-namespace Aidge 
+void QuantLSQ::insertQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits, float stepSize)
 {
+    const auto matches = SinglePassGraphMatching(graphView).match("(Conv2D#|FC#)");
 
-static float getTensorAbsMean(std::shared_ptr<Tensor> tensor)
-{
-    auto valueTensor = (*tensor).abs().mean();
-    std::shared_ptr<Tensor> fallback;
-    const Tensor& localTensor = valueTensor.refCastFrom(fallback, DataType::Float32, "cpu");
-    return localTensor.get<float>(0);
-}
+    for (const auto& match : matches) 
+    {
+        auto linearNode = match.graph->rootNode(); 
 
-static float getTensorStd(std::shared_ptr<Tensor> tensor)
-{
-    auto valueTensor = (*tensor);
-    
-    auto skewedTensor = valueTensor - valueTensor.mean();
-    auto squaredTensor = skewedTensor * skewedTensor;
-    auto varianceTensor = squaredTensor.mean();
+        std::pair<int, int> signedRange = {-std::pow(2, nbBits - 1), std::pow(2, nbBits - 1) - 1};
+        std::pair<int, int> unsignedRange = {0, std::pow(2, nbBits) - 1};
 
-    std::shared_ptr<Tensor> fallback;
-    auto localTensor = varianceTensor.refCastFrom(fallback, DataType::Float32, "cpu");
-    
-    float variance = localTensor.get<float>(0);
-    return std::sqrt(variance);
-}
+        // INPUT QUANTIZERS INSERTION
 
+        // TODO : double check this, and use createUniqueName()
+        auto inputQuantizerName = makeUniqueName(linearNode->name() + "_lsq_i", graphView);  
+        auto inputQuantizerNode = LSQ(signedRange, inputQuantizerName);
 
-// INIT THE STEP SIZE OF A QUANTIZER NODE
+        // Set the step size
 
-static bool initStepSize(std::shared_ptr<Node> quantizer)
-{
-    const auto quantizerOp = std::static_pointer_cast<LSQ_Op>(quantizer->getOperator());
+        auto inputStepSizeOp = inputQuantizerNode->getParent(1)->getOperator();
+        auto inputStepSizeTensor = std::make_shared<Tensor>(Array1D<float, 1>({{stepSize}}));
+        inputStepSizeOp->setOutput(0, inputStepSizeTensor);
 
-    // This formula is the one proposed in the paper ...
+        // Absorb the ReLU when possible ...
 
-    // float inputAbsMean = getTensorAbsMean(quantizerOp->getInput(0));
-    // float stepSize = 2.0f * (inputAbsMean / std::sqrt(quantizerOp->range().second));
+        // XXX is this safe ???
+        bool nodeHasParent = static_cast<bool> (linearNode->getParents()[0]); 
+        // bool nodeHasParent = (linearNode->getParents().size() != 0);
 
-    // .. but this formula seems to work better !!!
+        if (nodeHasParent) {
+            auto parentNode = linearNode->getParents()[0];
+            if (parentNode->type() == "ReLU") {
+                auto inputQuantizerOp = std::static_pointer_cast<LSQ_Op> (inputQuantizerNode->getOperator());
+                inputQuantizerOp->range() = unsignedRange;
+                graphView->replace({parentNode}, {}); 
+            }
+        }
 
-    float inputStd = getTensorStd(quantizerOp->getInput(0));
-    float stepSize = 8.0f * (inputStd / (quantizerOp->range().second));
+        // We need to handle the case where the linear node is the first one ...
 
-    // TODO : use the scalar constructor
-    auto stepSizeTensor = std::make_shared<Tensor>(Array1D<float, 1>({{stepSize}})); 
+        if (nodeHasParent) {
+            graphView->insertParent(linearNode, inputQuantizerNode, 0, 0, 0);
+        } else {
+            inputQuantizerNode->addChild(graphView);
+            graphView->add(inputQuantizerNode);
+        }
 
-    // XXX Manage backend here ?
-    stepSizeTensor->setBackend(quantizerOp->getInput(0)->backend());
-    stepSizeTensor->setDataType(quantizerOp->getInput(0)->dataType());
+        // PARAM QUANTIZERS INSERTION
 
-    auto stepSizeProducer = quantizer->getParent(1);
+        // TODO : double check this, and use createUniqueName()
+        auto paramQuantizerName = makeUniqueName(linearNode->name() + "_lsq_p", graphView);  
+        auto paramQuantizerNode = LSQ(signedRange, paramQuantizerName); 
+        graphView->insertParent(linearNode, paramQuantizerNode, 1, 0, 0);
 
-    stepSizeProducer->getOperator()->setOutput(0, stepSizeTensor);
+        // Set the step size
 
-    Log::notice(" [ INIT STEP SIZE = {} ] ", stepSize);
+        auto paramStepSizeOp = paramQuantizerNode->getParent(1)->getOperator();
+        auto paramStepSizeTensor = std::make_shared<Tensor>(Array1D<float, 1>({{stepSize}}));
+        paramStepSizeOp->setOutput(0, paramStepSizeTensor);
+    }
 
-    return false;
 }
 
-static void setupInputQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits)
+static float getTensorAbsMean(std::shared_ptr<Tensor> tensor)
 {
-    const auto matches = SinglePassGraphMatching(graphView).match("(Conv2D#|PaddedConv2D#|FC#)");
+    auto backend = tensor->backend();
+    if (backend == "cuda")
+        tensor->setBackend("cpu");
 
-    for (const auto& match : matches) 
-    {
-        auto linearNode = match.graph->rootNode(); 
+    float acc = 0;
+    float* castedTensor = static_cast<float *> (tensor->getImpl()->rawPtr());
+    for(std::size_t i = 0; i < tensor->size(); i++)
+        acc += std::abs(castedTensor[i]);
+    acc /= static_cast<float> (tensor->size());
 
-        // Log::notice(" SET INPUT QUANTIZER : {} ", linearNode->type());
-
-        std::pair<int, int> signedRange = {-std::pow(2, nbBits - 1), std::pow(2, nbBits - 1) - 1};
-        std::pair<int, int> unsignedRange = {0, std::pow(2, nbBits) - 1};
+    if (backend == "cuda")
+        tensor->setBackend("cuda");
 
-        // Create the input quantizer node
-
-        auto quantizerName = makeUniqueName(linearNode->name() + "_lsq_i", graphView);  
-        auto quantizerNode = LSQ(signedRange, quantizerName);
+    return acc;
+}
 
-        // Init the step-size using the node call stack
+static std::map<std::string, float> collectInputStats(std::shared_ptr<GraphView> graphView, std::shared_ptr<Tensor> calibrationData, bool useCuda)
+{
+    // Propagate the calibration tensor
 
-        quantizerNode->addBeforeForward([quantizerNode](){ return initStepSize(quantizerNode); });
+    SequentialScheduler scheduler(graphView);
+    scheduler.resetScheduling();
+    scheduler.forward(true, {calibrationData});
 
-        // Absorb the ReLU when possible ...
+    // Store the input tensor statistics
 
-        bool nodeHasParent = static_cast<bool> (linearNode->getParents()[0]);  // XXX is this safe ?
+    if (useCuda)
+        graphView->setBackend("cpu"); 
 
-        if (nodeHasParent) 
+    std::map<std::string, float> inputStats;
+    for (auto node : graphView->getNodes())
+    {
+        if (node->type() == "FC" || node->type() == "Conv2D") // TODO: use graph matching !!!
         {
-            bool allParentsAreReLU = true;
-            for (auto parentNode : linearNode->getParents())
-                if (parentNode->type() != "ReLU")
-                    allParentsAreReLU = false;
-
-            if (allParentsAreReLU) {
-                auto quantizerOp = std::static_pointer_cast<LSQ_Op> (quantizerNode->getOperator());
-                quantizerOp->range() = unsignedRange;
-            }
-
-            // TODO : remove the ReLUs when possible
+            const auto op = std::static_pointer_cast<LSQ_Op>(node->getOperator());
+            float inputAbsMean = getTensorAbsMean(op->getInput(0));
+            inputStats.insert(std::make_pair(node->name(), inputAbsMean));
+            fmt::println("{} -> {}", node->name(), inputAbsMean);
         }
+    }
 
-        // Insert the quantizer in the graphView ...
-        // (We need to handle the case where the linear node is the first one)
+    if (useCuda)
+        graphView->setBackend("cuda");
 
-        if (nodeHasParent) {
-            graphView->insertParent(linearNode, quantizerNode, 0, 0, 0);
-        } else {
-            quantizerNode->addChild(graphView);
-            graphView->add(quantizerNode);
+    return inputStats;
+}
+
+static std::map<std::string, float> collectParamStats(std::shared_ptr<GraphView> graphView, bool useCuda)
+{
+    if (useCuda)
+        graphView->setBackend("cpu");
+
+    std::map<std::string, float> paramStats;
+    for (auto node : graphView->getNodes())
+    {
+        if (node->type() == "FC" || node->type() == "Conv2D") // TODO: use graph matching !!!
+        {
+            const auto op = std::static_pointer_cast<LSQ_Op>(node->getOperator());
+            float paramAbsMean = getTensorAbsMean(op->getInput(1));
+            paramStats.insert(std::make_pair(node->name(), paramAbsMean));
+            fmt::println("{} -> {}", node->name(), paramAbsMean);
         }
     }
-}
+    
+    if (useCuda)
+        graphView->setBackend("cuda");
 
-// PARAM QUANTIZERS INSERTION
+    return paramStats;
+}
 
-static void setupParamQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits)
+static void adjustQuantizersStepSizes(std::shared_ptr<GraphView> graphView, std::map<std::string, float> inputStats, std::map<std::string, float> paramStats)
 {
-    const auto matches = SinglePassGraphMatching(graphView).match("(Conv2D#|PaddedConv2D#|FC#)");
-
-    std::pair<int, int> signedRange = {-std::pow(2, nbBits - 1), std::pow(2, nbBits - 1) - 1};
+    const auto matches = SinglePassGraphMatching(graphView).match("(Conv2D#|FC#)");
 
     for (const auto& match : matches) 
-    {       
-        auto linearNode = match.graph->rootNode(); 
+    {
+        auto linearNode = match.graph->rootNode();
 
-        // Log::notice(" SET PARAM QUANTIZER : {} ", linearNode->type());
+        // INPUT QUANTIZERS STEP-SIZES
 
-        // TODO : double check this, and use createUniqueName()
-        auto quantizerName = makeUniqueName(linearNode->name() + "_lsq_p", graphView);  
-        auto quantizerNode = LSQ(signedRange, quantizerName); 
+        auto inputQuantNode = linearNode->getParent(0);
+        auto inputQuantOp = std::static_pointer_cast<LSQ_Op>(inputQuantNode->getOperator());
+
+        float absMean = inputStats[linearNode->name()];
+        float stepSize = 2.0f * (absMean / std::sqrt(inputQuantOp->range().second));
 
-        // Init the step-size using the node call stack
+        auto inputStepSizeOp = inputQuantNode->getParent(1)->getOperator();
+        // XXX inputStepSizeOp->setOutput(0, std::make_shared<Tensor>(Array1D<float, 1>({{stepSize}})));
+        auto inputStepSizeTensor = std::make_shared<Tensor>(Array1D<float, 1>({{stepSize}}));
+        inputStepSizeOp->setOutput(0, inputStepSizeTensor);
 
-        quantizerNode->addBeforeForward([quantizerNode](){ return initStepSize(quantizerNode); });
+        // PARAM QUANTIZERS STEP-SIZES
 
-        // Insert the quantizer in the graphView
+        auto paramQuantNode = linearNode->getParent(1);
+        auto paramQuantOp = std::static_pointer_cast<LSQ_Op>(paramQuantNode->getOperator());
 
-        graphView->insertParent(linearNode, quantizerNode, 1, 0, 0);
+        absMean = paramStats[linearNode->name()];
+        stepSize = 2.0f * (absMean / std::sqrt(paramQuantOp->range().second));
+
+        auto paramStepSizeOp = paramQuantNode->getParent(1)->getOperator();
+        // XXX paramStepSizeOp->setOutput(0, std::make_shared<Tensor>(Array1D<float, 1>({{stepSize}})));
+        auto paramStepSizeTensor = std::make_shared<Tensor>(Array1D<float, 1>({{stepSize}}));
+        paramStepSizeOp->setOutput(0, paramStepSizeTensor);
     }
 }
 
-void QuantLSQ::setupQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits)
+void QuantLSQ::insertAndInitQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits, std::shared_ptr<Tensor> calibrationData)
 {
-    sanitizeNodeNames(graphView);
-    setupInputQuantizers(graphView, nbBits);
-    setupParamQuantizers(graphView, nbBits);
+    bool useCuda = (calibrationData->backend() == "cuda");
+
+    // Collect the tensor statisics
+    auto inputStats = collectInputStats(graphView, calibrationData, useCuda);
+
+    auto paramStats = collectParamStats(graphView, useCuda);
+
+    // Insert the quantizers
+    insertQuantizers(graphView, nbBits, 1.0);
+
+    // Adjust the quantizers step-sizes
+    adjustQuantizersStepSizes(graphView, inputStats, paramStats);
 }
 
 }
\ No newline at end of file
diff --git a/src/operator/PTQMetaOps.cpp b/src/operator/PTQMetaOps.cpp
index 56245da47076d8930ce29ab75e549d97d0d7493d..f86d454245a7fe088edd027732a91f5775cd2acf 100644
--- a/src/operator/PTQMetaOps.cpp
+++ b/src/operator/PTQMetaOps.cpp
@@ -60,23 +60,6 @@ std::shared_ptr<Node> Quantizer(double scalingFactor, double clipMin, double cli
     return metaopNode;
 }
 
-std::shared_ptr<Node> Scaling(double scalingFactor, const std::string& name)
-{
-    std::shared_ptr<Tensor> scalingFactorTensor = std::make_shared<Tensor>(Array1D<double, 1> {scalingFactor});
-
-    std::shared_ptr<Node> mulNode = Mul((!name.empty()) ? name + "_Scaling" : "");
-
-    std::shared_ptr<Node> scalingFactorProducer = addProducer<1>(mulNode, 1, {1}, "ScalingFactor");
-    scalingFactorProducer->getOperator()->setOutput(0, scalingFactorTensor);
-
-    std::shared_ptr<GraphView> graphView  = Sequential({mulNode});
-    std::shared_ptr<GraphView> connectedGraphView  = getConnectedGraphView(mulNode);
-
-    NodePtr metaopNode = MetaOperator("Scaling", connectedGraphView, {}, name);
-
-    return metaopNode;
-}
-
 static std::shared_ptr<Node> getSubNode(std::shared_ptr<GraphView> graphView, std::string nodeType)
 {
     std::shared_ptr<Node> mulNode = nullptr;
@@ -87,10 +70,12 @@ static std::shared_ptr<Node> getSubNode(std::shared_ptr<GraphView> graphView, st
     return mulNode;
 }
 
+
+
 void updateScalingFactor(std::shared_ptr<Node> metaOpNode, double scalingFactor)
 {
     if(metaOpNode->type() != "Scaling" && metaOpNode->type() != "Quantizer")
-        Log::warn(" Cannot update the scaling factor on Node of type {}", metaOpNode->type());
+        Log::warn("Cannot update the scaling factor on Node of type {}", metaOpNode->type());
 
     std::shared_ptr<Tensor> scalingFactorTensor = std::make_shared<Tensor>(Array1D<double, 1> {scalingFactor});
 
@@ -99,7 +84,7 @@ void updateScalingFactor(std::shared_ptr<Node> metaOpNode, double scalingFactor)
     std::shared_ptr<Node> mulNode = getSubNode(metaOp->getMicroGraph(), "Mul");
 
     if (!mulNode)
-        Log::warn(" Invalid PTQ MetaOperator, no Mul node found inside ! ");
+        Log::warn("Invalid PTQ MetaOperator, no Mul node found inside ! ");
 
     mulNode->input(1).first->getOperator()->setOutput(0, scalingFactorTensor);
 }
@@ -107,7 +92,7 @@ void updateScalingFactor(std::shared_ptr<Node> metaOpNode, double scalingFactor)
 double getScalingFactor(std::shared_ptr<Node> MetaOpNode)
 {
     if (MetaOpNode->type() != "Scaling" && MetaOpNode->type() != "Quantizer") {
-        Log::warn(" Cannot get the scaling factor on Node of type {}", MetaOpNode->type());
+        Log::warn("Cannot get the scaling factor on Node of type {}", MetaOpNode->type());
         return 0;
     }
 
@@ -116,7 +101,7 @@ double getScalingFactor(std::shared_ptr<Node> MetaOpNode)
     std::shared_ptr<Node> mulNode = getSubNode(metaOp->getMicroGraph(), "Mul");
 
     if (!mulNode) {
-        Log::warn(" Invalid PTQ MetaOperator, no Mul found inside node of type {}", MetaOpNode->type());
+        Log::warn("Invalid PTQ MetaOperator, no Mul found inside node of type {}", MetaOpNode->type());
         return 0;
     }
 
@@ -131,7 +116,7 @@ double getScalingFactor(std::shared_ptr<Node> MetaOpNode)
 void setClipRange(std::shared_ptr<Node> quantizerNode, double min, double max)
 {
     if (quantizerNode->type() != "Quantizer") {
-        Log::warn(" Cannot set the clipping range on Node of type {}", quantizerNode->type());
+        Log::warn("Cannot set the clipping range on Node of type {}", quantizerNode->type());
         return;
     }
 
@@ -140,7 +125,7 @@ void setClipRange(std::shared_ptr<Node> quantizerNode, double min, double max)
     std::shared_ptr<Node> clipNode = getSubNode(metaOp->getMicroGraph(), "Clip");
 
     if (!clipNode) {
-        Log::warn(" Invalid PTQ MetaOperator, no Clip found inside node of type {}", quantizerNode->type());
+        Log::warn("Invalid PTQ MetaOperator, no Clip found inside node of type {}", quantizerNode->type());
         return;
     }