diff --git a/include/aidge/backend/cpu/operator/ConvImpl.hpp b/include/aidge/backend/cpu/operator/ConvImpl.hpp
index 5cd1c804dcc53ae851b7b19bf4f5bb3c3d83fa6f..2c52893315385f65910e8322202fd26d67d1f24f 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl.hpp
@@ -17,21 +17,17 @@
 #include <tuple>
 #include <vector>
 
-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Conv.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 namespace Aidge {
-// class Conv_Op;
-
-// compute kernel registry for forward and backward
-// Conv 1D
-class ConvImpl1DForward_cpu
-    : public Registrable<ConvImpl1DForward_cpu,
-                         std::tuple<DataType, DataType, DataType, DataType>,
-                         std::function<void(const std::array<DimSize_t, 1>&,
+// Operator implementation entry point for the backend
+using Conv1D_Op = Conv_Op<1>;
+using ConvImpl1D_cpu = OperatorImpl_cpu<Conv_Op<1>,
+    void(const std::array<DimSize_t, 1>&,
                             const std::array<DimSize_t, 1>&,
                             const std::array<DimSize_t, 1>&,
                             const std::array<DimSize_t, 3> &,
@@ -39,31 +35,20 @@ class ConvImpl1DForward_cpu
                             const void *,
                             const void *,
                             const void *,
-                            void *)>> {};
-
-class ConvImpl1D_cpu : public OperatorImpl {
-   public:
-    ConvImpl1D_cpu(const Conv_Op<1>& op) : OperatorImpl(op, "cpu") {}
-
-    static std::unique_ptr<ConvImpl1D_cpu> create(const Conv_Op<1> &op) {
-        return std::make_unique<ConvImpl1D_cpu>(op);
-    }
+                            void *),
+    void(const std::array<DimSize_t, 1>&,
+                            const std::array<DimSize_t, 1>&,
+                            const std::array<DimSize_t, 1>&,
+                            bool,
+                            const std::array<DimSize_t, 3> &,
+                            const void *,
+                            const void *,
+                            const void *,
+                            void *)>;
 
-   public:
-    std::shared_ptr<ProdConso> getProdConso() const override { return std::make_unique<ProdConso>(mOp, true); };
-    void forward() override;
-};
-
-namespace {
-// add cpu backend to Conv_Op<1> implementation registry
-static Registrar<Conv_Op<1>> registrarConvImpl1D_cpu("cpu", Aidge::ConvImpl1D_cpu::create);
-}  // namespace
-
-// Conv 2D
-class ConvImpl2DForward_cpu
-    : public Registrable<ConvImpl2DForward_cpu,
-                         std::tuple<DataType, DataType, DataType, DataType>,
-                         std::function<void(const std::array<DimSize_t, 2>&,
+using Conv2D_Op = Conv_Op<2>;
+using ConvImpl2D_cpu = OperatorImpl_cpu<Conv_Op<2>,
+    void(const std::array<DimSize_t, 2>&,
                             const std::array<DimSize_t, 2>&,
                             const std::array<DimSize_t, 2>&,
                             const std::array<DimSize_t, 4> &,
@@ -71,11 +56,8 @@ class ConvImpl2DForward_cpu
                             const void *,
                             const void *,
                             const void *,
-                            void *)>> {};
-class ConvImpl2DBackward_cpu
-    : public Registrable<ConvImpl2DBackward_cpu,
-                         std::tuple<DataType, DataType, DataType, DataType>,
-                         std::function<void(const std::array<DimSize_t, 2>&,
+                            void *),
+    void(const std::array<DimSize_t, 2>&,
                             const std::array<DimSize_t, 2>&,
                             const std::array<DimSize_t, 2>&,
                             bool,
@@ -83,25 +65,198 @@ class ConvImpl2DBackward_cpu
                             const void *,
                             const void *,
                             const void *,
-                            void *)>> {};
+                            void *)>;
 
-class ConvImpl2D_cpu : public OperatorImpl {
-   public:
-    ConvImpl2D_cpu(const Conv_Op<2>& op) : OperatorImpl(op, "cpu") {}
+// Implementation entry point registration to Operator
+REGISTRAR(Conv1D_Op, "cpu", Aidge::ConvImpl1D_cpu::create);
+REGISTRAR(Conv2D_Op, "cpu", Aidge::ConvImpl2D_cpu::create);
 
-    static std::unique_ptr<ConvImpl2D_cpu> create(const Conv_Op<2> &op) {
-        return std::make_unique<ConvImpl2D_cpu>(op);
+////////////////////////////////////////////////////////////////////////////////
+
+/**
+ * @brief Forward kernel for 1D Convolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
+                            const std::array<DimSize_t, 1>& /*dilationDims*/,
+                            const std::array<DimSize_t, 1>& kernelDims,
+                            const std::array<DimSize_t, 3>& inputDims,
+                            DimSize_t outChannels,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_)
+{
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+
+    // output H size
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, inCh, Xin, Yin)
+    // weight (outCh, inCh, kernelX, kernelY)
+    // does not take Dilation attribute into account
+    using signedsize = std::make_signed<std::size_t>::type;
+    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize;
+            // If bias = nullptr, set B(0)
+            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+            std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
+            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2];
+                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0];
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                    const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                    const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                    const std::size_t oIndexFull = oIndex + ox;
+                    const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
+
+                    for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
+                        output[oIndexFull] += weights[wIndex + sx] *
+                                                input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))];
+                    }
+                }
+            }
+        }
     }
+}
+
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<int, int, int, int>, nullptr});
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr});
 
-   public:
-    std::shared_ptr<ProdConso> getProdConso() const override { return std::make_unique<ProdConso>(mOp, true); };
-    void forward() override;
-};
 
-namespace {
-// add cpu backend to Conv_Op<2> implementation registry
-static Registrar<Conv_Op<2>> registrarConvImpl2D_cpu("cpu", Aidge::ConvImpl2D_cpu::create);
-}  // namespace
+/**
+ * @brief Forward kernel for 2D Convolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
+                            const std::array<DimSize_t, 2>& /*dilationDims*/,
+                            const std::array<DimSize_t, 2>& kernelDims,
+                            const std::array<DimSize_t, 4> &inputDims,
+                            DimSize_t outChannels,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_)
+{
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+
+    // output H size
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+    // output W size
+    const std::size_t oySize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - kernelDims[1] + strideDims[1]) /
+                                static_cast<float>(strideDims[1])));
+
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, inCh, Xin, Yin)
+    // weight (outCh, inCh, kernelX, kernelY)
+    // does not take Dilation attribute into account
+    using signedsize = std::make_signed<std::size_t>::type;
+    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize;
+            // If bias = nullptr, set B(0)
+            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
+            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                    const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                    const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                    for (std::size_t oy = 0; oy < oySize; ++oy) {
+                        const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
+                        const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
+                        const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify);
+                        const std::size_t oIndexFull = oIndex + ox*oySize + oy;
+                        const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
+                        const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
+
+                        if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
+                            output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                                                   weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                                                   weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
+                                                   weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                                                   weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                                                   weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
+                                                   weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                                                   weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                                                   weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
+                        } else {
+                            for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
+                                for (std::size_t sy = syMin; sy < syMax; ++sy) {
+                                    output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
+                                                            input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy))];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<int, int, int, int>, nullptr});
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_CONVIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp
deleted file mode 100644
index 88a71c47244788f2da5e576c8ad5170a92561909..0000000000000000000000000000000000000000
--- a/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp
+++ /dev/null
@@ -1,259 +0,0 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_
-
-#include <algorithm>
-#include <array>
-#include <cmath>
-
-#include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/ConvImpl.hpp"
-#include "aidge/data/half.hpp"
-#include "aidge/utils/Registrar.hpp"
-#include "aidge/utils/Types.h"
-
-namespace Aidge {
-/**
- * @brief Forward kernel for 1D Convolution on CPU backend.
- * @tparam I Input data type.
- * @tparam W Weight data type.
- * @tparam B Bias data type.
- * @tparam O Output data type.
- * @param params tuple of Attributes from the Operator
- * @param inputDims Array of input dimensions.
- * @param input_ const input Tensor.
- * @param weights_ const weight Tensor.
- * @param biases_ const Biais Tensor.
- * @param output_ Output Tensor.
- */
-template <class I, class W, class B, class O>
-void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
-                            const std::array<DimSize_t, 1>& /*dilationDims*/,
-                            const std::array<DimSize_t, 1>& kernelDims,
-                            const std::array<DimSize_t, 3>& inputDims,
-                            DimSize_t outChannels,
-                            const void *input_,
-                            const void *weights_,
-                            const void *biases_,
-                            void *output_)
-{
-    // FIXME: missing convolution attributes as arguments
-    const I *input = static_cast<const I *>(input_);
-    const W *weights = static_cast<const W *>(weights_);
-    const B *biases = static_cast<const B *>(biases_);
-    O *output = static_cast<O *>(output_);
-
-    // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) /
-                                static_cast<float>(strideDims[0])));
-
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, inCh, Xin, Yin)
-    // weight (outCh, inCh, kernelX, kernelY)
-    // does not take Dilation attribute into account
-    using signedsize = std::make_signed<std::size_t>::type;
-    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
-            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize;
-            // If bias = nullptr, set B(0)
-            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-            std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
-            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2];
-                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0];
-                for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                    const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                    const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                    const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
-                    const std::size_t oIndexFull = oIndex + ox;
-                    const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
-
-                    for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
-                        output[oIndexFull] += weights[wIndex + sx] *
-                                                input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))];
-                    }
-                }
-            }
-        }
-    }
-}
-
-namespace {
-static Registrar<ConvImpl1DForward_cpu> registrarConvImpl1DForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::ConvImpl1D_cpu_forward_kernel<float, float, float, float>);
-static Registrar<ConvImpl1DForward_cpu> registrarConvImpl1DForward_cpu_Float16(
-        {DataType::Float16, DataType::Float16, DataType::Float16, DataType::Float16},
-        Aidge::ConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>);
-static Registrar<ConvImpl1DForward_cpu> registrarConvImpl1DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::ConvImpl1D_cpu_forward_kernel<int, int, int, int>);
-static Registrar<ConvImpl1DForward_cpu> registrarConvImpl1DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::ConvImpl1D_cpu_forward_kernel<double, double, double, double>);
-}  // namespace
-
-
-/**
- * @brief Forward kernel for 2D Convolution on CPU backend.
- * @tparam I Input data type.
- * @tparam W Weight data type.
- * @tparam B Bias data type.
- * @tparam O Output data type.
- * @param params tuple of Attributes from the Operator
- * @param inputDims Array of input dimensions.
- * @param input_ const input Tensor.
- * @param weights_ const weight Tensor.
- * @param biases_ const Biais Tensor.
- * @param output_ Output Tensor.
- */
-template <class I, class W, class B, class O>
-void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
-                            const std::array<DimSize_t, 2>& /*dilationDims*/,
-                            const std::array<DimSize_t, 2>& kernelDims,
-                            const std::array<DimSize_t, 4> &inputDims,
-                            DimSize_t outChannels,
-                            const void *input_,
-                            const void *weights_,
-                            const void *biases_,
-                            void *output_)
-{
-    // FIXME: missing convolution attributes as arguments
-    const I *input = static_cast<const I *>(input_);
-    const W *weights = static_cast<const W *>(weights_);
-    const B *biases = static_cast<const B *>(biases_);
-    O *output = static_cast<O *>(output_);
-/*
-    // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(static_cast<float>(inputDims[0] - kernelDims[0] + strideDims[0]) /
-                                static_cast<float>(strideDims[0]));
-    // output W size
-    const std::size_t oySize =
-            static_cast<std::size_t>(static_cast<float>(inputDims[1] - kernelDims[1] + strideDims[1]) /
-                                static_cast<float>(strideDims[1]));
-
-    // TODO: kernel computation
-    // output (Xout, Yout, outCh, batch)
-    // input  (Xin, Yin, inCh, batch)
-    // weight (kernelX, kernelY, inCh, outCh)
-    // does not take Dilation attribute into account
-    for (std::size_t ox = 0; ox < oxSize; ++ox) {
-        for (std::size_t oy = 0; oy < oySize; ++oy) {
-            const std::size_t ix = ox * strideDims[0];
-            const std::size_t iy = oy * strideDims[1];
-
-            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
-                const std::size_t oIndex = inputDims[3] * (outCh + outChannels * (oy + oySize * ox));
-                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                for (std::size_t batch = 0; batch < inputDims[3]; ++batch) {
-                    output[oIndex + batch] = biasVal;
-                }
-                for (std::size_t inCh = 0; inCh < inputDims[2]; ++inCh) {
-                    for (std::size_t sx = 0; sx < kernelDims[0]; ++sx) {
-                        for (std::size_t sy = 0; sy < kernelDims[1]; ++sy) {
-                            const std::size_t wIndex =
-                                    outCh + outChannels * (inCh + inputDims[2] * (sy + kernelDims[1] * sx));
-                            std::size_t iIndex = inputDims[3] * (inCh + inputDims[2] * ((iy + sy) + inputDims[1] * (ix + sx)));
-                            for (std::size_t batch = 0; batch < inputDims[3]; ++batch) {
-                                output[oIndex + batch] += weights[wIndex] * input[iIndex + batch];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-*/
-
-
-    // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - kernelDims[0] + strideDims[0]) /
-                                static_cast<float>(strideDims[0])));
-    // output W size
-    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - kernelDims[1] + strideDims[1]) /
-                                static_cast<float>(strideDims[1])));
-
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, inCh, Xin, Yin)
-    // weight (outCh, inCh, kernelX, kernelY)
-    // does not take Dilation attribute into account
-    using signedsize = std::make_signed<std::size_t>::type;
-    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
-            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize;
-            // If bias = nullptr, set B(0)
-            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
-            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
-                for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                    const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                    const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                    const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
-                    for (std::size_t oy = 0; oy < oySize; ++oy) {
-                        const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
-                        const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                        const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify);
-                        const std::size_t oIndexFull = oIndex + ox*oySize + oy;
-                        const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
-                        const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
-
-                        if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
-                            output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                   weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                   weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                   weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                   weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                   weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                   weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                   weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                   weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
-                        } else {
-                            for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
-                                for (std::size_t sy = syMin; sy < syMax; ++sy) {
-                                    output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
-                                                            input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy))];
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-namespace {
-static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>);
-static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float16(
-        {DataType::Float16, DataType::Float16, DataType::Float16, DataType::Float16},
-        Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>);
-static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::ConvImpl2D_cpu_forward_kernel<int, int, int, int>);
-static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_ */
diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp
index 3a6b331bd5e40a19113d231e22bb68dacc9fd914..b57ffd5fc2557d1f01582360f27ff83b40928f4e 100644
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -18,35 +18,18 @@
 #include <vector>
 
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp"
 #include "aidge/operator/Conv.hpp"
 #include "aidge/utils/Types.h"
 
+template <>
 void Aidge::ConvImpl1D_cpu::forward() {
     const auto& op_ = static_cast<const Conv_Op<1>&>(mOp);
 
     // FIXME: uncomment the following code once memory handling will work
-AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
+    AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
     AIDGE_ASSERT(op_.getInput(1), "missing input #1 in Conv Operator.");
 
-    // Find the correct kernel type
-    const auto outputDataType = op_.getOutput(0)->dataType();
-    const Registrar<ConvImpl1DForward_cpu>::registrar_key registrarKey = {
-        op_.getInput(0)->dataType(),
-        op_.getInput(1)->dataType(),
-        (op_.getInput(2) ? op_.getInput(2)->dataType() : op_.getInput(1)->dataType()),
-        outputDataType};
-
-    Registrar<ConvImpl1DForward_cpu>::registrar_type kernelFunc;
-    if (Registrar<ConvImpl1DForward_cpu>::exists(registrarKey)) {
-        // One exists with the right inputs/output types
-        kernelFunc = Registrar<ConvImpl1DForward_cpu>::create(registrarKey);
-    }
-    else {
-        // Otherwise, fallback to the kernel with all types matching output type
-        kernelFunc = Registrar<ConvImpl1DForward_cpu>::create({
-            outputDataType, outputDataType, outputDataType, outputDataType});
-    }
+    const auto impl = Registrar<ConvImpl1D_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Convert input data (no overhead if not needed!)
     // TODO: right now, if needed, memory will be allocated/deallocated at each
@@ -58,7 +41,7 @@ AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
     const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor();
 
     // Call kernel
-    kernelFunc(op_.strideDims(),
+    impl.forward(op_.strideDims(),
             op_.dilationDims(),
             op_.kernelDims(),
             op_.getInput(0)->template dims<3>(), // input dimensions
@@ -70,6 +53,12 @@ AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
             );
 }
 
+template <>
+void Aidge::ConvImpl1D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<1> on backend cpu");
+}
+
+template <>
 void Aidge::ConvImpl2D_cpu::forward() {
     const auto& op_ = dynamic_cast<const Conv_Op<2>&>(mOp);
 
@@ -77,24 +66,7 @@ void Aidge::ConvImpl2D_cpu::forward() {
     AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
     AIDGE_ASSERT(op_.getInput(1), "missing input #1 in Conv Operator.");
 
-    // Find the correct kernel type
-    const auto outputDataType = op_.getOutput(0)->dataType();
-    const Registrar<ConvImpl2DForward_cpu>::registrar_key registrarKey = {
-        op_.getInput(0)->dataType(),
-        op_.getInput(1)->dataType(),
-        (op_.getInput(2) ? op_.getInput(2)->dataType() : op_.getInput(1)->dataType()),
-        outputDataType};
-
-    Registrar<ConvImpl2DForward_cpu>::registrar_type kernelFunc;
-    if (Registrar<ConvImpl2DForward_cpu>::exists(registrarKey)) {
-        // One exists with the right inputs/output types
-        kernelFunc = Registrar<ConvImpl2DForward_cpu>::create(registrarKey);
-    }
-    else {
-        // Otherwise, fallback to the kernel with all types matching output type
-        kernelFunc = Registrar<ConvImpl2DForward_cpu>::create({
-            outputDataType, outputDataType, outputDataType, outputDataType});
-    }
+    const auto impl = Registrar<ConvImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Convert input data (no overhead if not needed!)
     // TODO: right now, if needed, memory will be allocated/deallocated at each
@@ -106,7 +78,7 @@ void Aidge::ConvImpl2D_cpu::forward() {
     const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor();
 
     // Call kernel
-    kernelFunc(op_.strideDims(),
+    impl.forward(op_.strideDims(),
             op_.dilationDims(),
             op_.kernelDims(),
             op_.getInput(0)->template dims<4>(), // input dimensions
@@ -117,3 +89,8 @@ void Aidge::ConvImpl2D_cpu::forward() {
             getCPUPtr(mOp.getRawOutput(0)) // output
             );
 }
+
+template <>
+void Aidge::ConvImpl2D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<2> on backend cpu");
+}