Finished operators adaptation

f203e3ac · Olivier BICHLER · 2fe30cc0 · f203e3ac · f203e3ac · f203e3ac
Commit f203e3ac authored 7 months ago by Olivier BICHLER
--- a/src/operator/AbsImpl.cpp
+++ b/src/operator/AbsImpl.cpp
@@ -14,24 +14,27 @@
 #include <memory>
 #include <vector>

-#include "aidge/backend/cpu/operator/AbsImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/AbsImpl_kernels.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Abs.hpp"
 #include "aidge/utils/Types.h"

+template <>
 void Aidge::AbsImpl_cpu::forward() {
    const Abs_Op& op = static_cast<const Abs_Op&>(mOp);

    // Find the correct kernel type
-    auto kernelFunc = Registrar<AbsImplForward_cpu>::create({
-                            op.getInput(0)->dataType(),
-                            op.getOutput(0)->dataType()
-                        });
+    const auto impl = Registrar<AbsImpl_cpu>::create(getBestMatch(getRequiredSpec()));

    // Call kernel
-    kernelFunc(
+    impl.forward(
        op.getInput(0)->size(),
        op.getInput(0)->getImpl()->rawPtr(),
        op.getOutput(0)->getImpl()->rawPtr()
    );
 }
+
+template <>
+void Aidge::AbsImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Abs_Op on backend cpu");
+}
--- a/src/operator/AddImpl.cpp
+++ b/src/operator/AddImpl.cpp
@@ -16,64 +16,57 @@
 #include <vector>

 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/AddImpl_kernels.hpp"
 #include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/utils/ErrorHandling.hpp"

+template <>
 void  Aidge::AddImpl_cpu::forward() {
-    const auto& opTensor = static_cast<const OperatorTensor&>(mOp);
-    AIDGE_ASSERT(opTensor.getInput(0)->hasImpl(), "cannot run Add forward because the 0-th input has no implementation.");
-    assert(opTensor.getInput(0) && "missing input in Add operator");
-    DataType datatypeFirstInput = opTensor.getInput(0)->dataType();
-    for (IOIndex_t i = 1; i < opTensor.nbInputs(); ++i) {
-        AIDGE_ASSERT(opTensor.getInput(i)->hasImpl(), "cannot run Add forward because the {}-th input has no implementation.", i);
-        assert(opTensor.getInput(i) && "missing input in Add operator");
-        assert(opTensor.getInput(i)->dataType() == datatypeFirstInput);
+    const Add_Op& op = static_cast<const Add_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in Add operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Add forward because the 0-th input has no implementation.");
+    DataType datatypeFirstInput = op.getInput(0)->dataType();
+    for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
+        AIDGE_ASSERT(op.getInput(i), "missing input in Add operator");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Add forward because the {}-th input has no implementation.", i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type.");
    }

    // Find the correct kernel type
-    const auto outputDataType = opTensor.getOutput(0)->dataType();
-    const Registrar<AddImplForward_cpu>::registrar_key registrarKey = {
-        datatypeFirstInput,
-        outputDataType};
-
-    Registrar<AddImplForward_cpu>::registrar_type kernelFunc;
-    if (Registrar<AddImplForward_cpu>::exists(registrarKey)) {
-        // One exists with the right inputs/output types
-        kernelFunc = Registrar<AddImplForward_cpu>::create(registrarKey);
-    }
-    else {
-        // Otherwise, fallback to the kernel with all types matching output type
-        kernelFunc = Registrar<AddImplForward_cpu>::create({
-            outputDataType, outputDataType});
-    }
+    const auto impl = Registrar<AddImpl_cpu>::create(getBestMatch(getRequiredSpec()));

    // Convert input data (no overhead if not needed!)
    // TODO: right now, if needed, memory will be allocated/deallocated at each
    // call to forward(). We might put the following shared_ptr as members of
    // this class to avoid that.
-    const std::size_t nbDims = opTensor.getOutput(0)->nbDims();
+    const std::size_t nbDims = op.getOutput(0)->nbDims();
    std::vector<std::vector<std::size_t>> inputsDims;
    std::vector<const void*> opInputs;
-    std::vector<std::shared_ptr<Tensor>> inputsFallback(opTensor.nbInputs());
-    for (IOIndex_t i = 0; i < opTensor.nbInputs(); ++i) {
+    std::vector<std::shared_ptr<Tensor>> inputsFallback(op.nbInputs());
+    for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
        std::vector<std::size_t> inputDims(nbDims, 1);
-        auto dims = opTensor.getInput(i)->dims();
+        auto dims = op.getInput(i)->dims();
 		for(std::size_t j=dims.size()-1; j+1>0; --j)
 		{
 			std::size_t idx = nbDims - (dims.size()-j);
 			inputDims[idx] = dims[j];
 		}
        inputsDims.push_back(inputDims);
-        const auto& input = opTensor.getInput(i)->refCastFrom(inputsFallback[i], *opTensor.getOutput(0));
+        const auto& input = op.getInput(i)->refCastFrom(inputsFallback[i], *op.getOutput(0));
        opInputs.push_back(input.getImpl()->rawPtr());
    }

-    kernelFunc(opInputs,
+    impl.forward(opInputs,
               inputsDims,
-               opTensor.getOutput(0)->size(),
-               opTensor.getOutput(0)->dims(),
-               getCPUPtr(opTensor.getRawOutput(0)));
+               op.getOutput(0)->size(),
+               op.getOutput(0)->dims(),
+               getCPUPtr(op.getRawOutput(0)));
+}
+
+template <>
+void Aidge::AddImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Add_Op on backend cpu");
 }
--- a/src/operator/AndImpl.cpp
+++ b/src/operator/AndImpl.cpp
@@ -21,25 +21,29 @@
 #include "aidge/backend/cpu/data/GetCPUPtr.h"

 #include "aidge/backend/cpu/operator/AndImpl.hpp"
-#include "aidge/backend/cpu/operator/AndImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/AndImpl_kernels.hpp"

+template <>
 void Aidge::AndImpl_cpu::forward() {
-    // Find the correct kernel type
-    auto kernelFunc = Registrar<AndImplForward_cpu>::create({
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
-        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
-
    const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
    const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());

+
+    // Find the correct kernel type
+    const auto impl = Registrar<AndImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
    // Call kernel
-    kernelFunc(inputDims0,
+    impl.forward(inputDims0,
        inputDims1,
        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
        getCPUPtr(mOp.getRawInput(0)),
        getCPUPtr(mOp.getRawInput(1)),
        getCPUPtr(mOp.getRawOutput(0)));
 }
+
+template <>
+void Aidge::AndImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for And_Op on backend cpu");
+}
--- a/src/operator/ArgMaxImpl.cpp
+++ b/src/operator/ArgMaxImpl.cpp
@@ -16,19 +16,24 @@

 #include "aidge/utils/Types.h"
 #include "aidge/operator/ArgMax.hpp"
-#include "aidge/backend/cpu/operator/ArgMaxImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/ArgMaxImpl_kernels.hpp"

+template <>
 void Aidge::ArgMaxImpl_cpu::forward() {
    const ArgMax_Op& op_ = dynamic_cast<const ArgMax_Op&>(mOp);
+
    // Find the correct kernel type
-    auto kernelFunc = Registrar<ArgMaxImplForward_cpu>::create({
-        op_.getInput(0)->dataType(),
-        op_.getOutput(0)->dataType()});
+    const auto impl = Registrar<ArgMaxImpl_cpu>::create(getBestMatch(getRequiredSpec()));

    // Call kernel
-    kernelFunc(op_.axis(),
+    impl.forward(op_.axis(),
                op_.selectLastIndex(),
                op_.getInput(0)->dims(),
                op_.getInput(0)->getImpl()->rawPtr(),
                op_.getOutput(0)->getImpl()->rawPtr());
 }
+
+template <>
+void Aidge::ArgMaxImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for ArgMax_Op on backend cpu");
+}
--- a/src/operator/AvgPoolingImpl.cpp
+++ b/src/operator/AvgPoolingImpl.cpp
@@ -16,24 +16,29 @@
 #include <vector>

 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/AvgPoolingImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/AvgPooling.hpp"
 #include "aidge/utils/Types.h"

+template <>
 void Aidge::AvgPoolingImpl2D_cpu::forward() {
    const auto& op_ = dynamic_cast<const AvgPooling_Op<2>&>(mOp);
    assert(op_.getInput(0) && "missing input #0");

    // Find the correct kernel type
-    auto kernelFunc = Registrar<AvgPoolingImpl2DForward_cpu>::create(
-        {op_.getInput(0)->dataType(),
-         op_.getOutput(0)->dataType()});
+    const auto impl = Registrar<AvgPoolingImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));

    // Call kernel
-    kernelFunc(op_.strideDims(),
+    impl.forward(op_.strideDims(),
               op_.kernelDims(),
               op_.getInput(0)->template dims<4>(),
               getCPUPtr(op_.getInput(0)),
               getCPUPtr(op_.getOutput(0)));
 }
+
+template <>
+void Aidge::AvgPoolingImpl2D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for AvgPooling_Op<2> on backend cpu");
+}
+
--- a/src/operator/BatchNormImpl.cpp
+++ b/src/operator/BatchNormImpl.cpp
@@ -19,8 +19,9 @@
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 #include "aidge/operator/BatchNorm.hpp"

-#include "aidge/backend/cpu/operator/BatchNormImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp"

+template <>
 void Aidge::BatchNormImpl2D_cpu::forward() {
    const auto& op_ = dynamic_cast<const BatchNorm_Op<2>&>(mOp);
    AIDGE_ASSERT(op_.getInput(0), "missing input #0 for BatchNorm Operator");
@@ -30,14 +31,12 @@ void Aidge::BatchNormImpl2D_cpu::forward() {
    AIDGE_ASSERT(op_.getInput(4), "missing input #4 for BatchNorm Operator");

    AIDGE_ASSERT(op_.getOutput(0)->nbDims() == 4, "");
+
    // Find the correct kernel type
-    auto kernelFunc =
-            Registrar<BatchNormImpl2DForward_cpu>::create({op_.getInput(0)->dataType(),
-                                                           op_.getInput(1)->dataType(),
-                                                           op_.getOutput(0)->dataType()});
+    const auto impl = Registrar<BatchNormImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));

    // Call kernel
-    kernelFunc(op_.epsilon(),
+    impl.forward(op_.epsilon(),
            op_.momentum(),
            op_.getInput(0)->template dims<4>(),
            getCPUPtr(op_.getRawInput(0)),
@@ -48,3 +47,8 @@ void Aidge::BatchNormImpl2D_cpu::forward() {
            getCPUPtr(op_.getRawOutput(0)),
            true);
 }
+
+template <>
+void Aidge::BatchNormImpl2D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for BatchNorm_Op<2> on backend cpu");
+}
--- a/src/operator/ConvDepthWiseImpl.cpp
+++ b/src/operator/ConvDepthWiseImpl.cpp
@@ -15,12 +15,13 @@
 #include <vector>

 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/ConvDepthWiseImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/ConvDepthWise.hpp"
 #include "aidge/utils/Log.hpp"
 #include "aidge/utils/Types.h"

+template <>
 void Aidge::ConvDepthWiseImpl1D_cpu::forward() {
    const auto& op_ = dynamic_cast<const ConvDepthWise_Op<1>&>(mOp);

@@ -30,23 +31,7 @@ void Aidge::ConvDepthWiseImpl1D_cpu::forward() {
    AIDGE_ASSERT((op_.getInput(0)->nbDims() == 3), "support for 4-dimensions tensors only");

    // Find the correct kernel type
-    const auto outputDataType = op_.getOutput(0)->dataType();
-    const Registrar<ConvDepthWiseImpl1DForward_cpu>::registrar_key registrarKey = {
-        op_.getInput(0)->dataType(),
-        op_.getInput(1)->dataType(),
-        ((op_.getInput(2)) ? op_.getInput(2)->dataType() : op_.getInput(1)->dataType()),
-        outputDataType};
-
-    Registrar<ConvDepthWiseImpl1DForward_cpu>::registrar_type kernelFunc;
-    if (Registrar<ConvDepthWiseImpl1DForward_cpu>::exists(registrarKey)) {
-        // One exists with the right inputs/output types
-        kernelFunc = Registrar<ConvDepthWiseImpl1DForward_cpu>::create(registrarKey);
-    }
-    else {
-        // Otherwise, fallback to the kernel with all types matching output type
-        kernelFunc = Registrar<ConvDepthWiseImpl1DForward_cpu>::create({
-            outputDataType, outputDataType, outputDataType, outputDataType});
-    }
+    const auto impl = Registrar<ConvDepthWiseImpl1D_cpu>::create(getBestMatch(getRequiredSpec()));

    // Convert input data (no overhead if not needed!)
    // TODO: right now, if needed, memory will be allocated/deallocated at each
@@ -58,7 +43,7 @@ void Aidge::ConvDepthWiseImpl1D_cpu::forward() {
    const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor();

    // Call kernel
-    kernelFunc(op_.strideDims(),
+    impl.forward(op_.strideDims(),
                op_.dilationDims(),
                op_.kernelDims(), // Conv attributes
               op_.getInput(0)->template dims<3>(), // input dimensions
@@ -69,6 +54,12 @@ void Aidge::ConvDepthWiseImpl1D_cpu::forward() {
            );
 }

+template <>
+void Aidge::ConvDepthWiseImpl1D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for ConvDepthWise_Op<1> on backend cpu");
+}
+
+template <>
 void Aidge::ConvDepthWiseImpl2D_cpu::forward() {
    const auto& op_ = dynamic_cast<const ConvDepthWise_Op<2>&>(mOp);

@@ -79,11 +70,7 @@ void Aidge::ConvDepthWiseImpl2D_cpu::forward() {
    AIDGE_ASSERT((op_.getInput(0)->nbDims() == 4), "support for 4-dimensions tensors only");

    // Find the correct kernel type
-    auto kernelFunc = Registrar<ConvDepthWiseImpl2DForward_cpu>::create(
-        {op_.getInput(0)->dataType(),
-        op_.getInput(1)->dataType(),
-        op_.getInput(2)->dataType(),
-        op_.getOutput(0)->dataType()});
+    const auto impl = Registrar<ConvDepthWiseImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));

        // Convert input data (no overhead if not needed!)
    // TODO: right now, if needed, memory will be allocated/deallocated at each
@@ -95,7 +82,7 @@ void Aidge::ConvDepthWiseImpl2D_cpu::forward() {
    const auto& input2 = op_.getInput(2) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor();

    // Call kernel
-    kernelFunc(op_.strideDims(),
+    impl.forward(op_.strideDims(),
            op_.dilationDims(),
            op_.kernelDims(),
            op_.getInput(0)->template dims<4>(),
@@ -104,3 +91,8 @@ void Aidge::ConvDepthWiseImpl2D_cpu::forward() {
            op_.getInput(2) ?  input2.getImpl()->rawPtr() : nullptr,
            getCPUPtr(op_.getRawOutput(0)));
 }
+
+template <>
+void Aidge::ConvDepthWiseImpl2D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for ConvDepthWise_Op<2> on backend cpu");
+}
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -30,6 +30,7 @@ void Aidge::ConvImpl1D_cpu::forward() {
    AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
    AIDGE_ASSERT(op_.getInput(1), "missing input #1 in Conv Operator.");

+    // Find the correct kernel type
    const auto impl = Registrar<ConvImpl1D_cpu>::create(getBestMatch(getRequiredSpec()));

    // Convert input data (no overhead if not needed!)
@@ -67,6 +68,7 @@ void Aidge::ConvImpl2D_cpu::forward() {
    AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
    AIDGE_ASSERT(op_.getInput(1), "missing input #1 in Conv Operator.");

+    // Find the correct kernel type
    const auto impl = Registrar<ConvImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));

    // Convert input data (no overhead if not needed!)

--- a/src/operator/DivImpl.cpp
+++ b/src/operator/DivImpl.cpp
@@ -15,10 +15,11 @@
 #include "aidge/backend/cpu/data/Broadcasting.hpp"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
-#include "aidge/backend/cpu/operator/DivImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/DivImpl_kernels.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/utils/Types.h"

+template <>
 void Aidge::DivImpl_cpu::forward() {
    // Find the correct kernel type
    // auto kernelFunc = Registrar<DivImplForward_cpu>::create({
@@ -55,10 +56,7 @@ void Aidge::DivImpl_cpu::forward() {
    const auto& opTensor = static_cast<const Div_Op&>(mOp);

    // Find the correct kernel type
-    auto kernelFunc = Registrar<DivImplForward_cpu>::create({
-        opTensor.getInput(0)->dataType(),
-        opTensor.getInput(1)->dataType(),
-        opTensor.getOutput(0)->dataType()});
+    const auto impl = Registrar<DivImpl_cpu>::create(getBestMatch(getRequiredSpec()));

    // Compute compatible input dimensions
    std::vector<std::size_t>        dims0   = opTensor.getInput(0)->dims();
@@ -68,7 +66,7 @@ void Aidge::DivImpl_cpu::forward() {
    // special case for equal dimensions, the kernel is called with the entire arrays at once
    if (dims0 == dims1) {
        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
-        kernelFunc(input0_contiguous_size, input0_contiguous_size, input0_contiguous_size,
+        impl.forward(input0_contiguous_size, input0_contiguous_size, input0_contiguous_size,
                    getCPUPtr(mOp.getRawInput(0)),
                    getCPUPtr(mOp.getRawInput(1)),
                    getCPUPtr(mOp.getRawOutput(0)));
@@ -134,7 +132,7 @@ void Aidge::DivImpl_cpu::forward() {
    std::size_t dim = contiguousIdx - 1;
    const std::size_t nbStacks = std::accumulate(outDims.cbegin(), outDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
    for (std::size_t stack = 0; stack < nbStacks;) {
-        kernelFunc(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+        impl.forward(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
                    getCPUPtr(mOp.getRawInput(0), offsetIn0*input0_contiguous_size),
                    getCPUPtr(mOp.getRawInput(1), offsetIn1*input1_contiguous_size),
                    getCPUPtr(mOp.getRawOutput(0), offsetOut*output_contiguous_size));
@@ -151,3 +149,8 @@ void Aidge::DivImpl_cpu::forward() {
        }
    }
 }
+
+template <>
+void Aidge::DivImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Div_Op on backend cpu");
+}
--- a/src/operator/ErfImpl.cpp
+++ b/src/operator/ErfImpl.cpp
@@ -14,24 +14,27 @@
 #include <memory>
 #include <vector>

-#include "aidge/backend/cpu/operator/ErfImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/ErfImpl_kernels.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Erf.hpp"
 #include "aidge/utils/Types.h"

+template <>
 void Aidge::ErfImpl_cpu::forward() {
    const Erf_Op& op = static_cast<const Erf_Op&>(mOp);

    // Find the correct kernel type
-    auto kernelFunc = Registrar<ErfImplForward_cpu>::create({
-                            op.getInput(0)->dataType(),
-                            op.getOutput(0)->dataType()
-                        });
+    const auto impl = Registrar<ErfImpl_cpu>::create(getBestMatch(getRequiredSpec()));

    // Call kernel
-    kernelFunc(
+    impl.forward(
        op.getInput(0)->size(),
        op.getInput(0)->getImpl()->rawPtr(),
        op.getOutput(0)->getImpl()->rawPtr()
    );
 }
+
+template <>
+void Aidge::ErfImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Erf_Op on backend cpu");
+}
--- a/src/operator/FCImpl.cpp
+++ b/src/operator/FCImpl.cpp
@@ -17,37 +17,20 @@
 #include <tuple>

 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/FCImpl_backward_kernels.hpp"
-#include "aidge/backend/cpu/operator/FCImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/FCImpl_kernels.hpp"
 #include "aidge/operator/FC.hpp"
 #include "aidge/utils/ErrorHandling.hpp"
 #include "aidge/utils/Types.h"


+template <>
 void Aidge::FCImpl_cpu::forward()
 {
    const FC_Op& op_ = dynamic_cast<const FC_Op&>(mOp);
    AIDGE_ASSERT(op_.getInput(0), "missing input #0");
    AIDGE_ASSERT(op_.getInput(1), "missing input #1");

-    // Find the correct kernel type
-    const auto outputDataType = op_.getOutput(0)->dataType();
-    const Registrar<FCImplForward_cpu>::registrar_key registrarKey = {
-        op_.getInput(0)->dataType(),
-        op_.getInput(1)->dataType(),
-        ((op_.getInput(2)) ? op_.getInput(2)->dataType() : op_.getInput(1)->dataType()),
-        outputDataType};
-
-    Registrar<FCImplForward_cpu>::registrar_type kernelFunc;
-    if (Registrar<FCImplForward_cpu>::exists(registrarKey)) {
-        // One exists with the right inputs/output types
-        kernelFunc = Registrar<FCImplForward_cpu>::create(registrarKey);
-    }
-    else {
-        // Otherwise, fallback to the kernel with all types matching output type
-        kernelFunc = Registrar<FCImplForward_cpu>::create({
-            outputDataType, outputDataType, outputDataType, outputDataType});
-    }
+    const auto impl = Registrar<FCImpl_cpu>::create(getBestMatch(getRequiredSpec()));

    // Convert input data (no overhead if not needed!)
    // TODO: right now, if needed, memory will be allocated/deallocated at each
@@ -60,7 +43,7 @@ void Aidge::FCImpl_cpu::forward()

    // Call kernel
    const auto batchSize = (input0.dims().size() > 1) ? input0.dims()[0] : 1;
-    kernelFunc(batchSize,
+    impl.forward(batchSize,
        input1.dims()[1], // nb input features
        input1.dims()[0], // nb output features
        input0.getImpl()->rawPtr(),
@@ -69,6 +52,7 @@ void Aidge::FCImpl_cpu::forward()
        getCPUPtr(mOp.getRawOutput(0)));
 }

+template <>
 void Aidge::FCImpl_cpu::backward()
 {
    const FC_Op& op_ = dynamic_cast<const FC_Op&>(mOp);
@@ -77,23 +61,7 @@ void Aidge::FCImpl_cpu::backward()
    AIDGE_ASSERT(op_.getInput(0)->grad(), "missing input #0 gradient");
    AIDGE_ASSERT(op_.getInput(1)->grad(), "missing input #1 gradient");

-    // Find the correct kernel type
-    const Registrar<FCImplBackward_cpu>::registrar_key registrarKey = {
-        fc_grad->dataType(),
-        op_.getInput(1)->grad()->dataType(),
-        (op_.getInput(2)) ? op_.getInput(2)->grad()->dataType() : op_.getInput(1)->grad()->dataType(),
-        op_.getInput(0)->grad()->dataType()};
-
-    Registrar<FCImplBackward_cpu>::registrar_type kernelFunc;
-    if (Registrar<FCImplBackward_cpu>::exists(registrarKey)) {
-        // One exists with the right inputs/output types
-        kernelFunc = Registrar<FCImplBackward_cpu>::create(registrarKey);
-    }
-    else {
-        // Otherwise, fallback to the kernel with all types matching output type
-        kernelFunc = Registrar<FCImplBackward_cpu>::create({
-            fc_grad->dataType(), fc_grad->dataType(), fc_grad->dataType(), fc_grad->dataType()});
-    }
+    const auto impl = Registrar<FCImpl_cpu>::create(getBestMatch(getRequiredSpec()));

    // Convert input data (no overhead if not needed!)
    // TODO: right now, if needed, memory will be allocated/deallocated at each
@@ -106,7 +74,7 @@ void Aidge::FCImpl_cpu::backward()

    // Call kernel
    const auto batchSize = (input0grad.dims().size() > 1) ? input0grad.dims()[0] : 1;
-    kernelFunc(batchSize,
+    impl.backward(batchSize,
        input1grad.dims()[1], // nb input features
        input1grad.dims()[0], // nb output features
        getCPUPtr(fc_grad),

--- a/src/operator/FoldImpl.cpp
+++ b/src/operator/FoldImpl.cpp
@@ -20,18 +20,18 @@
 #include "aidge/operator/Conv.hpp"

 #include "aidge/backend/cpu/operator/FoldImpl.hpp"
-#include "aidge/backend/cpu/operator/FoldImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/FoldImpl_kernels.hpp"

+template <>
 void Aidge::FoldImpl2D_cpu::forward() {
+    const auto& op_ = static_cast<const Fold_Op<2>&>(mOp);
    assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) && "missing input #0");

    // Find the correct kernel type
-    auto kernelFunc =
-            Registrar<FoldImpl2DForward_cpu>::create({std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(), std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
+    const auto impl = Registrar<FoldImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));

    // Call kernel
-    const auto& op_ = static_cast<const Fold_Op<2>&>(mOp);
-    kernelFunc(op_.outputDims(),
+    impl.forward(op_.outputDims(),
                op_.strideDims(),
                op_.dilationDims(),
                op_.kernelDims(),
@@ -39,3 +39,8 @@ void Aidge::FoldImpl2D_cpu::forward() {
                getCPUPtr(mOp.getRawInput(0)),
                getCPUPtr(mOp.getRawOutput(0)));
 }
+
+template <>
+void Aidge::FoldImpl2D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Fold_Op<2> on backend cpu");
+}
--- a/src/operator/GlobalAveragePoolingImpl.cpp
+++ b/src/operator/GlobalAveragePoolingImpl.cpp
@@ -15,7 +15,7 @@
 #include <memory>
 #include <vector>

-#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp"
 #include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/GlobalAveragePooling.hpp"
@@ -24,18 +24,23 @@
 #include "aidge/utils/Types.h"


+template <>
 void Aidge::GlobalAveragePoolingImpl_cpu::forward()
 {
    const GlobalAveragePooling_Op& op_ = static_cast<const GlobalAveragePooling_Op&>(mOp);
    // Check if input is provided
    AIDGE_ASSERT(op_.getInput(0), "missing input 0");

-    // Create the forward kernal with the wanted types
-    auto kernelFunc = Registrar<GlobalAveragePoolingImplForward_cpu>::create({op_.getInput(0)->dataType(),
-                                                                              op_.getOutput(0)->dataType()});
+    // Find the correct kernel type
+    const auto impl = Registrar<GlobalAveragePoolingImpl_cpu>::create(getBestMatch(getRequiredSpec()));

    // Call kernel
-    kernelFunc(op_.getInput(0)->dims(),
+    impl.forward(op_.getInput(0)->dims(),
               op_.getInput(0)->getImpl()->rawPtr(),
               op_.getOutput(0)->getImpl()->rawPtr());
-}
\ No newline at end of file
+}
+
+template <>
+void Aidge::GlobalAveragePoolingImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for GlobalAveragePooling_Op on backend cpu");
+}
--- a/src/operator/LeakyReLUImpl.cpp
+++ b/src/operator/LeakyReLUImpl.cpp
@@ -14,14 +14,14 @@
 #include <vector>

 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/LeakyReLUImpl_forward_kernels.hpp"
-#include "aidge/backend/cpu/operator/LeakyReLUImpl_backward_kernels.hpp"
+#include "aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/LeakyReLU.hpp"
 #include "aidge/utils/Log.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/utils/Registrar.hpp"

+template <>
 void Aidge::LeakyReLUImpl_cpu::forward() {
    const LeakyReLU_Op& op_ = dynamic_cast<const LeakyReLU_Op&>(mOp);

@@ -30,17 +30,16 @@ void Aidge::LeakyReLUImpl_cpu::forward() {
    AIDGE_ASSERT(in0, "missing input #0");

    // Find the correct kernel type
-    auto kernelFunc = Registrar<LeakyReLUImplForward_cpu>::create({
-        in0->dataType(),
-        out0->dataType()});
+    const auto impl = Registrar<LeakyReLUImpl_cpu>::create(getBestMatch(getRequiredSpec()));

    // Call kernel
-    kernelFunc(op_.negativeSlope(),
+    impl.forward(op_.negativeSlope(),
        in0->size(),
        getCPUPtr(mOp.getRawInput(0)),
        getCPUPtr(mOp.getRawOutput(0)));
 }

+template <>
 void Aidge::LeakyReLUImpl_cpu::backward() {
    // reversing in and out Data for backprop
    const LeakyReLU_Op& op_ = dynamic_cast<const LeakyReLU_Op&>(mOp);
@@ -49,12 +48,10 @@ void Aidge::LeakyReLUImpl_cpu::backward() {
    AIDGE_ASSERT(in0, "missing input #0");

    // Find the correct kernel type
-    auto kernelFunc = Registrar<LeakyReLUImplForward_cpu>::create({
-        in0->dataType(),
-        out0->dataType()});
+    const auto impl = Registrar<LeakyReLUImpl_cpu>::create(getBestMatch(getRequiredSpec()));

    // Call kernel
-    kernelFunc(op_.negativeSlope(),
+    impl.backward(op_.negativeSlope(),
        in0->size(),
        getCPUPtr(in0),
        getCPUPtr(out0));

--- a/src/operator/LnImpl.cpp
+++ b/src/operator/LnImpl.cpp
@@ -20,9 +20,9 @@
 #include "aidge/backend/cpu/data/GetCPUPtr.h"

 #include "aidge/backend/cpu/operator/LnImpl.hpp"
-#include "aidge/backend/cpu/operator/LnImpl_forward_kernels.hpp"
-#include "aidge/backend/cpu/operator/LnImpl_backward_kernels.hpp"
+#include "aidge/backend/cpu/operator/LnImpl_kernels.hpp"

+template <>
 void Aidge::LnImpl_cpu::forward() {
    const Ln_Op& op_ = static_cast<const Ln_Op&>(mOp);
 	std::shared_ptr<Tensor> in0 = op_.getInput(0);
@@ -30,16 +30,15 @@ void Aidge::LnImpl_cpu::forward() {
    AIDGE_ASSERT(in0, "missing input #0");

    // Find the correct kernel type
-    auto kernelFunc = Registrar<LnImplForward_cpu>::create({
-        in0->dataType(),
-	    out0->dataType()});
+    const auto impl = Registrar<LnImpl_cpu>::create(getBestMatch(getRequiredSpec()));

    // Call kernel
-    kernelFunc(in0->size(),
+    impl.forward(in0->size(),
        getCPUPtr(mOp.getRawInput(0)),
        getCPUPtr(mOp.getRawOutput(0)));
 }

+template <>
 void Aidge::LnImpl_cpu::backward() {
    const Ln_Op& op_ = dynamic_cast<const Ln_Op&>(mOp);
 	std::shared_ptr<Tensor> in0  = op_.getInput(0);
@@ -49,12 +48,8 @@ void Aidge::LnImpl_cpu::backward() {
    AIDGE_ASSERT(out0, "missing output #0 for current {} operator", op_.type());

    // Find the correct kernel type
-    auto kernelFunc = Registrar<LnImplBackward_cpu>::create({
-        in0->dataType(),
-	    gra_int0->dataType(),
-        gra_out0->dataType()        
-    });
+    const auto impl = Registrar<LnImpl_cpu>::create(getBestMatch(getRequiredSpec()));

    // Call kernel
-    kernelFunc(gra_int0->size(), getCPUPtr(in0), getCPUPtr(gra_out0), getCPUPtr(gra_int0));
+    impl.backward(gra_int0->size(), getCPUPtr(in0), getCPUPtr(gra_out0), getCPUPtr(gra_int0));
 }
--- a/src/operator/MatMulImpl.cpp
+++ b/src/operator/MatMulImpl.cpp
@@ -19,17 +19,16 @@
 #include "aidge/utils/Types.h"

 #include "aidge/backend/cpu/operator/MatMulImpl.hpp"
-#include "aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/MatMulImpl_kernels.hpp"

+template <>
 void Aidge::MatMulImpl_cpu::forward()
 {
    assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) && "missing input #0");
    assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(1)) && "missing input #1");

    // Find the correct kernel type
-    auto kernelFunc = Registrar<MatMulImplForward_cpu>::create(
-        {std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
-         std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
+    const auto impl = Registrar<MatMulImpl_cpu>::create(getBestMatch(getRequiredSpec()));

    // Compute compatible input dimensions
    std::vector<std::size_t> dims0 = static_cast<const MatMul_Op&>(mOp).getInput(0)->dims();
@@ -91,7 +90,7 @@ void Aidge::MatMulImpl_cpu::forward()
    const std::size_t matrix1Size = k*m;
    const std::size_t matrixOutSize = n*m;
    for (std::size_t stack = 0; stack < nbMatrices;) {
-        kernelFunc(n, k, m,
+        impl.forward(n, k, m,
                    getCPUPtr(mOp.getRawInput(0), offsetIn0*matrix0Size),
                    getCPUPtr(mOp.getRawInput(1), offsetIn1*matrix1Size),
                    getCPUPtr(mOp.getRawOutput(0), offsetOut*matrixOutSize));
@@ -126,3 +125,8 @@ void Aidge::MatMulImpl_cpu::forward()
 //         getCPUPtr(mOp.getRawInput(1)),
 //         getCPUPtr(mOp.getRawOutput(0)));
 // }
+
+template <>
+void Aidge::MatMulImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for MatMul_Op on backend cpu");
+}
--- a/src/operator/MaxPoolingImpl.cpp
+++ b/src/operator/MaxPoolingImpl.cpp
@@ -14,26 +14,29 @@
 #include <vector>

 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/backend/cpu/operator/MaxPoolingImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp"
 #include "aidge/operator/MaxPooling.hpp"
 #include "aidge/utils/Log.hpp"
 #include "aidge/utils/Types.h"

+template <>
 void Aidge::MaxPoolingImpl2D_cpu::forward() {
    const auto& op_ = dynamic_cast<const MaxPooling_Op<2>&>(mOp);
    AIDGE_ASSERT(op_.getInput(0), "missing input #0 in MaxPooling Operator.");

    // Find the correct kernel type
-    auto kernelFunc = Registrar<MaxPoolingImpl2DForward_cpu>::create({
-        op_.getInput(0)->dataType(),
-        op_.getOutput(0)->dataType()
-    });
+    const auto impl = Registrar<MaxPoolingImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));

    // Call kernel
-    kernelFunc(op_.strideDims(),
+    impl.forward(op_.strideDims(),
                op_.kernelDims(),
                op_.ceilMode(),
                op_.getInput(0)->template dims<4>(),
                getCPUPtr(mOp.getRawInput(0)),
                getCPUPtr(mOp.getRawOutput(0)));
 }
+
+template <>
+void Aidge::MaxPoolingImpl2D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for MaxPooling_Op<2> on backend cpu");
+}
--- a/src/operator/MulImpl.cpp
+++ b/src/operator/MulImpl.cpp
@@ -21,25 +21,28 @@
 #include "aidge/backend/cpu/data/GetCPUPtr.h"

 #include "aidge/backend/cpu/operator/MulImpl.hpp"
-#include "aidge/backend/cpu/operator/MulImpl_forward_kernels.hpp"
+#include "aidge/backend/cpu/operator/MulImpl_kernels.hpp"

+template <>
 void Aidge::MulImpl_cpu::forward() {
-    // Find the correct kernel type
-    auto kernelFunc = Registrar<MulImplForward_cpu>::create({
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
-        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
-
    const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
    const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());

+    // Find the correct kernel type
+    const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
    // Call kernel
-    kernelFunc(inputDims0,
+    impl.forward(inputDims0,
        inputDims1,
        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
        getCPUPtr(mOp.getRawInput(0)),
        getCPUPtr(mOp.getRawInput(1)),
        getCPUPtr(mOp.getRawOutput(0)));
 }
+
+template <>
+void Aidge::MulImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Mul_Op on backend cpu");
+}