diff --git a/include/aidge/backend/cpu/operator/ConvImpl.hpp b/include/aidge/backend/cpu/operator/ConvImpl.hpp
index e480697b6452440f043901140a07cb643f3cbdb6..1e4bcd1b0a498e8359e2c79519d462d43e416ce4 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl.hpp
@@ -68,9 +68,34 @@ using ConvImpl2D_cpu = OperatorImpl_cpu<Conv2D_Op,
                                              void *,
                                              void *)>;
 
+using Conv3D_Op = Conv_Op<3>;
+using ConvImpl3D_cpu = OperatorImpl_cpu<Conv3D_Op,
+                                        void(const std::array<DimSize_t, 3> &,
+                                             const std::array<DimSize_t, 3> &,
+                                             const std::array<DimSize_t, 3> &,
+                                             const std::array<DimSize_t, 5> &,
+                                             const std::array<DimSize_t, 5> &,
+                                             const void *,
+                                             const void *,
+                                             const void *,
+                                             void *),
+                                        void(const std::array<DimSize_t, 3> &,
+                                             const std::array<DimSize_t, 3> &,
+                                             const std::array<DimSize_t, 3> &,
+                                             const std::array<DimSize_t, 5> &,
+                                             const std::array<DimSize_t, 5> &,
+                                             const void *,
+                                             const void *,
+                                             const void *,
+                                             void *,
+                                             void *,
+                                             void *)>;
+
 // Implementation entry point registration to Operator
 REGISTRAR(Conv1D_Op, "cpu", Aidge::ConvImpl1D_cpu::create);
 REGISTRAR(Conv2D_Op, "cpu", Aidge::ConvImpl2D_cpu::create);
+REGISTRAR(Conv3D_Op, "cpu", Aidge::ConvImpl3D_cpu::create);
+
 } // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_CONVIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index d2b942f6b6f72235f5d079c0fbb402b1b4ed1373..e764eecfd6746585a7526b8dc7c2a7295c242285 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -12,7 +12,9 @@
 #ifndef AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
 #define AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
 
+#include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
@@ -234,7 +236,7 @@ static void conv1DBackwardWeights(const array<DimSize_t, 1> &stride,
                 for (DimSize_t kX = 0; kX < kDim[0]; ++kX) {
 
                     for (DimSize_t oX = 0; oX < oDims[2]; ++oX) {
-                        const DimSize_t iX = oX * stride[0] + kX * dilation[0] ;
+                        const DimSize_t iX = oX * stride[0] + kX * dilation[0];
 
                         weightsGrad[kOffsets[1] + kX] +=
                             input[iOffsets[1] + iX] * oGrad[oOffsets[1] + oX];
@@ -315,9 +317,9 @@ static void conv1DBackwardBias(const array<DimSize_t, 3> &oDims,
  * @param[inout] biasesGrad_ gradients of the kernel biases
  */
 template <class I, class W, class B, class O>
-void ConvImpl1D_cpu_backward_kernel(const array<DimSize_t,1> &stride,
-                                    const array<DimSize_t,1> &dilation,
-                                    const array<DimSize_t,1> &kernelDim,
+void ConvImpl1D_cpu_backward_kernel(const array<DimSize_t, 1> &stride,
+                                    const array<DimSize_t, 1> &dilation,
+                                    const array<DimSize_t, 1> &kernelDim,
                                     const array<DimSize_t, 3> &inputDims,
                                     const array<DimSize_t, 3> &outputDims,
                                     const void *input_,
@@ -1030,6 +1032,585 @@ REGISTRAR(ConvImpl2D_cpu,
                                           std::int32_t,
                                           std::int32_t,
                                           std::int32_t>});
+
+/**
+ * @brief Forward kernel for 3D Convolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param strideDims stride dimensions
+ * @param dilationDims dilation dimensions
+ * @param kDims kernel dimensions
+ * @param iDims input dimensions.
+ * @param oDims output dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void ConvImpl3D_cpu_forward_kernel(const array<DimSize_t, 3> &strideDims,
+                                   const array<DimSize_t, 3> &dilationDims,
+                                   const array<DimSize_t, 3> &kDims,
+                                   const array<DimSize_t, 5> &iDims,
+                                   const array<DimSize_t, 5> &oDims,
+                                   const void *input_,
+                                   const void *weights_,
+                                   const void *biases_,
+                                   void *output_) {
+
+    ////////////////////////////////////////////////////////////////////////
+    // TENSOR CASTING
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+
+    // const array<DimSize_t, 3> dilatedKernelDims{
+    //     dilationDims[0] * kDims[0] + 1,
+    //     dilationDims[1] * kDims[1] + 1,
+    //     dilationDims[2] * kDims[2] + 1};
+
+    ////////////////////////////////////////////////////////////////////////
+    // strides
+    // for each array they represent
+    // the number of elems contained in a given dimension
+    const array<DimSize_t, 4> iStride{
+        iDims[1] * iDims[2] * iDims[3] * iDims[4],
+        iDims[2] * iDims[3] * iDims[4],
+        iDims[3] * iDims[4],
+        iDims[4]};
+    const array<DimSize_t, 4> oStride{
+        oDims[1] * oDims[2] * oDims[3] * oDims[4],
+        oDims[2] * oDims[3] * oDims[4],
+        oDims[3] * oDims[4],
+        oDims[4]};
+    const array<DimSize_t, 4> kStride{
+        iDims[1] * kDims[0] * kDims[1] * kDims[2],
+        kDims[0] * kDims[1] * kDims[2],
+        kDims[1] * kDims[2],
+        kDims[2]};
+
+    ////////////////////////////////////////////////////////////////////////
+    // index offsets
+    // NOTE:
+    // in/out dims = {batch, in/outChannels,
+    // in/outDims[0],in/outDims[1],in/outDims[2]}
+    array<DimSize_t, 4> iOffset{0, 0, 0, 0};
+    array<DimSize_t, 4> oOffset{0, 0, 0, 0};
+    // NOTE:
+    // kernel dims = {outChannels, inChannels, kernelDims[0],
+    //                kernelDims[1], kernelDims[2]}
+    array<DimSize_t, 4> kOffset{0, 0, 0, 0};
+    array<DimSize_t, 2> kDilOffset{0, 0};
+
+    ////////////////////////////////////////////////////////////////////////
+    // COMPUTATION
+    for (DimSize_t batch = 0; batch < iDims[0];
+         ++batch, oOffset[0] += oStride[0], iOffset[0] += iStride[0]) {
+
+        oOffset[1] = oOffset[0];
+        kOffset[0] = 0;
+        for (DimSize_t oChannel = 0; oChannel < oDims[1];
+             ++oChannel, oOffset[1] += oStride[1], kOffset[0] += kStride[0]) {
+
+            // Filling given channel with corresponding bias value
+            if (biases != nullptr) {
+                B biasVal = biases[oChannel];
+                std::fill(output + oOffset[1],
+                          output + oOffset[1] + oStride[1],
+                          biasVal);
+            }
+
+            iOffset[1] = iOffset[0];
+            kOffset[1] = kOffset[0];
+            for (DimSize_t iChannel = 0; iChannel < iDims[1]; ++iChannel,
+                           iOffset[1] += iStride[1],
+                           kOffset[1] += kStride[1]) {
+
+                iOffset[2] = iOffset[1];
+                oOffset[2] = oOffset[1];
+                for (DimSize_t oX = 0; oX < oDims[2]; ++oX,
+                               iOffset[2] += strideDims[0] * iStride[2],
+                               oOffset[2] += oStride[2]) {
+
+                    iOffset[3] = iOffset[2];
+                    oOffset[3] = oOffset[2];
+                    for (DimSize_t oY = 0; oY < oDims[3]; ++oY,
+                                   iOffset[3] += strideDims[1] * iStride[3],
+                                   oOffset[3] += oStride[3]) {
+
+                        for (DimSize_t iZ = 0, oZ = 0; oZ < oDims[4];
+                             ++oZ, iZ += strideDims[2]) {
+                            auto oIdx = oOffset[3] + oZ;
+                            auto iIdx = iOffset[3] + iZ;
+
+                            kOffset[2] = kOffset[1];
+                            kDilOffset[0] = 0;
+                            for (DimSize_t kX = 0; kX < kDims[0]; ++kX,
+                                           kOffset[2] += kStride[2],
+                                           kDilOffset[0] += dilationDims[0] *
+                                                                  iStride[2]) {
+
+                                kOffset[3] = kOffset[2];
+                                kDilOffset[1] = kDilOffset[0];
+                                for (DimSize_t kY = 0; kY < kDims[1];
+                                     ++kY,
+                                               kOffset[3] += kStride[3],
+                                               kDilOffset[1] +=
+                                               dilationDims[1] * iStride[3]) {
+
+                                    for (DimSize_t kZ = 0; kZ < kDims[2];
+                                         ++kZ) {
+                                        output[oIdx] +=
+                                            weights[kOffset[3] + kZ] *
+                                            input[iIdx + kDilOffset[1] +
+                                                  kZ * dilationDims[2]];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+/**
+ * @brief perform backpropagation for the input
+ * @note INPUT & OUTPUT convention is the same as in the
+ * forward function
+ * @note formula :
+ * for i in 0..input_size:
+ *  for n in 0..weight_size:
+ *    dL     dYn  dL
+ *   ---- = ---- ----
+ *    dXi    dXi  Yn
+ * with : dYn / dXi = w_k
+ * for each input value
+ * for each weight
+ * for each output
+ * multiply the weight with the associated value
+ * @note kernel & stride are passed as single integers as they are just arrays
+ * of length 1
+ * @note reminder that kernel dimensions are
+ * {outChannels, inChannels, {kernelDims}}
+ * <=> {oDims[1], iDims[1], kernelDim}
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam O Output data type.
+ * @param[in] stride stride parameter of the convolution operator
+ * @param[in] dilation dilation parameter of the convolution operator
+ * @param[in] kDims dimension of the kernel
+ * @param[in] kStrides nb of elements contained per dimension of the kernel
+ * @param[in] weights weights values
+ * @param[in] oDims dimensions of the output
+ * @param[in] oStrides nb of elements contained per dimension of the output
+ * @param[in] oGrad output gradient
+ * @param[in] iDims input dimensions
+ * @param[in] iStrides nb of elements contained per dimension of the input
+ * @param[inout] iGrad gradients of the input to update
+ */
+template <class I, class W, class O>
+void conv3DBackwardInput(const array<DimSize_t, 3> &stride,
+                         const array<DimSize_t, 3> &dilation,
+                         const array<DimSize_t, 3> &kDims,
+                         const array<DimSize_t, 4> &kStrides,
+                         const W *weights,
+                         const array<DimSize_t, 5> &oDims,
+                         const array<DimSize_t, 4> &oStrides,
+                         const O *oGrad,
+                         const array<DimSize_t, 5> &iDims,
+                         const array<DimSize_t, 4> &iStrides,
+                         I *iGrad) {
+    // records index offsets for each dimension that have a stride (== all
+    // dimension except the last) for every parsed tensor
+    // these serve as checkpoints to avoid recomputing indexes at every
+    // iteration
+    array<DimSize_t, 4> iOffset{};
+    array<DimSize_t, 4> oOffset{};
+    array<DimSize_t, 4> kOffset{};
+    array<DimSize_t, 2> iDilkernelOffset{}; // input offset for dilated kernel
+
+    for (DimSize_t batch = 0; batch < iDims[0];
+         ++batch, iOffset[0] += iStrides[0], oOffset[0] += oStrides[0]) {
+
+        kOffset[0] = 0;
+        oOffset[1] = oOffset[0];
+        for (DimSize_t oChannel = 0; oChannel < oDims[1]; oChannel++,
+                       oOffset[1] += oStrides[1],
+                       kOffset[0] += kStrides[0]) {
+
+            iOffset[1] = iOffset[0];
+            kOffset[1] = kOffset[0];
+            for (DimSize_t iChannel = 0; iChannel < iDims[1]; ++iChannel,
+                           iOffset[1] += iStrides[1],
+                           kOffset[1] += kStrides[1]) {
+
+                oOffset[2] = oOffset[1];
+                iOffset[2] = iOffset[1];
+                DimSize_t iX = 0;
+                for (DimSize_t oX = 0; oX < oDims[2]; ++oX,
+                               iX += stride[0],
+                               oOffset[2] += oStrides[2],
+                               iOffset[2] += stride[0] * iStrides[2]) {
+
+                    DimSize_t iY = 0;
+                    oOffset[3] = oOffset[2];
+                    iOffset[3] = iOffset[2];
+                    for (DimSize_t oY = 0; oY < oDims[3]; ++oY,
+                                   iY += stride[1],
+                                   oOffset[3] += oStrides[3],
+                                   iOffset[3] += stride[1] * iStrides[3]) {
+
+                        DimSize_t iZ = 0;
+                        for (DimSize_t oZ = 0; oZ < oDims[4];
+                             ++oZ, iZ += stride[2]) {
+                            auto oIdx = oOffset[3] + oZ;
+                            auto iIdx = iOffset[3] + iZ;
+
+                            iDilkernelOffset[0] = 0;
+                            kOffset[2] = kOffset[1];
+                            for (DimSize_t kX = 0; kX < kDims[0]; ++kX,
+                                           iDilkernelOffset[0] += dilation[0] *
+                                                                  iStrides[2],
+                                           kOffset[2] += kStrides[2]) {
+
+                                kOffset[3] = kOffset[2];
+                                iDilkernelOffset[1] = iDilkernelOffset[0];
+                                for (DimSize_t kY = 0; kY < kDims[1];
+                                     ++kY,
+                                               kOffset[3] += kStrides[3],
+                                               iDilkernelOffset[1] +=
+                                               dilation[1] * iStrides[3]) {
+
+                                    for (DimSize_t kZ = 0; kZ < kDims[2];
+                                         ++kZ) {
+
+                                        iGrad[iIdx + iDilkernelOffset[1] +
+                                              kZ * dilation[2]] +=
+                                            weights[kOffset[3] + kZ] *
+                                            oGrad[oIdx];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+/**
+ * @brief computes weight backpropagation for conv3D operation
+ * @note INPUT & OUTPUT convention is the same as in the
+ * forward function
+ * weight grad
+ * for i in 0..weight_size:
+ *  for n in 0..output_size:
+ *    dL     dYn  dL
+ *   ---- = ---- ----
+ *   dwi     dwi  Yn
+ * with : dYn / dwi = x_k
+ * @tparam I input dtype
+ * @tparam W weight dtype
+ * @tparam O output dtype
+ * @param[in] iDims input data dimensions
+ * @param[in] iStrides nb element in each dimension of input tensor
+ * @param[in] input input data
+ * @param[in] oDims output data dimmensions
+ * @param[in] oStrides nb element in each dimension of output tensor
+ * @param[in] oGrad gradients of output data
+ * @param[in] kDim dimensions of kernel (not taking in count In/OutChannels)
+ * @param[in] kStrides nb element in each dimension of kernel tensor (taking in
+ * count In/OutChannels)
+ * @param[in] stride attribute of the convolution operator
+ * @param[in] dilation attribute of the convolution operator
+ * @param[inout] weightsGrad gradients of the kernel weights
+ */
+template <class I, class W, class O>
+void conv3DBackwardWeights(const array<DimSize_t, 5> &iDims,
+                           const array<DimSize_t, 4> &iStrides,
+                           const I *input,
+                           const array<DimSize_t, 5> &oDims,
+                           const array<DimSize_t, 4> &oStrides,
+                           const O *oGrad,
+                           const array<DimSize_t, 3> &kDims,
+                           const array<DimSize_t, 4> &kStrides,
+                           const array<DimSize_t, 3> &stride,
+                           const array<DimSize_t, 3> &dilation,
+                           W *weightsGrad) {
+    // records index offsets for each dimension that have a stride that is
+    // not 1 (=> all dimension except the last) for every parsed tensor
+    array<DimSize_t, 4> iOffsets{0, 0, 0, 0};
+    array<DimSize_t, 4> oOffsets{0, 0, 0, 0};
+    array<DimSize_t, 4> kOffsets{0, 0, 0, 0};
+    array<DimSize_t, 3> iDilKernelOffsets{0, 0, 0};
+
+    for (DimSize_t batch = 0; batch < iDims[0]; ++batch) {
+        iOffsets[0] = batch * iStrides[0];
+        oOffsets[0] = batch * oStrides[0];
+
+        for (DimSize_t oChannel = 0; oChannel < oDims[1]; ++oChannel) {
+            oOffsets[1] = oChannel * oStrides[1] + oOffsets[0];
+            kOffsets[0] = oChannel * kStrides[0];
+
+            for (DimSize_t iChannel = 0; iChannel < iDims[1]; ++iChannel) {
+                iOffsets[1] = iChannel * iStrides[1] + iOffsets[0];
+                kOffsets[1] = iChannel * kStrides[1] + kOffsets[0];
+
+                for (DimSize_t kX = 0; kX < kDims[0]; ++kX) {
+                    kOffsets[2] = kX * kStrides[2] + kOffsets[1];
+                    iDilKernelOffsets[0] = kX * dilation[0] * iStrides[2];
+
+                    for (DimSize_t kY = 0; kY < kDims[1]; ++kY) {
+                        kOffsets[3] = kY * kStrides[3] + kOffsets[2];
+                        iDilKernelOffsets[1] = kY * dilation[1] * iStrides[3] +
+                                               iDilKernelOffsets[0];
+
+                        for (DimSize_t kZ = 0; kZ < kDims[2]; ++kZ) {
+                            iDilKernelOffsets[2] =
+                                kZ * dilation[2] + iDilKernelOffsets[1];
+
+                            for (DimSize_t oX = 0; oX < oDims[2]; ++oX) {
+                                oOffsets[2] = oX * oStrides[2] + oOffsets[1];
+                                iOffsets[2] =
+                                    oX * stride[0] * iStrides[2] + iOffsets[1];
+
+                                for (DimSize_t oY = 0; oY < oDims[3]; ++oY) {
+                                    oOffsets[3] =
+                                        oY * oStrides[3] + oOffsets[2];
+                                    iOffsets[3] =
+                                        oY * stride[1] * iStrides[3] +
+                                        iOffsets[2];
+
+                                    for (DimSize_t oZ = 0, iZ = 0;
+                                         oZ < oDims[4];
+                                         ++oZ) {
+
+                                        weightsGrad[kOffsets[3] + kZ] +=
+                                            input[iOffsets[3] + iZ +
+                                                  iDilKernelOffsets[2]] *
+                                            oGrad[oOffsets[3] + oZ];
+                                        iZ += stride[2];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+/**
+ * @brief computes bias backpropagation for conv3D operation
+ * @note INPUT & OUTPUT convention is the same as in the
+ * forward function
+ * @note formula :
+ * Bias grad:
+ * for i in 0..bias_size:
+ *  for n in 0..output_size:
+ *    dL     dYn  dL
+ *   ---- = ---- ----
+ *   dbi     dbi  Yn
+ * with : dYn / dbi = 1
+ *
+ * Hence the partial derivative of the loss wrt bias is the
+ * output loss Hence the bias grad is just the sum of the
+ * loss values over the batch
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param[in] oDims output tensor dimensions
+ * @param[in] oStrides nb of elements contained per dimension of the
+ * output
+ * @param[in] oGrad output tensor gradients
+ * @param[inout] biasesGrad biases gradients
+ */
+template <class B, class O>
+static void conv3DBackwardBias(const array<DimSize_t, 5> &oDims,
+                               const array<DimSize_t, 4> &oStrides,
+                               const O *oGrad,
+                               B *biasesGrad) {
+    // records all index offsets for output tensor
+    array<DimSize_t, 4> oOffsets{0, 0, 0, 0};
+    for (DimSize_t batchIdx = 0; batchIdx < oDims[0]; ++batchIdx) {
+        oOffsets[0] = batchIdx * oStrides[0];
+
+        oOffsets[1] = oOffsets[0];
+        for (DimSize_t oChannel = 0; oChannel < oDims[1];
+             ++oChannel, oOffsets[1] += oStrides[1]) {
+
+            oOffsets[2] = oOffsets[1];
+            for (DimSize_t oX = 0; oX < oDims[2];
+                 ++oX, oOffsets[2] += oStrides[2]) {
+
+                oOffsets[3] = oOffsets[2];
+                for (DimSize_t oY = 0; oY < oDims[3];
+                     ++oY, oOffsets[3] += oStrides[3]) {
+                    for (DimSize_t oZ = 0; oZ < oDims[4]; ++oZ) {
+                        biasesGrad[oChannel] += oGrad[oOffsets[3] + oZ];
+                    }
+                }
+            }
+        }
+    }
+}
+
+/**
+ * @brief Backward kernel for 3D Convolution on CPU backend.
+ * @note INPUT & OUTPUT convention is the same as in the
+ * forward function
+ *
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param[in] const stride attribute of conv operator
+ * @param[in] const dilation attribute of conv operator
+ * @param[in] const kernelDims
+ * @param[in] const iDims input data dimensions
+ * @param[in] const oDims output data dimmensions
+ * @param[in] const input_ input tensor.
+ * @param[in] const weights_ kernel tensor.
+ * @param[in] const oGrad_ output tensor gradient.
+ * @param[inout] iGrad_ input tensor gradient.
+ * @param[inout] weightsGrad_  kernel weights tensor gradients
+ * @param[inout] biasesGrad_  kernel biases tensor gradients
+ */
+template <class I, class W, class B, class O>
+void ConvImpl3D_cpu_backward_kernel(const array<DimSize_t, 3> &stride,
+                                    const array<DimSize_t, 3> &dilation,
+                                    const array<DimSize_t, 3> &kernelDims,
+                                    const array<DimSize_t, 5> &inputDims,
+                                    const array<DimSize_t, 5> &outputDims,
+                                    const void *input_,
+                                    const void *weights_,
+                                    const void *oGrad_,
+                                    void *iGrad_,
+                                    void *weightsGrad_,
+                                    void *biasesGrad_) {
+
+    const I *input = static_cast<const I *>(input_);
+    I *iGrad = static_cast<I *>(iGrad_);
+    const I *outputGrad = static_cast<const I *>(oGrad_);
+    const W *weights = static_cast<const W *>(weights_);
+    W *weightsGrad = static_cast<W *>(weightsGrad_);
+
+    //////////////////////////////
+    // COMPUTING STRIDES
+    //////////////////////////////
+    // NOTE: The ...Stride var represent the number of values contained
+    // in each dimension they will be used to compute the index offset
+    // of values while iterating on each tensor NOTE: They are 1 item
+    // shorter than their corresponding tensor as the number of total
+    // elements is not used except for gradient initialization
+
+    // {batch_stride, channel_stride, dim0_stride, dim1_stride}
+    const array<DimSize_t, 4> inputStrides{
+        inputDims[1] * inputDims[2] * inputDims[3] * inputDims[4],
+        inputDims[2] * inputDims[3] * inputDims[4],
+        inputDims[3] * inputDims[4],
+        inputDims[4]};
+    const DimSize_t nbEltsInput = inputDims[0] * inputStrides[0];
+
+    // {batch_stride, channel_stride, dim0_stride, dim1_stride}
+    const array<DimSize_t, 4> outputStrides{
+        outputDims[1] * outputDims[2] * outputDims[3] * outputDims[4],
+        outputDims[2] * outputDims[3] * outputDims[4],
+        outputDims[3] * outputDims[4],
+        outputDims[4]};
+
+    // NOTE: kernel dims = {iChannel, oChannel, kernelDim0, kernelDim1}
+    // kernel_strides = {iChannel, oChannel, kernelDim0}
+    const array<DimSize_t, 4> kernelStrides{
+        inputDims[1] * kernelDims[0] * kernelDims[1] * kernelDims[2],
+        kernelDims[0] * kernelDims[1] * kernelDims[2],
+        kernelDims[1] * kernelDims[2],
+        kernelDims[2]};
+
+    const DimSize_t nbEltsKernel = outputDims[1] * kernelStrides[0];
+
+    ////////////////////////////
+    // prepping gradient arrays
+    std::fill(iGrad, iGrad + nbEltsInput, I(0));
+    std::fill(weightsGrad, weightsGrad + nbEltsKernel, W(0));
+
+    conv3DBackwardInput(stride,
+                        dilation,
+                        kernelDims,
+                        kernelStrides,
+                        weights,
+                        outputDims,
+                        outputStrides,
+                        outputGrad,
+                        inputDims,
+                        inputStrides,
+                        iGrad);
+
+    conv3DBackwardWeights(inputDims,
+                          inputStrides,
+                          input,
+                          outputDims,
+                          outputStrides,
+                          outputGrad,
+                          kernelDims,
+                          kernelStrides,
+                          stride,
+                          dilation,
+                          weightsGrad);
+
+    if (biasesGrad_ != nullptr) {
+        B *biasesGrad = static_cast<B *>(biasesGrad_);
+        std::fill(biasesGrad, biasesGrad + outputDims[1], B(0));
+        conv3DBackwardBias(outputDims, outputStrides, outputGrad, biasesGrad);
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ConvImpl3D_cpu,
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Float32, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           ConvImpl3D_cpu_forward_kernel<float, float, float, float>,
+           ConvImpl3D_cpu_backward_kernel<float, float, float, float>});
+REGISTRAR(ConvImpl3D_cpu,
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Float16, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           ConvImpl3D_cpu_forward_kernel<half_float::half,
+                                         half_float::half,
+                                         half_float::half,
+                                         half_float::half>,
+           ConvImpl3D_cpu_backward_kernel<half_float::half,
+                                          half_float::half,
+                                          half_float::half,
+                                          half_float::half>});
+REGISTRAR(ConvImpl3D_cpu,
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Float64, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           ConvImpl3D_cpu_forward_kernel<double, double, double, double>,
+           ConvImpl3D_cpu_backward_kernel<double, double, double, double>});
+REGISTRAR(ConvImpl3D_cpu,
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Int32, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           ConvImpl3D_cpu_forward_kernel<std::int32_t,
+                                         std::int32_t,
+                                         std::int32_t,
+                                         std::int32_t>,
+           ConvImpl3D_cpu_backward_kernel<std::int32_t,
+                                          std::int32_t,
+                                          std::int32_t,
+                                          std::int32_t>});
 } // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_ */
diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp
index eae5f109f6af8298b90cc8e505ff44eff51bab5c..22f28d504be2a071b5b9e06abbf8106cc836c32d 100644
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -26,7 +26,6 @@ template <>
 void ConvImpl1D_cpu::forward() {
     const auto& op_ = static_cast<const Conv_Op<1>&>(mOp);
 
-    // FIXME: uncomment the following code once memory handling will work
     AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
     AIDGE_ASSERT(op_.getInput(1), "missing input #1 in Conv Operator.");
 
@@ -104,7 +103,6 @@ template <>
 void ConvImpl2D_cpu::forward() {
     const auto& op_ = dynamic_cast<const Conv_Op<2>&>(mOp);
 
-    // FIXME: uncomment the following code once memory handling will work
     AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
     AIDGE_ASSERT(op_.getInput(1), "missing input #1 in Conv Operator.");
 
@@ -178,4 +176,79 @@ void ConvImpl2D_cpu::backward() {
         op.getInput(2) ? inputBiasGrad.getImpl()->rawPtr() : nullptr);
 }
 
+template <>
+void Aidge::ConvImpl3D_cpu::forward() {
+    const auto& op_ = dynamic_cast<const Conv_Op<3>&>(mOp);
+
+    AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
+    AIDGE_ASSERT(op_.getInput(1), "missing input #1 in Conv Operator.");
+
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
+    const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *op_.getOutput(0));
+    const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *op_.getOutput(0));
+    const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor();
+
+    // Find the correct kernel type
+    const auto impl = Registrar<ConvImpl3D_cpu>::create(getBestMatch(getRequiredSpec()));
+    // Call kernel
+    impl.forward(op_.strideDims(),
+            op_.dilationDims(),
+            op_.kernelDims(),
+            op_.getInput(0)->template dims<5>(), // input dimensions
+            op_.getOutput(0)->template dims<5>(), // input dimensions
+            input0.getImpl()->rawPtr(), // input
+            input1.getImpl()->rawPtr(), // weight
+            op_.getInput(2) ? input2.getImpl()->rawPtr() : nullptr, // bias
+            getCPUPtr(mOp.getRawOutput(0)) // output
+            );
+}
+
+template <> void ConvImpl3D_cpu::backward() {
+    const auto &op = dynamic_cast<const Conv3D_Op &>(mOp);
+    const auto &outputGrad = op.getOutput(0)->grad();
+    AIDGE_ASSERT(outputGrad, "{}: missing ouput #0 gradient", op.type());
+    AIDGE_ASSERT(op.getInput(0)->grad(),
+                 "{}: missing data input(#0) gradient",
+                 op.type());
+    AIDGE_ASSERT(op.getInput(1)->grad(),
+                 "{}: missing weight input(#1) gradient",
+                 op.type());
+
+    std::shared_ptr<Tensor> inputDataGradFallback, inputWeightGradFallback,
+        inputBiasGradFallback;
+    const auto &inputDataGrad =
+        op.getInput(0)->grad()->refCastFrom(inputDataGradFallback,
+                                            *(op.getOutput(0)));
+    const auto &inputWeightGrad =
+        op.getInput(1)->grad()->refCastFrom(inputWeightGradFallback,
+                                            *(op.getOutput(0)));
+    const auto &inputBiasGrad =
+        (op.getInput(2) && op.getInput(2)->grad())
+            ? op.getInput(2)->grad()->refCastFrom(inputBiasGradFallback,
+                                                  *(op.getOutput(0)))
+            : Tensor();
+
+    // Call kernel
+    const auto impl =
+        Registrar<ConvImpl3D_cpu>::create(getBestMatch(getRequiredSpec()));
+    impl.backward(
+        op.strideDims(),
+        op.dilationDims(),
+        op.kernelDims(),
+        op.getInput(0)->template dims<5>(),
+        op.getOutput(0)->template dims<5>(),
+
+        getCPUPtr(op.getInput(0)),
+        getCPUPtr(op.getInput(1)),
+        getCPUPtr(outputGrad),
+        inputDataGrad.getImpl()->rawPtr(),
+        inputWeightGrad.getImpl()->rawPtr(),
+        op.getInput(2) ? inputBiasGrad.getImpl()->rawPtr() : nullptr);
+}
+
 } // namespace Aidge
diff --git a/unit_tests/operator/Test_ConvImpl.cpp b/unit_tests/operator/Test_ConvImpl.cpp
index c7242bbb6f0c7ba6632d1d5937b72e2a0d5cc218..a47b315bd89ee7e9f054311dc88c04767e518c0a 100644
--- a/unit_tests/operator/Test_ConvImpl.cpp
+++ b/unit_tests/operator/Test_ConvImpl.cpp
@@ -9,13 +9,14 @@
  *
  ********************************************************************************/
 
+#include <aidge/utils/Types.h>
 #include <memory>
 
 #include <catch2/catch_test_macros.hpp>
 #include <fmt/core.h>
 
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
-#include "aidge/data/Data.hpp"  // DataType
+#include "aidge/data/Data.hpp" // DataType
 #include "aidge/data/Tensor.hpp"
 #include "aidge/filler/Filler.hpp"
 #include "aidge/graph/Node.hpp"
@@ -23,7 +24,45 @@
 #include "aidge/utils/TensorUtils.hpp"
 #include "aidge/operator/Pad.hpp"
 
-using namespace Aidge;
+namespace Aidge {
+
+template <DimSize_t DIM>
+static std::shared_ptr<OperatorTensor>
+setupTestConv(const DimSize_t batchSize,
+              const DimSize_t inChannels,
+              const DimSize_t outChannels,
+              const std::array<DimSize_t, DIM> kernelSize,
+              const std::array<DimSize_t, DIM> dataSize,
+              const std::array<DimSize_t, DIM> stride,
+              const std::array<DimSize_t, DIM> dilation,
+              const std::array<DimSize_t, 2 * DIM> padding,
+              const std::shared_ptr<Tensor> input,
+              const std::shared_ptr<Tensor> weights,
+              const std::shared_ptr<Tensor> biases) {
+    input->setBackend("cpu");
+    weights->setBackend("cpu");
+    biases->setBackend("cpu");
+    std::shared_ptr<Node> convNode;
+    convNode = Conv(inChannels,
+                    outChannels,
+                    kernelSize,
+                    "myconv",
+                    std::array<DimSize_t, DIM>({stride}),
+                    dilation);
+    auto op =
+        std::static_pointer_cast<OperatorTensor>(convNode->getOperator());
+
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    op->associateInput(0, input);
+    op->associateInput(1, weights);
+    op->associateInput(2, biases);
+
+    REQUIRE_NOTHROW(op->forwardDims(true));
+
+    return op;
+}
 
 /**
  * @brief ConvDepthWise reference cpp backend forward implmentation tests.
@@ -44,6 +83,7 @@ using namespace Aidge;
  *  stride [2,2], dilation [2,2]
  */
 TEST_CASE("[cpu/operator] Conv(forward)", "[Conv][CPU]") {
+    SECTION("2D") {
     SECTION("Conv with kernel [3,3]") {
         SECTION("No stride, no dilation") {
             std::shared_ptr<Node> myConv = Conv(3,4,{3,3}, "myconv");
@@ -1714,47 +1754,216 @@ TEST_CASE("[cpu/operator] Conv(forward)", "[Conv][CPU]") {
             //fmt::print("{:.^20}\n", "truth");
             //(*expectedOutput).print();
             REQUIRE(*(conv_op.getOutput(0)) == *expectedOutput);
+            }
         }
     }
+    SECTION("3D") {
+        constexpr DimSize_t DIM = 3;
+        SECTION("minimal test, no stride, no dilation, 1 in/outChannel") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 1;
+            constexpr DimSize_t outChannels = 1;
+            constexpr std::array<DimSize_t, DIM> kernelSize = {2, 2, 2};
+            constexpr std::array<DimSize_t, DIM> inDataSize = {3, 3, 3};
 
-}
+            constexpr std::array<DimSize_t, DIM> stride = {1, 1, 1};
+            constexpr std::array<DimSize_t, DIM> dilation = {1, 1, 1};
+            constexpr std::array<DimSize_t, 2 * DIM> padding({0, 0, 0});
 
-template <DimSize_t DIM>
-std::shared_ptr<OperatorTensor>
-setupTestConv(const DimSize_t batchSize,
-                      const DimSize_t inChannels,
-                      const DimSize_t outChannels,
-                      const std::array<DimSize_t, DIM> kernelSize,
-                      const std::array<DimSize_t, DIM> dataSize,
-                      const std::array<DimSize_t, DIM> stride,
-                      const std::array<DimSize_t, DIM> dilation,
-                      const std::array<DimSize_t, 2 * DIM> padding,
-                      const std::shared_ptr<Tensor> input,
-                      const std::shared_ptr<Tensor> weights,
-                      const std::shared_ptr<Tensor> biases) {
-    input->setBackend("cpu");
-    weights->setBackend("cpu");
-    biases->setBackend("cpu");
-    std::shared_ptr<Node> convNode;
-    convNode = Conv(inChannels,
-                    outChannels,
-                    kernelSize,
-                    "myconv",
-                    std::array<DimSize_t, DIM>({stride}),
-                    dilation);
-    auto op =
-        std::static_pointer_cast<OperatorTensor>(convNode->getOperator());
+            constexpr std::array<DimSize_t, DIM> outDataSize = {2, 2, 2};
 
-    op->setDataType(DataType::Float32);
-    op->setBackend("cpu");
+            auto inputSize = std::vector<DimSize_t>(
+                {batchSize, inChannels, inDataSize[0], inDataSize[1]});
 
-    op->associateInput(0, input);
-    op->associateInput(1, weights);
-    op->associateInput(2, biases);
+            auto input = std::make_shared<Tensor>(Array5D<float,
+                                                          batchSize,
+                                                          inChannels,
+                                                          inDataSize[0],
+                                                          inDataSize[1],
+                                                          inDataSize[2]>(
+                {{{{{{1., 2., 3.}, {4., 5., 6.}, {7., 8., 9.}},
 
-    REQUIRE_NOTHROW(op->forwardDims(true));
+                    {{10., 11., 12.}, {13., 14., 15.}, {16., 17., 18.}},
 
-    return op;
+                    {{19., 20., 21.}, {22., 23., 24.}, {25., 26., 27.}}}}}}));
+            auto weights = std::make_shared<Tensor>(
+                Array5D<float,
+                        outChannels,
+                        inChannels,
+                        kernelSize[0],
+                        kernelSize[1],
+                        kernelSize[2]>({{{{{{0.1, 0.2}, {0.3, 0.4}},
+
+                                           {{0.5, 0.6}, {0.7, 0.8}}}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.01}}));
+
+            auto op = setupTestConv<DIM>(batchSize,
+                                         inChannels,
+                                         outChannels,
+                                         kernelSize,
+                                         inDataSize,
+                                         stride,
+                                         dilation,
+                                         padding,
+                                         input,
+                                         weights,
+                                         biases);
+
+            REQUIRE_NOTHROW(op->forward());
+
+            auto expectedOutput = Tensor(Array5D<float,
+                                                 batchSize,
+                                                 outChannels,
+                                                 outDataSize[0],
+                                                 outDataSize[1],
+                                                 outDataSize[2]>(
+                {{{{{{35.610001, 39.209999}, {46.410000, 50.010002}},
+
+                    {{68.010002, 71.610001}, {78.809998, 82.410004}}}}}}));
+
+            CHECK(approxEq<float, float>(*op->getOutput(0), expectedOutput));
+        }
+        SECTION("stride & dilation, multiple outChannels") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 1;
+            constexpr DimSize_t outChannels = 2;
+            constexpr std::array<DimSize_t, DIM> kernelSize = {2, 2, 2};
+            constexpr std::array<DimSize_t, DIM> inDataSize = {8, 8, 8};
+
+            constexpr std::array<DimSize_t, DIM> stride = {2, 3, 4};
+            constexpr std::array<DimSize_t, DIM> dilation = {4, 3, 2};
+            constexpr std::array<DimSize_t, 2 * DIM> padding({0, 0, 0});
+
+            constexpr std::array<DimSize_t, DIM> outDataSize = {2, 2, 2};
+
+            auto inputSize = std::vector<DimSize_t>(
+                {batchSize, inChannels, inDataSize[0], inDataSize[1]});
+
+            auto input = std::make_shared<Tensor>(Array5D<float,
+                                                          batchSize,
+                                                          inChannels,
+                                                          inDataSize[0],
+                                                          inDataSize[1],
+                                                          inDataSize[2]>(
+                {{{{{{1., 2., 3., 4., 5., 6., 7., 8.},
+                     {9., 10., 11., 12., 13., 14., 15., 16.},
+                     {17., 18., 19., 20., 21., 22., 23., 24.},
+                     {25., 26., 27., 28., 29., 30., 31., 32.},
+                     {33., 34., 35., 36., 37., 38., 39., 40.},
+                     {41., 42., 43., 44., 45., 46., 47., 48.},
+                     {49., 50., 51., 52., 53., 54., 55., 56.},
+                     {57., 58., 59., 60., 61., 62., 63., 64.}},
+
+                    {{65., 66., 67., 68., 69., 70., 71., 72.},
+                     {73., 74., 75., 76., 77., 78., 79., 80.},
+                     {81., 82., 83., 84., 85., 86., 87., 88.},
+                     {89., 90., 91., 92., 93., 94., 95., 96.},
+                     {97., 98., 99., 100., 101., 102., 103., 104.},
+                     {105., 106., 107., 108., 109., 110., 111., 112.},
+                     {113., 114., 115., 116., 117., 118., 119., 120.},
+                     {121., 122., 123., 124., 125., 126., 127., 128.}},
+
+                    {{129., 130., 131., 132., 133., 134., 135., 136.},
+                     {137., 138., 139., 140., 141., 142., 143., 144.},
+                     {145., 146., 147., 148., 149., 150., 151., 152.},
+                     {153., 154., 155., 156., 157., 158., 159., 160.},
+                     {161., 162., 163., 164., 165., 166., 167., 168.},
+                     {169., 170., 171., 172., 173., 174., 175., 176.},
+                     {177., 178., 179., 180., 181., 182., 183., 184.},
+                     {185., 186., 187., 188., 189., 190., 191., 192.}},
+
+                    {{193., 194., 195., 196., 197., 198., 199., 200.},
+                     {201., 202., 203., 204., 205., 206., 207., 208.},
+                     {209., 210., 211., 212., 213., 214., 215., 216.},
+                     {217., 218., 219., 220., 221., 222., 223., 224.},
+                     {225., 226., 227., 228., 229., 230., 231., 232.},
+                     {233., 234., 235., 236., 237., 238., 239., 240.},
+                     {241., 242., 243., 244., 245., 246., 247., 248.},
+                     {249., 250., 251., 252., 253., 254., 255., 256.}},
+
+                    {{257., 258., 259., 260., 261., 262., 263., 264.},
+                     {265., 266., 267., 268., 269., 270., 271., 272.},
+                     {273., 274., 275., 276., 277., 278., 279., 280.},
+                     {281., 282., 283., 284., 285., 286., 287., 288.},
+                     {289., 290., 291., 292., 293., 294., 295., 296.},
+                     {297., 298., 299., 300., 301., 302., 303., 304.},
+                     {305., 306., 307., 308., 309., 310., 311., 312.},
+                     {313., 314., 315., 316., 317., 318., 319., 320.}},
+
+                    {{321., 322., 323., 324., 325., 326., 327., 328.},
+                     {329., 330., 331., 332., 333., 334., 335., 336.},
+                     {337., 338., 339., 340., 341., 342., 343., 344.},
+                     {345., 346., 347., 348., 349., 350., 351., 352.},
+                     {353., 354., 355., 356., 357., 358., 359., 360.},
+                     {361., 362., 363., 364., 365., 366., 367., 368.},
+                     {369., 370., 371., 372., 373., 374., 375., 376.},
+                     {377., 378., 379., 380., 381., 382., 383., 384.}},
+
+                    {{385., 386., 387., 388., 389., 390., 391., 392.},
+                     {393., 394., 395., 396., 397., 398., 399., 400.},
+                     {401., 402., 403., 404., 405., 406., 407., 408.},
+                     {409., 410., 411., 412., 413., 414., 415., 416.},
+                     {417., 418., 419., 420., 421., 422., 423., 424.},
+                     {425., 426., 427., 428., 429., 430., 431., 432.},
+                     {433., 434., 435., 436., 437., 438., 439., 440.},
+                     {441., 442., 443., 444., 445., 446., 447., 448.}},
+
+                    {{449., 450., 451., 452., 453., 454., 455., 456.},
+                     {457., 458., 459., 460., 461., 462., 463., 464.},
+                     {465., 466., 467., 468., 469., 470., 471., 472.},
+                     {473., 474., 475., 476., 477., 478., 479., 480.},
+                     {481., 482., 483., 484., 485., 486., 487., 488.},
+                     {489., 490., 491., 492., 493., 494., 495., 496.},
+                     {497., 498., 499., 500., 501., 502., 503., 504.},
+                     {505., 506., 507., 508., 509., 510., 511., 512.}}}}}}));
+
+            auto weights = std::make_shared<Tensor>(Array5D<float,
+                                                            outChannels,
+                                                            inChannels,
+                                                            kernelSize[0],
+                                                            kernelSize[1],
+                                                            kernelSize[2]>(
+                {{{{{{0.1, 0.2}, {0.3, 0.4}}, {{0.5, 0.6}, {0.7, 0.8}}}},
+
+                  {{{{0.9, 1.0}, {1.1, 1.2}}, {{1.3, 1.4}, {1.5, 1.6}}}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.01, 0.02}}));
+
+            auto op = setupTestConv<DIM>(batchSize,
+                                         inChannels,
+                                         outChannels,
+                                         kernelSize,
+                                         inDataSize,
+                                         stride,
+                                         dilation,
+                                         padding,
+                                         input,
+                                         weights,
+                                         biases);
+
+            REQUIRE_NOTHROW(op->forward());
+
+            auto expectedOutput = Tensor(Array5D<float,
+                                                 batchSize,
+                                                 outChannels,
+                                                 outDataSize[0],
+                                                 outDataSize[1],
+                                                 outDataSize[2]>(
+                {{{{{{726.010010, 740.410034}, {812.409973, 826.809998}},
+
+                    {{1186.810059, 1201.210083}, {1273.210083, 1287.609985}}},
+
+                   {{{1634.820068, 1674.820068}, {1874.820068, 1914.819946}},
+
+                    {{2914.820312, 2954.820068},
+                     {3154.820068, 3194.819824}}}}}}));
+
+            CHECK(approxEq<float, float>(*op->getOutput(0), expectedOutput));
+        }
+    }
 }
 
 TEST_CASE("[cpu/operator] Conv(backward)", "[Conv][CPU]") {
@@ -2713,4 +2922,512 @@ TEST_CASE("[cpu/operator] Conv(backward)", "[Conv][CPU]") {
             }
         }
     }
+    SECTION("3D") {
+        constexpr DimSize_t DIM = 3;
+        SECTION("basic test, square kernel, stride, dilation") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 1;
+            constexpr DimSize_t outChannels = 1;
+            constexpr std::array<DimSize_t, DIM> kernelSize = {2, 2, 2};
+            constexpr std::array<DimSize_t, DIM> inDataSize = {4, 4, 4};
+
+            constexpr std::array<DimSize_t, DIM> stride = {2, 2, 2};
+            constexpr std::array<DimSize_t, DIM> dilation = {2, 2, 2};
+            constexpr std::array<DimSize_t, 2 * DIM> padding({0, 0});
+
+            constexpr std::array<DimSize_t, DIM> outDataSize = {1, 1, 1};
+
+            auto inputSize = std::vector<DimSize_t>({batchSize,
+                                                     inChannels,
+                                                     inDataSize[0],
+                                                     inDataSize[1],
+                                                     inDataSize[2]});
+
+            auto input = std::make_shared<Tensor>(
+                Array5D<float,
+                        batchSize,
+                        inChannels,
+                        inDataSize[0],
+                        inDataSize[1],
+                        inDataSize[2]>({{{{{{1., 2., 3., 4.},
+                                            {5., 6., 7., 8.},
+                                            {9., 10., 11., 12.},
+                                            {13., 14., 15., 16.}},
+
+                                           {{17., 18., 19., 20.},
+                                            {21., 22., 23., 24.},
+                                            {25., 26., 27., 28.},
+                                            {29., 30., 31., 32.}},
+
+                                           {{33., 34., 35., 36.},
+                                            {37., 38., 39., 40.},
+                                            {41., 42., 43., 44.},
+                                            {45., 46., 47., 48.}},
+
+                                           {{49., 50., 51., 52.},
+                                            {53., 54., 55., 56.},
+                                            {57., 58., 59., 60.},
+                                            {61., 62., 63., 64.}}}}}}));
+
+            auto weights = std::make_shared<Tensor>(
+                Array5D<float,
+                        outChannels,
+                        inChannels,
+                        kernelSize[0],
+                        kernelSize[1],
+                        kernelSize[2]>({{{{{{0.1, 0.2}, {0.3, 0.4}},
+
+                                           {{0.5, 0.6}, {0.7, 0.8}}}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.01}}));
+
+            auto outputGrad = std::make_shared<Tensor>(
+                Array5D<float,
+                        batchSize,
+                        outChannels,
+                        outDataSize[0],
+                        outDataSize[1],
+                        outDataSize[2]>({{{{{{1.}}}}}}));
+
+            auto op = setupTestConv<DIM>(batchSize,
+                                         inChannels,
+                                         outChannels,
+                                         kernelSize,
+                                         inDataSize,
+                                         stride,
+                                         dilation,
+                                         padding,
+                                         input,
+                                         weights,
+                                         biases);
+
+            ////////////////////////////////////
+            // setup gradients for backward
+            op->getOutput(0)->setGrad(outputGrad);
+
+            REQUIRE_NOTHROW(op->backward());
+
+            SECTION("Input Grad") {
+                auto expectedInputGrad = std::make_shared<Tensor>(
+                    Array5D<float,
+                            batchSize,
+                            inChannels,
+                            inDataSize[0],
+                            inDataSize[1],
+                            inDataSize[2]>({{{{{{0.1, 0.0, 0.2, 0.0},
+                                                {0.0, 0.0, 0.0, 0.0},
+                                                {0.3, 0.0, 0.4, 0.0},
+                                                {0.0, 0.0, 0.0, 0.0}},
+
+                                               {{0.0, 0.0, 0.0, 0.0},
+                                                {0.0, 0.0, 0.0, 0.0},
+                                                {0.0, 0.0, 0.0, 0.0},
+                                                {0.0, 0.0, 0.0, 0.0}},
+
+                                               {{0.5, 0.0, 0.6, 0.0},
+                                                {0.0, 0.0, 0.0, 0.0},
+                                                {0.7, 0.0, 0.8, 0.0},
+                                                {0.0, 0.0, 0.0, 0.0}},
+
+                                               {{0.0, 0.0, 0.0, 0.0},
+                                                {0.0, 0.0, 0.0, 0.0},
+                                                {0.0, 0.0, 0.0, 0.0},
+                                                {0.0, 0.0, 0.0, 0.0}}}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(0)->grad(),
+                                             *expectedInputGrad));
+            }
+            SECTION("Weight grad") {
+                auto expectedWeightsGrad = std::make_shared<Tensor>(
+                    Array5D<float,
+                            outChannels,
+                            inChannels,
+                            kernelSize[0],
+                            kernelSize[1],
+                            kernelSize[2]>({{{{{{1., 3.}, {9., 11.}},
+
+                                               {{33., 35.}, {41., 43.}}}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(1)->grad(),
+                                             *expectedWeightsGrad));
+            }
+            SECTION("Bias Grad") {
+                auto expectedBiasesGrad = std::make_shared<Tensor>(
+                    Array1D<float, outChannels>({{1.}}));
+                CHECK(approxEq<float, float>(*op->getInput(2)->grad(),
+                                             *expectedBiasesGrad));
+            }
+        }
+        SECTION("square kernel, multiple in/out channels") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 2;
+            constexpr DimSize_t outChannels = 1;
+            constexpr std::array<DimSize_t, DIM> kernelSize = {2, 2, 2};
+            constexpr std::array<DimSize_t, DIM> inDataSize = {2, 2, 2};
+
+            constexpr std::array<DimSize_t, DIM> stride = {1, 1, 1};
+            constexpr std::array<DimSize_t, DIM> dilation = {1, 1, 1};
+            constexpr std::array<DimSize_t, 2 * DIM> padding({0, 0});
+
+            constexpr std::array<DimSize_t, DIM> outDataSize = {1, 1, 1};
+
+            auto inputSize = std::vector<DimSize_t>({batchSize,
+                                                     inChannels,
+                                                     inDataSize[0],
+                                                     inDataSize[1],
+                                                     inDataSize[2]});
+
+            auto input = std::make_shared<Tensor>(Array5D<float,
+                                                          batchSize,
+                                                          inChannels,
+                                                          inDataSize[0],
+                                                          inDataSize[1],
+                                                          inDataSize[2]>(
+                {{{{{{1.000000, 2.000000}, {3.000000, 4.000000}},
+
+                    {{5.000000, 6.000000}, {7.000000, 8.000000}}},
+
+                   {{{9.000000, 10.000000}, {11.000000, 12.000000}},
+
+                    {{13.000000, 14.000000}, {15.000000, 16.000000}}}}}}));
+
+            auto weights = std::make_shared<Tensor>(Array5D<float,
+                                                            outChannels,
+                                                            inChannels,
+                                                            kernelSize[0],
+                                                            kernelSize[1],
+                                                            kernelSize[2]>(
+                {{{{{{0.100000, 0.200000}, {0.300000, 0.400000}},
+
+                    {{0.500000, 0.600000}, {0.700000, 0.800000}}},
+
+                   {{{0.900000, 1.000000}, {1.100000, 1.200000}},
+
+                    {{1.300000, 1.400000}, {1.500000, 1.600000}}}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.010000}}));
+
+            auto outputGrad = std::make_shared<Tensor>(
+                Array5D<float,
+                        batchSize,
+                        outChannels,
+                        outDataSize[0],
+                        outDataSize[1],
+                        outDataSize[2]>({{{{{{1.000000}}}}}}));
+
+            auto op = setupTestConv<DIM>(batchSize,
+                                         inChannels,
+                                         outChannels,
+                                         kernelSize,
+                                         inDataSize,
+                                         stride,
+                                         dilation,
+                                         padding,
+                                         input,
+                                         weights,
+                                         biases);
+
+            ////////////////////////////////////
+            // setup gradients for backward
+            op->getOutput(0)->setGrad(outputGrad);
+
+            REQUIRE_NOTHROW(op->backward());
+
+            SECTION("Input Grad") {
+                auto expectedInputGrad =
+                    std::make_shared<Tensor>(Array5D<float,
+                                                     batchSize,
+                                                     inChannels,
+                                                     inDataSize[0],
+                                                     inDataSize[1],
+                                                     inDataSize[2]>({{{{{{0.100000, 0.200000},
+                                       {0.300000, 0.400000}},
+
+                                      {{0.500000, 0.600000},
+                                       {0.700000, 0.800000}}},
+
+
+                                     {{{0.900000, 1.000000},
+                                       {1.100000, 1.200000}},
+
+                                      {{1.300000, 1.400000},
+                                       {1.500000, 1.600000}}}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(0)->grad(),
+                                             *expectedInputGrad));
+            }
+            SECTION("Weight grad") {
+                auto expectedWeightsGrad =
+                    std::make_shared<Tensor>(Array5D<float,
+                                                     outChannels,
+                                                     inChannels,
+                                                     kernelSize[0],
+                                                     kernelSize[1],
+                                                     kernelSize[2]>({{{{{{ 1.000000,  2.000000},
+                                       { 3.000000,  4.000000}},
+
+                                      {{ 5.000000,  6.000000},
+                                       { 7.000000,  8.000000}}},
+
+
+                                     {{{ 9.000000, 10.000000},
+                                       {11.000000, 12.000000}},
+
+                                      {{13.000000, 14.000000},
+                                       {15.000000, 16.000000}}}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(1)->grad(),
+                                             *expectedWeightsGrad));
+            }
+            SECTION("Bias Grad") {
+                auto expectedBiasesGrad = std::make_shared<Tensor>(
+                    Array1D<float, outChannels>({{1.000000}}));
+                CHECK(approxEq<float, float>(*op->getInput(2)->grad(),
+                                             *expectedBiasesGrad));
+            }
+        }
+        SECTION("non square kernel, stride, dilation, multiple "
+                "in/outchannels") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 2;
+            constexpr DimSize_t outChannels = 2;
+            constexpr std::array<DimSize_t, DIM> kernelSize = {1, 2, 3};
+            constexpr std::array<DimSize_t, DIM> inDataSize = {5, 5, 5};
+
+            constexpr std::array<DimSize_t, DIM> stride = {1, 2, 3};
+            constexpr std::array<DimSize_t, DIM> dilation = {3, 2, 1};
+            constexpr std::array<DimSize_t, 2 * DIM> padding({0, 0});
+
+            constexpr std::array<DimSize_t, DIM> outDataSize = {5, 2, 1};
+
+            auto inputSize = std::vector<DimSize_t>({batchSize,
+                                                     inChannels,
+                                                     inDataSize[0],
+                                                     inDataSize[1],
+                                                     inDataSize[2]});
+
+            auto input = std::make_shared<Tensor>(Array5D<float,
+                                                          batchSize,
+                                                          inChannels,
+                                                          inDataSize[0],
+                                                          inDataSize[1],
+                                                          inDataSize[2]>(
+                {{{{{{1., 2., 3., 4., 5.},
+                     {6., 7., 8., 9., 10.},
+                     {11., 12., 13., 14., 15.},
+                     {16., 17., 18., 19., 20.},
+                     {21., 22., 23., 24., 25.}},
+
+                    {{26., 27., 28., 29., 30.},
+                     {31., 32., 33., 34., 35.},
+                     {36., 37., 38., 39., 40.},
+                     {41., 42., 43., 44., 45.},
+                     {46., 47., 48., 49., 50.}},
+
+                    {{51., 52., 53., 54., 55.},
+                     {56., 57., 58., 59., 60.},
+                     {61., 62., 63., 64., 65.},
+                     {66., 67., 68., 69., 70.},
+                     {71., 72., 73., 74., 75.}},
+
+                    {{76., 77., 78., 79., 80.},
+                     {81., 82., 83., 84., 85.},
+                     {86., 87., 88., 89., 90.},
+                     {91., 92., 93., 94., 95.},
+                     {96., 97., 98., 99., 100.}},
+
+                    {{101., 102., 103., 104., 105.},
+                     {106., 107., 108., 109., 110.},
+                     {111., 112., 113., 114., 115.},
+                     {116., 117., 118., 119., 120.},
+                     {121., 122., 123., 124., 125.}}},
+
+                   {{{126., 127., 128., 129., 130.},
+                     {131., 132., 133., 134., 135.},
+                     {136., 137., 138., 139., 140.},
+                     {141., 142., 143., 144., 145.},
+                     {146., 147., 148., 149., 150.}},
+
+                    {{151., 152., 153., 154., 155.},
+                     {156., 157., 158., 159., 160.},
+                     {161., 162., 163., 164., 165.},
+                     {166., 167., 168., 169., 170.},
+                     {171., 172., 173., 174., 175.}},
+
+                    {{176., 177., 178., 179., 180.},
+                     {181., 182., 183., 184., 185.},
+                     {186., 187., 188., 189., 190.},
+                     {191., 192., 193., 194., 195.},
+                     {196., 197., 198., 199., 200.}},
+
+                    {{201., 202., 203., 204., 205.},
+                     {206., 207., 208., 209., 210.},
+                     {211., 212., 213., 214., 215.},
+                     {216., 217., 218., 219., 220.},
+                     {221., 222., 223., 224., 225.}},
+
+                    {{226., 227., 228., 229., 230.},
+                     {231., 232., 233., 234., 235.},
+                     {236., 237., 238., 239., 240.},
+                     {241., 242., 243., 244., 245.},
+                     {246., 247., 248., 249., 250.}}}}}}));
+
+            auto weights = std::make_shared<Tensor>(Array5D<float,
+                                                            outChannels,
+                                                            inChannels,
+                                                            kernelSize[0],
+                                                            kernelSize[1],
+                                                            kernelSize[2]>(
+                {{{{{{0.1, 0.2, 0.3}, {0.4, 0.5, 0.6}}},
+
+                   {{{0.7, 0.8, 0.9}, {1.0, 1.1, 1.2}}}},
+
+                  {{{{1.3, 1.4, 1.5}, {1.6, 1.7, 1.8}}},
+
+                   {{{1.9, 2.0, 2.1}, {2.2, 2.3, 2.4}}}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.01, 0.02}}));
+
+            auto outputGrad = std::make_shared<Tensor>(
+                Array5D<float,
+                        batchSize,
+                        outChannels,
+                        outDataSize[0],
+                        outDataSize[1],
+                        outDataSize[2]>({{{{{{1.}, {2.}},
+
+                                            {{3.}, {4.}},
+
+                                            {{5.}, {6.}},
+
+                                            {{7.}, {8.}},
+
+                                            {{9.}, {10.}}},
+
+                                           {{{11.}, {12.}},
+
+                                            {{13.}, {14.}},
+
+                                            {{15.}, {16.}},
+
+                                            {{17.}, {18.}},
+
+                                            {{19.}, {20.}}}}}}));
+
+            auto op = setupTestConv<DIM>(batchSize,
+                                         inChannels,
+                                         outChannels,
+                                         kernelSize,
+                                         inDataSize,
+                                         stride,
+                                         dilation,
+                                         padding,
+                                         input,
+                                         weights,
+                                         biases);
+
+            ////////////////////////////////////
+            // setup gradients for backward
+            op->getOutput(0)->setGrad(outputGrad);
+
+            REQUIRE_NOTHROW(op->backward());
+
+            SECTION("Input Grad") {
+                auto expectedInputGrad =
+                    std::make_shared<Tensor>(Array5D<float,
+                                                     batchSize,
+                                                     inChannels,
+                                                     inDataSize[0],
+                                                     inDataSize[1],
+                                                     inDataSize[2]>(
+                        {{{{{{14.400001, 15.599999, 16.799999, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {33.800003, 36.400002, 39., 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {20., 21.400002, 22.800001, 0., 0.}},
+
+                            {{17.200001, 18.799999, 20.400000, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {40.599998, 44., 47.400002, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {24., 25.800001, 27.600000, 0., 0.}},
+
+                            {{20.000002, 22., 24., 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {47.400002, 51.599998, 55.800003, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {28., 30.200001, 32.400002, 0., 0.}},
+
+                            {{22.800001, 25.199999, 27.600000, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {54.200001, 59.200001, 64.200005, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {32., 34.600002, 37.200001, 0., 0.}},
+
+                            {{25.600002, 28.400000, 31.200001, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {61., 66.800003, 72.600006, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {36., 39., 42., 0., 0.}}},
+
+                           {{{21.600000, 22.799999, 24.000002, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {49.400002, 52., 54.600002, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {28.400002, 29.799999, 31.200001, 0., 0.}},
+
+                            {{26.799999, 28.400000, 30.000002, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {61., 64.400002, 67.800003, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {34.799999, 36.599998, 38.400002, 0., 0.}},
+
+                            {{32., 34., 36.000004, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {72.599998, 76.800003, 81., 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {41.200001, 43.400002, 45.600002, 0., 0.}},
+
+                            {{37.200001, 39.599998, 42.000004, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {84.199997, 89.199997, 94.200005, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {47.600002, 50.200001, 52.800003, 0., 0.}},
+
+                            {{42.399998, 45.200001, 48.000004, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {95.800003, 101.599998, 107.400009, 0., 0.},
+                             {0., 0., 0., 0., 0.},
+                             {54., 57., 60., 0., 0.}}}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(0)->grad(),
+                                             *expectedInputGrad));
+            }
+            SECTION("Weight grad") {
+                auto expectedWeightsGrad = std::make_shared<
+                    Tensor>(Array5D<float,
+                                    outChannels,
+                                    inChannels,
+                                    kernelSize[0],
+                                    kernelSize[1],
+                                    kernelSize[2]>(
+                    {{{{{{4105., 4160., 4215.}, {4655., 4710., 4765.}}},
+
+                       {{{10980., 11035., 11090.}, {11530., 11585., 11640.}}}},
+
+                      {{{{9705., 9860., 10015.}, {11255., 11410., 11565.}}},
+
+                       {{{29080., 29235., 29390.},
+                         {30630., 30785., 30940.}}}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(1)->grad(),
+                                             *expectedWeightsGrad));
+            }
+            SECTION("Bias Grad") {
+                auto expectedBiasesGrad = std::make_shared<Tensor>(
+                    Array1D<float, outChannels>({{55., 155.}}));
+                CHECK(approxEq<float, float>(*op->getInput(2)->grad(),
+                                             *expectedBiasesGrad));
+            }
+        }
+    }
 }
+
+} // namespace Aidge