Add GridSample impl for 1D and 2D

2eff4d7f · Maxence Naud · Olivier BICHLER · 10ef1960 · 2eff4d7f · 2eff4d7f
Commit 2eff4d7f authored 8 months ago by Maxence Naud Committed by Olivier BICHLER 7 months ago
--- a/include/aidge/backend/cpu/operator/GridSampleImpl.hpp
+++ b/include/aidge/backend/cpu/operator/GridSampleImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_
+#define AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/GridSample.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+namespace Aidge {
+// compute kernel registry for forward and backward
+class GridSampleImpl1DForward_cpu
+    : public Registrable<GridSampleImpl1DForward_cpu,
+                         std::tuple<DataType, DataType>,
+                         void(const GridSample_Op&,
+                            const std::shared_ptr<Tensor>&,
+                            const std::shared_ptr<Tensor>&,
+                            const std::shared_ptr<Tensor>&)> {};
+class GridSampleImpl2DForward_cpu
+    : public Registrable<GridSampleImpl2DForward_cpu,
+                         std::tuple<DataType, DataType>,
+                         void(const GridSample_Op&,
+                            const std::shared_ptr<Tensor>&,
+                            const std::shared_ptr<Tensor>&,
+                            const std::shared_ptr<Tensor>&)> {};
+class GridSampleImpl_cpu : public OperatorImpl {
+   public:
+    GridSampleImpl_cpu(const GridSample_Op& op) : OperatorImpl(op, "cpu") {}
+    static std::unique_ptr<GridSampleImpl_cpu> create(const GridSample_Op &op) {
+        return std::make_unique<GridSampleImpl_cpu>(op);
+    }
+   public:
+    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
+    void forward() override;
+};
+namespace {
+// add cpu backend to GridSample_Op<1> implementation registry
+static Registrar<GridSample_Op> registrarGridSampleImpl_cpu("cpu", Aidge::GridSampleImpl_cpu::create);
+}  // namespace
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/GridSampleImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GridSampleImpl_forward_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_
+#include <algorithm>  // std::max, std::min
+#include <cmath>      // std::fabs, std::trunf, std::nearbyint
+#include <cstddef>    // std::size_t
+#include <cstdint>    // std::int64_t
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/GridSampleImpl.hpp"
+#include "aidge/data/half.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+static bool in_bound(float coord, float lower_bound, float upper_bound) noexcept {
+    return (coord > lower_bound) && (coord < upper_bound);
+}
+static float unnormalized_coord(float coord, float new_lower_bound, float new_upper_bound) noexcept {
+    return (coord + 1) / 2 * (new_upper_bound - new_lower_bound) + new_lower_bound;
+}
+// unused
+// static float normalized_coord(float coord, float prev_lower_bound, float prev_upper_bound) noexcept {
+//     return (coord + prev_lower_bound) / (prev_upper_bound-prev_lower_bound) * 2 - 1;
+// }
+static float unnormalize_grid_sample_coord(float coord, std::size_t size, bool align_corners) noexcept {
+    return align_corners ? unnormalized_coord(coord, 0.0f, static_cast<float>(size) - 1.0f)
+                         : unnormalized_coord(coord, -0.5f, static_cast<float>(size) - 0.5f);
+}
+// unused
+// static float normalize_grid_sample_coord(float coord, std::size_t size, bool align_corners) noexcept {
+//     return align_corners ? normalized_coord(coord, 0.0f, static_cast<float>(size) - 1.0f)
+//                          : normalized_coord(coord, -0.5f, static_cast<float>(size) - 0.5f);
+// }
+static float update_normalized_coord_with_padding(float coord, Aidge::GridSample_Op::PaddingMode padding_mode) {
+    if (!in_bound(coord, -1.0f, 1.0f)) {
+        if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) {
+            coord = std::min(std::max(-1.0f, coord), 1.0f);
+        }
+        else if (padding_mode == Aidge::GridSample_Op::PaddingMode::Reflection) {
+            float abs_coord = std::fabs(coord);
+            float int_coord = std::truncf(abs_coord);
+            std::int32_t nb_refl = static_cast<std::int32_t>((int_coord - 1) / 2);
+            float res = ((nb_refl + 1)*2) - abs_coord;
+            coord = (coord > 0) ? (nb_refl % 2 == 0 ? res : -res) \
+                            : (nb_refl % 2 == 0 ? -res : res);
+        }
+    }
+    return coord;
+}
+static inline std::int64_t update_unnormalized_coord_with_padding(std::int64_t coord, std::int64_t size, Aidge::GridSample_Op::PaddingMode padding_mode) {
+    if (!in_bound(coord, 0, size)) {
+        // out of bound. switch padding mode
+        if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) {
+            coord = std::min(std::max(std::int64_t(0), coord), size-std::int64_t(1));
+        } else if (padding_mode == Aidge::GridSample_Op::PaddingMode::Reflection) {
+            const std::int64_t quotient = coord / (size-1);
+            const std::int64_t remainer = std::abs(coord - quotient*(size-1));
+            coord = (quotient % 2 == 0) ? remainer : size - 1 - remainer;
+        }
+    }
+    return coord;
+}
+namespace Aidge {
+/**
+ * @brief Forward kernel for 1D GridSample on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param grid_ const grid Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void GridSampleImpl1D_cpu_forward_kernel(const GridSample_Op& op,
+                            const std::shared_ptr<Tensor>& in0,
+                            const std::shared_ptr<Tensor>& in1,
+                            const std::shared_ptr<Tensor>& out)
+{
+    const I* const input = static_cast<const I * const>(in0->getImpl()->rawPtr());
+    const I* input_ptr = input;
+    float* const grid = static_cast<float* const>(in1->getImpl()->rawPtr());
+    float* grid_ptr = grid;
+    O* const output = static_cast<O* const>(out->getImpl()->rawPtr());
+    O* output_ptr = output;
+    const std::size_t N = in0->dim(0);
+    const std::size_t C = in0->dim(1);
+    const std::size_t in_H = in0->dim(2);
+    const std::size_t grid_H = in1->dim(1);
+    const std::size_t in_N_s = in0->stride(0);
+    const std::size_t in_C_s = in0->stride(1);
+    const std::size_t in_H_s = in0->stride(2);
+    const std::size_t grid_N_s = in1->stride(0);
+    const std::size_t grid_H_s = in1->stride(1);
+    const std::size_t out_N_s = out->stride(0);
+    const std::size_t out_C_s = out->stride(1);
+    const std::size_t out_H_s = out->stride(2);
+    float* grid_ptr_N = grid;
+    const I* input_ptr_N = input;
+    O* output_ptr_N = output;
+    for (std::size_t n = 0; n < N; ++n) {
+        grid_ptr = grid_ptr_N;
+        for (std::size_t grid_x = 0; grid_x < grid_H; ++grid_x) {
+            output_ptr = output_ptr_N + grid_x*out_H_s;
+            /*
+            * change grid_x coord to match padding_mode
+            * Change range from [-1, 1] to [0, H-1] or [-0.5, H-0.5] according to align_corners
+            * Handle computation of interpolation
+            *   any value outside bounds is considered 0
+            *   if nearest:
+            *   else if linear:
+            *   else if cubic:
+            *   else : nothing
+            */
+            float x = *grid_ptr;
+            x = update_normalized_coord_with_padding(x, op.paddingMode());
+            x = unnormalize_grid_sample_coord(x, in_H, op.alignCorners());
+            if (op.mode() == GridSample_Op::Mode::Nearest) {
+                const std::int64_t x_rounded = std::nearbyintf(x);
+                if (in_bound(x_rounded, 0, in_H)) {
+                    input_ptr = input_ptr_N + x_rounded*in_H_s;
+                    for (std::size_t c = 0; c < C; ++c) {
+                        *output_ptr = *input_ptr;
+                        input_ptr += in_C_s;
+                        output_ptr += out_C_s;
+                    }
+                } else {
+                    for (std::size_t c = 0; c < C; ++c) {
+                        *output_ptr = O(0);
+                        output_ptr += out_C_s;
+                    }
+                }
+            } else if (op.mode() == GridSample_Op::Mode::Linear) {
+                const std::int64_t x_inf = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_H, op.paddingMode());
+                const std::int64_t x_sup = update_unnormalized_coord_with_padding(x_inf + 1, in_H, op.paddingMode());
+                const I* input_ptr_NC = input_ptr_N;
+                for (std::size_t c = 0; c < C; ++c) {
+                    const I f_inf = in_bound(x_inf, 0, in_H) ?
+                        input_ptr_NC[static_cast<std::size_t>(x_inf)*in_H_s] : I(0);
+                    const I f_sup = in_bound(x_sup, 0, in_H) ?
+                        input_ptr_NC[static_cast<std::size_t>(x_sup)*in_H_s] : I(0);
+                    *output_ptr = static_cast<O>(static_cast<I>(x - x_inf)*f_inf \
+                            + static_cast<I>(x_sup - x)*f_sup);
+                    input_ptr_NC += in_C_s;
+                    output_ptr += out_C_s;
+                }
+            } else if (op.mode() == GridSample_Op::Mode::Cubic) {
+                const std::int64_t x_inf = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_H, op.paddingMode());
+                const std::int64_t x_sup = update_unnormalized_coord_with_padding(x_inf + 1, in_H, op.paddingMode());
+                const std::int64_t x_inf_inf = update_unnormalized_coord_with_padding(x_inf - 1, in_H, op.paddingMode());
+                const std::int64_t x_sup_sup = update_unnormalized_coord_with_padding(x_sup + 1, in_H, op.paddingMode());
+                const I x1 = static_cast<I>(x - static_cast<float>(x_inf));
+                const I x2 = x1 * x1;
+                const I x3 = x1 * x2;
+                const I* input_ptr_NC = input_ptr_N;
+                for (std::size_t c = 0; c < C; ++c) {
+                    const I f_inf_inf = in_bound(x_inf_inf, 0, in_H) ? input_ptr_NC[x_inf_inf*in_H_s] : I(0);
+                    const I f_inf = in_bound(x_inf, 0, in_H) ? input_ptr_NC[x_inf*in_H_s] : I(0);
+                    const I f_sup = in_bound(x_sup, 0, in_H) ? input_ptr_NC[x_sup*in_H_s] : I(0);
+                    const I f_sup_sup = in_bound(x_sup_sup, 0, in_H) ? input_ptr_NC[x_sup_sup*in_H_s] : I(0);
+                    const I m_inf = (f_sup - f_inf_inf) / I(2);
+                    const I m_sup = (f_sup_sup - f_inf) / I(2);
+                    *output_ptr = f_inf \
+                        + x1 * m_inf \
+                        + x2 * (3 * (f_sup - f_inf) - 2 * m_inf - m_sup) \
+                        + x3 * (2*(f_inf - f_sup) + m_inf + m_sup);
+                    input_ptr_NC += in_C_s;
+                    output_ptr += out_C_s;
+                }
+            }
+            grid_ptr += grid_H_s;
+        }
+        input_ptr_N += in_N_s;
+        grid_ptr_N += grid_N_s;
+        output_ptr_N += out_N_s;
+    }
+}
+namespace {
+static Registrar<GridSampleImpl1DForward_cpu> registrarGridSampleImpl1DForward_cpu_Float32(
+        {DataType::Float32, DataType::Float32},
+        Aidge::GridSampleImpl1D_cpu_forward_kernel<float, float>);
+static Registrar<GridSampleImpl1DForward_cpu> registrarGridSampleImpl1DForward_cpu_Float16(
+        {DataType::Float16, DataType::Float16},
+        Aidge::GridSampleImpl1D_cpu_forward_kernel<half_float::half, half_float::half>);
+static Registrar<GridSampleImpl1DForward_cpu> registrarGridSampleImpl1DForward_cpu_Int32(
+        {DataType::Int32, DataType::Int32},
+        Aidge::GridSampleImpl1D_cpu_forward_kernel<int, int>);
+static Registrar<GridSampleImpl1DForward_cpu> registrarGridSampleImpl1DForward_cpu_Float64(
+        {DataType::Float64, DataType::Float64},
+        Aidge::GridSampleImpl1D_cpu_forward_kernel<double, double>);
+/**
+ * @brief Forward kernel for 1D GridSample on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param grid_ const grid Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void GridSampleImpl2D_cpu_forward_kernel(const GridSample_Op& op,
+                            const std::shared_ptr<Tensor>& in0,
+                            const std::shared_ptr<Tensor>& in1,
+                            const std::shared_ptr<Tensor>& out)
+{
+    const I* input = static_cast<const I *>(in0->getImpl()->rawPtr());
+    const I* input_ptr = input;
+    float* const grid = static_cast<float* const>(in0->getImpl()->rawPtr());
+    float* grid_ptr = grid;
+    O* const output = static_cast<O* const>(out->getImpl()->rawPtr());
+    const std::size_t N = in0->dim(0);
+    const std::size_t C = in0->dim(1);
+    const std::size_t in_H = in0->dim(2);
+    const std::size_t in_W = in0->dim(3);
+    const std::size_t grid_H = in1->dim(1);
+    const std::size_t grid_W = in1->dim(2);
+    const std::size_t in_N_s = in0->stride(0);
+    const std::size_t in_C_s = in0->stride(1);
+    const std::size_t in_H_s = in0->stride(2);
+    const std::size_t in_W_s = in0->stride(3);
+    const std::size_t grid_N_s = in1->stride(0);
+    const std::size_t grid_H_s = in1->stride(1);
+    const std::size_t grid_W_s = in1->stride(2);
+    const std::size_t grid_Coord_s = in1->stride(3);
+    const std::size_t out_N_s = out->stride(0);
+    const std::size_t out_C_s = out->stride(1);
+    const std::size_t out_H_s = out->stride(2);
+    const std::size_t out_W_s = out->stride(3);
+    float* grid_ptr_N = grid;
+    const I* input_ptr_N = input;
+    O* output_ptr_N = output;
+    for (std::size_t n = 0; n < N; ++n) {
+        for (std::size_t grid_y = 0; grid_y < grid_H; ++grid_y) {
+            for (std::size_t grid_x = 0; grid_x < grid_W; ++grid_x) {
+                O* output_ptr = output_ptr_N + grid_y*out_H_s + grid_y*out_W_s;
+                grid_ptr = grid_ptr_N + grid_y*grid_H_s + grid_x*grid_W_s;
+                /*
+                * change grid_x coord to match padding_mode
+                * Change range from [-1, 1] to [0, H-1] or [-0.5, H-0.5] according to align_corners
+                * Handle computation of interpolation
+                *   any value outside bounds is considered 0
+                *   if nearest:
+                *   else if linear:
+                *   else if cubic:
+                *   else : nothing
+                */
+                float x = *grid_ptr;
+                float y = grid_ptr[grid_Coord_s];
+                x = update_normalized_coord_with_padding(x, op.paddingMode());
+                x = unnormalize_grid_sample_coord(x, in_W, op.alignCorners());
+                y = update_normalized_coord_with_padding(y, op.paddingMode());
+                y = unnormalize_grid_sample_coord(y, in_H, op.alignCorners());
+                if (op.mode() == GridSample_Op::Mode::Nearest) {
+                    const std::int64_t x_rounded = std::nearbyintf(x);
+                    const std::int64_t y_rounded = std::nearbyintf(y);
+                    if (in_bound(x_rounded, 0, in_W) && in_bound(y_rounded, 0, in_H)) {
+                        input_ptr = input_ptr_N + y_rounded*in_H_s + x_rounded*in_W_s;
+                        for (std::size_t c = 0; c < C; ++c) {
+                            *output_ptr = *input_ptr;
+                            input_ptr += in_C_s;
+                            output_ptr += out_C_s;
+                        }
+                    } else {
+                        for (std::size_t c = 0; c < C; ++c) {
+                            *output_ptr = O(0);
+                            output_ptr += out_C_s;
+                        }
+                    }
+                } else if (op.mode() == GridSample_Op::Mode::Linear) {
+                    const std::int64_t x_r = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_W, op.paddingMode()); // right
+                    const std::int64_t x_l = update_unnormalized_coord_with_padding(x_r + 1, in_W, op.paddingMode()); // left
+                    const std::int64_t y_t = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(y)), in_H, op.paddingMode()); // top
+                    const std::int64_t y_b = update_unnormalized_coord_with_padding(y_t + 1, in_H, op.paddingMode()); // bottom
+                    const I* input_ptr_NC = input_ptr_N;
+                    for (std::size_t c = 0; c < C; ++c) {
+                        const I f_tr = (in_bound(x_r, 0, in_W) && in_bound(y_t, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_t)*in_H_s
+                                         + static_cast<std::size_t>(x_r)*in_W_s]
+                                : I(0);
+                        const I f_tl = (in_bound(x_l, 0, in_W) && in_bound(y_t, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_t)*in_H_s
+                                         + static_cast<std::size_t>(x_l)*in_W_s]
+                                : I(0);
+                        const I f_br = (in_bound(x_r, 0, in_W) && in_bound(y_b, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_b)*in_H_s
+                                         + static_cast<std::size_t>(x_r)*in_W_s]
+                                : I(0);
+                        const I f_bl = (in_bound(x_l, 0, in_W) && in_bound(y_b, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_b)*in_H_s
+                                         + static_cast<std::size_t>(x_l)*in_W_s]
+                                : I(0);
+                        // compute weighted sum of the 4 corners
+                        const I w_tr = static_cast<I>((y - static_cast<float>(y_t))*(static_cast<float>(x_r) - x));
+                        const I w_tl = static_cast<I>((y - static_cast<float>(y_t))*(x - static_cast<float>(x_l)));
+                        const I w_br = static_cast<I>((static_cast<float>(y_b) - y)*(static_cast<float>(x_r) - x));
+                        const I w_bl = static_cast<I>((static_cast<float>(y_b) - y)*(x - static_cast<float>(x_l)));
+                        *output_ptr = static_cast<O>(w_tr*f_tr + w_tl*f_tl + w_br*f_br + w_bl*f_bl);
+                        input_ptr_NC += in_C_s;
+                        output_ptr += out_C_s;
+                    }
+                } else if (op.mode() == GridSample_Op::Mode::Cubic) {
+                    /*
+                    *  .. .. .. .. .. ..
+                    *  .. 00 01 02 03 ..
+                    *  .. 10 11 12 13 ..
+                    *  .. 20 21 22 23 ..
+                    *  .. 30 31 32 33 ..
+                    *  .. .. .. .. .. ..
+                    */
+                    const std::int64_t x_1 = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_W, op.paddingMode());
+                    const std::int64_t x_0 = update_unnormalized_coord_with_padding(x_1 - 1, in_W, op.paddingMode());
+                    const std::int64_t x_2 = update_unnormalized_coord_with_padding(x_1 + 1, in_W, op.paddingMode());
+                    const std::int64_t x_3 = update_unnormalized_coord_with_padding(x_1 + 2, in_W, op.paddingMode());
+                    const std::int64_t y_1 = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(y)), in_H, op.paddingMode());
+                    const std::int64_t y_0 = update_unnormalized_coord_with_padding(y_1 - 1, in_H, op.paddingMode());
+                    const std::int64_t y_2 = update_unnormalized_coord_with_padding(y_1 + 1, in_H, op.paddingMode());
+                    const std::int64_t y_3 = update_unnormalized_coord_with_padding(y_1 + 2, in_H, op.paddingMode());
+                    const I* input_ptr_NC = input_ptr_N;
+                    for (std::size_t c = 0; c < C; ++c) {
+                        const I f_00 = in_bound(x_0, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_01 = in_bound(x_0, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_02 = in_bound(x_0, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_03 = in_bound(x_0, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_3*in_H_s] : I(0);
+                        const I f_10 = in_bound(x_1, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_20 = in_bound(x_2, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_30 = in_bound(x_3, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_11 = in_bound(x_1, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_12 = in_bound(x_1, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_13 = in_bound(x_1, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_3*in_H_s] : I(0);
+                        const I f_21 = in_bound(x_2, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_22 = in_bound(x_2, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_23 = in_bound(x_2, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_3*in_H_s] : I(0);
+                        const I f_31 = in_bound(x_3, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_32 = in_bound(x_3, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_33 = in_bound(x_3, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_3*in_H_s] : I(0);
+                        const I mx_11 = (f_21 - f_01) / I(2);
+                        const I mx_12 = (f_22 - f_02) / I(2);
+                        const I mx_21 = (f_31 - f_11) / I(2);
+                        const I mx_22 = (f_32 - f_12) / I(2);
+                        const I my_11 = (f_12 - f_10) / I(2);
+                        const I my_12 = (f_13 - f_11) / I(2);
+                        const I my_21 = (f_22 - f_20) / I(2);
+                        const I my_22 = (f_23 - f_21) / I(2);
+                        const I mxy_11 = (f_22 - f_20 - f_02 - + f_00) / I(4);
+                        const I mxy_12 = (f_23 - f_21 - f_03 - + f_01) / I(4);
+                        const I mxy_21 = (f_32 - f_30 - f_12 - + f_10) / I(4);
+                        const I mxy_22 = (f_33 - f_31 - f_13 - + f_11) / I(4);
+                        const I a_00 = f_11;
+                        const I a_10 = mx_11;
+                        const I a_20 = I(3)*(f_21 - f_11) - I(2)*mx_11 - mx_21;
+                        const I a_30 = I(2)*(f_11 - f_21) + mx_11 + mx_21;
+                        const I a_01 = my_11;
+                        const I a_11 = mxy_11;
+                        const I a_21 = I(3)*(my_21 - my_11) - I(2)*mxy_11 - mxy_21;
+                        const I a_31 = I(2)*(my_11 - my_21) + mxy_11 + mxy_21;
+                        const I a_02 = I(3)*(f_12 - f_11) - I(2)*my_11 - my_12;
+                        const I a_12 = I(3)*(mx_12 - mx_11) - I(2)*mxy_11 - mxy_12;
+                        const I a_22 = I(9)*(f_11 + f_22 - f_21 - f_12) + I(3)*(I(2)*(mx_11 - mx_12 + my_11 - my_21) + mx_21 - mx_22 + my_12 - my_22) + mxy_22 + I(2)*(mxy_12 + mxy_21 + I(2)*mxy_11);
+                        const I a_32 = - mxy_12 - mxy_22 + I(2)*(my_22 - my_12 - mxy_11 - mxy_21 + I(2)*(my_21 - my_11) + I(3)*(f_21 + f_12 - f_11 - f_22)) + I(3)*(mx_12 + mx_22 - mx_11 - mx_21);
+                        const I a_03 = I(2)*(f_11 - f_12) + my_11 + my_12;
+                        const I a_13 = I(2)*(mx_11 - mx_12) + mxy_11 + mxy_12;
+                        const I a_23 = - mxy_21 - mxy_22 + I(2)*(-mx_21 + mx_22 - mxy_11 - mxy_12 + I(2)*(mx_12 - mx_11) + I(3)*(f_12 + f_21 - f_11 - f_22)) + I(3)*(my_21 + my_22 - my_11 - my_12);
+                        const I a_33 = mxy_11 + mxy_21 + mxy_12 + mxy_22 + I(2)*(mx_11 + mx_21 - mx_12 - mx_22 + my_11 - my_21 + my_12 - my_22 + I(2)*(f_11 - f_21 - f_12 + f_22));
+                        const I x2 = static_cast<I>(x*x);
+                        const I x3 = static_cast<I>(x*x*x);
+                        const I y2 = static_cast<I>(y*y);
+                        const I y3 = static_cast<I>(y*y*y);
+                        *output_ptr = static_cast<O>( \
+                            a_00 + a_10*x + a_20*x2 + a_30*x3 \
+                            + a_01*y + a_11*x*y + a_21*x2*y + a_31*x3*y \
+                            + a_02*y2 + a_12*x*y2 + a_22*x2*y2 + a_32*x3*y2 \
+                            + a_03*y3 + a_13*x*y3 + a_23*x2*y3 + a_33*x3*y3);
+                        input_ptr_NC += in_C_s;
+                        output_ptr += out_C_s;
+                    }
+                }
+            }
+        }
+        input_ptr_N += in_N_s;
+        grid_ptr_N += grid_N_s;
+        output_ptr_N += out_N_s;
+    }
+}
+static Registrar<GridSampleImpl2DForward_cpu> registrarGridSampleImpl2DForward_cpu_Float32(
+        {DataType::Float32, DataType::Float32},
+        Aidge::GridSampleImpl2D_cpu_forward_kernel<float, float>);
+static Registrar<GridSampleImpl2DForward_cpu> registrarGridSampleImpl2DForward_cpu_Float16(
+        {DataType::Float16, DataType::Float16},
+        Aidge::GridSampleImpl2D_cpu_forward_kernel<half_float::half, half_float::half>);
+static Registrar<GridSampleImpl2DForward_cpu> registrarGridSampleImpl2DForward_cpu_Int32(
+        {DataType::Int32, DataType::Int32},
+        Aidge::GridSampleImpl2D_cpu_forward_kernel<int, int>);
+static Registrar<GridSampleImpl2DForward_cpu> registrarGridSampleImpl2DForward_cpu_Float64(
+        {DataType::Float64, DataType::Float64},
+        Aidge::GridSampleImpl2D_cpu_forward_kernel<double, double>);
+}  // namespace
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_ */
--- a/src/operator/GridSampleImpl.cpp
+++ b/src/operator/GridSampleImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#include "aidge/backend/cpu/operator/GridSampleImpl.hpp"
+#include <functional>
+#include <vector>
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/GridSampleImpl_forward_kernels.hpp"
+#include "aidge/operator/GridSample.hpp"
+#include "aidge/utils/Types.h"
+Aidge::Elts_t Aidge::GridSampleImpl_cpu::getNbRequiredProtected(IOIndex_t /*inputIdx*/) const {
+    // this implementation can be in-place
+    return Elts_t::DataElts(0);
+}
+void Aidge::GridSampleImpl_cpu::forward() {
+    const auto& op_ = static_cast<const GridSample_Op&>(mOp);
+    // Find the correct kernel type
+    const auto outputDataType = op_.getOutput(0)->dataType();
+    const Registrar<GridSampleImpl1DForward_cpu>::registrar_key registrarKey = {
+        op_.getInput(0)->dataType(),
+        outputDataType};
+    std::function<void(const GridSample_Op&,
+                            const std::shared_ptr<Tensor>&,
+                            const std::shared_ptr<Tensor>&,
+                            const std::shared_ptr<Tensor>&)> kernelFunc;
+    const std::size_t nbSpatialFeat = op_.getInput(0)->nbDims();
+    switch (nbSpatialFeat)
+    {
+    case 1:
+        kernelFunc = Registrar<GridSampleImpl1DForward_cpu>::create(registrarKey);
+        break;
+    case 2:
+        kernelFunc = Registrar<GridSampleImpl2DForward_cpu>::create(registrarKey);
+        break;
+    default:
+        AIDGE_THROW_OR_ABORT(std::runtime_error, "No CPU {} kernel available for {} dimensions.", op_.type(), nbSpatialFeat);
+        break;
+    }
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback;
+    const auto& input0 = std::make_shared<Tensor>(op_.getInput(0)->refCastFrom(input0Fallback, *op_.getOutput(0)));
+    const auto& input1 = std::make_shared<Tensor>(op_.getInput(1)->refCastFrom(input1Fallback, *op_.getOutput(0)));
+    // Call kernel
+    kernelFunc(op_,
+            input0, // input
+            input1, // grid
+            op_.getOutput(0) // output
+            );
+}