Merge branch 'dev' into feat/release_pip

96a9b1c8 · Grégoire Kubler · dfe4925e · f5968e18 · 96a9b1c8 · 96a9b1c8
Commit 96a9b1c8 authored 9 months ago by Grégoire Kubler
--- a/include/aidge/backend/cuda.hpp
+++ b/include/aidge/backend/cuda.hpp
@@ -14,13 +14,21 @@

 #include "aidge/backend/cuda/data/TensorImpl.hpp"
 #include "aidge/backend/cuda/operator/AddImpl.hpp"
+#include "aidge/backend/cuda/operator/AndImpl.hpp"
+#include "aidge/backend/cuda/operator/ArgMaxImpl.hpp"
 #include "aidge/backend/cuda/operator/AvgPoolingImpl.hpp"
 #include "aidge/backend/cuda/operator/BatchNormImpl.hpp"
 #include "aidge/backend/cuda/operator/ConvImpl.hpp"
+#include "aidge/backend/cuda/operator/DivImpl.hpp"
 #include "aidge/backend/cuda/operator/FCImpl.hpp"
 #include "aidge/backend/cuda/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/backend/cuda/operator/LnImpl.hpp"
 #include "aidge/backend/cuda/operator/MaxPoolingImpl.hpp"
+#include "aidge/backend/cuda/operator/MulImpl.hpp"
 #include "aidge/backend/cuda/operator/PadImpl.hpp"
+#include "aidge/backend/cuda/operator/PowImpl.hpp"
+#include "aidge/backend/cuda/operator/ReduceMeanImpl.hpp"
+#include "aidge/backend/cuda/operator/ReduceSumImpl.hpp"
 #include "aidge/backend/cuda/operator/ReLUImpl.hpp"
 #include "aidge/backend/cuda/operator/ShiftMaxImpl.hpp"
 #include "aidge/backend/cuda/operator/ShiftGELUImpl.hpp"

--- a/include/aidge/backend/cuda/data/TensorImpl.hpp
+++ b/include/aidge/backend/cuda/data/TensorImpl.hpp
@@ -221,7 +221,39 @@ public:
                                            &strides[0]));
            }
        }
+        else {
+            // Compare if the shape of the tensor has changed
+            cudnnDataType_t currentDataType;
+            int currentNbDims;
+            // Since we don't know the nb dims of the current tensor, we init with CUDNN_DIM_MAX then remove the trailing zeros
+            std::vector<int> currentDims(CUDNN_DIM_MAX);
+            std::vector<int> currentStrides(CUDNN_DIM_MAX);
+
+            CHECK_CUDNN_STATUS(cudnnGetTensorNdDescriptor(mCudnnTensor, CUDNN_DIM_MAX, &currentDataType, &currentNbDims, currentDims.data(), currentStrides.data()));
+            // Remove the trailing zeros
+            currentDims.erase(std::find_if(currentDims.rbegin(), currentDims.rend(), [](int x) { return x != 0; }).base(),
+                              currentDims.end());
+
+            std::vector<int> dims(tensor.dims().cbegin(), tensor.dims().cend());
+            if (dims.size() < 4) {
+                dims.resize(4, 1);
+            }
+
+            // Update descriptor if shape has changed
+            if (dims!=currentDims) {
+                std::vector<int> strides(tensor.strides().cbegin(), tensor.strides().cend());

+                if (strides.size() < 4) {
+                    strides.resize(4, 1);
+                }
+
+                CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor,
+                                            CudaContext::data_type<T>::value,
+                                            dims.size(),
+                                            &dims[0],
+                                            &strides[0]));
+            }
+        }
        return mCudnnTensor;
    }


--- a/include/aidge/backend/cuda/operator/AndImpl.hpp
+++ b/include/aidge/backend/cuda/operator/AndImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ANDIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_ANDIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/And.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class AndImpl_cuda : public OperatorImpl {
+private:
+
+public:
+    AndImpl_cuda(const And_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<AndImpl_cuda> create(const And_Op &op) {
+        return std::make_unique<AndImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+
+private:
+    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+};
+
+namespace {
+// add cuda backend to And_Op implementation registry
+static Registrar<And_Op> registrarAndImpl_cuda("cuda", Aidge::AndImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ANDIMPL_H_ */
--- a/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_ANDIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CUDA_OPERATOR_ANDIMPL_FORWARD_KERNEL_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+
+template <class T>
+void AndForward(const T* input1, const T* input2, T* output,
+                const std::vector<int>& input1Dims,const std::vector<int>& input2Dims,
+                const std::vector<int>& inputStrides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                int outSize);
+}
+#endif /* AIDGE_CUDA_OPERATOR_ANDIMPL_FORWARD_KERNEL_H_ */
+
+
+
+
+
--- a/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ArgMaxImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_ARGMAXIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_ARGMAXIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/ArgMax.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class ArgMaxImpl_cuda : public OperatorImpl {
+private:
+    // CuDNN specific variables
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
+    
+
+public:
+    ArgMaxImpl_cuda(const ArgMax_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<ArgMaxImpl_cuda> create(const ArgMax_Op &op) {
+        return std::make_unique<ArgMaxImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+
+private:
+    template <class T> void forward_(const Tensor& input, std::int32_t axis, DimSize_t selectLastIdx);
+};
+
+namespace {
+// add cuda backend to ArgMax_Op implementation registry
+static Registrar<ArgMax_Op> registrarArgMaxImpl_cuda("cuda", Aidge::ArgMaxImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_ARGMAXIMPL_H_ */
--- a/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/ArgMaxImpl_CUDA_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_ARGMAXIMPL_KERNEL_H_
+#define AIDGE_CUDA_OPERATOR_ARGMAXIMPL_KERNEL_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge
+{
+    template <class T>
+    void ArgMax_cuda_forward_kernel(const T* input, T* output,
+                                    const std::vector<int>& inputDims, const std::vector<int>& inputStrides,
+                                    int axis, int total_elems, std::size_t selectLastIdx);
+}
+#endif /* AIDGE_CUDA_OPERATOR_ARGMAXIMPL_KERNEL_H_ */
\ No newline at end of file
--- a/include/aidge/backend/cuda/operator/ConvImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ConvImpl.hpp
@@ -35,7 +35,7 @@ private:
    // CuDNN specific variables
    cudnnConvolutionDescriptor_t mConvDesc = nullptr;
    cudnnFilterDescriptor_t mFilterDesc = nullptr;
-    cudnnConvolutionFwdAlgo_t mFwdAlgo;
+    cudnnConvolutionFwdAlgo_t mFwdAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
    cudnnConvolutionBwdFilterAlgo_t mBwdFilterAlgo;
    cudnnConvolutionBwdDataAlgo_t mBwdDataAlgo;
    size_t mWorkspaceSize = 0;

--- a/include/aidge/backend/cuda/operator/DivImpl.hpp
+++ b/include/aidge/backend/cuda/operator/DivImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_DIVIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_DIVIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Div.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class DivImpl_cuda : public OperatorImpl {
+private:
+
+
+public:
+    DivImpl_cuda(const Div_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<DivImpl_cuda> create(const Div_Op &op) {
+        return std::make_unique<DivImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    void backward();
+    // ~DivImpl_cuda();
+private:
+    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+    template <class T> void backward_(const Tensor& outGrad);
+};
+
+namespace {
+// add cuda backend to Div_Op implementation registry
+static Registrar<Div_Op> registrarDivImpl_cuda("cuda", Aidge::DivImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_DIVIMPL_H_ */
--- a/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_DIVIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CUDA_OPERATOR_DIVIMPL_FORWARD_KERNEL_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+
+template <class T>
+void divForward(const T* input1, T* output, const T* intput2,
+                const std::vector<int>& input1Dims,const std::vector<int>& input2Dims, const std::vector<int>& outputDims,
+                const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                int outSize);
+
+}
+#endif /* AIDGE_CUDA_OPERATOR_DIVIMPL_FORWARD_KERNEL_H_ */
+
+
+
+
+
--- a/include/aidge/backend/cuda/operator/LnImpl.hpp
+++ b/include/aidge/backend/cuda/operator/LnImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_LNIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_LNIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Ln.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class LnImpl_cuda : public OperatorImpl {
+private:
+    std::shared_ptr<Tensor> mInputFallback;
+    std::shared_ptr<Tensor> mOutputGradFallback;
+
+public:
+    LnImpl_cuda(const Ln_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<LnImpl_cuda> create(const Ln_Op &op) {
+        return std::make_unique<LnImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    void backward();
+    // ~LnImpl_cuda();
+
+private:
+    template <class T> void forward_(const Tensor& input);
+    template <class T> void backward_(const Tensor& output_grad);
+};
+
+namespace {
+// add cuda backend to Ln_Op implementation registry
+static Registrar<Ln_Op> registrarLnImpl_cuda("cuda", Aidge::LnImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_LNIMPL_H_ */
--- a/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/LnImpl_CUDA_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_LNIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CUDA_OPERATOR_LNIMPL_FORWARD_KERNEL_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+
+template <class T>
+void lnForward(const T* input, T* output, int size);
+
+}
+#endif /* AIDGE_CUDA_OPERATOR_LNIMPL_FORWARD_KERNEL_H_ */
+
+
+
+
+
--- a/include/aidge/backend/cuda/operator/MulImpl.hpp
+++ b/include/aidge/backend/cuda/operator/MulImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_MULIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_MULIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Mul.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class MulImpl_cuda : public OperatorImpl {
+private:
+
+public:
+    MulImpl_cuda(const Mul_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<MulImpl_cuda> create(const Mul_Op &op) {
+        return std::make_unique<MulImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    void backward();
+
+private:
+    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+    template <class T> void backward_(const Tensor& outputGrad, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+};
+
+namespace {
+// add cuda backend to Mul_Op implementation registry
+static Registrar<Mul_Op> registrarMulImpl_cuda("cuda", Aidge::MulImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_MULIMPL_H_ */
--- a/include/aidge/backend/cuda/operator/PowImpl.hpp
+++ b/include/aidge/backend/cuda/operator/PowImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_POWIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_POWIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Pow.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class PowImpl_cuda : public OperatorImpl {
+private:
+
+
+public:
+    PowImpl_cuda(const Pow_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<PowImpl_cuda> create(const Pow_Op &op) {
+        return std::make_unique<PowImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    void backward();
+    // ~PowImpl_cuda();
+private:
+    template <class T> void forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides);
+    template <class T> void backward_(const Tensor& outGrad);
+};
+
+namespace {
+// add cuda backend to Pow_Op implementation registry
+static Registrar<Pow_Op> registrarPowImpl_cuda("cuda", Aidge::PowImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_POWIMPL_H_ */
--- a/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_POWIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CUDA_OPERATOR_POWIMPL_FORWARD_KERNEL_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+
+template <class T>
+void powForward(const T* input, T* output, const T* exponent,
+                const std::vector<int>& inputDims,const std::vector<int>& exponentDims, const std::vector<int>& outputDims,
+                const std::vector<int>& inputStrides, const std::vector<int>& exponentStrides,const std::vector<int>& outputStrides,
+                int outSize);
+
+}
+#endif /* AIDGE_CUDA_OPERATOR_POWIMPL_FORWARD_KERNEL_H_ */
+
+
+
+
+
--- a/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/ReduceImpl_CUDA_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_
+#define AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge
+{
+
+    template <class T>
+    void ReduceBackward(const T* input,
+                                   T* output,
+                                   const std::vector<std::size_t>& inputDims,
+                                   const std::vector<std::size_t>& outputDims,
+                                   const std::vector<int>& axes,
+                                   const std::vector<std::size_t>& factors,
+                                   int outSize);
+}
+#endif /* AIDGE_CUDA_OPERATOR_REDUCEIMPL_KERNEL_H_ */
\ No newline at end of file
--- a/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ReduceMeanImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_REDUCEMEANIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_REDUCEMEANIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/ReduceMean.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class ReduceMeanImpl_cuda : public OperatorImpl {
+private:
+    // CuDNN specific variables
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
+    
+
+public:
+    ReduceMeanImpl_cuda(const ReduceMean_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<ReduceMeanImpl_cuda> create(const ReduceMean_Op &op) {
+        return std::make_unique<ReduceMeanImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    void backward();
+    // ~ReduceMeanImpl_cuda();
+
+private:
+    template <class T> void forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims);
+    template <class T> void backward_(const Tensor& output_grad, const std::vector<int>& axes);
+};
+
+namespace {
+// add cuda backend to ReduceMean_Op implementation registry
+static Registrar<ReduceMean_Op> registrarReduceMeanImpl_cuda("cuda", Aidge::ReduceMeanImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_REDUCEMEANIMPL_H_ */
--- a/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ReduceSumImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_REDUCESUMIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_REDUCESUMIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+class ReduceSumImpl_cuda : public OperatorImpl {
+private:
+    // CuDNN specific variables
+    std::shared_ptr<Tensor> mInputFallback, mOutputGradFallback;
+    
+
+public:
+    ReduceSumImpl_cuda(const ReduceSum_Op &op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<ReduceSumImpl_cuda> create(const ReduceSum_Op &op) {
+        return std::make_unique<ReduceSumImpl_cuda>(op);
+    }
+
+public:
+    void forward();
+    void backward();
+
+private:
+    template <class T> void forward_(const Tensor& input, const std::vector<int>& axes, bool keepDims);
+    template <class T> void backward_(const Tensor& output_grad, const std::vector<int>& axes);
+};
+
+namespace {
+// add cuda backend to ReduceSum_Op implementation registry
+static Registrar<ReduceSum_Op> registrarReduceSumImpl_cuda("cuda", Aidge::ReduceSumImpl_cuda::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_REDUCESUMIMPL_H_ */
--- a/src/operator/AddImpl.cpp
+++ b/src/operator/AddImpl.cpp
@@ -44,6 +44,10 @@ void Aidge::AddImpl_cuda::forward() {
        std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));

+        if (dims[i].size() < 4) {
+            dims[i].resize(4, 1);
+        }
+
        // Compute the corresponding strides
        std::vector<int> tensorStrides(dims[i].size());
        int product = 1;

--- a/src/operator/AndImpl.cpp
+++ b/src/operator/AndImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/AndImpl.hpp"
+#include "aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/And.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::AndImpl_cuda::forward() {
+    const And_Op& op = static_cast<const And_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in And operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run And forward because the 0-th input has no implementation.");
+    DataType datatypeFirstInput = op.getInput(0)->dataType();
+    for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
+        AIDGE_ASSERT(op.getInput(i), "missing input in And operator");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run And forward because the {}-th input has no implementation.", i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot And inputs with two differents data type.");
+    }
+
+    std::vector<std::shared_ptr<Tensor>> inputFallbacks(op.nbInputs());
+    std::vector<Tensor> inputs(op.nbInputs());
+    std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
+    std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
+    for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
+        inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
+
+        // Get tensor dims and broadcast them
+        std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
+        dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
+
+        if (dims[i].size() < 4) {
+            dims[i].resize(4, 1);
+        }
+
+        // Compute the corresponding strides
+        std::vector<int> tensorStrides(dims[i].size());
+        int product = 1;
+        for (size_t j = dims[i].size(); j > 0; --j) {
+            tensorStrides[j - 1] = product;
+            product *= dims[i][j - 1];
+        }
+        strides[i] = tensorStrides;
+    }
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(inputs, dims, strides);
+            break;
+        case DataType::Float32:
+            forward_<float>(inputs, dims, strides);
+            break;
+        case DataType::Float16:
+            forward_<half>(inputs, dims, strides);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::AndImpl_cuda::forward_(const std::vector<Tensor>& inputs, const std::vector<std::vector<int>>& inputsDims, const std::vector<std::vector<int>>& inputsStrides) {
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+    const T * input1Ptr = static_cast<const T*>(inputs[0].getImpl()->rawPtr());
+    const T * input2Ptr = static_cast<const T*>(inputs[1].getImpl()->rawPtr());
+    T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr());
+
+    std::vector<int> outputStrides(op.getOutput(0)->nbDims(), 1);
+    if(op.getOutput(0)->nbDims()>1) {
+        for (int i = op.getOutput(0)->nbDims()-2; i >= 0; i--) {
+            outputStrides[i] = outputStrides[i+1] *  op.getOutput(0)->dims()[i+1];
+        }
+    }
+
+    Aidge::AndForward<T>(input1Ptr, input2Ptr, outputPtr,
+                inputsDims[0], inputsDims[1],
+                inputsStrides[0], inputsStrides[1], outputStrides,
+                static_cast<int>(op.getOutput(0)->size()));
+}
\ No newline at end of file
--- a/src/operator/AndImpl_CUDA_kernels.cu
+++ b/src/operator/AndImpl_CUDA_kernels.cu
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cuda_fp16.h>
+
+#include "aidge/backend/cuda/operator/AndImpl_CUDA_kernels.hpp"
+
+// Helper function for comparison
+template <typename T>
+__device__ bool compareE(T a, T b) {
+    return a == b;
+}
+template <>
+__device__ bool compareE<half>(half a, half b) {
+    return __half2float(a) == __half2float(b);
+}
+
+template <typename T>
+__global__ void and_cuda_Kernel(const T* input1, const T* input2, T* output,
+                          int* input1_shape, int* input2_shape,
+                          int* input1_strides, int* input2_strides, int* output_strides,
+                          int num_dims, int size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size) return;
+
+    int input1_idx = 0, input2_idx = 0;
+    int temp_idx = idx;
+    for (int i = 0; i < num_dims; ++i) {
+        int dim = temp_idx / output_strides[i];
+        temp_idx %= output_strides[i];
+        input1_idx += (input1_shape[i] == 1 ? 0 : dim) * input1_strides[i];
+        input2_idx += (input2_shape[i] == 1 ? 0 : dim) * input2_strides[i];
+    }
+
+    output[idx] = static_cast<T>(compareE(input1[input1_idx], input2[input2_idx]));
+}
+
+template <typename T>
+void Aidge::AndForward(const T* input1, const T* input2, T* output,
+                        const std::vector<int>& input1Dims,const std::vector<int>& input2Dims,
+                        const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                        int outSize)
+{
+    int *d_input1_strides, *d_input2_strides, *d_output_strides, *d_input1_shape, *d_input2_shape;
+    // Allocate device memory
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input1_shape, input1Dims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input2_shape, input1Dims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input1_strides, input1Dims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_input2_strides, input1Dims.size() * sizeof(int)));
+    CHECK_CUDA_STATUS(cudaMalloc(&d_output_strides, input1Dims.size() * sizeof(int)));
+
+    // Copy data from host to device;
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input1_shape, input1Dims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input2_shape, input2Dims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input1_strides, input1Strides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_input2_strides, input2Strides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDA_STATUS(cudaMemcpy(d_output_strides, outputStrides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
+    int blockSize = 256;
+    int numBlocks = (outSize + blockSize - 1) / blockSize;
+
+    int num_dims = input1Dims.size();
+    // Launch the kernel
+    and_cuda_Kernel<<<numBlocks, blockSize>>>(input1, input2, output,
+                                            d_input1_shape, d_input2_shape,
+                                            d_input1_strides, d_input2_strides, d_output_strides,
+                                            num_dims, outSize);
+    CHECK_CUDA_STATUS(cudaFree(d_input1_shape));
+    CHECK_CUDA_STATUS(cudaFree(d_input2_shape));
+    CHECK_CUDA_STATUS(cudaFree(d_input1_strides));
+    CHECK_CUDA_STATUS(cudaFree(d_input2_strides));
+    CHECK_CUDA_STATUS(cudaFree(d_output_strides));
+};
+
+template void Aidge::AndForward(const double* input1, const double* input2, double* output,
+                        const std::vector<int>& input1Dims,const std::vector<int>& input2Dims,
+                        const std::vector<int>& inputStrides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                        int outSize);
+
+template void Aidge::AndForward(const float* input1, const float* input2, float* output,
+                        const std::vector<int>& input1Dims,const std::vector<int>& input2Dims,
+                        const std::vector<int>& inputStrides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                        int outSize);
+
+template void Aidge::AndForward(const half* input1, const half* input2, half* output,
+                        const std::vector<int>& input1Dims,const std::vector<int>& input2Dims,
+                        const std::vector<int>& inputStrides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
+                        int outSize);
\ No newline at end of file