Add Exp operator

b488c7cd · Olivier Antoni · 02da21b0 · b488c7cd · b488c7cd · b488c7cd
Commit b488c7cd authored 2 months ago by Olivier Antoni
--- a/include/aidge/backend/cuda/operator/ExpImpl.hpp
+++ b/include/aidge/backend/cuda/operator/ExpImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_BACKEND_CUDA_OPERATOR_EXPIMPL_H_
+#define AIDGE_BACKEND_CUDA_OPERATOR_EXPIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include <cudnn.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/operator/Exp.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+class ExpImpl_cuda : public OperatorImpl {
+public:
+    ExpImpl_cuda(const Exp_Op& op) : OperatorImpl(op, "cuda") {}
+
+    static std::unique_ptr<ExpImpl_cuda> create(const Exp_Op& op) {
+        return std::make_unique<ExpImpl_cuda>(op);
+    }
+
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
+        return {
+            {DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float16},
+        };
+    }
+
+    void forward() override;
+    void backward() override;
+
+private:
+
+    std::shared_ptr<Tensor> mInputFallback;
+    std::shared_ptr<Tensor> mOutputGradFallback;
+
+    template <class T> void forward_(const Tensor& input);
+    template <class T> void backward_(const Tensor& input, const Tensor& output_grad);
+};
+
+// Implementation entry point registration to Operator
+REGISTRAR(Exp_Op, "cuda", Aidge::ExpImpl_cuda::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_BACKEND_CUDA_OPERATOR_EXPIMPL_H_ */
--- a/include/aidge/backend/cuda/operator/ExpImpl_CUDA_kernels.hpp
+++ b/include/aidge/backend/cuda/operator/ExpImpl_CUDA_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CUDA_OPERATOR_EXPIMPL_KERNELS_H_
+#define AIDGE_CUDA_OPERATOR_EXPIMPL_KERNELS_H_
+
+#include <stdexcept>
+#include <cfloat>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+
+#include "aidge/data/Data.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+
+template <class T>
+void expForward(const T* input, T* output, int size);
+
+template <class T>
+void expBackward(const T* input, const T* outputGrad, T* inputGrad, int size);
+
+}
+#endif /* AIDGE_CUDA_OPERATOR_EXPIMPL_KERNELS_H_ */
+
+
+
+
+
--- a/src/operator/ExpImpl.cpp
+++ b/src/operator/ExpImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <vector>
+
+#include <cuda_fp16.h>
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/ExpImpl.hpp"
+#include "aidge/backend/cuda/operator/ExpImpl_CUDA_kernels.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaContext.hpp"
+#include "aidge/backend/cuda/utils/CudaUtils.hpp"
+#include "aidge/operator/Exp.hpp"
+#include "aidge/utils/Types.h"
+
+void Aidge::ExpImpl_cuda::forward() {
+
+    const Exp_Op& op = static_cast<const Exp_Op&>(mOp);
+
+    AIDGE_ASSERT(op.getInput(0), "missing input #0");
+
+    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+
+    switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
+        case DataType::Float64:
+            forward_<double>(input);
+            break;
+        case DataType::Float32:
+            forward_<float>(input);
+            break;
+        case DataType::Float16:
+            forward_<half>(input);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by the CUDA backend");
+    }
+}
+
+template <class T>
+void Aidge::ExpImpl_cuda::forward_(const Tensor& input) 
+{
+    const Exp_Op& op = static_cast<const Exp_Op&>(mOp);
+
+    const T* inputPtr = static_cast<T*>(input.getImpl()->rawPtr());
+    T* outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr());
+
+    int size = op.getInput(0)->size();
+    
+    Aidge::expForward<T>(inputPtr, outputPtr, size);
+}
+
+void Aidge::ExpImpl_cuda::backward()
+{
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    AIDGE_ASSERT(op.getInput(0), "missing input #0");
+    AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output #0 grad");
+
+    const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
+    const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
+
+    switch(op.getInput(0)->grad()->dataType()) {
+        case DataType::Float64:
+            backward_<double>(input, output_grad);
+            break;
+        case DataType::Float32:
+            backward_<float>(input, output_grad);
+            break;
+        case DataType::Float16:
+            backward_<half>(input, output_grad);
+            break;
+        default:
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
+    }
+}
+
+template <class T>
+void Aidge::ExpImpl_cuda::backward_(const Tensor& input, const Tensor& output_grad)
+{
+    const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
+
+    const T * inputPtr = static_cast<const T*>(input.getImpl()->rawPtr());
+    const T * outputGradPtr = static_cast<const T*>(output_grad.getImpl()->rawPtr());
+    T * inputGradPtr = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr());
+    
+    int size = op.getOutput(0)->size();
+
+    Aidge::expBackward<T>(inputPtr, outputGradPtr, inputGradPtr, size);
+}
+
--- a/src/operator/ExpImpl_CUDA_kernels.cu
+++ b/src/operator/ExpImpl_CUDA_kernels.cu
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cuda/operator/ExpImpl_CUDA_kernels.hpp"
+
+// Base template for floating-point types (float, double)
+template<typename T>
+__device__ T exp_helper(T x) {
+    return std::exp(x);  // std::log works for both float and double
+}
+
+// Specialization for half-precision type using CUDA's half
+template<>
+__device__ half exp_helper<half>(half x) {
+#if __CUDA_ARCH__ >= 530 && defined(CUDART_VERSION) && CUDART_VERSION >= 9000
+    return hexp(x);
+#else
+    float x_float = __half2float(x);  // Convert __half to float
+    return __float2half(std::exp(x_float));  // Compute log and convert back to half
+#endif
+}
+
+// Base template for floating-point types (float, double)
+template<typename T>
+__device__ T mul_helper_for_exp(T a, T b) {
+    return a * b;
+}
+
+// Specialization for half-precision type using CUDA's half
+template<>
+__device__ half mul_helper_for_exp<half>(half a, half b) {
+#if __CUDA_ARCH__ >= 530 && defined(CUDART_VERSION) && CUDART_VERSION >= 9000
+    return __hmul(a, b);
+#else
+    float a_float = __half2float(a);  // Convert __half to float
+    float b_float = __half2float(b);  // Convert __half to float
+    return __float2half(a_float * b_float);  // Compute log and convert back to half
+#endif
+}
+
+template <class T>
+__global__ void expCUDAForwardKernel(const T* input, T* output, int size) 
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size) return;
+    output[idx] = exp_helper(input[idx]);
+}
+
+template <class T>
+void Aidge::expForward(const T* input, T* output, int size)
+{
+    int blockSize = 256;
+    int numBlocks = (size + blockSize - 1) / blockSize;
+    expCUDAForwardKernel<<<numBlocks, blockSize>>>(input, output, size);
+};
+
+template <class T>
+__global__ void expCUDABackwardKernel(const T* input, const T* outputGrad, T* inputGrad, int size)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size) return;
+    inputGrad[idx] = mul_helper_for_exp(input[idx], outputGrad[idx]);
+}
+
+template <class T>
+void Aidge::expBackward(const T* input, const T* outputGrad, T* inputGrad, int size)
+{
+    const int blockSize = 256;
+    int numBlocks = (size + blockSize - 1) / blockSize;
+    expCUDABackwardKernel<<<numBlocks, blockSize>>>(input, outputGrad, inputGrad, size);
+};
+
+template void Aidge::expForward<double>(const double* input, double* output, int size);
+template void Aidge::expForward<float>(const float* input, float* output, int size);
+template void Aidge::expForward<half>(const half* input, half* output, int size);
+
+template void Aidge::expBackward<double>(const double* input, const double* outputGrad, double* inputGrad, int size);
+template void Aidge::expBackward<float>(const float* input, const float* outputGrad, float* inputGrad, int size);
+template void Aidge::expBackward<half>(const half* input, const half* outputGrad, half* inputGrad, int size);
--- a/unit_tests/Test_ExpImpl.cpp
+++ b/unit_tests/Test_ExpImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
+#include <memory>
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <cuda.h>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ExpImpl.hpp"
+#include "aidge/backend/cuda/data/TensorImpl.hpp"
+#include "aidge/backend/cuda/operator/ExpImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/operator/Exp.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace std::chrono;
+namespace Aidge {
+
+TEST_CASE("[gpu/operator] Exp", "[Exp][GPU]")
+{
+    // CONSTANTS
+
+    constexpr std::uint16_t NB_TRIALS = 10;
+
+    // SETUP RNGS
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(-10, 10);
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), std::size_t(20));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(6));
+
+    for (std::uint16_t trial = 0; trial < NB_TRIALS; ++trial)
+    {
+        // PREPARE TEST DATA 
+
+        const std::size_t nbDims = nbDimsDist(gen);
+
+        std::vector<std::size_t> dims;
+        for (std::size_t i = 0; i < nbDims; ++i) 
+            dims.push_back(dimSizeDist(gen));
+
+        const std::size_t nbElements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        float* rawData  = new float[nbElements];
+        for (std::size_t i = 0; i < nbElements; ++i)
+            rawData[i] = valueDist(gen);
+        
+        // CPU FORWARD
+
+        std::shared_ptr<Exp_Op> cpuOp = std::make_shared<Exp_Op>();
+        cpuOp->setDataType(DataType::Float32);
+        cpuOp->setBackend("cpu");
+
+        std::shared_ptr<Tensor> cpuTensor = std::make_shared<Tensor>();
+        cpuOp->associateInput(0, cpuTensor);
+        cpuTensor->setDataType(DataType::Float32);
+        cpuTensor->setBackend("cpu");
+        cpuTensor->resize(dims);
+        cpuTensor->getImpl()->setRawPtr(rawData, nbElements);
+
+        auto startTime = std::chrono::system_clock::now();
+        cpuOp->forward();
+        auto endTime = std::chrono::system_clock::now();
+        auto cpuElapsedTime = duration_cast<milliseconds>(endTime - startTime).count();
+
+        Tensor cpuResult = *(cpuOp->getOutput(0));
+
+        // CUDA FORWARD
+
+        std::shared_ptr<Exp_Op> cudaOp = std::make_shared<Exp_Op>();
+        cudaOp->setDataType(DataType::Float32);
+        cudaOp->setBackend("cuda");
+
+        std::shared_ptr<Tensor> cudaTensor = std::make_shared<Tensor>();
+        cudaTensor->setDataType(DataType::Float32);
+        cudaTensor->setBackend("cuda");
+        cudaTensor->resize(dims);
+        cudaOp->associateInput(0, cudaTensor);
+
+        float* rawDataDevice;
+        cudaMalloc(reinterpret_cast<void **> (&rawDataDevice), sizeof(float) * nbElements);
+        cudaMemcpy(rawDataDevice, rawData, sizeof(float) * nbElements, cudaMemcpyHostToDevice);
+        cudaTensor->getImpl()->setRawPtr(rawDataDevice, nbElements);
+
+        startTime = std::chrono::system_clock::now();
+        cudaOp->forward();
+        endTime = std::chrono::system_clock::now();
+        auto cudaElapsedTime = duration_cast<milliseconds>(endTime - startTime).count();
+
+        std::shared_ptr<Tensor> fallback;
+        Tensor& cudaResult = cudaOp->getOutput(0)->refCastFrom(fallback, DataType::Float32, "cpu");
+
+        // COMPARE
+
+        REQUIRE(approxEq<float>(cudaResult, cpuResult));
+
+        // FREE MEMORY
+
+        delete[] rawData;
+        cudaFree(rawDataDevice);
+
+        // LOG INFOS
+
+        fmt::print(" Execution time on CPU  : {} ms\n", cpuElapsedTime);
+        fmt::print(" Execution time on CUDA : {} ms\n", cudaElapsedTime);
+    }
+}
+} // namespace Aidge