Skip to content
Snippets Groups Projects
Commit b488c7cd authored by Olivier Antoni's avatar Olivier Antoni
Browse files

Add Exp operator

parent 02da21b0
No related branches found
No related tags found
2 merge requests!90v0.7.0,!89Add Exp operator
Pipeline #81018 passed
/********************************************************************************
* Copyright (c) 2024 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_BACKEND_CUDA_OPERATOR_EXPIMPL_H_
#define AIDGE_BACKEND_CUDA_OPERATOR_EXPIMPL_H_
#include <array>
#include <memory>
#include <tuple>
#include <vector>
#include <cudnn.h>
#include "aidge/backend/OperatorImpl.hpp"
#include "aidge/operator/Exp.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
namespace Aidge {
// Operator implementation entry point for the backend
class ExpImpl_cuda : public OperatorImpl {
public:
ExpImpl_cuda(const Exp_Op& op) : OperatorImpl(op, "cuda") {}
static std::unique_ptr<ExpImpl_cuda> create(const Exp_Op& op) {
return std::make_unique<ExpImpl_cuda>(op);
}
virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
return {
{DataType::Float64},
{DataType::Float32},
{DataType::Float16},
};
}
void forward() override;
void backward() override;
private:
std::shared_ptr<Tensor> mInputFallback;
std::shared_ptr<Tensor> mOutputGradFallback;
template <class T> void forward_(const Tensor& input);
template <class T> void backward_(const Tensor& input, const Tensor& output_grad);
};
// Implementation entry point registration to Operator
REGISTRAR(Exp_Op, "cuda", Aidge::ExpImpl_cuda::create);
} // namespace Aidge
#endif /* AIDGE_BACKEND_CUDA_OPERATOR_EXPIMPL_H_ */
/********************************************************************************
* Copyright (c) 2024 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CUDA_OPERATOR_EXPIMPL_KERNELS_H_
#define AIDGE_CUDA_OPERATOR_EXPIMPL_KERNELS_H_
#include <stdexcept>
#include <cfloat>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_fp16.h>
#include "aidge/data/Data.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
#include "aidge/utils/Types.h"
namespace Aidge {
template <class T>
void expForward(const T* input, T* output, int size);
template <class T>
void expBackward(const T* input, const T* outputGrad, T* inputGrad, int size);
}
#endif /* AIDGE_CUDA_OPERATOR_EXPIMPL_KERNELS_H_ */
/********************************************************************************
* Copyright (c) 2024 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <algorithm>
#include <cassert>
#include <numeric>
#include <vector>
#include <cuda_fp16.h>
#include "aidge/backend/cuda/data/TensorImpl.hpp"
#include "aidge/backend/cuda/operator/ExpImpl.hpp"
#include "aidge/backend/cuda/operator/ExpImpl_CUDA_kernels.hpp"
#include "aidge/backend/cuda/utils/CudaContext.hpp"
#include "aidge/backend/cuda/utils/CudaContext.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
#include "aidge/operator/Exp.hpp"
#include "aidge/utils/Types.h"
void Aidge::ExpImpl_cuda::forward() {
const Exp_Op& op = static_cast<const Exp_Op&>(mOp);
AIDGE_ASSERT(op.getInput(0), "missing input #0");
const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) {
case DataType::Float64:
forward_<double>(input);
break;
case DataType::Float32:
forward_<float>(input);
break;
case DataType::Float16:
forward_<half>(input);
break;
default:
AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by the CUDA backend");
}
}
template <class T>
void Aidge::ExpImpl_cuda::forward_(const Tensor& input)
{
const Exp_Op& op = static_cast<const Exp_Op&>(mOp);
const T* inputPtr = static_cast<T*>(input.getImpl()->rawPtr());
T* outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr());
int size = op.getInput(0)->size();
Aidge::expForward<T>(inputPtr, outputPtr, size);
}
void Aidge::ExpImpl_cuda::backward()
{
const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
AIDGE_ASSERT(op.getInput(0), "missing input #0");
AIDGE_ASSERT(op.getOutput(0)->grad(), "missing output #0 grad");
const auto& input = op.getInput(0)->refCastFrom(mInputFallback, *op.getOutput(0));
const auto& output_grad = op.getOutput(0)->grad()->refCastFrom(mOutputGradFallback, *op.getOutput(0)->grad());
switch(op.getInput(0)->grad()->dataType()) {
case DataType::Float64:
backward_<double>(input, output_grad);
break;
case DataType::Float32:
backward_<float>(input, output_grad);
break;
case DataType::Float16:
backward_<half>(input, output_grad);
break;
default:
AIDGE_THROW_OR_ABORT(std::runtime_error, "Data type is not supported by Backend Cuda");
}
}
template <class T>
void Aidge::ExpImpl_cuda::backward_(const Tensor& input, const Tensor& output_grad)
{
const OperatorTensor& op = static_cast<const OperatorTensor&>(mOp);
const T * inputPtr = static_cast<const T*>(input.getImpl()->rawPtr());
const T * outputGradPtr = static_cast<const T*>(output_grad.getImpl()->rawPtr());
T * inputGradPtr = static_cast<T*>(op.getInput(0)->grad()->getImpl()->rawPtr());
int size = op.getOutput(0)->size();
Aidge::expBackward<T>(inputPtr, outputGradPtr, inputGradPtr, size);
}
/********************************************************************************
* Copyright (c) 2024 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include "aidge/backend/cuda/operator/ExpImpl_CUDA_kernels.hpp"
// Base template for floating-point types (float, double)
template<typename T>
__device__ T exp_helper(T x) {
return std::exp(x); // std::log works for both float and double
}
// Specialization for half-precision type using CUDA's half
template<>
__device__ half exp_helper<half>(half x) {
#if __CUDA_ARCH__ >= 530 && defined(CUDART_VERSION) && CUDART_VERSION >= 9000
return hexp(x);
#else
float x_float = __half2float(x); // Convert __half to float
return __float2half(std::exp(x_float)); // Compute log and convert back to half
#endif
}
// Base template for floating-point types (float, double)
template<typename T>
__device__ T mul_helper_for_exp(T a, T b) {
return a * b;
}
// Specialization for half-precision type using CUDA's half
template<>
__device__ half mul_helper_for_exp<half>(half a, half b) {
#if __CUDA_ARCH__ >= 530 && defined(CUDART_VERSION) && CUDART_VERSION >= 9000
return __hmul(a, b);
#else
float a_float = __half2float(a); // Convert __half to float
float b_float = __half2float(b); // Convert __half to float
return __float2half(a_float * b_float); // Compute log and convert back to half
#endif
}
template <class T>
__global__ void expCUDAForwardKernel(const T* input, T* output, int size)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size) return;
output[idx] = exp_helper(input[idx]);
}
template <class T>
void Aidge::expForward(const T* input, T* output, int size)
{
int blockSize = 256;
int numBlocks = (size + blockSize - 1) / blockSize;
expCUDAForwardKernel<<<numBlocks, blockSize>>>(input, output, size);
};
template <class T>
__global__ void expCUDABackwardKernel(const T* input, const T* outputGrad, T* inputGrad, int size)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size) return;
inputGrad[idx] = mul_helper_for_exp(input[idx], outputGrad[idx]);
}
template <class T>
void Aidge::expBackward(const T* input, const T* outputGrad, T* inputGrad, int size)
{
const int blockSize = 256;
int numBlocks = (size + blockSize - 1) / blockSize;
expCUDABackwardKernel<<<numBlocks, blockSize>>>(input, outputGrad, inputGrad, size);
};
template void Aidge::expForward<double>(const double* input, double* output, int size);
template void Aidge::expForward<float>(const float* input, float* output, int size);
template void Aidge::expForward<half>(const half* input, half* output, int size);
template void Aidge::expBackward<double>(const double* input, const double* outputGrad, double* inputGrad, int size);
template void Aidge::expBackward<float>(const float* input, const float* outputGrad, float* inputGrad, int size);
template void Aidge::expBackward<half>(const half* input, const half* outputGrad, half* inputGrad, int size);
/********************************************************************************
* Copyright (c) 2024 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <chrono> // std::micro, std::chrono::time_point,
// std::chrono::system_clock
#include <cstddef> // std::size_t
#include <cstdint> // std::uint16_t
#include <functional> // std::multiplies
#include <memory>
#include <numeric> // std::accumulate
#include <random> // std::random_device, std::mt19937
// std::uniform_int_distribution, std::uniform_real_distribution
#include <vector>
#include <catch2/catch_test_macros.hpp>
#include <cuda.h>
#include <fmt/core.h>
#include "aidge/backend/cpu/data/TensorImpl.hpp"
#include "aidge/backend/cpu/operator/ExpImpl.hpp"
#include "aidge/backend/cuda/data/TensorImpl.hpp"
#include "aidge/backend/cuda/operator/ExpImpl.hpp"
#include "aidge/data/Data.hpp"
#include "aidge/data/Tensor.hpp"
#include "aidge/graph/Node.hpp"
#include "aidge/operator/Exp.hpp"
#include "aidge/utils/TensorUtils.hpp"
using namespace std::chrono;
namespace Aidge {
TEST_CASE("[gpu/operator] Exp", "[Exp][GPU]")
{
// CONSTANTS
constexpr std::uint16_t NB_TRIALS = 10;
// SETUP RNGS
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> valueDist(-10, 10);
std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(1), std::size_t(20));
std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(6));
for (std::uint16_t trial = 0; trial < NB_TRIALS; ++trial)
{
// PREPARE TEST DATA
const std::size_t nbDims = nbDimsDist(gen);
std::vector<std::size_t> dims;
for (std::size_t i = 0; i < nbDims; ++i)
dims.push_back(dimSizeDist(gen));
const std::size_t nbElements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
float* rawData = new float[nbElements];
for (std::size_t i = 0; i < nbElements; ++i)
rawData[i] = valueDist(gen);
// CPU FORWARD
std::shared_ptr<Exp_Op> cpuOp = std::make_shared<Exp_Op>();
cpuOp->setDataType(DataType::Float32);
cpuOp->setBackend("cpu");
std::shared_ptr<Tensor> cpuTensor = std::make_shared<Tensor>();
cpuOp->associateInput(0, cpuTensor);
cpuTensor->setDataType(DataType::Float32);
cpuTensor->setBackend("cpu");
cpuTensor->resize(dims);
cpuTensor->getImpl()->setRawPtr(rawData, nbElements);
auto startTime = std::chrono::system_clock::now();
cpuOp->forward();
auto endTime = std::chrono::system_clock::now();
auto cpuElapsedTime = duration_cast<milliseconds>(endTime - startTime).count();
Tensor cpuResult = *(cpuOp->getOutput(0));
// CUDA FORWARD
std::shared_ptr<Exp_Op> cudaOp = std::make_shared<Exp_Op>();
cudaOp->setDataType(DataType::Float32);
cudaOp->setBackend("cuda");
std::shared_ptr<Tensor> cudaTensor = std::make_shared<Tensor>();
cudaTensor->setDataType(DataType::Float32);
cudaTensor->setBackend("cuda");
cudaTensor->resize(dims);
cudaOp->associateInput(0, cudaTensor);
float* rawDataDevice;
cudaMalloc(reinterpret_cast<void **> (&rawDataDevice), sizeof(float) * nbElements);
cudaMemcpy(rawDataDevice, rawData, sizeof(float) * nbElements, cudaMemcpyHostToDevice);
cudaTensor->getImpl()->setRawPtr(rawDataDevice, nbElements);
startTime = std::chrono::system_clock::now();
cudaOp->forward();
endTime = std::chrono::system_clock::now();
auto cudaElapsedTime = duration_cast<milliseconds>(endTime - startTime).count();
std::shared_ptr<Tensor> fallback;
Tensor& cudaResult = cudaOp->getOutput(0)->refCastFrom(fallback, DataType::Float32, "cpu");
// COMPARE
REQUIRE(approxEq<float>(cudaResult, cpuResult));
// FREE MEMORY
delete[] rawData;
cudaFree(rawDataDevice);
// LOG INFOS
fmt::print(" Execution time on CPU : {} ms\n", cpuElapsedTime);
fmt::print(" Execution time on CUDA : {} ms\n", cudaElapsedTime);
}
}
} // namespace Aidge
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment