Skip to content
Snippets Groups Projects
Commit 12c56d5e authored by Houssem ROUIS's avatar Houssem ROUIS
Browse files

use cublas for gemm operator

parent 435b6c51
No related branches found
No related tags found
3 merge requests!15version 0.2.0,!12Lenetop,!10Lenet operators
This commit is part of merge request !10. Comments created here will be created in the context of that merge request.
...@@ -50,6 +50,7 @@ target_link_libraries(${module_name} ...@@ -50,6 +50,7 @@ target_link_libraries(${module_name}
PUBLIC PUBLIC
_aidge_core # _ is added because we link the target not the project _aidge_core # _ is added because we link the target not the project
CUDA::cudart CUDA::cudart
CUDA::cublas
cudnn cudnn
) )
......
...@@ -16,20 +16,21 @@ ...@@ -16,20 +16,21 @@
#include <cfloat> #include <cfloat>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <cuda_fp16.h>
#include "aidge/data/Data.hpp" #include "aidge/data/Data.hpp"
#include "aidge/backend/cuda/operator/FCImpl.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp"
namespace Aidge { namespace Aidge {
template<class T> template <class T>
void fc_forward_cuda(std::size_t nbInputs, std::size_t inChannels, std::size_t outChannels, bool noBias, const void *input, const void *weights, const void *bias, void *output); cublasStatus_t cublasGemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
namespace { int m, int n, int k,
static Registrar<FCImplForward_cuda> registrarFCImpl2DForward_cuda_Float32({DataType::Float32}, Aidge::fc_forward_cuda<float>); const T *alpha,
static Registrar<FCImplForward_cuda> registrarFCImpl2DForward_cuda_Float64({DataType::Float64}, Aidge::fc_forward_cuda<double>); const T *A, int lda,
} // namespace const T *B, int ldb,
const T *beta,
T *C, int ldc);
} }
#endif /* AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ */ #endif /* AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ */
\ No newline at end of file
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp" #include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp"
#include "aidge/backend/cuda/utils/CudaContext.hpp" #include "aidge/backend/cuda/utils/CudaContext.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
void Aidge::FCImpl_cuda::forward() { void Aidge::FCImpl_cuda::forward() {
assert(mOp.getRawInput(0) && "missing input #0"); assert(mOp.getRawInput(0) && "missing input #0");
...@@ -35,8 +36,9 @@ void Aidge::FCImpl_cuda::forward() { ...@@ -35,8 +36,9 @@ void Aidge::FCImpl_cuda::forward() {
const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))); const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
const auto& fcOp = static_cast<const FC_Op&>(mOp); const auto& fcOp = static_cast<const FC_Op&>(mOp);
std::size_t outChannels = static_cast<std::size_t>(fcOp.template getAttr<FCAttr::OutChannels>());
bool noBias = fcOp.template getAttr<FCAttr::NoBias>(); bool noBias = fcOp.template getAttr<FCAttr::NoBias>();
std::size_t outChannels = static_cast<std::size_t>(fcOp.template getAttr<FCAttr::OutChannels>());
if (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType() == DataType::Float64) { if (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType() == DataType::Float64) {
forward_<double>(input0, input1, input2, noBias, outChannels); forward_<double>(input0, input1, input2, noBias, outChannels);
} }
...@@ -48,13 +50,61 @@ void Aidge::FCImpl_cuda::forward() { ...@@ -48,13 +50,61 @@ void Aidge::FCImpl_cuda::forward() {
template<class T> template<class T>
void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels) void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels)
{ {
Aidge::fc_forward_cuda<T>(
input0.dims()[0], const T * input = static_cast<const T*>(input0.getImpl()->rawPtr());
input0.size() / input0.dims()[0], const T * weights = static_cast<const T*>(input1.getImpl()->rawPtr());
outChannels, T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
noBias,
input0.getImpl()->rawPtr(), int n = outChannels;
input1.getImpl()->rawPtr(), int m = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->size()/n;
input2.getImpl()->rawPtr(), int k = input0.size()/m;
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()); int lda = k;
int ldb = k;
int ldc = n;
const T alpha = 1.0;
const T beta = 0.0;
CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(),
CUBLAS_OP_T,
CUBLAS_OP_N,
n,
m,
k,
&alpha,
weights,
ldb,
input,
lda,
&beta,
output,
ldc));
if(!noBias){
T* onesVector;
cudaMalloc((void**)&onesVector, outChannels * sizeof(T));
// Fill the vector with ones
std::vector<T> onesVec(m, 1.0f);
CHECK_CUDA_STATUS(cudaMemcpy(onesVector,
&onesVec[0],
m * sizeof(T),
cudaMemcpyHostToDevice));
const T * biases = static_cast<const T*>(input2.getImpl()->rawPtr());
CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(),
CUBLAS_OP_N,
CUBLAS_OP_N,
n,
m,
1,
&alpha,
biases,
n,
onesVector,
1,
&alpha,
output,
n));
cudaFree(onesVector);
}
} }
\ No newline at end of file
...@@ -12,37 +12,65 @@ ...@@ -12,37 +12,65 @@
#include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp" #include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp"
template<class T> namespace Aidge{
__global__
void fc_forward_cuda_kernel(std::size_t nbInputs, std::size_t inChannels, std::size_t outChannels, bool noBias, const T* input, const T* weights, const T* bias, T *output)
{
const std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
for(std::size_t batch=idx; batch<nbInputs; ++batch) template <>
{ cublasStatus_t cublasGemm<__half>(cublasHandle_t handle,
for (std::size_t out = 0; out < outChannels; ++out) { cublasOperation_t transa, cublasOperation_t transb,
T sum = 0; int m, int n, int k,
for (std::size_t in = 0; in < inChannels; ++in) { const __half *alpha,
sum += input[batch * inChannels + in] * weights[out * inChannels + in]; const __half *A, int lda,
} const __half *B, int ldb,
output[batch * outChannels + out] = sum + (noBias ? 0 : bias[out]); const __half *beta,
} __half *C, int ldc)
} {
return cublasHgemm(handle,
transa, transb,
m, n, k,
alpha,
A, lda,
B, ldb,
beta,
C, ldc);
} }
namespace Aidge{ template <>
template<class T> cublasStatus_t cublasGemm<float>(cublasHandle_t handle,
void fc_forward_cuda(std::size_t nbInputs, std::size_t inChannels, std::size_t outChannels, bool noBias, const void* input_, const void* weights_, const void* bias_, void* output_) cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const float *alpha,
const float *A, int lda,
const float *B, int ldb,
const float *beta,
float *C, int ldc)
{ {
const T* input = static_cast<const T*>(input_); return cublasSgemm(handle,
const T* weights = static_cast<const T*>(weights_); transa, transb,
const T* bias = static_cast<const T*>(bias_); m, n, k,
T * output = static_cast<T*>(output_); alpha,
A, lda,
const dim3 blocksPerGrid = {(static_cast<unsigned int>(inChannels) + 255) / 256, 1, static_cast<unsigned int>(outChannels)}; B, ldb,
const dim3 threadsPerBlocks = {256, 1, 1}; beta,
fc_forward_cuda_kernel<<<blocksPerGrid, threadsPerBlocks>>>(nbInputs, inChannels, outChannels, noBias, input, weights, bias, output); C, ldc);
CHECK_CUDA_STATUS(cudaPeekAtLastError());
} }
template <>
cublasStatus_t cublasGemm<double>(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const double *alpha,
const double *A, int lda,
const double *B, int ldb,
const double *beta,
double *C, int ldc)
{
return cublasDgemm(handle,
transa, transb,
m, n, k,
alpha,
A, lda,
B, ldb,
beta,
C, ldc);
} }
}
\ No newline at end of file
...@@ -76,7 +76,6 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") { ...@@ -76,7 +76,6 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") {
for(int i = 0; i < myOutput->size(); i++){ for(int i = 0; i < myOutput->size(); i++){
const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
std::cout << "targetOutput " << targetOutput << ", out " << computedOutput[i]<<std::endl;
REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment