Skip to content
Snippets Groups Projects
Commit 12c56d5e authored by Houssem ROUIS's avatar Houssem ROUIS
Browse files

use cublas for gemm operator

parent 435b6c51
No related branches found
No related tags found
3 merge requests!15version 0.2.0,!12Lenetop,!10Lenet operators
......@@ -50,6 +50,7 @@ target_link_libraries(${module_name}
PUBLIC
_aidge_core # _ is added because we link the target not the project
CUDA::cudart
CUDA::cublas
cudnn
)
......
......@@ -16,20 +16,21 @@
#include <cfloat>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_fp16.h>
#include "aidge/data/Data.hpp"
#include "aidge/backend/cuda/operator/FCImpl.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
namespace Aidge {
template<class T>
void fc_forward_cuda(std::size_t nbInputs, std::size_t inChannels, std::size_t outChannels, bool noBias, const void *input, const void *weights, const void *bias, void *output);
namespace {
static Registrar<FCImplForward_cuda> registrarFCImpl2DForward_cuda_Float32({DataType::Float32}, Aidge::fc_forward_cuda<float>);
static Registrar<FCImplForward_cuda> registrarFCImpl2DForward_cuda_Float64({DataType::Float64}, Aidge::fc_forward_cuda<double>);
} // namespace
template <class T>
cublasStatus_t cublasGemm(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const T *alpha,
const T *A, int lda,
const T *B, int ldb,
const T *beta,
T *C, int ldc);
}
#endif /* AIDGE_CUDA_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ */
\ No newline at end of file
......@@ -23,6 +23,7 @@
#include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp"
#include "aidge/backend/cuda/utils/CudaContext.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
void Aidge::FCImpl_cuda::forward() {
assert(mOp.getRawInput(0) && "missing input #0");
......@@ -35,8 +36,9 @@ void Aidge::FCImpl_cuda::forward() {
const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
const auto& fcOp = static_cast<const FC_Op&>(mOp);
std::size_t outChannels = static_cast<std::size_t>(fcOp.template getAttr<FCAttr::OutChannels>());
bool noBias = fcOp.template getAttr<FCAttr::NoBias>();
std::size_t outChannels = static_cast<std::size_t>(fcOp.template getAttr<FCAttr::OutChannels>());
if (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType() == DataType::Float64) {
forward_<double>(input0, input1, input2, noBias, outChannels);
}
......@@ -48,13 +50,61 @@ void Aidge::FCImpl_cuda::forward() {
template<class T>
void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels)
{
Aidge::fc_forward_cuda<T>(
input0.dims()[0],
input0.size() / input0.dims()[0],
outChannels,
noBias,
input0.getImpl()->rawPtr(),
input1.getImpl()->rawPtr(),
input2.getImpl()->rawPtr(),
std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
const T * input = static_cast<const T*>(input0.getImpl()->rawPtr());
const T * weights = static_cast<const T*>(input1.getImpl()->rawPtr());
T * output = static_cast<T*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
int n = outChannels;
int m = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->size()/n;
int k = input0.size()/m;
int lda = k;
int ldb = k;
int ldc = n;
const T alpha = 1.0;
const T beta = 0.0;
CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(),
CUBLAS_OP_T,
CUBLAS_OP_N,
n,
m,
k,
&alpha,
weights,
ldb,
input,
lda,
&beta,
output,
ldc));
if(!noBias){
T* onesVector;
cudaMalloc((void**)&onesVector, outChannels * sizeof(T));
// Fill the vector with ones
std::vector<T> onesVec(m, 1.0f);
CHECK_CUDA_STATUS(cudaMemcpy(onesVector,
&onesVec[0],
m * sizeof(T),
cudaMemcpyHostToDevice));
const T * biases = static_cast<const T*>(input2.getImpl()->rawPtr());
CHECK_CUBLAS_STATUS(cublasGemm(CudaContext::cublasHandle(),
CUBLAS_OP_N,
CUBLAS_OP_N,
n,
m,
1,
&alpha,
biases,
n,
onesVector,
1,
&alpha,
output,
n));
cudaFree(onesVector);
}
}
\ No newline at end of file
......@@ -12,37 +12,65 @@
#include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp"
template<class T>
__global__
void fc_forward_cuda_kernel(std::size_t nbInputs, std::size_t inChannels, std::size_t outChannels, bool noBias, const T* input, const T* weights, const T* bias, T *output)
{
const std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
namespace Aidge{
for(std::size_t batch=idx; batch<nbInputs; ++batch)
{
for (std::size_t out = 0; out < outChannels; ++out) {
T sum = 0;
for (std::size_t in = 0; in < inChannels; ++in) {
sum += input[batch * inChannels + in] * weights[out * inChannels + in];
}
output[batch * outChannels + out] = sum + (noBias ? 0 : bias[out]);
}
}
template <>
cublasStatus_t cublasGemm<__half>(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const __half *alpha,
const __half *A, int lda,
const __half *B, int ldb,
const __half *beta,
__half *C, int ldc)
{
return cublasHgemm(handle,
transa, transb,
m, n, k,
alpha,
A, lda,
B, ldb,
beta,
C, ldc);
}
namespace Aidge{
template<class T>
void fc_forward_cuda(std::size_t nbInputs, std::size_t inChannels, std::size_t outChannels, bool noBias, const void* input_, const void* weights_, const void* bias_, void* output_)
template <>
cublasStatus_t cublasGemm<float>(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const float *alpha,
const float *A, int lda,
const float *B, int ldb,
const float *beta,
float *C, int ldc)
{
const T* input = static_cast<const T*>(input_);
const T* weights = static_cast<const T*>(weights_);
const T* bias = static_cast<const T*>(bias_);
T * output = static_cast<T*>(output_);
const dim3 blocksPerGrid = {(static_cast<unsigned int>(inChannels) + 255) / 256, 1, static_cast<unsigned int>(outChannels)};
const dim3 threadsPerBlocks = {256, 1, 1};
fc_forward_cuda_kernel<<<blocksPerGrid, threadsPerBlocks>>>(nbInputs, inChannels, outChannels, noBias, input, weights, bias, output);
CHECK_CUDA_STATUS(cudaPeekAtLastError());
return cublasSgemm(handle,
transa, transb,
m, n, k,
alpha,
A, lda,
B, ldb,
beta,
C, ldc);
}
template <>
cublasStatus_t cublasGemm<double>(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transb,
int m, int n, int k,
const double *alpha,
const double *A, int lda,
const double *B, int ldb,
const double *beta,
double *C, int ldc)
{
return cublasDgemm(handle,
transa, transb,
m, n, k,
alpha,
A, lda,
B, ldb,
beta,
C, ldc);
}
}
\ No newline at end of file
......@@ -76,7 +76,6 @@ TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") {
for(int i = 0; i < myOutput->size(); i++){
const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
std::cout << "targetOutput " << targetOutput << ", out " << computedOutput[i]<<std::endl;
REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment