Skip to content
Snippets Groups Projects
Commit 94ac83e4 authored by Houssem ROUIS's avatar Houssem ROUIS
Browse files

changed argument to std::size_t for fc kernel

parent f4454951
No related branches found
No related tags found
No related merge requests found
This commit is part of merge request !10. Comments created here will be created in the context of that merge request.
......@@ -29,7 +29,7 @@
namespace Aidge {
class FCImplForward_cuda : public Registrable<FCImplForward_cuda,
std::tuple<DataType>,
void(unsigned int , unsigned int , unsigned int, bool, const void* , const void* , const void* , void*)> {};
void(std::size_t , std::size_t, std::size_t, bool, const void* , const void* , const void* , void*)> {};
class FCImpl_cuda : public OperatorImpl {
private:
// CuDNN specific variables
......@@ -47,7 +47,7 @@ public:
// ~FCImpl_cuda();
private:
template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, DimSize_t outChannels);
template <class T> void forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels);
};
namespace {
......
......@@ -24,7 +24,7 @@
namespace Aidge {
template<class T>
void fc_forward_cuda(DimSize_t nbInputs, DimSize_t inChannels, DimSize_t outChannels, bool noBias, const void *input, const void *weights, const void *bias, void *output);
void fc_forward_cuda(std::size_t nbInputs, std::size_t inChannels, std::size_t outChannels, bool noBias, const void *input, const void *weights, const void *bias, void *output);
namespace {
static Registrar<FCImplForward_cuda> registrarFCImpl2DForward_cuda_Float32({DataType::Float32}, Aidge::fc_forward_cuda<float>);
......
......@@ -14,13 +14,13 @@
#include <numeric> // std::accumulate
#include <thread> // std::this_thread::sleep_for
#include <vector>
#include <iostream>
#include "aidge/utils/Types.h"
#include "aidge/operator/FC.hpp"
#include "aidge/backend/cuda/data/TensorImpl.hpp"
#include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp"
#include "aidge/backend/cuda/operator/FCImpl.hpp"
#include "aidge/backend/cuda/operator/FCImpl_CUDA_kernels.hpp"
#include "aidge/backend/cuda/utils/CudaContext.hpp"
......@@ -34,9 +34,9 @@ void Aidge::FCImpl_cuda::forward() {
const auto& input1 = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
const FC_Op& fcOp = static_cast<const FC_Op&>(mOp);
const auto& fcOp = static_cast<const FC_Op&>(mOp);
std::size_t outChannels = static_cast<std::size_t>(fcOp.template getAttr<FCAttr::OutChannels>());
bool noBias = fcOp.template getAttr<FCAttr::NoBias>();
DimSize_t outChannels = fcOp.template getAttr<FCAttr::OutChannels>();
if (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType() == DataType::Float64) {
forward_<double>(input0, input1, input2, noBias, outChannels);
}
......@@ -46,7 +46,7 @@ void Aidge::FCImpl_cuda::forward() {
}
template<class T>
void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, DimSize_t outChannels)
void Aidge::FCImpl_cuda::forward_(const Tensor& input0, const Tensor& input1, const Tensor& input2, bool noBias, std::size_t outChannels)
{
Aidge::fc_forward_cuda<T>(
input0.dims()[0],
......
......@@ -14,7 +14,7 @@
template<class T>
__global__
void fc_forward_cuda_kernel(std::size_t nbInputs, std::size_t inChannels, std::size_t outChannels, bool noBias,const T* input, const T* weights, const T* bias, T *output)
void fc_forward_cuda_kernel(std::size_t nbInputs, std::size_t inChannels, std::size_t outChannels, bool noBias, const T* input, const T* weights, const T* bias, T *output)
{
const std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -32,17 +32,19 @@ void fc_forward_cuda_kernel(std::size_t nbInputs, std::size_t inChannels, std::s
namespace Aidge{
template<class T>
void fc_forward_cuda(DimSize_t nbInputs, DimSize_t inChannels, DimSize_t outChannels, bool noBias, const void* input_, const void* weights_, const void* bias_, void* output_)
void fc_forward_cuda(std::size_t nbInputs, std::size_t inChannels, std::size_t outChannels, bool noBias, const void* input_, const void* weights_, const void* bias_, void* output_)
{
const T* input = static_cast<const T*>(input_);
const T* weights = static_cast<const T*>(weights_);
const T* bias = static_cast<const T*>(bias_);
T * output = static_cast<T*>(output_);
const dim3 blocksPerGrid = {(static_cast<unsigned int>(inChannels) + 255) / 256, 1, static_cast<unsigned int>(outChannels)};
const dim3 threadsPerBlocks = {256, 1, 1};
fc_forward_cuda_kernel<<<blocksPerGrid, threadsPerBlocks>>>(nbInputs, inChannels, outChannels, noBias, input, weights, bias, output);
// const dim3 blocksPerGrid = {(static_cast<unsigned int>(inChannels) + 255) / 256, 1, static_cast<unsigned int>(outChannels)};
// const dim3 threadsPerBlocks = {256, 1, 1};
// fc_forward_cuda_kernel<<<blocksPerGrid, threadsPerBlocks>>>(nbInputs, inChannels, outChannels, noBias, input, weights, bias, output);
const int blockSize = 256;
const int numBlocks = (nbInputs + blockSize - 1) / blockSize;
fc_forward_cuda_kernel<<<numBlocks, blockSize>>>(nbInputs, inChannels, outChannels, noBias, input, weights, bias, output);
CHECK_CUDA_STATUS(cudaPeekAtLastError());
}
}
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
// /********************************************************************************
// * Copyright (c) 2023 CEA-List
// *
// * This program and the accompanying materials are made available under the
// * terms of the Eclipse Public License 2.0 which is available at
// * http://www.eclipse.org/legal/epl-2.0.
// *
// * SPDX-License-Identifier: EPL-2.0
// *
// ********************************************************************************/
#include <array>
// #include <array>
#include <catch2/catch_test_macros.hpp>
// #include <catch2/catch_test_macros.hpp>
#include "Test_cuda.hpp"
// #include "Test_cuda.hpp"
#include "aidge/data/Tensor.hpp"
// #include "aidge/data/Tensor.hpp"
#include "aidge/backend/cpu.hpp"
#include "aidge/backend/cuda.hpp"
// #include "aidge/backend/cpu.hpp"
// #include "aidge/backend/cuda.hpp"
using namespace Aidge;
// using namespace Aidge;
TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") {
std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 5, 75>{
{{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}});
std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 5>{{1, 2, 3, 4, 5}});
std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 2, 5>{
{{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}});
myWeights->setBackend("cuda");
myBias->setBackend("cuda");
std::shared_ptr<Node> myFC = FC(75, 5, false, "myfc");
auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
op -> associateInput(1, myWeights);
op -> associateInput(2, myBias);
SECTION("2D input") {
std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 2, 75>{
{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74},
{75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149}}});
myInput->setBackend("cuda");
op->associateInput(0, myInput);
op -> setDataType(DataType::Float32);
op -> setBackend("cuda");
op->computeOutputDims();
myFC->forward();
// TEST_CASE("[gpu/operator] FC(forward)", "[FC][GPU]") {
// std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 5, 75>{
// {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4,
// 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8,
// 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
// 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
// {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4,
// 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8,
// 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
// 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
// {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4,
// 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8,
// 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
// 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
// {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4,
// 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8,
// 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
// 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
// {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4,
// 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8,
// 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
// 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}});
// std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 5>{{1, 2, 3, 4, 5}});
// std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 2, 5>{
// {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}});
// myWeights->setBackend("cuda");
// myBias->setBackend("cuda");
// std::shared_ptr<Node> myFC = FC(75, 5, false, "myfc");
// auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
// op -> associateInput(1, myWeights);
// op -> associateInput(2, myBias);
// SECTION("2D input") {
// std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 2, 75>{
// {{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
// 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
// 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
// 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74},
// {75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
// 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
// 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
// 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
// 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149}}});
// myInput->setBackend("cuda");
// op->associateInput(0, myInput);
// op -> setDataType(DataType::Float32);
// op -> setBackend("cuda");
// op->computeOutputDims();
// myFC->forward();
float* computedOutput = new float[myOutput->size()]();
cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
// float* computedOutput = new float[myOutput->size()]();
// cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
for(int i = 0; i < myOutput->size(); i++){
const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
std::cout << "targetOutput " << targetOutput << ", out " << computedOutput[i]<<std::endl;
REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
}
// for(int i = 0; i < myOutput->size(); i++){
// const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
// std::cout << "targetOutput " << targetOutput << ", out " << computedOutput[i]<<std::endl;
// REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
// }
delete[] computedOutput;
}
SECTION("4D input") {
std::shared_ptr<Tensor> myInput =
std::make_shared<Tensor>(Array4D<float, 2, 3, 5, 5>{{{{{0, 1, 2, 3, 4},
{5, 6, 7, 8, 9},
{10, 11, 12, 13, 14},
{15, 16, 17, 18, 19},
{20, 21, 22, 23, 24}},
{{25, 26, 27, 28, 29},
{30, 31, 32, 33, 34},
{35, 36, 37, 38, 39},
{40, 41, 42, 43, 44},
{45, 46, 47, 48, 49}},
{{50, 51, 52, 53, 54},
{55, 56, 57, 58, 59},
{60, 61, 62, 63, 64},
{65, 66, 67, 68, 69},
{70, 71, 72, 73, 74}}},
{{{75, 76, 77, 78, 79},
{80, 81, 82, 83, 84},
{85, 86, 87, 88, 89},
{90, 91, 92, 93, 94},
{95, 96, 97, 98, 99}},
{{100, 101, 102, 103, 104},
{105, 106, 107, 108, 109},
{110, 111, 112, 113, 114},
{115, 116, 117, 118, 119},
{120, 121, 122, 123, 124}},
{{125, 126, 127, 128, 129},
{130, 131, 132, 133, 134},
{135, 136, 137, 138, 139},
{140, 141, 142, 143, 144},
{145, 146, 147, 148, 149}}}}});
myInput->setBackend("cuda");
op->associateInput(0, myInput);
op -> setDataType(DataType::Float32);
op -> setBackend("cuda");
op->computeOutputDims();
myFC->forward();
// delete[] computedOutput;
// }
// SECTION("4D input") {
// std::shared_ptr<Tensor> myInput =
// std::make_shared<Tensor>(Array4D<float, 2, 3, 5, 5>{{{{{0, 1, 2, 3, 4},
// {5, 6, 7, 8, 9},
// {10, 11, 12, 13, 14},
// {15, 16, 17, 18, 19},
// {20, 21, 22, 23, 24}},
// {{25, 26, 27, 28, 29},
// {30, 31, 32, 33, 34},
// {35, 36, 37, 38, 39},
// {40, 41, 42, 43, 44},
// {45, 46, 47, 48, 49}},
// {{50, 51, 52, 53, 54},
// {55, 56, 57, 58, 59},
// {60, 61, 62, 63, 64},
// {65, 66, 67, 68, 69},
// {70, 71, 72, 73, 74}}},
// {{{75, 76, 77, 78, 79},
// {80, 81, 82, 83, 84},
// {85, 86, 87, 88, 89},
// {90, 91, 92, 93, 94},
// {95, 96, 97, 98, 99}},
// {{100, 101, 102, 103, 104},
// {105, 106, 107, 108, 109},
// {110, 111, 112, 113, 114},
// {115, 116, 117, 118, 119},
// {120, 121, 122, 123, 124}},
// {{125, 126, 127, 128, 129},
// {130, 131, 132, 133, 134},
// {135, 136, 137, 138, 139},
// {140, 141, 142, 143, 144},
// {145, 146, 147, 148, 149}}}}});
// myInput->setBackend("cuda");
// op->associateInput(0, myInput);
// op -> setDataType(DataType::Float32);
// op -> setBackend("cuda");
// op->computeOutputDims();
// myFC->forward();
float* computedOutput = new float[myOutput->size()]();
cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
// float* computedOutput = new float[myOutput->size()]();
// cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
for(int i = 0; i < myOutput->size(); i++){
const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
}
// for(int i = 0; i < myOutput->size(); i++){
// const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
// REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
// }
delete[] computedOutput;
}
}
\ No newline at end of file
// delete[] computedOutput;
// }
// }
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment