Skip to content
Snippets Groups Projects
Commit 406b110b authored by Olivier BICHLER's avatar Olivier BICHLER
Browse files

WIP: Conv computation gives wrong result (transposed kernel)

parent 4b2e0d5a
No related branches found
No related tags found
No related merge requests found
Pipeline #31882 failed
...@@ -86,8 +86,8 @@ endif() ...@@ -86,8 +86,8 @@ endif()
target_compile_features(${module_name} PRIVATE cxx_std_14) target_compile_features(${module_name} PRIVATE cxx_std_14)
target_compile_options(${module_name} PRIVATE target_compile_options(${module_name} PRIVATE
$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>: $<$<COMPILE_LANGUAGE:CPP>:$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-Wall -Wextra -Wold-style-cast -Winline -pedantic -Werror=narrowing -Wshadow $<$<BOOL:${WERROR}>:-Werror>>) -Wall -Wextra -Wold-style-cast -Winline -pedantic -Werror=narrowing -Wshadow $<$<BOOL:${WERROR}>:-Werror>>>)
target_compile_options(${module_name} PRIVATE target_compile_options(${module_name} PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>: $<$<COMPILE_LANGUAGE:CUDA>:
-Wall>) -Wall>)
......
...@@ -13,5 +13,6 @@ ...@@ -13,5 +13,6 @@
#define AIDGE_BACKEND_CUDA_IMPORTS_H_ #define AIDGE_BACKEND_CUDA_IMPORTS_H_
#include "aidge/backend/cuda/data/TensorImpl.hpp" #include "aidge/backend/cuda/data/TensorImpl.hpp"
#include "aidge/backend/cuda/operator/ConvImpl.hpp"
#endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */ #endif /* AIDGE_BACKEND_CUDA_IMPORTS_H_ */
\ No newline at end of file
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#define AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_ #define AIDGE_BACKEND_CUDA_DATA_TENSORIMPL_H_
#include <thrust/equal.h> #include <thrust/equal.h>
#include <thrust/execution_policy.h> #include <thrust/device_ptr.h>
#include "aidge/backend/TensorImpl.hpp" #include "aidge/backend/TensorImpl.hpp"
#include "aidge/data/Tensor.hpp" #include "aidge/data/Tensor.hpp"
...@@ -31,14 +31,7 @@ class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_ { ...@@ -31,14 +31,7 @@ class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_ {
TensorImpl_cuda(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor) {} TensorImpl_cuda(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor) {}
bool operator==(const TensorImpl &otherImpl) const override final { bool operator==(const TensorImpl &otherImpl) const override final;
const auto& otherImplCuda = static_cast<const TensorImpl_cuda<T>&>(otherImpl);
if (mTensor.size() != otherImplCuda.mTensor.size())
return false;
return thrust::equal(mData, mData + mTensor.size(), otherImplCuda.mData);
}
static std::unique_ptr<TensorImpl_cuda> create(const Tensor &tensor) { static std::unique_ptr<TensorImpl_cuda> create(const Tensor &tensor) {
return std::make_unique<TensorImpl_cuda<T>>(tensor); return std::make_unique<TensorImpl_cuda<T>>(tensor);
...@@ -87,9 +80,6 @@ class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_ { ...@@ -87,9 +80,6 @@ class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_ {
stride *= mTensor.dims()[dim]; stride *= mTensor.dims()[dim];
} }
std::reverse(dims.begin(), dims.end());
std::reverse(strides.begin(), strides.end());
CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor, CHECK_CUDNN_STATUS(cudnnSetTensorNdDescriptor(mCudnnTensor,
CudaContext::data_type<T>::value, CudaContext::data_type<T>::value,
dims.size(), dims.size(),
......
...@@ -46,6 +46,7 @@ private: ...@@ -46,6 +46,7 @@ private:
public: public:
ConvImpl_cuda(const Conv_Op<DIM> &op) : mOp(op), mNbConsumedData({0, 0, 0}), mNbProducedData({0}) { ConvImpl_cuda(const Conv_Op<DIM> &op) : mOp(op), mNbConsumedData({0, 0, 0}), mNbProducedData({0}) {
CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc)); CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc));
CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc));
} }
static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<2> &op) { static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<2> &op) {
......
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include "aidge/backend/cuda/data/TensorImpl.hpp"
template <class T>
bool Aidge::TensorImpl_cuda<T>::operator==(const TensorImpl &otherImpl) const {
const auto& otherImplCuda = static_cast<const TensorImpl_cuda<T>&>(otherImpl);
if (mTensor.size() != otherImplCuda.mTensor.size())
return false;
thrust::device_ptr<T> thrustData(mData);
thrust::device_ptr<T> thrustOtherData(otherImplCuda.mData);
return thrust::equal(thrustData, thrustData + mTensor.size(), thrustOtherData);
}
...@@ -78,9 +78,9 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { ...@@ -78,9 +78,9 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
assert(mOp.getInput(0) && "missing input #0"); assert(mOp.getInput(0) && "missing input #0");
assert(mOp.getInput(1) && "missing input #1"); assert(mOp.getInput(1) && "missing input #1");
const std::vector<int> strides(mOp.template get<ConvParam::StrideDims>().rbegin(), mOp.template get<ConvParam::StrideDims>().rend()); const std::vector<int> strides(mOp.template get<ConvParam::StrideDims>().begin(), mOp.template get<ConvParam::StrideDims>().end());
const std::vector<int> paddings(DIM, 0); const std::vector<int> paddings(DIM, 0);
const std::vector<int> upscales(mOp.template get<ConvParam::DilationDims>().rbegin(), mOp.template get<ConvParam::DilationDims>().rend()); const std::vector<int> upscales(mOp.template get<ConvParam::DilationDims>().begin(), mOp.template get<ConvParam::DilationDims>().end());
CHECK_CUDNN_STATUS( CHECK_CUDNN_STATUS(
cudnnSetConvolutionNdDescriptor(mConvDesc, cudnnSetConvolutionNdDescriptor(mConvDesc,
...@@ -89,17 +89,15 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { ...@@ -89,17 +89,15 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
&strides[0], &strides[0],
&upscales[0], &upscales[0],
CUDNN_CROSS_CORRELATION, CUDNN_CROSS_CORRELATION,
DataTypeToCudnn(mOp.getInput(2)->dataType()))); DataTypeToCudnn(mOp.getOutput(0)->dataType())));
const std::vector<int> cudaKernelDims(mOp.getInput(1)->dims().rbegin(), const std::vector<int> kernels(mOp.getInput(1)->dims().begin(), mOp.getInput(1)->dims().end());
mOp.getInput(1)->dims().rend());
CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc));
CHECK_CUDNN_STATUS(cudnnSetFilterNdDescriptor(mFilterDesc, CHECK_CUDNN_STATUS(cudnnSetFilterNdDescriptor(mFilterDesc,
DataTypeToCudnn(mOp.getInput(1)->dataType()), DataTypeToCudnn(mOp.getInput(1)->dataType()),
CUDNN_TENSOR_NCHW, CUDNN_TENSOR_NCHW,
cudaKernelDims.size(), kernels.size(),
&cudaKernelDims[0])); &kernels[0]));
int maxAlgoIterations = 0; int maxAlgoIterations = 0;
cudnnGetConvolutionForwardAlgorithmMaxCount(CudaContext::cudnnHandle(), cudnnGetConvolutionForwardAlgorithmMaxCount(CudaContext::cudnnHandle(),
...@@ -221,10 +219,20 @@ void Aidge::ConvImpl_cuda<DIM>::forward_() { ...@@ -221,10 +219,20 @@ void Aidge::ConvImpl_cuda<DIM>::forward_() {
dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(), dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(),
mOp.getOutput(0)->getImpl()->rawPtr())); mOp.getOutput(0)->getImpl()->rawPtr()));
if (mOp.getInput(2) != nullptr) { if (mOp.getInput(2) && mOp.getInput(2)->size() > 0) {
// Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor()
std::vector<DimSize_t> biasDims(DIM+2, 1);
biasDims[1] = mOp.getInput(2)->size();
// Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc())
Tensor bias(mOp.getInput(2)->dataType());
bias.setBackend("cuda");
bias.resize(biasDims);
// TODO: find a more elegant solution
CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(), CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(),
&alpha, &alpha,
dynamic_cast<TensorImpl_cuda_*>(mOp.getInput(2)->getImpl().get())->getCudnnTensorDesc(), dynamic_cast<TensorImpl_cuda_*>(bias.getImpl().get())->getCudnnTensorDesc(),
mOp.getInput(2)->getImpl()->rawPtr(), mOp.getInput(2)->getImpl()->rawPtr(),
&alpha, &alpha,
dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(), dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(),
...@@ -234,7 +242,12 @@ void Aidge::ConvImpl_cuda<DIM>::forward_() { ...@@ -234,7 +242,12 @@ void Aidge::ConvImpl_cuda<DIM>::forward_() {
template <Aidge::DimIdx_t DIM> template <Aidge::DimIdx_t DIM>
Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() { Aidge::ConvImpl_cuda<DIM>::~ConvImpl_cuda() {
cudnnDestroyConvolutionDescriptor(mConvDesc);
cudnnDestroyFilterDescriptor(mFilterDesc);
if (mWorkspace != nullptr) {
cudaFree(mWorkspace);
}
} }
template <Aidge::DimIdx_t DIM> template <Aidge::DimIdx_t DIM>
......
...@@ -23,11 +23,61 @@ ...@@ -23,11 +23,61 @@
using namespace Aidge; using namespace Aidge;
TEST_CASE("[gpu/operator] Conv(forward)") { TEST_CASE("[gpu/operator] Conv(forward)") {
SECTION("Simple Conv no bias") {
std::shared_ptr<Node> myConv = Conv(1,1,{3,3}, "myconv");
myConv->getOperator()->setDatatype(DataType::Float32);
myConv->getOperator()->setBackend("cuda");
std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
{
{
{{ 0, 1, 2},
{ 3, 4, 5},
{ 6, 7, 8}}
}
}
});
std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { //NCHW
{
{
{{ 0, 1, 2},
{ 3, 4, 5},
{ 6, 7, 8}}
}
}
});
const float myOutput = 0*0+1*1+2*2+3*3+4*4+5*5+6*6+7*7+8*8;
myInput->setBackend("cuda");
myWeights->setBackend("cuda");
myConv->getOperator()->associateInput(0,myInput);
myConv->getOperator()->associateInput(1,myWeights);
myConv->getOperator()->computeOutputDims();
myConv->forward();
REQUIRE(myConv->getOperator()->getOutput(0)->size() == 1);
std::array<float, 9> kernel;
cudaMemcpy(&kernel[0], myWeights->getImpl()->rawPtr(), 9 * sizeof(float), cudaMemcpyDeviceToHost);
std::array<float, 9> input;
cudaMemcpy(&input[0], myInput->getImpl()->rawPtr(), 9 * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 9; ++i) {
REQUIRE(kernel[i] == i);
REQUIRE(input[i] == i);
}
float computedOutput;
cudaMemcpy(&computedOutput, myConv->getOperator()->getOutput(0)->getImpl()->rawPtr(), sizeof(float), cudaMemcpyDeviceToHost);
REQUIRE(fabs(computedOutput - myOutput) < 1e-6);
}
SECTION("Classic Conv") { SECTION("Classic Conv") {
std::shared_ptr<Node> myConv = Conv(3,4,{3,3}, "myconv"); std::shared_ptr<Node> myConv = Conv(3,4,{3,3}, "myconv");
myConv->getOperator()->setDatatype(DataType::Float32); myConv->getOperator()->setDatatype(DataType::Float32);
myConv->getOperator()->setBackend("cuda"); myConv->getOperator()->setBackend("cuda");
std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int,4,3,3,3> { std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,3,3,3> {
{ {
{ {
{{ 0, 1, 2}, {{ 0, 1, 2},
...@@ -75,8 +125,8 @@ TEST_CASE("[gpu/operator] Conv(forward)") { ...@@ -75,8 +125,8 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
} }
} }
}); });
std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<int,4> {{7,0,9,0}}); std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,4> {{7,0,9,0}});
std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,5,5> { //NCHW
{ {
{ {
{{ 0, 1, 2, 3, 4}, {{ 0, 1, 2, 3, 4},
...@@ -118,7 +168,7 @@ TEST_CASE("[gpu/operator] Conv(forward)") { ...@@ -118,7 +168,7 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
} }
} }
}); });
std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,4,3,3> { std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,3,3> {
{ {
{ {
{{ 15226, 15577, 15928}, {{ 15226, 15577, 15928},
...@@ -150,12 +200,26 @@ TEST_CASE("[gpu/operator] Conv(forward)") { ...@@ -150,12 +200,26 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
} }
} }
}); });
myInput->setBackend("cuda");
myWeights->setBackend("cuda");
myBias->setBackend("cuda");
myConv->getOperator()->associateInput(0,myInput); myConv->getOperator()->associateInput(0,myInput);
myConv->getOperator()->associateInput(1,myWeights); myConv->getOperator()->associateInput(1,myWeights);
myConv->getOperator()->associateInput(2,myBias); myConv->getOperator()->associateInput(2,myBias);
myConv->getOperator()->computeOutputDims(); myConv->getOperator()->computeOutputDims();
myConv->forward(); myConv->forward();
// myConv->getOperator()->getOutput(0)->print(); // myConv->getOperator()->getOutput(0)->print();
REQUIRE(*(myConv->getOperator()->getOutput(0)) == *myOutput);
float* computedOutput = new float[myOutput->size()]();
cudaMemcpy(computedOutput, myConv->getOperator()->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
for(int i = 0; i < myOutput->size(); i++){
const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6);
}
delete[] computedOutput;
} }
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment