Skip to content
Snippets Groups Projects
Commit 828e4c1e authored by Olivier BICHLER's avatar Olivier BICHLER
Browse files

Convert initial commit

parent cf66707e
No related branches found
No related tags found
1 merge request!4Add Convert operator (a.k.a. Transmitter)
Pipeline #35258 failed
...@@ -5,11 +5,16 @@ ...@@ -5,11 +5,16 @@
#include "aidge/data/Tensor.hpp" #include "aidge/data/Tensor.hpp"
#include "aidge/utils/Registrar.hpp" #include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h" #include "aidge/utils/Types.h"
#include "aidge/utils/ErrorHandling.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp"
#include "aidge/backend/cuda/utils/CudaContext.hpp" #include "aidge/backend/cuda/utils/CudaContext.hpp"
namespace Aidge { namespace Aidge {
template <typename SRC_T, typename DST_T>
void thrust_copy(SRC_T* /*srcData*/, DST_T* /*dstData*/, size_t /*size*/);
/** /**
* @brief Abstract class for the TensorImpl_cuda class template. * @brief Abstract class for the TensorImpl_cuda class template.
* @details Its purpose is to provide access to base methods that are specific * @details Its purpose is to provide access to base methods that are specific
...@@ -51,14 +56,90 @@ class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_ { ...@@ -51,14 +56,90 @@ class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_ {
std::size_t scalarSize() const override { return sizeof(T); } std::size_t scalarSize() const override { return sizeof(T); }
void setDevice(int device) override {
mDevice = device;
}
void copy(const void *src, NbElts_t length) override { void copy(const void *src, NbElts_t length) override {
CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyDeviceToDevice));
}
void copyCast(const void *src, NbElts_t length, const DataType srcDt) override {
if (srcDt == DataType::Float64) {
thrust_copy(static_cast<const double*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::Float32) {
thrust_copy(static_cast<const float*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::Int64) {
thrust_copy(static_cast<const int64_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::UInt64) {
thrust_copy(static_cast<const uint64_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::Int32) {
thrust_copy(static_cast<const int32_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::UInt32) {
thrust_copy(static_cast<const uint32_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::Int16) {
thrust_copy(static_cast<const int16_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::UInt16) {
thrust_copy(static_cast<const uint16_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::Int8) {
thrust_copy(static_cast<const int8_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::UInt8) {
thrust_copy(static_cast<const uint8_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else {
AIDGE_THROW_OR_ABORT(std::runtime_error, "Unsupported data type.");
}
}
void copyFromDevice(const void *src, NbElts_t length, const std::pair<std::string, int>& device) override {
CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyDeviceToDevice));
}
void copyFromHost(const void *src, NbElts_t length) override {
CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyHostToDevice)); CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyHostToDevice));
} }
void copyToHost(void *dst, NbElts_t length) override {
CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(), length * sizeof(T), cudaMemcpyDeviceToHost));
}
void *rawPtr() override { void *rawPtr() override {
lazyInit(reinterpret_cast<void**>(&mData)); lazyInit(reinterpret_cast<void**>(&mData));
return mData; return mData;
} };
void *hostPtr() override {
return nullptr;
};
void* getRaw(std::size_t idx) { void* getRaw(std::size_t idx) {
return static_cast<void*>(static_cast<T*>(rawPtr()) + idx); return static_cast<void*>(static_cast<T*>(rawPtr()) + idx);
......
...@@ -14,6 +14,14 @@ ...@@ -14,6 +14,14 @@
#include <thrust/equal.h> #include <thrust/equal.h>
#include <thrust/device_ptr.h> #include <thrust/device_ptr.h>
template <typename SRC_T, typename DST_T>
void Aidge::thrust_copy(SRC_T* srcData, DST_T* dstData, size_t size)
{
thrust::device_ptr<SRC_T> thrustSrcPtr(srcData);
thrust::device_ptr<DST_T> thrustDstPtr(dstData);
thrust::copy(thrustSrcPtr, thrustSrcPtr + size, thrustDstPtr);
}
template <class T> template <class T>
bool Aidge::TensorImpl_cuda<T>::operator==(const TensorImpl &otherImpl) const { bool Aidge::TensorImpl_cuda<T>::operator==(const TensorImpl &otherImpl) const {
const auto& otherImplCuda = static_cast<const TensorImpl_cuda<T>&>(otherImpl); const auto& otherImplCuda = static_cast<const TensorImpl_cuda<T>&>(otherImpl);
......
...@@ -25,8 +25,8 @@ ...@@ -25,8 +25,8 @@
template <Aidge::DimIdx_t DIM> template <Aidge::DimIdx_t DIM>
void Aidge::ConvImpl_cuda<DIM>::forward() { void Aidge::ConvImpl_cuda<DIM>::forward() {
// FIXME: uncomment the following code once memory handling will work // FIXME: uncomment the following code once memory handling will work
assert(mOp.getInput(0) && "missing input #0"); assert(mOp.getRawInput(0) && "missing input #0");
assert(mOp.getInput(1) && "missing input #1"); assert(mOp.getRawInput(1) && "missing input #1");
// Lazy-initialize CuDNN convolution descriptor // Lazy-initialize CuDNN convolution descriptor
if (mConvDesc == nullptr) { if (mConvDesc == nullptr) {
...@@ -43,16 +43,16 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { ...@@ -43,16 +43,16 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
&strides[0], &strides[0],
&upscales[0], &upscales[0],
CUDNN_CROSS_CORRELATION, CUDNN_CROSS_CORRELATION,
DataTypeToCudnn(mOp.getOutput(0)->dataType()))); DataTypeToCudnn(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType())));
} }
// Lazy-initialize CuDNN filter descriptor // Lazy-initialize CuDNN filter descriptor
if (mFilterDesc == nullptr) { if (mFilterDesc == nullptr) {
const std::vector<int> kernels(mOp.getInput(1)->dims().begin(), mOp.getInput(1)->dims().end()); const std::vector<int> kernels(std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims().begin(), std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims().end());
CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc)); CHECK_CUDNN_STATUS(cudnnCreateFilterDescriptor(&mFilterDesc));
CHECK_CUDNN_STATUS(cudnnSetFilterNdDescriptor(mFilterDesc, CHECK_CUDNN_STATUS(cudnnSetFilterNdDescriptor(mFilterDesc,
DataTypeToCudnn(mOp.getInput(1)->dataType()), DataTypeToCudnn(std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType()),
CUDNN_TENSOR_NCHW, CUDNN_TENSOR_NCHW,
kernels.size(), kernels.size(),
&kernels[0])); &kernels[0]));
...@@ -72,10 +72,10 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { ...@@ -72,10 +72,10 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
CHECK_CUDNN_STATUS(cudnnFindConvolutionForwardAlgorithm( CHECK_CUDNN_STATUS(cudnnFindConvolutionForwardAlgorithm(
CudaContext::cudnnHandle(), CudaContext::cudnnHandle(),
dynamic_cast<TensorImpl_cuda_*>(mOp.getInput(0)->getImpl().get())->getCudnnTensorDesc(), dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->getImpl().get())->getCudnnTensorDesc(),
mFilterDesc, mFilterDesc,
mConvDesc, mConvDesc,
dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(), dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(),
maxAlgoIterations, maxAlgoIterations,
&returnAlgoCounts, &returnAlgoCounts,
&returnFwdAlgo[0])); &returnFwdAlgo[0]));
...@@ -86,10 +86,10 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { ...@@ -86,10 +86,10 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
CHECK_CUDNN_STATUS(cudnnGetConvolutionForwardWorkspaceSize( CHECK_CUDNN_STATUS(cudnnGetConvolutionForwardWorkspaceSize(
CudaContext::cudnnHandle(), CudaContext::cudnnHandle(),
dynamic_cast<TensorImpl_cuda_*>(mOp.getInput(0)->getImpl().get())->getCudnnTensorDesc(), dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->getImpl().get())->getCudnnTensorDesc(),
mFilterDesc, mFilterDesc,
mConvDesc, mConvDesc,
dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(), dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(),
mFwdAlgo, mFwdAlgo,
&workspaceSize)); &workspaceSize));
...@@ -100,7 +100,7 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { ...@@ -100,7 +100,7 @@ void Aidge::ConvImpl_cuda<DIM>::forward() {
// Do the actual forward computation // Do the actual forward computation
// Template is only for scaling parameters, which are always in float // Template is only for scaling parameters, which are always in float
// excepted when the convolution is performed in double precision. // excepted when the convolution is performed in double precision.
if (mOp.getOutput(0)->dataType() == DataType::Float64) { if (std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType() == DataType::Float64) {
forward_<double>(); forward_<double>();
} }
else { else {
...@@ -117,26 +117,26 @@ void Aidge::ConvImpl_cuda<DIM>::forward_() { ...@@ -117,26 +117,26 @@ void Aidge::ConvImpl_cuda<DIM>::forward_() {
CHECK_CUDNN_STATUS( CHECK_CUDNN_STATUS(
cudnnConvolutionForward(CudaContext::cudnnHandle(), cudnnConvolutionForward(CudaContext::cudnnHandle(),
&alpha, &alpha,
dynamic_cast<TensorImpl_cuda_*>(mOp.getInput(0)->getImpl().get())->getCudnnTensorDesc(), dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->getImpl().get())->getCudnnTensorDesc(),
mOp.getInput(0)->getImpl()->rawPtr(), std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->getImpl()->rawPtr(),
mFilterDesc, mFilterDesc,
mOp.getInput(1)->getImpl()->rawPtr(), std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->getImpl()->rawPtr(),
mConvDesc, mConvDesc,
mFwdAlgo, mFwdAlgo,
mWorkspace, mWorkspace,
mWorkspaceSize, mWorkspaceSize,
&beta, &beta,
dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(), dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(),
mOp.getOutput(0)->getImpl()->rawPtr())); std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()));
// Add bias (if there is any) // Add bias (if there is any)
if (mOp.getInput(2) && mOp.getInput(2)->size() > 0) { if (mOp.getRawInput(2) && std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->size() > 0) {
// Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor() // Bias tensor needs to have the same number of dims than output tensor for cudnnAddTensor()
std::vector<DimSize_t> biasDims(DIM+2, 1); std::vector<DimSize_t> biasDims(DIM+2, 1);
biasDims[1] = mOp.getInput(2)->size(); biasDims[1] = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->size();
// Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc()) // Create a dummy tensor with the right dims in order to get a CuDNN tensor descriptor (with getCudnnTensorDesc())
Tensor bias(mOp.getInput(2)->dataType()); Tensor bias(std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->dataType());
bias.setBackend("cuda"); bias.setBackend("cuda");
bias.resize(biasDims); bias.resize(biasDims);
// TODO: find a more elegant solution(?) // TODO: find a more elegant solution(?)
...@@ -144,10 +144,10 @@ void Aidge::ConvImpl_cuda<DIM>::forward_() { ...@@ -144,10 +144,10 @@ void Aidge::ConvImpl_cuda<DIM>::forward_() {
CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(), CHECK_CUDNN_STATUS(cudnnAddTensor(CudaContext::cudnnHandle(),
&alpha, &alpha,
dynamic_cast<TensorImpl_cuda_*>(bias.getImpl().get())->getCudnnTensorDesc(), dynamic_cast<TensorImpl_cuda_*>(bias.getImpl().get())->getCudnnTensorDesc(),
mOp.getInput(2)->getImpl()->rawPtr(), std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->getImpl()->rawPtr(),
&alpha, &alpha,
dynamic_cast<TensorImpl_cuda_*>(mOp.getOutput(0)->getImpl().get())->getCudnnTensorDesc(), dynamic_cast<TensorImpl_cuda_*>(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl().get())->getCudnnTensorDesc(),
mOp.getOutput(0)->getImpl()->rawPtr())); std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr()));
} }
} }
......
...@@ -25,8 +25,9 @@ using namespace Aidge; ...@@ -25,8 +25,9 @@ using namespace Aidge;
TEST_CASE("[gpu/operator] Conv(forward)") { TEST_CASE("[gpu/operator] Conv(forward)") {
SECTION("Simple Conv no bias") { SECTION("Simple Conv no bias") {
std::shared_ptr<Node> myConv = Conv(1,1,{3,3}, "myconv"); std::shared_ptr<Node> myConv = Conv(1,1,{3,3}, "myconv");
myConv->getOperator()->setDatatype(DataType::Float32); auto op = std::static_pointer_cast<OperatorTensor>(myConv->getOperator());
myConv->getOperator()->setBackend("cuda"); op->setDataType(DataType::Float32);
op->setBackend("cuda");
std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,1,1,3,3> { std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
{ {
{ {
...@@ -50,12 +51,12 @@ TEST_CASE("[gpu/operator] Conv(forward)") { ...@@ -50,12 +51,12 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
myInput->setBackend("cuda"); myInput->setBackend("cuda");
myWeights->setBackend("cuda"); myWeights->setBackend("cuda");
myConv->getOperator()->associateInput(0,myInput); op->associateInput(0,myInput);
myConv->getOperator()->associateInput(1,myWeights); op->associateInput(1,myWeights);
myConv->getOperator()->computeOutputDims(); op->computeOutputDims();
myConv->forward(); myConv->forward();
REQUIRE(myConv->getOperator()->getOutput(0)->size() == 1); REQUIRE(op->getOutput(0)->size() == 1);
std::array<float, 9> kernel; std::array<float, 9> kernel;
cudaMemcpy(&kernel[0], myWeights->getImpl()->rawPtr(), 9 * sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(&kernel[0], myWeights->getImpl()->rawPtr(), 9 * sizeof(float), cudaMemcpyDeviceToHost);
...@@ -68,15 +69,16 @@ TEST_CASE("[gpu/operator] Conv(forward)") { ...@@ -68,15 +69,16 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
} }
float computedOutput; float computedOutput;
cudaMemcpy(&computedOutput, myConv->getOperator()->getOutput(0)->getImpl()->rawPtr(), sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(&computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float), cudaMemcpyDeviceToHost);
REQUIRE(fabs(computedOutput - myOutput) < 1e-6); REQUIRE(fabs(computedOutput - myOutput) < 1e-6);
} }
SECTION("Classic Conv") { SECTION("Classic Conv") {
std::shared_ptr<Node> myConv = Conv(3,4,{3,3}, "myconv"); std::shared_ptr<Node> myConv = Conv(3,4,{3,3}, "myconv");
myConv->getOperator()->setDatatype(DataType::Float32); auto op = std::static_pointer_cast<OperatorTensor>(myConv->getOperator());
myConv->getOperator()->setBackend("cuda"); op->setDataType(DataType::Float32);
op->setBackend("cuda");
std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,3,3,3> { std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,3,3,3> {
{ {
{ {
...@@ -205,15 +207,15 @@ TEST_CASE("[gpu/operator] Conv(forward)") { ...@@ -205,15 +207,15 @@ TEST_CASE("[gpu/operator] Conv(forward)") {
myWeights->setBackend("cuda"); myWeights->setBackend("cuda");
myBias->setBackend("cuda"); myBias->setBackend("cuda");
myConv->getOperator()->associateInput(0,myInput); op->associateInput(0,myInput);
myConv->getOperator()->associateInput(1,myWeights); op->associateInput(1,myWeights);
myConv->getOperator()->associateInput(2,myBias); op->associateInput(2,myBias);
myConv->getOperator()->computeOutputDims(); op->computeOutputDims();
myConv->forward(); myConv->forward();
// myConv->getOperator()->getOutput(0)->print(); // op->getOutput(0)->print();
float* computedOutput = new float[myOutput->size()](); float* computedOutput = new float[myOutput->size()]();
cudaMemcpy(computedOutput, myConv->getOperator()->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost);
for(int i = 0; i < myOutput->size(); i++){ for(int i = 0; i < myOutput->size(); i++){
const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment