diff --git a/include/aidge/backend/cuda/operator/ConvImpl.hpp b/include/aidge/backend/cuda/operator/ConvImpl.hpp index 8c591927ce0e52daeff447726c114ce3ae4d0103..0722048f7cf021104a9694a621b1c0dad00ce423 100644 --- a/include/aidge/backend/cuda/operator/ConvImpl.hpp +++ b/include/aidge/backend/cuda/operator/ConvImpl.hpp @@ -21,11 +21,13 @@ #include "aidge/backend/OperatorImpl.hpp" #include "aidge/operator/Conv.hpp" +#include "aidge/operator/ConvDepthWise.hpp" #include "aidge/utils/Registrar.hpp" #include "aidge/utils/Types.h" #include "aidge/backend/cuda/utils/CudaUtils.hpp" + namespace Aidge { template <DimIdx_t DIM> class ConvImpl_cuda : public OperatorImpl { @@ -42,14 +44,19 @@ private: std::shared_ptr<Tensor> mInput0Fallback; std::shared_ptr<Tensor> mInput1Fallback; std::shared_ptr<Tensor> mInput2Fallback; + bool mDepthWise = false; public: - ConvImpl_cuda(const Conv_Op<DIM> &op) : OperatorImpl(op, "cuda") {} + ConvImpl_cuda(const Operator&op, bool depthWise = false) : OperatorImpl(op, "cuda"), mDepthWise(depthWise) {} static std::unique_ptr<ConvImpl_cuda> create(const Conv_Op<DIM> &op) { return std::make_unique<ConvImpl_cuda>(op); } + static std::unique_ptr<ConvImpl_cuda> createDW(const ConvDepthWise_Op<DIM> &op) { + return std::make_unique<ConvImpl_cuda>(op, true); + } + public: void forward(); void backward(); @@ -61,8 +68,8 @@ private: }; namespace { -// add cuda backend to Conv_Op<2> implementation registry static Registrar<Conv_Op<2>> registrarConvImpl_cuda("cuda", Aidge::ConvImpl_cuda<2>::create); +static Registrar<ConvDepthWise_Op<2>> registrarConvDepthWiseImpl_cuda("cuda", Aidge::ConvImpl_cuda<2>::createDW); } // namespace } // namespace Aidge diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp index 096ee9485a03b736326f46e9a569c6b3c9b5a631..78c3c3276ba24b9ea9d5cb497369ca38ebaf947f 100644 --- a/src/operator/ConvImpl.cpp +++ b/src/operator/ConvImpl.cpp @@ -16,6 +16,7 @@ #include "aidge/backend/cuda/operator/ConvImpl.hpp" #include "aidge/backend/cuda/utils/CudaUtils.hpp" #include "aidge/operator/Conv.hpp" +#include "aidge/operator/ConvDepthWise.hpp" #include "aidge/utils/Types.h" template <Aidge::DimIdx_t DIM> @@ -33,19 +34,27 @@ void Aidge::ConvImpl_cuda<DIM>::forward() { // Lazy-initialize CuDNN convolution descriptor if (mConvDesc == nullptr) { - const Conv_Op<DIM>& convOp = static_cast<const Conv_Op<DIM>&>(mOp); - const std::vector<int> strides(convOp.template getAttr<ConvAttr::StrideDims>().begin(), convOp.template getAttr<ConvAttr::StrideDims>().end()); const std::vector<int> paddings(DIM, 0); - const std::vector<int> upscales(convOp.template getAttr<ConvAttr::DilationDims>().begin(), convOp.template getAttr<ConvAttr::DilationDims>().end()); - - CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc)); - CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc, - DIM, - &paddings[0], - &strides[0], - &upscales[0], - CUDNN_CROSS_CORRELATION, - DataTypeToCudnn(op.getOutput(0)->dataType()))); + std::vector<int> strides, upscales; + if (mDepthWise) { + const ConvDepthWise_Op<DIM>& convDWOp = static_cast<const ConvDepthWise_Op<DIM>&>(mOp); + strides = std::vector<int>(convDWOp.template getAttr<ConvDepthWiseAttr::StrideDims>().begin(), convDWOp.template getAttr<ConvDepthWiseAttr::StrideDims>().end()); + upscales = std::vector<int>(convDWOp.template getAttr<ConvDepthWiseAttr::DilationDims>().begin(), convDWOp.template getAttr<ConvDepthWiseAttr::DilationDims>().end()); + } + else { + const Conv_Op<DIM>& convOp = static_cast<const Conv_Op<DIM>&>(mOp); + strides = std::vector<int>(convOp.template getAttr<ConvAttr::StrideDims>().begin(), convOp.template getAttr<ConvAttr::StrideDims>().end()); + upscales = std::vector<int>(convOp.template getAttr<ConvAttr::DilationDims>().begin(), convOp.template getAttr<ConvAttr::DilationDims>().end()); + } + + CHECK_CUDNN_STATUS(cudnnCreateConvolutionDescriptor(&mConvDesc)); + CHECK_CUDNN_STATUS(cudnnSetConvolutionNdDescriptor(mConvDesc, + DIM, + &paddings[0], + &strides[0], + &upscales[0], + CUDNN_CROSS_CORRELATION, + DataTypeToCudnn(op.getOutput(0)->dataType()))); } // Lazy-initialize CuDNN filter descriptor diff --git a/unit_tests/Test_ConvDepthWiseImpl.cpp b/unit_tests/Test_ConvDepthWiseImpl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1fad201a2c775a8d802010def44ff44c1d5d4a3f --- /dev/null +++ b/unit_tests/Test_ConvDepthWiseImpl.cpp @@ -0,0 +1,167 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <array> + +#include <catch2/catch_test_macros.hpp> + +#include "Test_cuda.hpp" + +#include "aidge/data/Tensor.hpp" + +#include "aidge/backend/cpu.hpp" +#include "aidge/backend/cuda.hpp" + +using namespace Aidge; + +TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") { + std::shared_ptr<Node> myCDW = ConvDepthWise(4, {3,3}, "mycdw"); + auto op = std::static_pointer_cast<OperatorTensor>(myCDW -> getOperator()); + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,1,3,3> { + { + {{ + { 0, 1, 2}, + { 3, 4, 5}, + { 6, 7, 8} + + }}, + {{ + { 27, 28, 29}, + { 30, 31, 32}, + { 33, 34, 35} + + }}, + {{ + { 54, 55, 56}, + { 57, 58, 59}, + { 60, 61, 62} + }}, + {{ + { 81, 82, 83}, + { 84, 85, 86}, + { 87, 88, 89} + }} + } + }); + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,4> {{7,0,9,0}}); + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,4,5,5> { //NCHW + { + { + {{ 0, 1, 2, 3, 4}, + { 5, 6, 7, 8, 9}, + { 10, 11, 12, 13, 14}, + { 15, 16, 17, 18, 19}, + { 20, 21, 22, 23, 24}}, + + {{ 25, 26, 27, 28, 29}, + { 30, 31, 32, 33, 34}, + { 35, 36, 37, 38, 39}, + { 40, 41, 42, 43, 44}, + { 45, 46, 47, 48, 49}}, + + {{ 50, 51, 52, 53, 54}, + { 55, 56, 57, 58, 59}, + { 60, 61, 62, 63, 64}, + { 65, 66, 67, 68, 69}, + { 70, 71, 72, 73, 74}}, + + {{ 75, 76, 77, 78, 79}, + { 80, 81, 82, 83, 84}, + { 85, 86, 87, 88, 89}, + { 90, 91, 92, 93, 94}, + { 95, 96, 97, 98, 99}} + }, + { + {{100, 101, 102, 103, 104}, + {105, 106, 107, 108, 109}, + {110, 111, 112, 113, 114}, + {115, 116, 117, 118, 119}, + {120, 121, 122, 123, 124}}, + + {{125, 126, 127, 128, 129}, + {130, 131, 132, 133, 134}, + {135, 136, 137, 138, 139}, + {140, 141, 142, 143, 144}, + {145, 146, 147, 148, 149}}, + + {{150, 151, 152, 153, 154}, + {155, 156, 157, 158, 159}, + {160, 161, 162, 163, 164}, + {165, 166, 167, 168, 169}, + {170, 171, 172, 173, 174}}, + + {{175, 176, 177, 178, 179}, + {180, 181, 182, 183, 184}, + {185, 186, 187, 188, 189}, + {190, 191, 192, 193, 194}, + {195, 196, 197, 198, 199}} + } + } + }); + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,3,3> { + { + { + {{ 319, 355, 391}, + { 499, 535, 571}, + { 679, 715, 751}}, + + {{ 8745, 9024, 9303}, + { 10140, 10419, 10698}, + { 11535, 11814, 12093}}, + + {{ 29337, 29859, 30381}, + { 31947, 32469, 32991}, + { 34557, 35079, 35601}}, + + {{ 62061, 62826, 63591}, + { 65886, 66651, 67416}, + { 69711, 70476, 71241}} + }, + { + {{ 3919, 3955, 3991}, + { 4099, 4135, 4171}, + { 4279, 4315, 4351}}, + + {{ 36645, 36924, 37203}, + { 38040, 38319, 38598}, + { 39435, 39714, 39993}}, + + {{ 81537, 82059, 82581}, + { 84147, 84669, 85191}, + { 86757, 87279, 87801}}, + + {{138561, 139326, 140091}, + {142386, 143151, 143916}, + {146211, 146976, 147741}} + } + } + }); + myInput->setBackend("cuda"); + myWeights->setBackend("cuda"); + myBias->setBackend("cuda"); + op -> associateInput(0, myInput); + op -> associateInput(1, myWeights); + op -> associateInput(2, myBias); + op->setDataType(DataType::Float32); + op->setBackend("cuda"); + op -> computeOutputDims(); + myCDW -> forward(); + + float* computedOutput = new float[myOutput->size()](); + cudaMemcpy(computedOutput, op->getOutput(0)->getImpl()->rawPtr(), sizeof(float) * myOutput->size(), cudaMemcpyDeviceToHost); + + for(int i = 0; i < myOutput->size(); i++){ + const float targetOutput = *(static_cast<float*>(myOutput->getImpl()->rawPtr()) + i); + REQUIRE(fabs(computedOutput[i] - targetOutput) < 1e-6); + } + + delete[] computedOutput; +} \ No newline at end of file