From e2a71d4f81b45f8a2aa77192a70c88ca75175827 Mon Sep 17 00:00:00 2001 From: NAUD Maxence <maxence.naud@cea.fr> Date: Tue, 13 Feb 2024 14:56:40 +0000 Subject: [PATCH] Rework MatMul kernel - [function] computation of input dimensions now happens in 'MatMulImpl_cpu::forward()'. It handle indexes and sends the right pointers to its computation kernel - [function] 'getCPUPtr()' new parameter: offset - [function] reduce MatMul kernel function to simple matrices multiplication --- include/aidge/backend/cpu/data/GetCPUPtr.h | 4 +- .../aidge/backend/cpu/operator/MatMulImpl.hpp | 4 +- .../operator/MatMulImpl_forward_kernels.hpp | 65 ++-------- src/operator/MatMulImpl.cpp | 119 ++++++++++++++++-- 4 files changed, 122 insertions(+), 70 deletions(-) diff --git a/include/aidge/backend/cpu/data/GetCPUPtr.h b/include/aidge/backend/cpu/data/GetCPUPtr.h index 47e3b07e..335be9ca 100644 --- a/include/aidge/backend/cpu/data/GetCPUPtr.h +++ b/include/aidge/backend/cpu/data/GetCPUPtr.h @@ -15,9 +15,9 @@ #include "aidge/data/Tensor.hpp" namespace Aidge { -inline void *getCPUPtr(std::shared_ptr<Aidge::Data> const &data) { +inline void *getCPUPtr(std::shared_ptr<Aidge::Data> const &data, const std::size_t offset = 0) { const auto tensor = std::static_pointer_cast<Tensor>(data); - return tensor->getImpl()->hostPtr(tensor->getImplOffset()); + return tensor->getImpl()->hostPtr(tensor->getImplOffset() + offset); } } // namespace Aidge diff --git a/include/aidge/backend/cpu/operator/MatMulImpl.hpp b/include/aidge/backend/cpu/operator/MatMulImpl.hpp index ef517065..437ba404 100644 --- a/include/aidge/backend/cpu/operator/MatMulImpl.hpp +++ b/include/aidge/backend/cpu/operator/MatMulImpl.hpp @@ -23,12 +23,10 @@ #include "aidge/backend/cpu/data/GetCPUPtr.h" namespace Aidge { -// class MatMul_Op; -// compute kernel registry for forward and backward class MatMulImplForward_cpu : public Registrable<MatMulImplForward_cpu, std::tuple<DataType, DataType>, - void(const std::vector<DimSize_t>&, const std::vector<DimSize_t>&, + void(const std::size_t, const std::size_t, const std::size_t, const void *, const void *, void *)> {}; class MatMulImplBackward_cpu : public Registrable<MatMulImplBackward_cpu, std::tuple<DataType, DataType>, diff --git a/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp index a63e9e12..5045580f 100644 --- a/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp @@ -12,70 +12,27 @@ #ifndef AIDGE_CPU_OPERATOR_MATMULIMPL_FORWARD_KERNEL_H_ #define AIDGE_CPU_OPERATOR_MATMULIMPL_FORWARD_KERNEL_H_ -#include "aidge/utils/Registrar.hpp" -#include <algorithm> - -// #include <omp.h> #include "aidge/backend/cpu/operator/MatMulImpl.hpp" namespace Aidge { template <class I, class O> -void MatMulImpl_cpu_forward_kernel(const std::vector<DimSize_t>& input1Dims,const std::vector<DimSize_t>& input2Dims, - const void* input1_, const void* input2_, void* output_) { +void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, const std::size_t m, + const void* input1_, const void* input2_, void* output_) { // FIXME: missing MatMul parameters as arguments const I* input1 = static_cast<const I*>(input1_); const I* input2 = static_cast<const I*>(input2_); O* output = static_cast<O*>(output_); - size_t secondToLastIdx2 = input2Dims.size() > 1 ? input2Dims.size() - 2 : 0; - // Checking if matrix dimensions are compatible for multiplication - assert(input1Dims[input1Dims.size()-1] == input2Dims[secondToLastIdx2] && - "Matrix dimensions are not compatible for multiplication"); - - std::size_t innerMulAxis = input1Dims[input1Dims.size()-1]; - std::size_t rows1 = input1Dims[input1Dims.size()-2]; - std::size_t cols2 = input2Dims[input2Dims.size()-1]; - std::size_t nbMat1 = 1, nbMat2 = 1; - if (input1Dims.size()>2) - { - for (std::size_t i = 0; i < input1Dims.size()-2; i++) - { - nbMat1 *= input1Dims[i]; - } - - } - if (input2Dims.size()>2) - { - for (std::size_t i = 0; i < input2Dims.size()-2; i++) - { - nbMat2 *= input2Dims[i]; - } - - } - std::size_t mat1Size = rows1 * innerMulAxis; - std::size_t mat2Size = innerMulAxis * cols2; - std::size_t matSize = rows1 * cols2; - std::size_t nbMat = nbMat1 > nbMat2 ? nbMat1 : nbMat2; - - for (std::size_t i = 0; i < nbMat; i++) { -// #pragma omp parallel for num_threads(8) - for (std::size_t m = 0; m < rows1; m++) - { - - for (size_t k = 0; k < innerMulAxis; k++) - { - for (std::size_t n = 0; n < cols2; n++) - { - if (k==0) { - output[i * matSize + m * cols2 + n] = 0; - } - - output[i * matSize + m * cols2 + n] += input1[(i%nbMat1) * mat1Size + m *innerMulAxis + k] * input2[(i%nbMat2)*mat2Size + k * cols2 + n]; - } - } - } - } + for (std::size_t i = 0; i < n; ++i) { + for (std::size_t j = 0; j < m; ++j) { + O sum = O(0); + for (std::size_t l = 0; l < k; ++l) { + sum += static_cast<O>(input1[i*k + l] * input2[l*m + j]); + } + output[i*m + j] = sum; + } + } } namespace { diff --git a/src/operator/MatMulImpl.cpp b/src/operator/MatMulImpl.cpp index 5818ac64..488af176 100644 --- a/src/operator/MatMulImpl.cpp +++ b/src/operator/MatMulImpl.cpp @@ -9,15 +9,14 @@ * ********************************************************************************/ -#include <cassert> -#include <chrono> // std::chrono::milliseconds -#include <numeric> // std::accumulate -#include <thread> // std::this_thread::sleep_for +#include <cstddef> // std::size_t +#include <cstdint> // std::int32_t +#include <numeric> // std::accumulate #include <vector> +#include "aidge/backend/cpu/data/GetCPUPtr.h" #include "aidge/operator/MatMul.hpp" #include "aidge/utils/Types.h" -#include "aidge/backend/cpu/data/GetCPUPtr.h" #include "aidge/backend/cpu/operator/MatMulImpl.hpp" #include "aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp" @@ -32,10 +31,108 @@ void Aidge::MatMulImpl_cpu::forward() {std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(), std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()}); - kernelFunc( - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims(), - getCPUPtr(mOp.getRawInput(0)), - getCPUPtr(mOp.getRawInput(1)), - getCPUPtr(mOp.getRawOutput(0))); + // Compute compatible input dimensions + std::vector<std::size_t> dims0 = static_cast<const MatMul_Op&>(mOp).getInput(0)->dims(); + std::vector<std::size_t> dims1 = static_cast<const MatMul_Op&>(mOp).getInput(1)->dims(); + + // keep second-to-last dimension of dims0 + const std::size_t keepDim0 = (dims0.size() > 1) ? 1 : 0; + // keep last dimension of dims1 + const std::size_t keepDim1 = (dims1.size() > 1) ? 1 : 0; + + if (dims0.size() == 1) { + dims0.insert(dims0.cbegin(), 1); + } + if (dims1.size() == 1) { + dims1.push_back(1); + } + + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } + + // const std::size_t dims_size = std::max(dims0.size(), dims1.size()); + // at this point, dims0.size() == dims1.size() + const std::size_t nbDims = dims0.size(); + + // initialize strides to iterate through data because of broadcasting + std::size_t *stride_post0; + std::size_t *stride_post1; + std::int32_t *stride_step0; + std::int32_t *stride_step1; + if (nbDims > 2) { + stride_post0 = new std::size_t[nbDims-2]; + stride_post0[nbDims - 3] = 1; + stride_post1 = new std::size_t[nbDims-2]; + stride_post1[nbDims - 3] = 1; + for (std::size_t i = nbDims-4; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*dims0[i+1]; + stride_post1[i] = stride_post1[i+1]*dims1[i+1]; + } + stride_step0 = new std::int32_t[nbDims-2]; + stride_step1 = new std::int32_t[nbDims-2]; + for (std::size_t i = 0; i != nbDims-2; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - static_cast<std::int32_t>(stride_post0[i]) : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - static_cast<std::int32_t>(stride_post1[i]) : 1; + } + } + + const std::vector<std::size_t>& outDims = static_cast<const MatMul_Op&>(mOp).getOutput(0)->dims(); + const std::size_t nbMatrices = std::accumulate(outDims.cbegin(), outDims.cend() - keepDim0 - keepDim1, 1, std::multiplies<std::size_t>()); + std::size_t dim = outDims.size() - 1 - keepDim0 - keepDim1; + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + const std::size_t n = dims0[nbDims - 2]; + const std::size_t k = dims0[nbDims - 1]; + const std::size_t m = dims1[nbDims - 1]; + const std::size_t matrix0Size = n*k; + const std::size_t matrix1Size = k*m; + const std::size_t matrixOutSize = n*m; + for (std::size_t stack = 0; stack < nbMatrices;) { + kernelFunc(n, k, m, + getCPUPtr(mOp.getRawInput(0), offsetIn0*matrix0Size), + getCPUPtr(mOp.getRawInput(1), offsetIn1*matrix1Size), + getCPUPtr(mOp.getRawOutput(0), offsetOut*matrixOutSize)); + if (++stack < nbMatrices) { + std::size_t tmp_stack = stack; + while(tmp_stack % outDims[dim] == 0) { + tmp_stack /= outDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = outDims.size() - 1 - keepDim0 - keepDim1; + } + } + if (nbDims > 2) { + delete[] stride_post0; + delete[] stride_post1; + delete[] stride_step0; + delete[] stride_step1; + } } + +// void Aidge::MatMulImpl_cpu::forward() +// { +// assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) && "missing input #0"); +// assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(1)) && "missing input #1"); + +// // Find the correct kernel type +// auto kernelFunc = Registrar<MatMulImplForward_cpu>::create( +// {std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(), +// std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()}); + +// kernelFunc( +// std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims(), +// std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims(), +// getCPUPtr(mOp.getRawInput(0)), +// getCPUPtr(mOp.getRawInput(1)), +// getCPUPtr(mOp.getRawOutput(0))); +// } -- GitLab