diff --git a/include/aidge/backend/cpu/operator/MatMulImpl.hpp b/include/aidge/backend/cpu/operator/MatMulImpl.hpp index e8654c6e9cc8fab9080bbb5ed57ea78ee0b7978c..437ba404b1cc39973448f3c5567aec2fe35994e3 100644 --- a/include/aidge/backend/cpu/operator/MatMulImpl.hpp +++ b/include/aidge/backend/cpu/operator/MatMulImpl.hpp @@ -23,16 +23,14 @@ #include "aidge/backend/cpu/data/GetCPUPtr.h" namespace Aidge { -// class MatMul_Op; -// compute kernel registry for forward and backward class MatMulImplForward_cpu - : public Registrable<MatMulImplForward_cpu, std::tuple<DataType, DataType, DataType>, - void(const MatMul_Op::Attrs &, const DimSize_t, const DimSize_t, + : public Registrable<MatMulImplForward_cpu, std::tuple<DataType, DataType>, + void(const std::size_t, const std::size_t, const std::size_t, const void *, const void *, void *)> {}; class MatMulImplBackward_cpu - : public Registrable<MatMulImplBackward_cpu, std::tuple<DataType, DataType, DataType>, - void(const MatMul_Op::Attrs &, const DimSize_t, const DimSize_t, + : public Registrable<MatMulImplBackward_cpu, std::tuple<DataType, DataType>, + void(const std::vector<DimSize_t>&, const std::vector<DimSize_t>&, const void *, const void *, void *)> {}; class MatMulImpl_cpu : public OperatorImpl { diff --git a/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp index bc52779eff274379a853ea84fb839c9486652433..5045580fa599aac64f2c1414bfdf2b87ea57e313 100644 --- a/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp @@ -12,45 +12,39 @@ #ifndef AIDGE_CPU_OPERATOR_MATMULIMPL_FORWARD_KERNEL_H_ #define AIDGE_CPU_OPERATOR_MATMULIMPL_FORWARD_KERNEL_H_ -#include "aidge/utils/Registrar.hpp" -#include <algorithm> - #include "aidge/backend/cpu/operator/MatMulImpl.hpp" namespace Aidge { -template <class I, class W, class O> -void MatMulImpl_cpu_forward_kernel(const MatMul_Op::Attrs& attrs, const DimSize_t batchSize, const DimSize_t oneInputSize, - const void* input_, const void* weights_, void* output_) { +template <class I, class O> +void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, const std::size_t m, + const void* input1_, const void* input2_, void* output_) { // FIXME: missing MatMul parameters as arguments - const I* input = static_cast<const I*>(input_); - const W* weights = static_cast<const W*>(weights_); + const I* input1 = static_cast<const I*>(input1_); + const I* input2 = static_cast<const I*>(input2_); O* output = static_cast<O*>(output_); - - std::fill(output, output+(batchSize*std::get<0>(attrs)), O(0)); - - for (std::size_t batch = 0; batch < batchSize; ++batch) { - for (std::size_t out = 0; out < std::get<0>(attrs); ++out) { - output[out + batch*std::get<0>(attrs)] = std::inner_product(input + batch*oneInputSize, - input + (batch + 1)*oneInputSize, - weights + out*oneInputSize, - output[out + batch*std::get<0>(attrs)]); + for (std::size_t i = 0; i < n; ++i) { + for (std::size_t j = 0; j < m; ++j) { + O sum = O(0); + for (std::size_t l = 0; l < k; ++l) { + sum += static_cast<O>(input1[i*k + l] * input2[l*m + j]); + } + output[i*m + j] = sum; } } } - namespace { static Registrar<MatMulImplForward_cpu> registrarMatMulImpl2DForward_cpu_Float32( - {DataType::Float32, DataType::Float32, DataType::Float32}, - Aidge::MatMulImpl_cpu_forward_kernel<float, float, float>); + {DataType::Float32, DataType::Float32}, + Aidge::MatMulImpl_cpu_forward_kernel<float, float>); static Registrar<MatMulImplForward_cpu> registrarMatMulImpl2DForward_cpu_Int32( - {DataType::Int32, DataType::Int32, DataType::Int32}, - Aidge::MatMulImpl_cpu_forward_kernel<int, int, int>); + {DataType::Int32, DataType::Int32}, + Aidge::MatMulImpl_cpu_forward_kernel<int, int>); static Registrar<MatMulImplForward_cpu> registrarMatMulImpl2DForward_cpu_Float64( - {DataType::Float64, DataType::Float64, DataType::Float64}, - Aidge::MatMulImpl_cpu_forward_kernel<double, double, double>); + {DataType::Float64, DataType::Float64}, + Aidge::MatMulImpl_cpu_forward_kernel<double, double>); } // namespace } // namespace Aidge diff --git a/src/operator/MatMulImpl.cpp b/src/operator/MatMulImpl.cpp index f02effb3172e2c0624c6c7532513a2b794ee3a89..488af17617d556ad7a9d9b73909324d67a672459 100644 --- a/src/operator/MatMulImpl.cpp +++ b/src/operator/MatMulImpl.cpp @@ -9,15 +9,14 @@ * ********************************************************************************/ -#include <cassert> -#include <chrono> // std::chrono::milliseconds -#include <numeric> // std::accumulate -#include <thread> // std::this_thread::sleep_for +#include <cstddef> // std::size_t +#include <cstdint> // std::int32_t +#include <numeric> // std::accumulate #include <vector> +#include "aidge/backend/cpu/data/GetCPUPtr.h" #include "aidge/operator/MatMul.hpp" #include "aidge/utils/Types.h" -#include "aidge/backend/cpu/data/GetCPUPtr.h" #include "aidge/backend/cpu/operator/MatMulImpl.hpp" #include "aidge/backend/cpu/operator/MatMulImpl_forward_kernels.hpp" @@ -30,27 +29,110 @@ void Aidge::MatMulImpl_cpu::forward() // Find the correct kernel type auto kernelFunc = Registrar<MatMulImplForward_cpu>::create( {std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(), std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()}); - // Call kernel - // if (mOp.getInput(0)->nbDims() == 4) { - // kernelFunc( - // mOp.getStaticAttributes(), - // std::static_pointer_cast<Tensor>(mOp.getInput(0))->template dims<4>(), - // mOp.getInput(0))->getImpl()->rawPtr(), - // mOp.mInputs[1]->getImpl()->rawPtr(), - // mOp.mInputs[2]->getImpl()->rawPtr(), - // getCPUPtr(mOp.getRawOutput(0)); - // } - // else - kernelFunc( - dynamic_cast<const MatMul_Op&>(mOp).getStaticAttributes(), - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0], - std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size() / std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0], - getCPUPtr(mOp.getRawInput(0)), - getCPUPtr(mOp.getRawInput(1)), - getCPUPtr(mOp.getRawOutput(0))); + // Compute compatible input dimensions + std::vector<std::size_t> dims0 = static_cast<const MatMul_Op&>(mOp).getInput(0)->dims(); + std::vector<std::size_t> dims1 = static_cast<const MatMul_Op&>(mOp).getInput(1)->dims(); + + // keep second-to-last dimension of dims0 + const std::size_t keepDim0 = (dims0.size() > 1) ? 1 : 0; + // keep last dimension of dims1 + const std::size_t keepDim1 = (dims1.size() > 1) ? 1 : 0; + + if (dims0.size() == 1) { + dims0.insert(dims0.cbegin(), 1); + } + if (dims1.size() == 1) { + dims1.push_back(1); + } + + if (dims0.size() > dims1.size()) { + dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1)); + } + else if (dims1.size() > dims0.size()) { + dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1)); + } + // const std::size_t dims_size = std::max(dims0.size(), dims1.size()); + // at this point, dims0.size() == dims1.size() + const std::size_t nbDims = dims0.size(); + // initialize strides to iterate through data because of broadcasting + std::size_t *stride_post0; + std::size_t *stride_post1; + std::int32_t *stride_step0; + std::int32_t *stride_step1; + if (nbDims > 2) { + stride_post0 = new std::size_t[nbDims-2]; + stride_post0[nbDims - 3] = 1; + stride_post1 = new std::size_t[nbDims-2]; + stride_post1[nbDims - 3] = 1; + for (std::size_t i = nbDims-4; i != static_cast<std::size_t>(-1); --i) { + stride_post0[i] = stride_post0[i+1]*dims0[i+1]; + stride_post1[i] = stride_post1[i+1]*dims1[i+1]; + } + stride_step0 = new std::int32_t[nbDims-2]; + stride_step1 = new std::int32_t[nbDims-2]; + for (std::size_t i = 0; i != nbDims-2; ++i) { + stride_step0[i] = (dims0[i] == 1) ? 1 - static_cast<std::int32_t>(stride_post0[i]) : 1; + stride_step1[i] = (dims1[i] == 1) ? 1 - static_cast<std::int32_t>(stride_post1[i]) : 1; + } + } + + const std::vector<std::size_t>& outDims = static_cast<const MatMul_Op&>(mOp).getOutput(0)->dims(); + const std::size_t nbMatrices = std::accumulate(outDims.cbegin(), outDims.cend() - keepDim0 - keepDim1, 1, std::multiplies<std::size_t>()); + std::size_t dim = outDims.size() - 1 - keepDim0 - keepDim1; + + // variables for arrays offsets + std::size_t offsetIn0 = 0; + std::size_t offsetIn1 = 0; + std::size_t offsetOut = 0; + const std::size_t n = dims0[nbDims - 2]; + const std::size_t k = dims0[nbDims - 1]; + const std::size_t m = dims1[nbDims - 1]; + const std::size_t matrix0Size = n*k; + const std::size_t matrix1Size = k*m; + const std::size_t matrixOutSize = n*m; + for (std::size_t stack = 0; stack < nbMatrices;) { + kernelFunc(n, k, m, + getCPUPtr(mOp.getRawInput(0), offsetIn0*matrix0Size), + getCPUPtr(mOp.getRawInput(1), offsetIn1*matrix1Size), + getCPUPtr(mOp.getRawOutput(0), offsetOut*matrixOutSize)); + if (++stack < nbMatrices) { + std::size_t tmp_stack = stack; + while(tmp_stack % outDims[dim] == 0) { + tmp_stack /= outDims[dim]; + dim--; + } + offsetIn0 += stride_step0[dim]; + offsetIn1 += stride_step1[dim]; + ++offsetOut; + dim = outDims.size() - 1 - keepDim0 - keepDim1; + } + } + if (nbDims > 2) { + delete[] stride_post0; + delete[] stride_post1; + delete[] stride_step0; + delete[] stride_step1; + } } + +// void Aidge::MatMulImpl_cpu::forward() +// { +// assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(0)) && "missing input #0"); +// assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(1)) && "missing input #1"); + +// // Find the correct kernel type +// auto kernelFunc = Registrar<MatMulImplForward_cpu>::create( +// {std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(), +// std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()}); + +// kernelFunc( +// std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims(), +// std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims(), +// getCPUPtr(mOp.getRawInput(0)), +// getCPUPtr(mOp.getRawInput(1)), +// getCPUPtr(mOp.getRawOutput(0))); +// } diff --git a/unit_tests/operator/Test_MatMulImpl.cpp b/unit_tests/operator/Test_MatMulImpl.cpp index 1edb915fb78e3e056f455ddecb8e704eee068cd9..5df0528b5d24be04b324cd05d1f964a57c35b3ea 100644 --- a/unit_tests/operator/Test_MatMulImpl.cpp +++ b/unit_tests/operator/Test_MatMulImpl.cpp @@ -10,102 +10,281 @@ ********************************************************************************/ #include <catch2/catch_test_macros.hpp> +#include <cstddef> // std::size_t +#include <cstdint> // std::uint16_t +#include <chrono> +#include <iostream> #include <memory> +#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution #include "aidge/data/Tensor.hpp" #include "aidge/operator/MatMul.hpp" +#include "aidge/operator/OperatorTensor.hpp" +#include "aidge/utils/TensorUtils.hpp" #include "aidge/backend/cpu/operator/MatMulImpl.hpp" -using namespace Aidge; +namespace Aidge { TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") { - // Test MatMul forward with batch size = 2 and feature size = 75 - std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<int, 5, 75>{ - {{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}}}); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<int, 2, 5>{ - {{23600, 23600, 23600, 23600, 23600}, {68600, 68600, 68600, 68600, 68600}}}); - - std::shared_ptr<Node> myMatMul = MatMul(75, 5, "mymatmul"); + const std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> dis(0.0, 1.0); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> distDims(10, 100); + std::uniform_int_distribution<std::size_t> distNbMatrix(1, 5); + + // Create MatMul Operator + std::shared_ptr<Node> myMatMul = MatMul(); auto op = std::static_pointer_cast<OperatorTensor>(myMatMul -> getOperator()); - op->associateInput(1, myWeights); - - SECTION("2D input") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<int, 2, 75>{ - {{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, - 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, - 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, - 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74}, - {75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, - 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, - 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, - 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, - 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149}}}); - op->associateInput(0, myInput); - op->setDataType(DataType::Int32); - op->setBackend("cpu"); - op->computeOutputDims(); - myMatMul->forward(); - REQUIRE(*(op->getOutput(0)) == *myOutput); + + // To measure execution time of 'MatMul_Op::forward()' member function call + std::chrono::time_point<std::chrono::system_clock> start; + std::chrono::time_point<std::chrono::system_clock> end; + std::chrono::duration<double, std::micro> duration; + + SECTION("2-D Tensors") { + std::size_t totalComputation = 0; + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate Tensors dimensions + const std::size_t dim0 = distDims(gen); + const std::size_t dim1 = distDims(gen); + const std::size_t dim2 = distDims(gen); + totalComputation += dim0*dim1*dim2; + + // Create and populate the array with random float values + float bigArray1[dim0][dim1]; + for (int i = 0; i < dim0; ++i) { + for (int j = 0; j < dim1; ++j) { + bigArray1[i][j] = dis(gen); // Generate random float value + } + } + float bigArray2[dim1][dim2]; + for (int i = 0; i < dim1; ++i) { + for (int j = 0; j < dim2; ++j) { + bigArray2[i][j] = dis(gen); // Generate random float value + } + } + float res[dim0][dim2]; + for (int i = 0; i < dim0; ++i) { + for (int j = 0; j < dim2; ++j) { + float sum = 0.0; + for (int k = 0; k < dim1; ++k) { + sum += bigArray1[i][k] * bigArray2[k][j]; + } + res[i][j] = sum; + } + } + + + // Convert bigArray1 to Tensor + std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>(DataType::Float32); + T1 -> resize({dim0,dim1}); + T1 -> setBackend("cpu"); + T1 -> getImpl() -> setRawPtr(&bigArray1[0][0], dim0*dim1); + // Convert bigArray2 to Tensor + std::shared_ptr<Tensor> T2 = std::make_shared<Tensor>(DataType::Float32); + T2 -> resize({dim1,dim2}); + T2 -> setBackend("cpu"); + T2 -> getImpl() -> setRawPtr(&bigArray2[0][0], dim1*dim2); + // convert res to Tensor + std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32); + Tres -> resize({dim0,dim2}); + Tres -> setBackend("cpu"); + Tres -> getImpl() -> setRawPtr(&res[0][0], dim0*dim2); + + op->associateInput(0, T1); + op->associateInput(1, T2); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->computeOutputDims(); + start = std::chrono::system_clock::now(); + myMatMul->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); + } + std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; + std::cout << "total time: " << duration.count() << std::endl; } - SECTION("4D input") { - std::shared_ptr<Tensor> myInput = - std::make_shared<Tensor>(Array4D<int, 2, 3, 5, 5>{{{{{0, 1, 2, 3, 4}, - {5, 6, 7, 8, 9}, - {10, 11, 12, 13, 14}, - {15, 16, 17, 18, 19}, - {20, 21, 22, 23, 24}}, - {{25, 26, 27, 28, 29}, - {30, 31, 32, 33, 34}, - {35, 36, 37, 38, 39}, - {40, 41, 42, 43, 44}, - {45, 46, 47, 48, 49}}, - {{50, 51, 52, 53, 54}, - {55, 56, 57, 58, 59}, - {60, 61, 62, 63, 64}, - {65, 66, 67, 68, 69}, - {70, 71, 72, 73, 74}}}, - {{{75, 76, 77, 78, 79}, - {80, 81, 82, 83, 84}, - {85, 86, 87, 88, 89}, - {90, 91, 92, 93, 94}, - {95, 96, 97, 98, 99}}, - {{100, 101, 102, 103, 104}, - {105, 106, 107, 108, 109}, - {110, 111, 112, 113, 114}, - {115, 116, 117, 118, 119}, - {120, 121, 122, 123, 124}}, - {{125, 126, 127, 128, 129}, - {130, 131, 132, 133, 134}, - {135, 136, 137, 138, 139}, - {140, 141, 142, 143, 144}, - {145, 146, 147, 148, 149}}}}}); - op->associateInput(0, myInput); - op->setDataType(DataType::Int32); + + SECTION("3-D Tensors") { + std::size_t totalComputation = 0; + duration = std::chrono::duration<double, std::micro>::zero(); + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate Tensors dimensions + const std::size_t dimNb = distNbMatrix(gen); + const std::size_t dim0 = distDims(gen); + const std::size_t dim1 = distDims(gen); + const std::size_t dim2 = distDims(gen); + totalComputation += dim0*dim1*dim2*dimNb; + + // Create and populate the array with random float values + float bigArray1[dimNb][dim0][dim1]; + for (std::size_t n = 0; n < dimNb; ++n) { + for (std::size_t i = 0; i < dim0; ++i) { + for (std::size_t j = 0; j < dim1; ++j) { + bigArray1[n][i][j] = dis(gen); // Generate random float value + } + } + } + float bigArray2[dimNb][dim1][dim2]; + for (std::size_t n = 0; n < dimNb; ++n) { + for (int i = 0; i < dim1; ++i) { + for (int j = 0; j < dim2; ++j) { + bigArray2[n][i][j] = dis(gen); // Generate random float value + } + } + } + float res[dimNb][dim0][dim2]; + for (std::size_t n = 0; n < dimNb; ++n) { + for (int i = 0; i < dim0; ++i) { + for (int j = 0; j < dim2; ++j) { + float sum = 0.0; + for (int k = 0; k < dim1; ++k) { + sum += bigArray1[n][i][k] * bigArray2[n][k][j]; + } + res[n][i][j] = sum; + } + } + } + // Convert bigArray1 to Tensor + std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>(DataType::Float32); + T1 -> resize({dimNb,dim0,dim1}); + T1 -> setBackend("cpu"); + T1 -> getImpl() -> setRawPtr(&bigArray1[0][0], dimNb*dim0*dim1); + // Convert bigArray2 to Tensor + std::shared_ptr<Tensor> T2 = std::make_shared<Tensor>(DataType::Float32); + T2 -> resize({dimNb,dim1,dim2}); + T2 -> setBackend("cpu"); + T2 -> getImpl() -> setRawPtr(&bigArray2[0][0], dimNb*dim1*dim2); + // convert res to Tensor + std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32); + Tres -> resize({dimNb,dim0,dim2}); + Tres -> setBackend("cpu"); + Tres -> getImpl() -> setRawPtr(&res[0][0], dimNb*dim0*dim2); + + op->associateInput(0, T1); + op->associateInput(1, T2); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->computeOutputDims(); + start = std::chrono::system_clock::now(); + myMatMul->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + + REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); + } + std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; + std::cout << "total time: " << duration.count() << std::endl; + } + + SECTION("4-D Tensors") { + std::size_t totalComputation = 0; + duration = std::chrono::duration<double, std::micro>::zero(); + for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { + // generate Tensors dimensions + const std::size_t dimNb1 = distNbMatrix(gen); + const std::size_t dimNb2 = distNbMatrix(gen); + const std::size_t dim0 = distDims(gen); + const std::size_t dim1 = distDims(gen); + const std::size_t dim2 = distDims(gen); + totalComputation += dim0*dim1*dim2*dimNb1*dimNb2; + + // Create and populate the array with random float values + float bigArray1[dimNb1][dimNb2][dim0][dim1]; + for (std::size_t n1 = 0; n1 < dimNb1; ++n1) { + for (std::size_t n2 = 0; n2 < dimNb2; ++n2) { + for (std::size_t i = 0; i < dim0; ++i) { + for (std::size_t j = 0; j < dim1; ++j) { + bigArray1[n1][n2][i][j] = dis(gen); // Generate random float value + } + } + } + } + float bigArray2[dimNb1][dimNb2][dim1][dim2]; + for (std::size_t n1 = 0; n1 < dimNb1; ++n1) { + for (std::size_t n2 = 0; n2 < dimNb2; ++n2) { + for (std::size_t i = 0; i < dim1; ++i) { + for (std::size_t j = 0; j < dim2; ++j) { + bigArray2[n1][n2][i][j] = dis(gen); // Generate random float value + } + } + } + } + float res[dimNb1][dimNb2][dim0][dim2]; + for (std::size_t n1 = 0; n1 < dimNb1; ++n1) { + for (std::size_t n2 = 0; n2 < dimNb2; ++n2) { + for (int i = 0; i < dim0; ++i) { + for (int j = 0; j < dim2; ++j) { + float sum = 0.0; + for (int k = 0; k < dim1; ++k) { + sum += bigArray1[n1][n2][i][k] * bigArray2[n1][n2][k][j]; + } + res[n1][n2][i][j] = sum; + } + } + } + } + // Convert bigArray1 to Tensor + std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>(DataType::Float32); + T1 -> resize({dimNb1,dimNb2,dim0,dim1}); + T1 -> setBackend("cpu"); + T1 -> getImpl() -> setRawPtr(&bigArray1[0][0], dimNb1*dimNb2*dim0*dim1); + // Convert bigArray2 to Tensor + std::shared_ptr<Tensor> T2 = std::make_shared<Tensor>(DataType::Float32); + T2 -> resize({dimNb1,dimNb2,dim1,dim2}); + T2 -> setBackend("cpu"); + T2 -> getImpl() -> setRawPtr(&bigArray2[0][0], dimNb1*dimNb2*dim1*dim2); + // convert res to Tensor + std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32); + Tres -> resize({dimNb1,dimNb2,dim0,dim2}); + Tres -> setBackend("cpu"); + Tres -> getImpl() -> setRawPtr(&res[0][0], dimNb1*dimNb2*dim0*dim2); + + op->associateInput(0, T1); + op->associateInput(1, T2); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->computeOutputDims(); + start = std::chrono::system_clock::now(); + myMatMul->forward(); + end = std::chrono::system_clock::now(); + duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); + REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); + } + std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl; + std::cout << "total time: " << duration.count() << std::endl; + } + + SECTION("+2-D / 1-D") { + // allows to test both computation with a 1-D Tensor and broadcasting + // input_0 + std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>(); + op->associateInput(0,T0); + const std::size_t dim0 = distNbMatrix(gen); + const std::size_t dim1 = distNbMatrix(gen) + 1; + const std::size_t dim2 = distNbMatrix(gen); + const std::size_t dim3 = distNbMatrix(gen); + T0->resize({dim0,dim1,dim2,dim3}); + T0->setDataType(DataType::Float32); + T0->setBackend("cpu"); + + // input_1 + std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>(); + op -> associateInput(1,T1); + T1->resize({dim3}); + T1->setDataType(DataType::Float32); + T1->setBackend("cpu"); + + op->setDataType(DataType::Float32); op->setBackend("cpu"); op->computeOutputDims(); myMatMul->forward(); - REQUIRE(*(op->getOutput(0)) == *myOutput); - } - // std::cout << static_cast<Tensor>((*myMatMul->getOperator())["weight"])[0][0][0][0] << std::endl; -} \ No newline at end of file + } +} +} // namespace Aidge \ No newline at end of file