/******************************************************************************** * Copyright (c) 2023 CEA-List * * This program and the accompanying materials are made available under the * terms of the Eclipse Public License 2.0 which is available at * http://www.eclipse.org/legal/epl-2.0. * * SPDX-License-Identifier: EPL-2.0 * ********************************************************************************/ #include <chrono> // std::micro, std::chrono::time_point, // std::chrono::system_clock, std::chrono::duration #include <cstddef> // std::size_t #include <cstdint> // std::uint16_t #include <memory> #include <random> // std::random_device, std::mt19937 // std::uniform_int_distribution, std::uniform_real_distribution #include <vector> #include <catch2/catch_test_macros.hpp> #include <fmt/core.h> #include "aidge/backend/cpu/data/TensorImpl.hpp" #include "aidge/backend/cpu/operator/MatMulImpl.hpp" #include "aidge/data/Data.hpp" #include "aidge/data/Tensor.hpp" #include "aidge/operator/MatMul.hpp" #include "aidge/operator/OperatorTensor.hpp" #include "aidge/utils/TensorUtils.hpp" namespace Aidge { TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") { const std::uint16_t NBTRIALS = 10; // Create a random number generator std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> dis(0.0, 1.0); // Random float distribution between 0 and 1 std::uniform_int_distribution<std::size_t> distDims(10, 100); std::uniform_int_distribution<std::size_t> distNbMatrix(1, 5); // Create MatMul Operator std::shared_ptr<Node> myMatMul = MatMul(); auto op = std::static_pointer_cast<OperatorTensor>(myMatMul -> getOperator()); // To measure execution time of 'MatMul_Op::forward()' member function call std::chrono::time_point<std::chrono::system_clock> start; std::chrono::time_point<std::chrono::system_clock> end; std::chrono::duration<double, std::micro> duration; SECTION("2-D Tensors") { std::size_t totalComputation = 0; for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // generate Tensors dimensions const std::size_t dim0 = distDims(gen); const std::size_t dim1 = distDims(gen); const std::size_t dim2 = distDims(gen); totalComputation += dim0*dim1*dim2; // Create and populate the array with random float values float* bigArray1 = new float[dim0*dim1]; for (int i = 0; i < dim0*dim1; ++i) { bigArray1[i] = dis(gen); // Generate random float value } float* bigArray2 = new float[dim1*dim2]; for (int i = 0; i < dim1*dim2; ++i) { bigArray2[i] = dis(gen); // Generate random float value } float* res = new float[dim0*dim2]; for (int i = 0; i < dim0; ++i) { for (int j = 0; j < dim2; ++j) { float sum = 0.0; for (int k = 0; k < dim1; ++k) { sum += bigArray1[i*dim1+k] * bigArray2[k*dim2+j]; } res[i*dim2+j] = sum; } } // Convert bigArray1 to Tensor std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>(DataType::Float32); T1 -> resize({dim0,dim1}); T1 -> setBackend("cpu"); T1 -> getImpl() -> setRawPtr(bigArray1, dim0*dim1); // Convert bigArray2 to Tensor std::shared_ptr<Tensor> T2 = std::make_shared<Tensor>(DataType::Float32); T2 -> resize({dim1,dim2}); T2 -> setBackend("cpu"); T2 -> getImpl() -> setRawPtr(bigArray2, dim1*dim2); // convert res to Tensor std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32); Tres -> resize({dim0,dim2}); Tres -> setBackend("cpu"); Tres -> getImpl() -> setRawPtr(res, dim0*dim2); op->associateInput(0, T1); op->associateInput(1, T2); op->setDataType(DataType::Float32); op->setBackend("cpu"); op->forwardDims(); start = std::chrono::system_clock::now(); myMatMul->forward(); end = std::chrono::system_clock::now(); duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); delete[] bigArray1; delete[] bigArray2; delete[] res; } Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count())); Log::info("total time: {} μs\n", duration.count()); } SECTION("3-D Tensors") { std::size_t totalComputation = 0; duration = std::chrono::duration<double, std::micro>::zero(); for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // generate Tensors dimensions const std::size_t dimNb = distNbMatrix(gen); const std::size_t dim0 = distDims(gen); const std::size_t dim1 = distDims(gen); const std::size_t dim2 = distDims(gen); totalComputation += dim0*dim1*dim2*dimNb; // Create and populate the array with random float values float* bigArray1 = new float[dimNb*dim0*dim1]; for (std::size_t i = 0; i < dimNb*dim0*dim1; ++i) { bigArray1[i] = dis(gen); // Generate random float value } float* bigArray2 = new float[dimNb*dim1*dim2]; for (int i = 0; i < dimNb*dim1*dim2; ++i) { bigArray2[i] = dis(gen); // Generate random float value } float* res = new float[dimNb*dim0*dim2]; for (std::size_t n = 0; n < dimNb; ++n) { for (int i = 0; i < dim0; ++i) { for (int j = 0; j < dim2; ++j) { float sum = 0.0; for (int k = 0; k < dim1; ++k) { sum += bigArray1[n*dim0*dim1 + i*dim1 + k] * bigArray2[n*dim2*dim1+k*dim2+j]; } res[n*dim0*dim2+i*dim2+j] = sum; } } } // Convert bigArray1 to Tensor std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>(DataType::Float32); T1 -> resize({dimNb,dim0,dim1}); T1 -> setBackend("cpu"); T1 -> getImpl() -> setRawPtr(bigArray1, dimNb*dim0*dim1); // Convert bigArray2 to Tensor std::shared_ptr<Tensor> T2 = std::make_shared<Tensor>(DataType::Float32); T2 -> resize({dimNb,dim1,dim2}); T2 -> setBackend("cpu"); T2 -> getImpl() -> setRawPtr(bigArray2, dimNb*dim1*dim2); // convert res to Tensor std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32); Tres -> resize({dimNb,dim0,dim2}); Tres -> setBackend("cpu"); Tres -> getImpl() -> setRawPtr(res, dimNb*dim0*dim2); op->associateInput(0, T1); op->associateInput(1, T2); op->setDataType(DataType::Float32); op->setBackend("cpu"); op->forwardDims(); start = std::chrono::system_clock::now(); myMatMul->forward(); end = std::chrono::system_clock::now(); duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); delete[] bigArray1; delete[] bigArray2; delete[] res; } Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count())); Log::info("total time: {} μs\n", duration.count()); } SECTION("4-D Tensors") { std::size_t totalComputation = 0; duration = std::chrono::duration<double, std::micro>::zero(); for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { // generate Tensors dimensions const std::size_t dimNb1 = distNbMatrix(gen); const std::size_t dimNb2 = distNbMatrix(gen); const std::size_t dim0 = distDims(gen); const std::size_t dim1 = distDims(gen); const std::size_t dim2 = distDims(gen); totalComputation += dim0*dim1*dim2*dimNb1*dimNb2; // Create and populate the array with random float values float* bigArray1 = new float[dimNb1*dimNb2*dim0*dim1]; for (std::size_t i = 0; i < dimNb1*dimNb2*dim0*dim1; ++i) { bigArray1[i] = dis(gen); // Generate random float value } float* bigArray2 = new float[dimNb1*dimNb2*dim1*dim2]; for (std::size_t i = 0; i < dimNb1*dimNb2*dim1*dim2; ++i) { bigArray2[i] = dis(gen); // Generate random float value } float* res = new float[dimNb1*dimNb2*dim0*dim2]; for (std::size_t n1 = 0; n1 < dimNb1; ++n1) { for (std::size_t n2 = 0; n2 < dimNb2; ++n2) { for (int i = 0; i < dim0; ++i) { for (int j = 0; j < dim2; ++j) { float sum = 0.0; for (int k = 0; k < dim1; ++k) { sum += bigArray1[n1*dimNb2*dim0*dim1+n2*dim0*dim1+i*dim1+k] * bigArray2[n1*dimNb2*dim1*dim2+n2*dim1*dim2+k*dim2+j]; } res[n1*dimNb2*dim0*dim2+n2*dim0*dim2+i*dim2+j] = sum; } } } } // Convert bigArray1 to Tensor std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>(DataType::Float32); T1 -> resize({dimNb1,dimNb2,dim0,dim1}); T1 -> setBackend("cpu"); T1 -> getImpl() -> setRawPtr(bigArray1, dimNb1*dimNb2*dim0*dim1); // Convert bigArray2 to Tensor std::shared_ptr<Tensor> T2 = std::make_shared<Tensor>(DataType::Float32); T2 -> resize({dimNb1,dimNb2,dim1,dim2}); T2 -> setBackend("cpu"); T2 -> getImpl() -> setRawPtr(bigArray2, dimNb1*dimNb2*dim1*dim2); // convert res to Tensor std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32); Tres -> resize({dimNb1,dimNb2,dim0,dim2}); Tres -> setBackend("cpu"); Tres -> getImpl() -> setRawPtr(res, dimNb1*dimNb2*dim0*dim2); op->associateInput(0, T1); op->associateInput(1, T2); op->setDataType(DataType::Float32); op->setBackend("cpu"); op->forwardDims(); start = std::chrono::system_clock::now(); myMatMul->forward(); end = std::chrono::system_clock::now(); duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start); REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres)); delete[] bigArray1; delete[] bigArray2; delete[] res; } Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count())); Log::info("total time: {} μs\n", duration.count()); } SECTION("+2-D / 1-D") { // allows to test both computation with a 1-D Tensor and broadcasting // input_0 std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>(); op->associateInput(0,T0); const std::size_t dim0 = distNbMatrix(gen); const std::size_t dim1 = distNbMatrix(gen) + 1; const std::size_t dim2 = distNbMatrix(gen); const std::size_t dim3 = distNbMatrix(gen); T0->resize({dim0,dim1,dim2,dim3}); T0->setDataType(DataType::Float32); T0->setBackend("cpu"); // input_1 std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>(); op -> associateInput(1,T1); T1->resize({dim3}); T1->setDataType(DataType::Float32); T1->setBackend("cpu"); op->setDataType(DataType::Float32); op->setBackend("cpu"); op->forwardDims(); myMatMul->forward(); } } } // namespace Aidge