From f4a5d905f8793d03517269cb770603576d9b9e5e Mon Sep 17 00:00:00 2001 From: hrouis <houssemeddine.rouis92@gmail.com> Date: Thu, 6 Feb 2025 10:56:31 +0100 Subject: [PATCH 1/8] add alpha and beta attr to FC impl --- include/aidge/backend/cpu/operator/FCImpl.hpp | 4 ++ .../backend/cpu/operator/FCImpl_kernels.hpp | 47 +++++++++----- src/operator/FCImpl.cpp | 4 ++ unit_tests/operator/Test_FCImpl.cpp | 63 +++++++++++++++++++ 4 files changed, 102 insertions(+), 16 deletions(-) diff --git a/include/aidge/backend/cpu/operator/FCImpl.hpp b/include/aidge/backend/cpu/operator/FCImpl.hpp index e82352d9..4daa522f 100644 --- a/include/aidge/backend/cpu/operator/FCImpl.hpp +++ b/include/aidge/backend/cpu/operator/FCImpl.hpp @@ -27,6 +27,8 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op, void(const DimSize_t, const DimSize_t, const DimSize_t, + const float, + const float, const void *, const void *, const void *, @@ -34,6 +36,8 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op, void(const DimSize_t, const DimSize_t, const DimSize_t, + const float, + const float, const void *, const void *, const void *, diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp index c57f86e6..a1624bb5 100644 --- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp @@ -86,6 +86,8 @@ template <class I, class W, class B, class O> void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, const DimSize_t inputFeatureSize, const DimSize_t outputFeatureSize, + const float alpha_, + const float beta_, const void* input_, const void* weights_, const void* biases_, @@ -96,21 +98,29 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, const B* biases = static_cast<const B*>(biases_); O* output = static_cast<O*>(output_); + const O alpha = static_cast<O>(alpha_); + const O beta = static_cast<O>(beta_); + if (biases == nullptr) { - std::fill(output, output+(batchSize*outputFeatureSize), B(0)); - } - else { + std::fill(output, output + (batchSize * outputFeatureSize), O(0)); + } else { // Initialize output with bias * beta for (std::size_t batch = 0; batch < batchSize; ++batch) { - std::copy(biases, biases+outputFeatureSize, output+(batch*outputFeatureSize)); + std::transform( + biases, biases + outputFeatureSize, output + batch * outputFeatureSize, + [beta](const B& bias) { return beta * static_cast<O>(bias); } + ); } } + // Perform matrix-vector multiplication with alpha scaling for (std::size_t batch = 0; batch < batchSize; ++batch) { for (std::size_t out = 0; out < outputFeatureSize; ++out) { - output[out + batch*outputFeatureSize] = std::inner_product(input + batch*inputFeatureSize, - input + (batch + 1)*inputFeatureSize, - weights + out*inputFeatureSize, - output[out + batch*outputFeatureSize]); + output[out + batch * outputFeatureSize] += alpha * std::inner_product( + input + batch * inputFeatureSize, + input + (batch + 1) * inputFeatureSize, + weights + out * inputFeatureSize, + O(0) // Initialize accumulator to zero + ); } } } @@ -119,6 +129,8 @@ template <class I, class O, class W, class B> void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, const DimSize_t inputFeatureSize, const DimSize_t outputFeatureSize, + const float alpha_, + const float beta_, const void* input_, const void* originalInput_, const void* weight_, @@ -134,17 +146,20 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, W* weightGrad = static_cast<W*>(weightGrad_); B* biasesGrad = static_cast<B*>(biasesGrad_); + // Coefficients + const O alpha = static_cast<O>(alpha_); + const O beta = static_cast<O>(beta_); // bias grad if (biasesGrad == nullptr) { // no bias std::fill(biasesGrad, biasesGrad + outputFeatureSize, B(0)); } else { - for (std::size_t o = 0; o < outputFeatureSize; ++o) { // nb outputs + for (std::size_t o = 0; o < outputFeatureSize; ++o) { B sum{0}; for (std::size_t b = 0; b < batchSize; ++b) { - sum += input[b*outputFeatureSize + o]; + sum += input[b * outputFeatureSize + o]; } - biasesGrad[o] = sum; + biasesGrad[o] = beta * biasesGrad[o] + alpha * sum; } } @@ -153,20 +168,20 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, for (std::size_t c = 0; c < inputFeatureSize; ++c) { W sum{0}; for (std::size_t b = 0; b < batchSize; ++b) { - sum += originalInput[b*inputFeatureSize + c]*input[b*outputFeatureSize + o]; + sum += originalInput[b * inputFeatureSize + c] * input[b * outputFeatureSize + o]; } - weightGrad[o*inputFeatureSize + c] = sum; + weightGrad[o * inputFeatureSize + c] = beta * weightGrad[o * inputFeatureSize + c] + alpha * sum; } } - // input grad + // Input gradient (output) for (std::size_t b = 0; b < batchSize; ++b) { for (std::size_t c = 0; c < inputFeatureSize; ++c) { O sum{0}; for (std::size_t o = 0; o < outputFeatureSize; ++o) { - sum += weight[o*inputFeatureSize + c] * input[b*outputFeatureSize + o]; + sum += weight[o * inputFeatureSize + c] * input[b * outputFeatureSize + o]; } - output[b*inputFeatureSize + c] = sum; + output[b * inputFeatureSize + c] = alpha * sum; // Apply alpha (no accumulation needed) } } } diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp index 35945271..144cb1cd 100644 --- a/src/operator/FCImpl.cpp +++ b/src/operator/FCImpl.cpp @@ -46,6 +46,8 @@ void Aidge::FCImpl_cpu::forward() impl.forward(batchSize, input1.dims()[1], // nb input features input1.dims()[0], // nb output features + op_.alpha(), + op_.beta(), input0.getImpl()->rawPtr(), input1.getImpl()->rawPtr(), (op_.getInput(2)) ? input2.getImpl()->rawPtr() : nullptr, @@ -77,6 +79,8 @@ void Aidge::FCImpl_cpu::backward() impl.backward(batchSize, input1grad.dims()[1], // nb input features input1grad.dims()[0], // nb output features + op_.alpha(), + op_.beta(), getCPUPtr(fc_grad), getCPUPtr(op_.getInput(0)), getCPUPtr(mOp.getRawInput(1)), diff --git a/unit_tests/operator/Test_FCImpl.cpp b/unit_tests/operator/Test_FCImpl.cpp index 8ac0afc3..41f95abe 100644 --- a/unit_tests/operator/Test_FCImpl.cpp +++ b/unit_tests/operator/Test_FCImpl.cpp @@ -10,6 +10,7 @@ ********************************************************************************/ #include <memory> +#include <iostream> #include <catch2/catch_test_macros.hpp> @@ -108,4 +109,66 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") { } // std::cout << static_cast<Tensor>((*myFC->getOperator())["weight"])[0][0][0][0] << std::endl; +} + + +TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") { + + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array2D<float, 4, 3>{ + { + {0.55043954, -0.080161572, 0.18495631}, + {-0.82497174, -0.95155114, 0.25449812}, + {1.6508394, 0.2518357, -0.49999624}, + {0.82770473, 0.28659272, -0.11644308} + }}); + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{ + {{0.044322353, 1.9578923, -1.96035}, + {-1.1458585, -0.8235659, 0.24195994}}}); + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{1.5327742, 0.90154403}}); + + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{ + {{1.0376441, 0.38158852}, + {-0.86573052, 2.6920884}, + {3.0791781, -1.3184667}, + {2.3588469, -0.31109101}}}); + + std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc"); + auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator()); + op -> associateInput(0, myInput); + op -> associateInput(1, myWeights); + op -> associateInput(2, myBias); + op -> setDataType(DataType::Float32); + op -> setBackend("cpu"); + myFC->forward(); + op->getOutput(0)->print(); + REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput)); + + // Backward + std::shared_ptr<Tensor> myOutputGrad = + std::make_shared<Tensor>(Array2D<float, 4, 2>{ + { + {1.373911, -1.2312084}, + {0.24750818, -0.71446633}, + {-1.5132738, -0.23136522}, + {0.20452768, -1.2200259} + }}); + std::shared_ptr<Tensor> expectedInputGrad = + std::make_shared<Tensor>(Array2D<float, 4, 3>{ + { + {1.4716856, 3.7039511, -2.9912496}, + {0.82964748, 1.0730045, -0.65807492}, + {0.19803995, -2.7722826, 2.9105654}, + {1.4070423, 1.4052149, -0.69614327} + }}); + std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{ + {{-1.7768159, -0.66813177, 1.0499192}, + {-1.4800593, 0.37063029, -0.15180479}}}); + std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float, 2>{{0.31267303, -3.397066 }}); + + op->getOutput(0)->setGrad(myOutputGrad); + myFC->backward(); + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad)); + REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad)); } \ No newline at end of file -- GitLab From a52c842f120e9b7c19de89ef9147d1ac0afea36d Mon Sep 17 00:00:00 2001 From: hrouis <houssemeddine.rouis92@gmail.com> Date: Mon, 10 Feb 2025 15:27:36 +0100 Subject: [PATCH 2/8] support 2D Bias for FC operator --- include/aidge/backend/cpu/operator/FCImpl.hpp | 2 + .../backend/cpu/operator/FCImpl_kernels.hpp | 101 +++++------------- src/operator/FCImpl.cpp | 6 +- unit_tests/operator/Test_FCImpl.cpp | 67 +++++++++++- 4 files changed, 99 insertions(+), 77 deletions(-) diff --git a/include/aidge/backend/cpu/operator/FCImpl.hpp b/include/aidge/backend/cpu/operator/FCImpl.hpp index 4daa522f..b6e0d099 100644 --- a/include/aidge/backend/cpu/operator/FCImpl.hpp +++ b/include/aidge/backend/cpu/operator/FCImpl.hpp @@ -27,6 +27,7 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op, void(const DimSize_t, const DimSize_t, const DimSize_t, + const bool, const float, const float, const void *, @@ -36,6 +37,7 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op, void(const DimSize_t, const DimSize_t, const DimSize_t, + const bool, const float, const float, const void *, diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp index a1624bb5..7635fca3 100644 --- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp @@ -18,74 +18,11 @@ #include "aidge/utils/Registrar.hpp" namespace Aidge { -// template <class I, class W, class B, class O> -// void FCImpl_cpu_forward_kernel(const FC_Op::Attrs& attrs, const std::array<DimSize_t, 4>& dims, -// const void* input_, const void* weights_, const void* biases_, void* output_) { -// // FIXME: missing FC attributes as arguments -// const I* input = static_cast<const I*>(input_); -// const W* weights = static_cast<const W*>(weights_); -// const B* biases = static_cast<const B*>(biases_); -// O* output = static_cast<O*>(output_); - -// for (std::size_t outIdx = 0; outIdx < outputFeatureSize; ++outIdx) { -// std::size_t oIndex = outIdx * dims[3]; -// const B bias = std::get<0>(attrs) ? B(0) : biases[outIdx]; -// for (std::size_t batch = 0; batch < dims[3]; ++batch) { -// output[oIndex + batch] = bias; -// } -// } - -// for (std::size_t ix = 0; ix < dims[0]; ++ix) { -// for (std::size_t iy = 0; iy < dims[1]; ++iy) { -// for (std::size_t inCh = 0; inCh < dims[2]; ++inCh) { -// const std::size_t iIndex = dims[3] * (inCh + dims[2] * (iy + dims[1] * ix)); -// for (std::size_t outCh = 0; outCh < outputFeatureSize; ++outCh) { -// const std::size_t oIndex = dims[3] * outCh; -// const std::size_t wIndex = (inCh + dims[2] * (iy + dims[1] * ix)) * outputFeatureSize + -// outCh; // (iIndex*outputFeatureSize + oIndex)/dims[3]; -// for (std::size_t batch = 0; batch < dims[3]; ++batch) { -// output[oIndex + batch] += weights[wIndex] * input[iIndex + batch]; -// } -// } -// } -// } -// } -// } - -// template <class I, class W, class B, class O> -// void FCImpl_cpu_forward_kernel(const FC_Op::Attrs& attrs, const std::array<DimSize_t, 2>& dims, -// const void* input_, const void* weights_, const void* biases_, void* output_) { -// // FIXME: missing FC attributes as arguments -// const I* input = static_cast<const I*>(input_); -// const W* weights = static_cast<const W*>(weights_); -// const B* biases = static_cast<const B*>(biases_); -// O* output = static_cast<O*>(output_); - -// // let's have I.dims() = [N, C, H, W] instead of [H, W, C, N] - -// for (std::size_t outIdx = 0; outIdx < outputFeatureSize; ++outIdx) { -// std::size_t oIndex = outIdx * dims[0]; -// const B bias = std::get<0>(attrs) ? B(0) : biases[outIdx]; -// for (std::size_t batch = 0; batch < dims[0]; ++batch) { -// output[oIndex + batch] = bias; -// } -// } - -// for (std::size_t batch = 0; batch < dims[0]; ++batch) { -// const std::size_t oIndex = dims[1] * batch; -// for (std::size_t i = 0; i < dims[1]; ++i) { -// for (std::size_t outCh = 0; outCh < outputFeatureSize; ++outCh) { -// std::size_t wIndex = i * outputFeatureSize + outCh; // (iIndex*outputFeatureSize + oIndex)/dims[3]; -// output[oIndex + outCh] += weights[wIndex] * input[i + batch]; -// } -// } -// } -// } - template <class I, class W, class B, class O> void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, const DimSize_t inputFeatureSize, const DimSize_t outputFeatureSize, + const bool isBiasBatched, const float alpha_, const float beta_, const void* input_, @@ -104,12 +41,20 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, if (biases == nullptr) { std::fill(output, output + (batchSize * outputFeatureSize), O(0)); } else { // Initialize output with bias * beta - for (std::size_t batch = 0; batch < batchSize; ++batch) { + if (isBiasBatched) { std::transform( - biases, biases + outputFeatureSize, output + batch * outputFeatureSize, + biases, biases + batchSize * outputFeatureSize, output, [beta](const B& bias) { return beta * static_cast<O>(bias); } ); } + else { // Bias 1D + for (std::size_t batch = 0; batch < batchSize; ++batch) { + std::transform( + biases, biases + outputFeatureSize, output + batch * outputFeatureSize, + [beta](const B& bias) { return beta * static_cast<O>(bias); } + ); + } + } } // Perform matrix-vector multiplication with alpha scaling @@ -129,6 +74,7 @@ template <class I, class O, class W, class B> void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, const DimSize_t inputFeatureSize, const DimSize_t outputFeatureSize, + const bool isBiasBatched, const float alpha_, const float beta_, const void* input_, @@ -150,16 +96,23 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, const O alpha = static_cast<O>(alpha_); const O beta = static_cast<O>(beta_); + const DimSize_t biasSize = isBiasBatched ? batchSize * outputFeatureSize : outputFeatureSize; // bias grad if (biasesGrad == nullptr) { // no bias - std::fill(biasesGrad, biasesGrad + outputFeatureSize, B(0)); + std::fill(biasesGrad, biasesGrad + biasSize, B(0)); } else { - for (std::size_t o = 0; o < outputFeatureSize; ++o) { - B sum{0}; - for (std::size_t b = 0; b < batchSize; ++b) { - sum += input[b * outputFeatureSize + o]; + if (isBiasBatched) { + for (std::size_t o = 0; o < biasSize; ++o) { + biasesGrad[o] = beta * input[o]; + } + } else { // BiasGrad 1D + for (std::size_t o = 0; o < outputFeatureSize; ++o) { + B sum{0}; + for (std::size_t b = 0; b < batchSize; ++b) { + sum += input[b * outputFeatureSize + o]; + } + biasesGrad[o] = beta * sum; } - biasesGrad[o] = beta * biasesGrad[o] + alpha * sum; } } @@ -170,7 +123,7 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, for (std::size_t b = 0; b < batchSize; ++b) { sum += originalInput[b * inputFeatureSize + c] * input[b * outputFeatureSize + o]; } - weightGrad[o * inputFeatureSize + c] = beta * weightGrad[o * inputFeatureSize + c] + alpha * sum; + weightGrad[o * inputFeatureSize + c] = alpha * sum; } } @@ -181,7 +134,7 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, for (std::size_t o = 0; o < outputFeatureSize; ++o) { sum += weight[o * inputFeatureSize + c] * input[b * outputFeatureSize + o]; } - output[b * inputFeatureSize + c] = alpha * sum; // Apply alpha (no accumulation needed) + output[b * inputFeatureSize + c] = alpha * sum; } } } diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp index 144cb1cd..4c1b6861 100644 --- a/src/operator/FCImpl.cpp +++ b/src/operator/FCImpl.cpp @@ -40,12 +40,13 @@ void Aidge::FCImpl_cpu::forward() const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *(op_.getOutput(0))); const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *(op_.getOutput(0))); const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *(op_.getOutput(0))) : Tensor(); - + const bool isBiasBatched = input2.nbDims() == 2; // Call kernel const auto batchSize = (input0.dims().size() > 1) ? input0.dims()[0] : 1; impl.forward(batchSize, input1.dims()[1], // nb input features input1.dims()[0], // nb output features + isBiasBatched, op_.alpha(), op_.beta(), input0.getImpl()->rawPtr(), @@ -73,12 +74,13 @@ void Aidge::FCImpl_cpu::backward() const auto& input0grad = op_.getInput(0)->grad()->refCastFrom(input0gradFallback, *(op_.getOutput(0))); const auto& input1grad = op_.getInput(1)->grad()->refCastFrom(input1gradFallback, *(op_.getOutput(0))); const auto& input2grad = (op_.getInput(2)) ? op_.getInput(2)->grad()->refCastFrom(input2gradFallback, *(op_.getOutput(0))) : Tensor(); - + const bool isBiasBatched = input2grad.nbDims() == 2; // Call kernel const auto batchSize = (input0grad.dims().size() > 1) ? input0grad.dims()[0] : 1; impl.backward(batchSize, input1grad.dims()[1], // nb input features input1grad.dims()[0], // nb output features + isBiasBatched, op_.alpha(), op_.beta(), getCPUPtr(fc_grad), diff --git a/unit_tests/operator/Test_FCImpl.cpp b/unit_tests/operator/Test_FCImpl.cpp index 41f95abe..324a8a18 100644 --- a/unit_tests/operator/Test_FCImpl.cpp +++ b/unit_tests/operator/Test_FCImpl.cpp @@ -113,7 +113,7 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") { TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") { - + SECTION("2D Input 1D Bias"){ std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 4, 3>{ { @@ -171,4 +171,69 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") { REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad)); REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad)); REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad)); + } + SECTION("2D Input 2D Bias"){ + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array2D<float, 4, 3>{ + { + {1.2137502, -0.73264742, -0.64570695}, + {0.85416597, -1.361271, -0.48093504}, + {-1.2204953, -1.400124, 0.14018863}, + {-1.1383502, -0.47965094, -0.64090657} + }}); + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{ + {{0.12352414, -0.036496852, -0.15418212}, + {-1.0330718, 0.011371522, -0.60658616}}}); + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array2D<float, 4, 2>{ + {{-0.7607078, 0.54936022}, + {-0.31278628, 0.68560582}, + {0.78312093, 0.96373892}, + {-1.2183768, -0.4587383}}}); + + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{ + {{0.17209265, -1.4664109}, + {0.30229449, -0.86952901}, + {0.14901026, 2.8016717}, + {-0.65777171, 2.8892474}}}); + + std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f); + auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator()); + op -> associateInput(0, myInput); + op -> associateInput(1, myWeights); + op -> associateInput(2, myBias); + op -> setDataType(DataType::Float32); + op -> setBackend("cpu"); + myFC->forward(); + op->getOutput(0)->print(); + REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput)); + + // Backward + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{ + { + {0.19126743, -0.85291833}, + {0.94577849, -1.0063207}, + {-0.60322332, 0.65167785}, + {-0.038923461, 0.94386256} + }}); + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 4, 3>{ + { + {1.8095039, -0.033359278, 0.97575688}, + {2.312856, -0.091922671, 0.92919618}, + {-1.4954853, 0.058852643, -0.60458499}, + {-1.9597715, 0.024307476, -1.1330653} + }}); + std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{ + {{3.6410849, -1.1286706, -1.275959}, + {-7.5292215, 1.2592185, 1.0422806}}}); + std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{{0.095633715, -0.42645916}, + {0.47288924, -0.50316036}, + {-0.30161166, 0.32583892}, + {-0.01946173, 0.47193128 }}}); + + op->getOutput(0)->setGrad(myOutputGrad); + myFC->backward(); + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad)); + REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad)); + } } \ No newline at end of file -- GitLab From a59730f1c82228864ff013c0f24b29621ccba54d Mon Sep 17 00:00:00 2001 From: hrouis <houssemeddine.rouis92@gmail.com> Date: Tue, 11 Feb 2025 18:27:37 +0100 Subject: [PATCH 3/8] add transA and transB for FC --- include/aidge/backend/cpu/operator/FCImpl.hpp | 4 + .../backend/cpu/operator/FCImpl_kernels.hpp | 89 +++-- src/operator/FCImpl.cpp | 18 +- unit_tests/operator/Test_FCImpl.cpp | 362 ++++++++++++------ 4 files changed, 304 insertions(+), 169 deletions(-) diff --git a/include/aidge/backend/cpu/operator/FCImpl.hpp b/include/aidge/backend/cpu/operator/FCImpl.hpp index b6e0d099..b01c220f 100644 --- a/include/aidge/backend/cpu/operator/FCImpl.hpp +++ b/include/aidge/backend/cpu/operator/FCImpl.hpp @@ -28,6 +28,8 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op, const DimSize_t, const DimSize_t, const bool, + const bool, + const bool, const float, const float, const void *, @@ -38,6 +40,8 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op, const DimSize_t, const DimSize_t, const bool, + const bool, + const bool, const float, const float, const void *, diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp index 7635fca3..e928e9ea 100644 --- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp @@ -23,13 +23,14 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, const DimSize_t inputFeatureSize, const DimSize_t outputFeatureSize, const bool isBiasBatched, + const bool transA, + const bool transB, const float alpha_, const float beta_, const void* input_, const void* weights_, const void* biases_, void* output_) { - // FIXME: missing FC attributes as arguments const I* input = static_cast<const I*>(input_); const W* weights = static_cast<const W*>(weights_); const B* biases = static_cast<const B*>(biases_); @@ -40,14 +41,13 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, if (biases == nullptr) { std::fill(output, output + (batchSize * outputFeatureSize), O(0)); - } else { // Initialize output with bias * beta - if (isBiasBatched) { + } else { + if (isBiasBatched) { // Bias is (batchSize, outputFeatureSize) std::transform( biases, biases + batchSize * outputFeatureSize, output, [beta](const B& bias) { return beta * static_cast<O>(bias); } ); - } - else { // Bias 1D + } else { // Bias is 1D (outputFeatureSize) for (std::size_t batch = 0; batch < batchSize; ++batch) { std::transform( biases, biases + outputFeatureSize, output + batch * outputFeatureSize, @@ -57,17 +57,18 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, } } - // Perform matrix-vector multiplication with alpha scaling for (std::size_t batch = 0; batch < batchSize; ++batch) { for (std::size_t out = 0; out < outputFeatureSize; ++out) { - output[out + batch * outputFeatureSize] += alpha * std::inner_product( - input + batch * inputFeatureSize, - input + (batch + 1) * inputFeatureSize, - weights + out * inputFeatureSize, - O(0) // Initialize accumulator to zero - ); + O sum = O(0); + for (std::size_t i = 0; i < inputFeatureSize; ++i) { + std::size_t inputIdx = transA ? (i * batchSize + batch) : (batch * inputFeatureSize + i); + std::size_t weightIdx = transB ? (i * outputFeatureSize + out) : (out * inputFeatureSize + i); + sum += static_cast<O>(input[inputIdx]) * static_cast<O>(weights[weightIdx]); + } + output[batch * outputFeatureSize + out] += alpha * sum; } } + } template <class I, class O, class W, class B> @@ -75,6 +76,8 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, const DimSize_t inputFeatureSize, const DimSize_t outputFeatureSize, const bool isBiasBatched, + const bool transA, + const bool transB, const float alpha_, const float beta_, const void* input_, @@ -85,58 +88,64 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, void* biasesGrad_) { // FIXME: missing FC attributes as arguments - const I* input = static_cast<const I*>(input_); - const I* originalInput = static_cast<const I*>(originalInput_); - const W* weight = static_cast<const W*>(weight_); - O* output = static_cast<O*>(output_); - W* weightGrad = static_cast<W*>(weightGrad_); - B* biasesGrad = static_cast<B*>(biasesGrad_); - - // Coefficients + const I* outputGrad = static_cast<const I*>(input_); // dY + const I* originalInput = static_cast<const I*>(originalInput_); // X (Input in forward pass) + const W* weight = static_cast<const W*>(weight_); // W + O* inputGrad = static_cast<O*>(output_); // dX + W* weightGrad = static_cast<W*>(weightGrad_); // dW + B* biasesGrad = static_cast<B*>(biasesGrad_); // dB + const O alpha = static_cast<O>(alpha_); - const O beta = static_cast<O>(beta_); - - const DimSize_t biasSize = isBiasBatched ? batchSize * outputFeatureSize : outputFeatureSize; - // bias grad - if (biasesGrad == nullptr) { // no bias - std::fill(biasesGrad, biasesGrad + biasSize, B(0)); - } else { - if (isBiasBatched) { - for (std::size_t o = 0; o < biasSize; ++o) { - biasesGrad[o] = beta * input[o]; + const O beta = static_cast<O>(beta_); + + // Compute bias gradient: dB = beta * dB + alpha * dY + if (biasesGrad != nullptr) { + if (isBiasBatched) { // Bias is (batchSize, outputFeatureSize) + for (std::size_t b = 0; b < batchSize; ++b) { + for (std::size_t o = 0; o < outputFeatureSize; ++o) { + biasesGrad[b * outputFeatureSize + o] = beta * outputGrad[b * outputFeatureSize + o]; + } } - } else { // BiasGrad 1D + } else { // Bias is 1D (outputFeatureSize) for (std::size_t o = 0; o < outputFeatureSize; ++o) { - B sum{0}; + O sum{0}; for (std::size_t b = 0; b < batchSize; ++b) { - sum += input[b * outputFeatureSize + o]; + sum += outputGrad[b * outputFeatureSize + o]; } biasesGrad[o] = beta * sum; } } } - // weight grad + // Compute weight gradient: dW = dY^T * X for (std::size_t o = 0; o < outputFeatureSize; ++o) { for (std::size_t c = 0; c < inputFeatureSize; ++c) { - W sum{0}; + O sum{0}; for (std::size_t b = 0; b < batchSize; ++b) { - sum += originalInput[b * inputFeatureSize + c] * input[b * outputFeatureSize + o]; + std::size_t inputIdx = transA ? (c * batchSize + b) : (b * inputFeatureSize + c); + std::size_t outputIdx = b * outputFeatureSize + o; + sum += originalInput[inputIdx] * outputGrad[outputIdx]; } - weightGrad[o * inputFeatureSize + c] = alpha * sum; + std::size_t weightIdx = transB ? (c * outputFeatureSize + o) : (o * inputFeatureSize + c); + weightGrad[weightIdx] = alpha * sum; } } - // Input gradient (output) + + // Compute input gradient: dX = dY * W^T for (std::size_t b = 0; b < batchSize; ++b) { for (std::size_t c = 0; c < inputFeatureSize; ++c) { O sum{0}; for (std::size_t o = 0; o < outputFeatureSize; ++o) { - sum += weight[o * inputFeatureSize + c] * input[b * outputFeatureSize + o]; + std::size_t weightIdx = transB ? (c * outputFeatureSize + o) : (o * inputFeatureSize + c); + std::size_t outputIdx = b * outputFeatureSize + o; + sum += weight[weightIdx] * outputGrad[outputIdx]; } - output[b * inputFeatureSize + c] = alpha * sum; + std::size_t inputIdx = transA ? (c * batchSize + b) : (b * inputFeatureSize + c); + inputGrad[inputIdx] = alpha * sum; } } + } // Kernels registration to implementation entry point diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp index 4c1b6861..2c56c97d 100644 --- a/src/operator/FCImpl.cpp +++ b/src/operator/FCImpl.cpp @@ -42,11 +42,14 @@ void Aidge::FCImpl_cpu::forward() const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *(op_.getOutput(0))) : Tensor(); const bool isBiasBatched = input2.nbDims() == 2; // Call kernel - const auto batchSize = (input0.dims().size() > 1) ? input0.dims()[0] : 1; + const DimSize_t nbInFeat = op_.transB()? input1.dims()[0]:input1.dims()[1]; + const auto batchSize = input0.size() /nbInFeat; impl.forward(batchSize, - input1.dims()[1], // nb input features - input1.dims()[0], // nb output features + op_.transB()? input1.dims()[0]:input1.dims()[1], // nb input features + op_.transB()?input1.dims()[1]: input1.dims()[0], // nb output features isBiasBatched, + op_.transA(), + op_.transB(), op_.alpha(), op_.beta(), input0.getImpl()->rawPtr(), @@ -76,11 +79,14 @@ void Aidge::FCImpl_cpu::backward() const auto& input2grad = (op_.getInput(2)) ? op_.getInput(2)->grad()->refCastFrom(input2gradFallback, *(op_.getOutput(0))) : Tensor(); const bool isBiasBatched = input2grad.nbDims() == 2; // Call kernel - const auto batchSize = (input0grad.dims().size() > 1) ? input0grad.dims()[0] : 1; + const DimSize_t nbInFeat = op_.transB()? input1grad.dims()[0]:input1grad.dims()[1]; + const auto batchSize = input0grad.size() /nbInFeat; impl.backward(batchSize, - input1grad.dims()[1], // nb input features - input1grad.dims()[0], // nb output features + nbInFeat, // nb input features + op_.transB()?input1grad.dims()[1]: input1grad.dims()[0], // nb output features isBiasBatched, + op_.transA(), + op_.transB(), op_.alpha(), op_.beta(), getCPUPtr(fc_grad), diff --git a/unit_tests/operator/Test_FCImpl.cpp b/unit_tests/operator/Test_FCImpl.cpp index 324a8a18..0c6cfa00 100644 --- a/unit_tests/operator/Test_FCImpl.cpp +++ b/unit_tests/operator/Test_FCImpl.cpp @@ -10,7 +10,6 @@ ********************************************************************************/ #include <memory> -#include <iostream> #include <catch2/catch_test_macros.hpp> @@ -108,132 +107,249 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") { REQUIRE(*(op->getOutput(0)) == myOutput); } + SECTION("transA and transB") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 3, 4>{{ + {1.2137502, 0.85416597, -1.2204953, -1.1383502}, + {-0.73264742, -1.361271, -1.400124, -0.47965094}, + {-0.64570695, -0.48093504, 0.14018863, -0.64090657} + }}); // Transposed Input (4x3 → 3x4) + + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 3, 2>{{ + {0.12352414, -1.0330718}, + {-0.036496852, 0.011371522}, + {-0.15418212, -0.60658616} + }}); // Transposed Weights (2x3 → 3x2) + + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array2D<float, 4, 2>{{ + {-0.7607078, 0.54936022}, + {-0.31278628, 0.68560582}, + {0.78312093, 0.96373892}, + {-1.2183768, -0.4587383} + }}); // Bias remains the same (4x2) + + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{{ + {0.17209265, -1.4664109}, + {0.30229449, -0.86952901}, + {0.14901026, 2.8016717}, + {-0.65777171, 2.8892474} + }}); // Expected Output + + std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f, true, true); // transA = true, transB = true + + auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator()); + op->associateInput(0, myInput); + op->associateInput(1, myWeights); + op->associateInput(2, myBias); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + myFC->forward(); + op->getOutput(0)->print(); + + REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput)); + } // std::cout << static_cast<Tensor>((*myFC->getOperator())["weight"])[0][0][0][0] << std::endl; } TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") { - SECTION("2D Input 1D Bias"){ - std::shared_ptr<Tensor> myInput = - std::make_shared<Tensor>(Array2D<float, 4, 3>{ - { - {0.55043954, -0.080161572, 0.18495631}, - {-0.82497174, -0.95155114, 0.25449812}, - {1.6508394, 0.2518357, -0.49999624}, - {0.82770473, 0.28659272, -0.11644308} - }}); - std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{ - {{0.044322353, 1.9578923, -1.96035}, - {-1.1458585, -0.8235659, 0.24195994}}}); - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{1.5327742, 0.90154403}}); - - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{ - {{1.0376441, 0.38158852}, - {-0.86573052, 2.6920884}, - {3.0791781, -1.3184667}, - {2.3588469, -0.31109101}}}); - - std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc"); - auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator()); - op -> associateInput(0, myInput); - op -> associateInput(1, myWeights); - op -> associateInput(2, myBias); - op -> setDataType(DataType::Float32); - op -> setBackend("cpu"); - myFC->forward(); - op->getOutput(0)->print(); - REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput)); - - // Backward - std::shared_ptr<Tensor> myOutputGrad = - std::make_shared<Tensor>(Array2D<float, 4, 2>{ - { - {1.373911, -1.2312084}, - {0.24750818, -0.71446633}, - {-1.5132738, -0.23136522}, - {0.20452768, -1.2200259} - }}); - std::shared_ptr<Tensor> expectedInputGrad = - std::make_shared<Tensor>(Array2D<float, 4, 3>{ - { - {1.4716856, 3.7039511, -2.9912496}, - {0.82964748, 1.0730045, -0.65807492}, - {0.19803995, -2.7722826, 2.9105654}, - {1.4070423, 1.4052149, -0.69614327} - }}); - std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{ - {{-1.7768159, -0.66813177, 1.0499192}, - {-1.4800593, 0.37063029, -0.15180479}}}); - std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float, 2>{{0.31267303, -3.397066 }}); - - op->getOutput(0)->setGrad(myOutputGrad); - myFC->backward(); - REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad)); - REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad)); - REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad)); - } - SECTION("2D Input 2D Bias"){ - std::shared_ptr<Tensor> myInput = - std::make_shared<Tensor>(Array2D<float, 4, 3>{ - { - {1.2137502, -0.73264742, -0.64570695}, - {0.85416597, -1.361271, -0.48093504}, - {-1.2204953, -1.400124, 0.14018863}, - {-1.1383502, -0.47965094, -0.64090657} + SECTION("2D Input 1D Bias"){ + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array2D<float, 4, 3>{ + { + {0.55043954, -0.080161572, 0.18495631}, + {-0.82497174, -0.95155114, 0.25449812}, + {1.6508394, 0.2518357, -0.49999624}, + {0.82770473, 0.28659272, -0.11644308} + }}); + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{ + {{0.044322353, 1.9578923, -1.96035}, + { -1.1458585, -0.8235659, 0.24195994}}}); + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{1.5327742, 0.90154403}}); + + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{ + {{1.0376441, 0.38158852}, + {-0.86573052, 2.6920884}, + {3.0791781, -1.3184667}, + {2.3588469, -0.31109101}}}); + + std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc"); + auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator()); + op -> associateInput(0, myInput); + op -> associateInput(1, myWeights); + op -> associateInput(2, myBias); + op -> setDataType(DataType::Float32); + op -> setBackend("cpu"); + myFC->forward(); + op->getOutput(0)->print(); + REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput)); + + // Backward + std::shared_ptr<Tensor> myOutputGrad = + std::make_shared<Tensor>(Array2D<float, 4, 2>{ + { + {1.373911, -1.2312084}, + {0.24750818, -0.71446633}, + {-1.5132738, -0.23136522}, + {0.20452768, -1.2200259} + }}); + std::shared_ptr<Tensor> expectedInputGrad = + std::make_shared<Tensor>(Array2D<float, 4, 3>{ + { + {1.4716856, 3.7039511, -2.9912496}, + {0.82964748, 1.0730045, -0.65807492}, + {0.19803995, -2.7722826, 2.9105654}, + {1.4070423, 1.4052149, -0.69614327} + }}); + std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{ + {{-1.7768159, -0.66813177, 1.0499192}, + {-1.4800593, 0.37063029, -0.15180479}}}); + std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float, 2>{{0.31267303, -3.397066 }}); + + op->getOutput(0)->setGrad(myOutputGrad); + myFC->backward(); + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad)); + REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad)); + } + SECTION("2D Input 2D Bias"){ + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array2D<float, 4, 3>{ + { + {1.2137502, -0.73264742, -0.64570695}, + {0.85416597, -1.361271, -0.48093504}, + {-1.2204953, -1.400124, 0.14018863}, + {-1.1383502, -0.47965094, -0.64090657} + }}); + std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{ + {{0.12352414, -0.036496852, -0.15418212}, + {-1.0330718, 0.011371522, -0.60658616}}}); + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array2D<float, 4, 2>{ + {{-0.7607078, 0.54936022}, + {-0.31278628, 0.68560582}, + {0.78312093, 0.96373892}, + {-1.2183768, -0.4587383}}}); + + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{ + {{0.17209265, -1.4664109}, + {0.30229449, -0.86952901}, + {0.14901026, 2.8016717}, + {-0.65777171, 2.8892474}}}); + + std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f); + auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator()); + op -> associateInput(0, myInput); + op -> associateInput(1, myWeights); + op -> associateInput(2, myBias); + op -> setDataType(DataType::Float32); + op -> setBackend("cpu"); + myFC->forward(); + op->getOutput(0)->print(); + REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput)); + + // Backward + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{ + { + {0.19126743, -0.85291833}, + {0.94577849, -1.0063207}, + {-0.60322332, 0.65167785}, + {-0.038923461, 0.94386256} + }}); + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 4, 3>{ + { + {1.8095039, -0.033359278, 0.97575688}, + {2.312856, -0.091922671, 0.92919618}, + {-1.4954853, 0.058852643, -0.60458499}, + {-1.9597715, 0.024307476, -1.1330653} + }}); + std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{ + {{3.6410849, -1.1286706, -1.275959}, + {-7.5292215, 1.2592185, 1.0422806}}}); + std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{{0.095633715, -0.42645916}, + {0.47288924, -0.50316036}, + {-0.30161166, 0.32583892}, + {-0.01946173, 0.47193128 }}}); + + op->getOutput(0)->setGrad(myOutputGrad); + myFC->backward(); + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad)); + REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad)); + } + SECTION("transA and transB") { + std::shared_ptr<Tensor> myInput = + std::make_shared<Tensor>(Array2D<float, 3, 4>{{ + {0.51234567, -1.23456789, 0.67891234, -0.43219876}, + {1.87654321, -0.98765432, 1.34567890, -1.23456789}, + {-0.67891234, 0.43219876, -1.87654321, 0.98765432} }}); - std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{ - {{0.12352414, -0.036496852, -0.15418212}, - {-1.0330718, 0.011371522, -0.60658616}}}); - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array2D<float, 4, 2>{ - {{-0.7607078, 0.54936022}, - {-0.31278628, 0.68560582}, - {0.78312093, 0.96373892}, - {-1.2183768, -0.4587383}}}); - - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{ - {{0.17209265, -1.4664109}, - {0.30229449, -0.86952901}, - {0.14901026, 2.8016717}, - {-0.65777171, 2.8892474}}}); - - std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f); - auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator()); - op -> associateInput(0, myInput); - op -> associateInput(1, myWeights); - op -> associateInput(2, myBias); - op -> setDataType(DataType::Float32); - op -> setBackend("cpu"); - myFC->forward(); - op->getOutput(0)->print(); - REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput)); - - // Backward - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{ - { - {0.19126743, -0.85291833}, - {0.94577849, -1.0063207}, - {-0.60322332, 0.65167785}, - {-0.038923461, 0.94386256} - }}); - std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 4, 3>{ - { - {1.8095039, -0.033359278, 0.97575688}, - {2.312856, -0.091922671, 0.92919618}, - {-1.4954853, 0.058852643, -0.60458499}, - {-1.9597715, 0.024307476, -1.1330653} - }}); - std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{ - {{3.6410849, -1.1286706, -1.275959}, - {-7.5292215, 1.2592185, 1.0422806}}}); - std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{{0.095633715, -0.42645916}, - {0.47288924, -0.50316036}, - {-0.30161166, 0.32583892}, - {-0.01946173, 0.47193128 }}}); - - op->getOutput(0)->setGrad(myOutputGrad); - myFC->backward(); - REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad)); - REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad)); - REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad)); - } + + std::shared_ptr<Tensor> myWeights = + std::make_shared<Tensor>(Array2D<float, 3, 2>{{ + {0.12345678, -1.34567890}, + {-0.87654321, 0.56789012}, + {0.23456789, -0.45678901} + }}); + + std::shared_ptr<Tensor> myBias = + std::make_shared<Tensor>(Array2D<float, 4, 2>{{ + {0.65432109, -0.54321098}, + {1.23456789, -1.09876543}, + {-0.32109876, 0.98765432}, + {-0.87654321, 0.76543210} + }}); + + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array2D<float, 4, 2>{{ + {-3.1545789, 1.101069}, + {2.2466557, 1.2566758}, + {-3.2323616, 1.9093952}, + {2.0826612, -0.75857949} + }}); + + std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f, true, true); + auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator()); + op->associateInput(0, myInput); + op->associateInput(1, myWeights); + op->associateInput(2, myBias); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + // Forward pass + myFC->forward(); + op->getOutput(0)->print(); + REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput)); + + // Backward Pass + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{ + {0.19876543, -0.65432109}, + {0.76543210, -1.23456789}, + {-0.43210987, 1.09876543}, + {-0.98765432, 0.87654321} + }}); + + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 3, 4>{{ + {1.8100901, 3.5116596, -3.0638645, -2.6029568}, + {-1.0916179, -2.7440662, 2.0054817, 2.7270038}, + {0.69102132, 1.4869658, -1.206526, -1.2641346} + }}); + + std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 3, 2>{{ + {-1.4192863, 3.1120875}, + {0.50970948, 0.77579576}, + {0.062572584, -2.571022} + }}); + + std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{ + {0.099382713, -0.32716054}, + {0.38271606, -0.61728394}, + {-0.21605493, 0.54938269}, + {-0.49382716, 0.43827161} + }}); + op->getOutput(0)->setGrad(myOutputGrad); + myFC->backward(); + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad)); + REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad)); + } } \ No newline at end of file -- GitLab From ef53239c7d5cc0a6e2fcc9a30437d478057bb0ee Mon Sep 17 00:00:00 2001 From: hrouis <houssemeddine.rouis92@gmail.com> Date: Tue, 18 Feb 2025 15:51:13 +0100 Subject: [PATCH 4/8] support only bias of size outChannels for FC --- include/aidge/backend/cpu/operator/FCImpl.hpp | 2 - .../backend/cpu/operator/FCImpl_kernels.hpp | 31 +-- src/operator/FCImpl.cpp | 4 - unit_tests/operator/Test_FCImpl.cpp | 207 ++++++------------ 4 files changed, 74 insertions(+), 170 deletions(-) diff --git a/include/aidge/backend/cpu/operator/FCImpl.hpp b/include/aidge/backend/cpu/operator/FCImpl.hpp index b01c220f..9249ba77 100644 --- a/include/aidge/backend/cpu/operator/FCImpl.hpp +++ b/include/aidge/backend/cpu/operator/FCImpl.hpp @@ -29,7 +29,6 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op, const DimSize_t, const bool, const bool, - const bool, const float, const float, const void *, @@ -41,7 +40,6 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op, const DimSize_t, const bool, const bool, - const bool, const float, const float, const void *, diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp index e928e9ea..aa4ffa2e 100644 --- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp @@ -22,7 +22,6 @@ template <class I, class W, class B, class O> void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, const DimSize_t inputFeatureSize, const DimSize_t outputFeatureSize, - const bool isBiasBatched, const bool transA, const bool transB, const float alpha_, @@ -41,19 +40,12 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, if (biases == nullptr) { std::fill(output, output + (batchSize * outputFeatureSize), O(0)); - } else { - if (isBiasBatched) { // Bias is (batchSize, outputFeatureSize) + } else { + for (std::size_t batch = 0; batch < batchSize; ++batch) { std::transform( - biases, biases + batchSize * outputFeatureSize, output, + biases, biases + outputFeatureSize, output + batch * outputFeatureSize, [beta](const B& bias) { return beta * static_cast<O>(bias); } ); - } else { // Bias is 1D (outputFeatureSize) - for (std::size_t batch = 0; batch < batchSize; ++batch) { - std::transform( - biases, biases + outputFeatureSize, output + batch * outputFeatureSize, - [beta](const B& bias) { return beta * static_cast<O>(bias); } - ); - } } } @@ -75,7 +67,6 @@ template <class I, class O, class W, class B> void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, const DimSize_t inputFeatureSize, const DimSize_t outputFeatureSize, - const bool isBiasBatched, const bool transA, const bool transB, const float alpha_, @@ -100,20 +91,12 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, // Compute bias gradient: dB = beta * dB + alpha * dY if (biasesGrad != nullptr) { - if (isBiasBatched) { // Bias is (batchSize, outputFeatureSize) + for (std::size_t o = 0; o < outputFeatureSize; ++o) { + O sum{0}; for (std::size_t b = 0; b < batchSize; ++b) { - for (std::size_t o = 0; o < outputFeatureSize; ++o) { - biasesGrad[b * outputFeatureSize + o] = beta * outputGrad[b * outputFeatureSize + o]; - } - } - } else { // Bias is 1D (outputFeatureSize) - for (std::size_t o = 0; o < outputFeatureSize; ++o) { - O sum{0}; - for (std::size_t b = 0; b < batchSize; ++b) { - sum += outputGrad[b * outputFeatureSize + o]; - } - biasesGrad[o] = beta * sum; + sum += outputGrad[b * outputFeatureSize + o]; } + biasesGrad[o] = beta * sum; } } diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp index 2c56c97d..2a06803c 100644 --- a/src/operator/FCImpl.cpp +++ b/src/operator/FCImpl.cpp @@ -40,14 +40,12 @@ void Aidge::FCImpl_cpu::forward() const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *(op_.getOutput(0))); const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *(op_.getOutput(0))); const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *(op_.getOutput(0))) : Tensor(); - const bool isBiasBatched = input2.nbDims() == 2; // Call kernel const DimSize_t nbInFeat = op_.transB()? input1.dims()[0]:input1.dims()[1]; const auto batchSize = input0.size() /nbInFeat; impl.forward(batchSize, op_.transB()? input1.dims()[0]:input1.dims()[1], // nb input features op_.transB()?input1.dims()[1]: input1.dims()[0], // nb output features - isBiasBatched, op_.transA(), op_.transB(), op_.alpha(), @@ -77,14 +75,12 @@ void Aidge::FCImpl_cpu::backward() const auto& input0grad = op_.getInput(0)->grad()->refCastFrom(input0gradFallback, *(op_.getOutput(0))); const auto& input1grad = op_.getInput(1)->grad()->refCastFrom(input1gradFallback, *(op_.getOutput(0))); const auto& input2grad = (op_.getInput(2)) ? op_.getInput(2)->grad()->refCastFrom(input2gradFallback, *(op_.getOutput(0))) : Tensor(); - const bool isBiasBatched = input2grad.nbDims() == 2; // Call kernel const DimSize_t nbInFeat = op_.transB()? input1grad.dims()[0]:input1grad.dims()[1]; const auto batchSize = input0grad.size() /nbInFeat; impl.backward(batchSize, nbInFeat, // nb input features op_.transB()?input1grad.dims()[1]: input1grad.dims()[0], // nb output features - isBiasBatched, op_.transA(), op_.transB(), op_.alpha(), diff --git a/unit_tests/operator/Test_FCImpl.cpp b/unit_tests/operator/Test_FCImpl.cpp index 0c6cfa00..ea464f41 100644 --- a/unit_tests/operator/Test_FCImpl.cpp +++ b/unit_tests/operator/Test_FCImpl.cpp @@ -120,19 +120,16 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") { {-0.15418212, -0.60658616} }}); // Transposed Weights (2x3 → 3x2) - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array2D<float, 4, 2>{{ - {-0.7607078, 0.54936022}, - {-0.31278628, 0.68560582}, - {0.78312093, 0.96373892}, - {-1.2183768, -0.4587383} - }}); // Bias remains the same (4x2) + std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{ + -0.31278628, 0.68560582 + }}); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{{ - {0.17209265, -1.4664109}, - {0.30229449, -0.86952901}, - {0.14901026, 2.8016717}, - {-0.65777171, 2.8892474} - }}); // Expected Output + std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{{ + {0.39605334, -1.3982881}, + {0.30229449, -0.86952901}, + {-0.39894336, 2.6626053}, + {-0.20497644, 3.4614196} + }}); std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f, true, true); // transA = true, transB = true @@ -212,144 +209,74 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") { REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad)); REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad)); } - SECTION("2D Input 2D Bias"){ + SECTION("transA and transB") { std::shared_ptr<Tensor> myInput = - std::make_shared<Tensor>(Array2D<float, 4, 3>{ - { - {1.2137502, -0.73264742, -0.64570695}, - {0.85416597, -1.361271, -0.48093504}, - {-1.2204953, -1.400124, 0.14018863}, - {-1.1383502, -0.47965094, -0.64090657} - }}); - std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{ - {{0.12352414, -0.036496852, -0.15418212}, - {-1.0330718, 0.011371522, -0.60658616}}}); - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array2D<float, 4, 2>{ - {{-0.7607078, 0.54936022}, - {-0.31278628, 0.68560582}, - {0.78312093, 0.96373892}, - {-1.2183768, -0.4587383}}}); - - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{ - {{0.17209265, -1.4664109}, - {0.30229449, -0.86952901}, - {0.14901026, 2.8016717}, - {-0.65777171, 2.8892474}}}); - - std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f); - auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator()); - op -> associateInput(0, myInput); - op -> associateInput(1, myWeights); - op -> associateInput(2, myBias); - op -> setDataType(DataType::Float32); - op -> setBackend("cpu"); + std::make_shared<Tensor>(Array2D<float, 3, 4>{{ + {0.51234567, -1.23456789, 0.67891234, -0.43219876}, + {1.87654321, -0.98765432, 1.34567890, -1.23456789}, + {-0.67891234, 0.43219876, -1.87654321, 0.98765432} + }}); + + std::shared_ptr<Tensor> myWeights = + std::make_shared<Tensor>(Array2D<float, 3, 2>{{ + {0.12345678, -1.34567890}, + {-0.87654321, 0.56789012}, + {0.23456789, -0.45678901} + }}); + + std::shared_ptr<Tensor> myBias = + std::make_shared<Tensor>(Array1D<float, 2>{ + {0.65432109, -0.54321098} + }); + + std::shared_ptr<Tensor> myOutput = + std::make_shared<Tensor>(Array2D<float, 4, 2>{{ + {-3.1545789, 1.101069}, + {1.9565322, 1.534453}, + {-2.7446516, 1.1439626}, + {2.8480933, -1.412901} + }}); + + std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f, true, true); + auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator()); + op->associateInput(0, myInput); + op->associateInput(1, myWeights); + op->associateInput(2, myBias); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + + // Forward pass myFC->forward(); op->getOutput(0)->print(); REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput)); - // Backward - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{ - { - {0.19126743, -0.85291833}, - {0.94577849, -1.0063207}, - {-0.60322332, 0.65167785}, - {-0.038923461, 0.94386256} - }}); - std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 4, 3>{ - { - {1.8095039, -0.033359278, 0.97575688}, - {2.312856, -0.091922671, 0.92919618}, - {-1.4954853, 0.058852643, -0.60458499}, - {-1.9597715, 0.024307476, -1.1330653} - }}); - std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{ - {{3.6410849, -1.1286706, -1.275959}, - {-7.5292215, 1.2592185, 1.0422806}}}); - std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{{0.095633715, -0.42645916}, - {0.47288924, -0.50316036}, - {-0.30161166, 0.32583892}, - {-0.01946173, 0.47193128 }}}); + // Backward Pass + std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{ + {0.19876543, -0.65432109}, + {0.76543210, -1.23456789}, + {-0.43210987, 1.09876543}, + {-0.98765432, 0.87654321} + }}); + std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 3, 4>{{ + {1.8100901, 3.5116596, -3.0638645, -2.6029568}, + {-1.0916179, -2.7440662, 2.0054817, 2.7270038}, + {0.69102132, 1.4869658, -1.206526, -1.2641346} + }}); + + std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 3, 2>{{ + {-1.4192863, 3.1120875}, + {0.50970948, 0.77579576}, + {0.062572584, -2.571022} + }}); + + std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float, 2>{ + {-0.22778332, 0.04320982} + }); op->getOutput(0)->setGrad(myOutputGrad); myFC->backward(); REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad)); REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad)); REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad)); } - SECTION("transA and transB") { - std::shared_ptr<Tensor> myInput = - std::make_shared<Tensor>(Array2D<float, 3, 4>{{ - {0.51234567, -1.23456789, 0.67891234, -0.43219876}, - {1.87654321, -0.98765432, 1.34567890, -1.23456789}, - {-0.67891234, 0.43219876, -1.87654321, 0.98765432} - }}); - - std::shared_ptr<Tensor> myWeights = - std::make_shared<Tensor>(Array2D<float, 3, 2>{{ - {0.12345678, -1.34567890}, - {-0.87654321, 0.56789012}, - {0.23456789, -0.45678901} - }}); - - std::shared_ptr<Tensor> myBias = - std::make_shared<Tensor>(Array2D<float, 4, 2>{{ - {0.65432109, -0.54321098}, - {1.23456789, -1.09876543}, - {-0.32109876, 0.98765432}, - {-0.87654321, 0.76543210} - }}); - - std::shared_ptr<Tensor> myOutput = - std::make_shared<Tensor>(Array2D<float, 4, 2>{{ - {-3.1545789, 1.101069}, - {2.2466557, 1.2566758}, - {-3.2323616, 1.9093952}, - {2.0826612, -0.75857949} - }}); - - std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f, true, true); - auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator()); - op->associateInput(0, myInput); - op->associateInput(1, myWeights); - op->associateInput(2, myBias); - op->setDataType(DataType::Float32); - op->setBackend("cpu"); - - // Forward pass - myFC->forward(); - op->getOutput(0)->print(); - REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput)); - - // Backward Pass - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{ - {0.19876543, -0.65432109}, - {0.76543210, -1.23456789}, - {-0.43210987, 1.09876543}, - {-0.98765432, 0.87654321} - }}); - - std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 3, 4>{{ - {1.8100901, 3.5116596, -3.0638645, -2.6029568}, - {-1.0916179, -2.7440662, 2.0054817, 2.7270038}, - {0.69102132, 1.4869658, -1.206526, -1.2641346} - }}); - - std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 3, 2>{{ - {-1.4192863, 3.1120875}, - {0.50970948, 0.77579576}, - {0.062572584, -2.571022} - }}); - - std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{ - {0.099382713, -0.32716054}, - {0.38271606, -0.61728394}, - {-0.21605493, 0.54938269}, - {-0.49382716, 0.43827161} - }}); - op->getOutput(0)->setGrad(myOutputGrad); - myFC->backward(); - REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad)); - REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad)); - REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad)); - } } \ No newline at end of file -- GitLab From 74899817bf12f4d2ffb72e9e3d60543492b1101a Mon Sep 17 00:00:00 2001 From: hrouis <houssemeddine.rouis92@gmail.com> Date: Fri, 21 Feb 2025 16:27:13 +0100 Subject: [PATCH 5/8] update FC factory call --- unit_tests/operator/Test_FCImpl.cpp | 52 +++++++++++------------ unit_tests/operator/Test_MetaOperator.cpp | 4 +- unit_tests/scheduler/Test_CastMove.cpp | 4 +- unit_tests/scheduler/Test_Scheduler.cpp | 6 +-- 4 files changed, 32 insertions(+), 34 deletions(-) diff --git a/unit_tests/operator/Test_FCImpl.cpp b/unit_tests/operator/Test_FCImpl.cpp index ea464f41..ce41ec1e 100644 --- a/unit_tests/operator/Test_FCImpl.cpp +++ b/unit_tests/operator/Test_FCImpl.cpp @@ -19,6 +19,7 @@ #include "aidge/data/Tensor.hpp" #include "aidge/operator/FC.hpp" #include "aidge/utils/ArrayHelpers.hpp" +#include "aidge/utils/TensorUtils.hpp" using namespace Aidge; @@ -48,7 +49,7 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") { Tensor myOutput = Array2D<int, 2, 5>{ {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}}; - std::shared_ptr<Node> myFC = FC(75, 5, false, "myfc"); + std::shared_ptr<Node> myFC = FC(75, 5, 1.0f, 1.0f, false, false, false, "myfc"); auto op = std::static_pointer_cast<FC_Op>(myFC -> getOperator()); op -> setDataType(DataType::Int32); op -> setBackend("cpu"); @@ -124,14 +125,14 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") { -0.31278628, 0.68560582 }}); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{{ + Tensor myOutput = Tensor(Array2D<float, 4, 2>{{ {0.39605334, -1.3982881}, {0.30229449, -0.86952901}, {-0.39894336, 2.6626053}, {-0.20497644, 3.4614196} }}); - std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f, true, true); // transA = true, transB = true + std::shared_ptr<Node> myFC = FC(3, 2, 2.0f, 0.5f, false, true, true, "myfc"); // transA = true, transB = true auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator()); op->associateInput(0, myInput); @@ -141,9 +142,8 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") { op->setBackend("cpu"); myFC->forward(); - op->getOutput(0)->print(); - REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput)); + REQUIRE(approxEq<float>(*(op->getOutput(0)), myOutput)); } // std::cout << static_cast<Tensor>((*myFC->getOperator())["weight"])[0][0][0][0] << std::endl; } @@ -164,13 +164,13 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") { { -1.1458585, -0.8235659, 0.24195994}}}); std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{1.5327742, 0.90154403}}); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{ + Tensor myOutput = Tensor(Array2D<float, 4, 2>{ {{1.0376441, 0.38158852}, {-0.86573052, 2.6920884}, {3.0791781, -1.3184667}, {2.3588469, -0.31109101}}}); - std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc"); + std::shared_ptr<Node> myFC = FC(3, 2, 1.0f, 1.0f, false, false, false, "myfc"); auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator()); op -> associateInput(0, myInput); op -> associateInput(1, myWeights); @@ -178,8 +178,8 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") { op -> setDataType(DataType::Float32); op -> setBackend("cpu"); myFC->forward(); - op->getOutput(0)->print(); - REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput)); + + REQUIRE(approxEq<float>(*(op->getOutput(0)), myOutput)); // Backward std::shared_ptr<Tensor> myOutputGrad = @@ -190,24 +190,23 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") { {-1.5132738, -0.23136522}, {0.20452768, -1.2200259} }}); - std::shared_ptr<Tensor> expectedInputGrad = - std::make_shared<Tensor>(Array2D<float, 4, 3>{ + Tensor expectedInputGrad = Tensor(Array2D<float, 4, 3>{ { {1.4716856, 3.7039511, -2.9912496}, {0.82964748, 1.0730045, -0.65807492}, {0.19803995, -2.7722826, 2.9105654}, {1.4070423, 1.4052149, -0.69614327} }}); - std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{ + Tensor expectedWeightsGrad = Tensor(Array2D<float, 2, 3>{ {{-1.7768159, -0.66813177, 1.0499192}, {-1.4800593, 0.37063029, -0.15180479}}}); - std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float, 2>{{0.31267303, -3.397066 }}); + Tensor expectedBiasGrad = Tensor(Array1D<float, 2>{{0.31267303, -3.397066 }}); op->getOutput(0)->setGrad(myOutputGrad); myFC->backward(); - REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad)); - REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad)); - REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad)); + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), expectedInputGrad)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), expectedWeightsGrad)); + REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), expectedBiasGrad)); } SECTION("transA and transB") { std::shared_ptr<Tensor> myInput = @@ -229,15 +228,14 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") { {0.65432109, -0.54321098} }); - std::shared_ptr<Tensor> myOutput = - std::make_shared<Tensor>(Array2D<float, 4, 2>{{ + Tensor myOutput = Tensor(Array2D<float, 4, 2>{{ {-3.1545789, 1.101069}, {1.9565322, 1.534453}, {-2.7446516, 1.1439626}, {2.8480933, -1.412901} }}); - std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f, true, true); + std::shared_ptr<Node> myFC = FC(3, 2, 2.0f, 0.5f, false, true, true, "myfc"); auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator()); op->associateInput(0, myInput); op->associateInput(1, myWeights); @@ -247,8 +245,8 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") { // Forward pass myFC->forward(); - op->getOutput(0)->print(); - REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput)); + + REQUIRE(approxEq<float>(*(op->getOutput(0)), myOutput)); // Backward Pass std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{ @@ -258,25 +256,25 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") { {-0.98765432, 0.87654321} }}); - std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 3, 4>{{ + Tensor expectedInputGrad = Tensor(Array2D<float, 3, 4>{{ {1.8100901, 3.5116596, -3.0638645, -2.6029568}, {-1.0916179, -2.7440662, 2.0054817, 2.7270038}, {0.69102132, 1.4869658, -1.206526, -1.2641346} }}); - std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 3, 2>{{ + Tensor expectedWeightsGrad = Tensor(Array2D<float, 3, 2>{{ {-1.4192863, 3.1120875}, {0.50970948, 0.77579576}, {0.062572584, -2.571022} }}); - std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float, 2>{ + Tensor expectedBiasGrad = Tensor(Array1D<float, 2>{ {-0.22778332, 0.04320982} }); op->getOutput(0)->setGrad(myOutputGrad); myFC->backward(); - REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad)); - REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad)); - REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad)); + REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), expectedInputGrad)); + REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), expectedWeightsGrad)); + REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), expectedBiasGrad)); } } \ No newline at end of file diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp index 7b0b80d8..925a264a 100644 --- a/unit_tests/operator/Test_MetaOperator.cpp +++ b/unit_tests/operator/Test_MetaOperator.cpp @@ -702,8 +702,8 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") { auto init = std::make_shared<Tensor>(Array2D<float, 2, 5>{}); uniformFiller<float>(init, 0.0, 0.0); - auto fc1 = FC(inChannels, outChannels, true, "myfc"); - auto fc2 = FC(outChannels, inChannels, true, "fc2"); + auto fc1 = FC(inChannels, outChannels, 1.0f, 1.0f, true, false, false, "myfc"); + auto fc2 = FC(outChannels, inChannels, 1.0f, 1.0f, true, false, false, "fc2"); // NOTE: Account for init step by adding 1 to the max timestep // parameter. auto lif1 = Leaky(nbTimeSteps + 1, beta, threshold, LeakyReset::Subtraction, "leaky"); diff --git a/unit_tests/scheduler/Test_CastMove.cpp b/unit_tests/scheduler/Test_CastMove.cpp index b78e864f..fdeb2d79 100644 --- a/unit_tests/scheduler/Test_CastMove.cpp +++ b/unit_tests/scheduler/Test_CastMove.cpp @@ -56,7 +56,7 @@ TEST_CASE("[cpu/castmove] CastMove(forward)") { Conv(1, 3, {3, 3}, "conv1"), Conv(3, 4, {1, 1}, "conv2"), Conv(4, 3, {1, 1}, "conv3"), - FC(27, 5, false, "fc")}); + FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")}); g->getNode("conv1")->getOperator()->setInput(0, inputTensor); g->getNode("conv1")->getOperator()->setInput(1, weight1); @@ -158,7 +158,7 @@ TEST_CASE("[cpu/castmove] CastMove(forward)") { Conv(1, 3, {3, 3}, "conv1"), Conv(3, 4, {1, 1}, "conv2"), Conv(4, 3, {1, 1}, "conv3"), - FC(27, 5, false, "fc")}); + FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")}); g->getNode("conv1")->getOperator()->setInput(0, inputTensor); g->getNode("conv1")->getOperator()->setInput(1, weight1); diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp index be87e8ac..e264f114 100644 --- a/unit_tests/scheduler/Test_Scheduler.cpp +++ b/unit_tests/scheduler/Test_Scheduler.cpp @@ -71,7 +71,7 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") { Conv(1, 3, {3, 3}, "conv1"), Conv(3, 4, {1, 1}, "conv2"), Conv(4, 3, {1, 1}, "conv3"), - FC(27, 5, false, "fc")}); + FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")}); g->getNode("conv1")->getOperator()->setInput(0, inputTensor); g->getNode("conv1")->getOperator()->setInput(1, weight1); @@ -173,7 +173,7 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") { Conv(3, 3, {1, 1}, "conv1.3")}), Add("add2"), Conv(3, 2, {1, 1}, "conv2"), - FC(18, 5, false, "out")}); + FC(18, 5, 1.0f, 1.0f, false, false, false, "out")}); g->getNode("inputConv")->getOperator()->setInput(0, inputTensor); g->getNode("inputConv")->getOperator()->setInput(1, weight1); @@ -321,7 +321,7 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") { Conv(1, 3, {3, 3}, "conv1"), Conv(3, 4, {1, 1}, "conv2"), Conv(4, 3, {1, 1}, "conv3"), - FC(27, 5, false, "fc")}); + FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")}); // g->getNode("conv1")->getOperator()->setInput(0, inputTensor); g->getNode("conv1")->getOperator()->setInput(1, weight1); -- GitLab From 0eb2f2b47cdbdb0ef9037a0a0a33c709927bb668 Mon Sep 17 00:00:00 2001 From: hrouis <houssemeddine.rouis92@gmail.com> Date: Mon, 24 Feb 2025 11:22:26 +0100 Subject: [PATCH 6/8] add hints to support batched biad in FC --- .../backend/cpu/operator/FCImpl_kernels.hpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp index aa4ffa2e..3542c8cf 100644 --- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp @@ -41,6 +41,14 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, if (biases == nullptr) { std::fill(output, output + (batchSize * outputFeatureSize), O(0)); } else { + /* TODO: If we want to support batched Biases: + for (std::size_t batch = 0; batch < batchSize; ++batch) { + std::transform( + biases, biases + outputFeatureSize, output + batch * outputFeatureSize, + [beta](const B& bias) { return beta * static_cast<O>(bias); } + ); + + */ for (std::size_t batch = 0; batch < batchSize; ++batch) { std::transform( biases, biases + outputFeatureSize, output + batch * outputFeatureSize, @@ -90,7 +98,14 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, const O beta = static_cast<O>(beta_); // Compute bias gradient: dB = beta * dB + alpha * dY - if (biasesGrad != nullptr) { + if (biasesGrad != nullptr) { + /* TODO: If we want to support batched Biases: + for (std::size_t b = 0; b < batchSize; ++b) { + for (std::size_t o = 0; o < outputFeatureSize; ++o) { + biasesGrad[b * outputFeatureSize + o] = beta * outputGrad[b * outputFeatureSize + o]; + } + } + */ for (std::size_t o = 0; o < outputFeatureSize; ++o) { O sum{0}; for (std::size_t b = 0; b < batchSize; ++b) { -- GitLab From 5c9c92b73d6c3384ebc08def25d9798e1af05a58 Mon Sep 17 00:00:00 2001 From: hrouis <houssemeddine.rouis92@gmail.com> Date: Tue, 11 Mar 2025 12:02:39 +0100 Subject: [PATCH 7/8] add Sum operator --- include/aidge/backend/cpu.hpp | 1 + .../aidge/backend/cpu/operator/SumImpl.hpp | 36 ++++ .../backend/cpu/operator/SumImpl_kernels.hpp | 59 ++++++ src/operator/SumImpl.cpp | 71 +++++++ unit_tests/operator/Test_SumImpl.cpp | 176 ++++++++++++++++++ 5 files changed, 343 insertions(+) create mode 100644 include/aidge/backend/cpu/operator/SumImpl.hpp create mode 100644 include/aidge/backend/cpu/operator/SumImpl_kernels.hpp create mode 100644 src/operator/SumImpl.cpp create mode 100644 unit_tests/operator/Test_SumImpl.cpp diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp index 5c1f9b11..ecf111a7 100644 --- a/include/aidge/backend/cpu.hpp +++ b/include/aidge/backend/cpu.hpp @@ -58,6 +58,7 @@ #include "aidge/backend/cpu/operator/SliceImpl.hpp" #include "aidge/backend/cpu/operator/SoftmaxImpl.hpp" #include "aidge/backend/cpu/operator/SubImpl.hpp" +#include "aidge/backend/cpu/operator/SumImpl.hpp" #include "aidge/backend/cpu/operator/TanhImpl.hpp" #include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp" diff --git a/include/aidge/backend/cpu/operator/SumImpl.hpp b/include/aidge/backend/cpu/operator/SumImpl.hpp new file mode 100644 index 00000000..54d68979 --- /dev/null +++ b/include/aidge/backend/cpu/operator/SumImpl.hpp @@ -0,0 +1,36 @@ +/******************************************************************************** + * Copyright (c) 2025 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + + #ifndef AIDGE_CPU_OPERATOR_SUMIMPL_H_ + #define AIDGE_CPU_OPERATOR_SUMIMPL_H_ + + #include <cstddef> // std::size_t + #include <memory> // std::unique_ptr, std::make_unique + #include <string> + #include <vector> + + #include "aidge/backend/cpu/operator/OperatorImpl.hpp" + #include "aidge/operator/Sum.hpp" + #include "aidge/utils/Registrar.hpp" + #include "aidge/utils/Types.h" + + namespace Aidge { + // Operator implementation entry point for the backend + using SumImpl_cpu = OperatorImpl_cpu<Sum_Op, + void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)>; + + + // Implementation entry point registration to Operator + REGISTRAR(Sum_Op, "cpu", Aidge::SumImpl_cpu::create); + } // namespace Aidge + + #endif /* AIDGE_CPU_OPERATOR_SUMIMPL_H_ */ + \ No newline at end of file diff --git a/include/aidge/backend/cpu/operator/SumImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SumImpl_kernels.hpp new file mode 100644 index 00000000..0c5e137e --- /dev/null +++ b/include/aidge/backend/cpu/operator/SumImpl_kernels.hpp @@ -0,0 +1,59 @@ +/******************************************************************************** + * Copyright (c) 2025 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + + #ifndef AIDGE_CPU_OPERATOR_SUBIMPL_KERNELS_H_ + #define AIDGE_CPU_OPERATOR_SUBIMPL_KERNELS_H_ + + #include "aidge/utils/Registrar.hpp" + + #include <cstdint> // std::int32_t, std::int64_t + + #include "aidge/backend/cpu/data/Broadcasting.hpp" + #include "aidge/backend/cpu/operator/SumImpl.hpp" + + namespace Aidge { + + template <class I, class O> + void SumImpl_cpu_forward_kernel(const std::vector<const void*> inputs_, const std::vector<std::vector<std::size_t>>& inputDims, const std::size_t outputLength, const std::vector<std::size_t>& outDims, void* output_) { + std::vector<const I*> inputs; + for (const auto& input_ : inputs_) { + inputs.push_back(static_cast<const I*>(input_)); + } + O* output = static_cast<O*>(output_); + + for (std::size_t oIndex = 0; oIndex < outputLength; ++oIndex) + { + output[oIndex] = 0; + std::vector<size_t> indexes = getMultiDimIndices(outDims, oIndex); + for(std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) { + std::size_t idx = getFlattenedIndex(inputDims[iIndex], indexes); + output[oIndex] += inputs[iIndex][idx]; + } + } + } + + // Kernels registration to implementation entry point + REGISTRAR(SumImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}}, + {ProdConso::inPlaceModel, Aidge::SumImpl_cpu_forward_kernel<float, float>, nullptr}); + REGISTRAR(SumImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}}, + {ProdConso::inPlaceModel, Aidge::SumImpl_cpu_forward_kernel<double, double>, nullptr}); + REGISTRAR(SumImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}}, + {ProdConso::inPlaceModel, Aidge::SumImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr}); + REGISTRAR(SumImpl_cpu, + {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}}, + {ProdConso::inPlaceModel, Aidge::SumImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr}); + } // namespace Aidge + + #endif /* AIDGE_CPU_OPERATOR_SUBIMPL_KERNELS_H_ */ + \ No newline at end of file diff --git a/src/operator/SumImpl.cpp b/src/operator/SumImpl.cpp new file mode 100644 index 00000000..436fd78c --- /dev/null +++ b/src/operator/SumImpl.cpp @@ -0,0 +1,71 @@ +/******************************************************************************** + * Copyright (c) 2025 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + + #include "aidge/backend/cpu/operator/SumImpl.hpp" + + #include <cassert> + #include <vector> + + #include "aidge/backend/cpu/data/GetCPUPtr.h" + #include "aidge/backend/cpu/operator/SumImpl_kernels.hpp" + #include "aidge/data/Data.hpp" + #include "aidge/data/Tensor.hpp" + #include "aidge/utils/Types.h" + #include "aidge/utils/ErrorHandling.hpp" + +template <> +void Aidge::SumImpl_cpu::forward() { + const Sum_Op& op = static_cast<const Sum_Op&>(mOp); + // Check inputs + AIDGE_ASSERT(op.getInput(0), "missing input in Sum operator"); + AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Sum forward because input#0 has no implementation."); + DataType datatypeFirstInput = op.getInput(0)->dataType(); + for (IOIndex_t i = 1; i < op.nbInputs(); ++i) { + AIDGE_ASSERT(op.getInput(i), "missing input in Sum operator"); + AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Sum forward because the input#{} has no implementation.", i); + AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot sum inputs with two differents data type."); + } + + // Find the correct kernel type + const auto impl = Registrar<SumImpl_cpu>::create(getBestMatch(getRequiredSpec())); + + // Convert input data (no overhead if not needed!) + // TODO: right now, if needed, memory will be allocated/deallocated at each + // call to forward(). We might put the following shared_ptr as members of + // this class to avoid that. + const std::size_t nbDims = op.getOutput(0)->nbDims(); + std::vector<std::vector<std::size_t>> inputsDims; + std::vector<const void*> opInputs; + std::vector<std::shared_ptr<Tensor>> inputsFallback(op.nbInputs()); + for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { + std::vector<std::size_t> inputDims(nbDims, 1); + auto dims = op.getInput(i)->dims(); + for(std::size_t j=dims.size()-1; j+1>0; --j) + { + std::size_t idx = nbDims - (dims.size()-j); + inputDims[idx] = dims[j]; + } + inputsDims.push_back(inputDims); + const auto& input = op.getInput(i)->refCastFrom(inputsFallback[i], *op.getOutput(0)); + opInputs.push_back(input.getImpl()->rawPtr()); + } + + impl.forward(opInputs, + inputsDims, + op.getOutput(0)->size(), + op.getOutput(0)->dims(), + getCPUPtr(op.getRawOutput(0))); +} + + +template <> +void Aidge::SumImpl_cpu::backward() { +} \ No newline at end of file diff --git a/unit_tests/operator/Test_SumImpl.cpp b/unit_tests/operator/Test_SumImpl.cpp new file mode 100644 index 00000000..c3e81a90 --- /dev/null +++ b/unit_tests/operator/Test_SumImpl.cpp @@ -0,0 +1,176 @@ +/******************************************************************************** + * Copyright (c) 2025 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + + #include <catch2/catch_test_macros.hpp> + + #include "aidge/data/Tensor.hpp" + #include "aidge/operator/Sum.hpp" + + #include "aidge/backend/cpu.hpp" + + using namespace Aidge; + + TEST_CASE("[cpu/operator] Sum(forward)", "[Sum][CPU]") { + std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { + { // + { // + {{20, 47},{21, 48},{22, 49}}, // + {{23, 50},{24, 51},{25, 52}}, // + {{26, 53},{27, 54},{28, 55}} // + }, // + { // + {{29, 56},{30, 57},{31, 58}}, // + {{32, 59},{33, 60},{34, 61}}, // + {{35, 62},{36, 63},{37, 64}} // + }, // + { // + {{38, 65},{39, 66},{40, 67}}, // + {{41, 68},{42, 69},{43, 70}}, // + {{44, 71},{45, 72},{46, 73}} // + } // + } // + }); // + + SECTION("One input") { + std::shared_ptr<Node> mySum = Sum(1); + auto op = std::static_pointer_cast<OperatorTensor>(mySum -> getOperator()); + op->associateInput(0, input1); + op->setBackend("cpu"); + op->setDataType(DataType::Int32); + mySum->forward(); + + REQUIRE(*(op->getOutput(0)) == *input1); + } + + SECTION("Two inputs") { + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { + { + { + {{40, 94},{42, 96},{44, 98}}, + {{46, 100},{48, 102},{50, 104}}, + {{52, 106},{54, 108},{56, 110}} + }, + { + {{58, 112},{60, 114},{62, 116}}, + {{64, 118},{66, 120},{68, 122}}, + {{70, 124},{72, 126},{74, 128}} + }, + { + {{76, 130},{78, 132},{80, 134}}, + {{82, 136},{84, 138},{86, 140}}, + {{88, 142},{90, 144},{92, 146}} + } + } + }); + + std::shared_ptr<Node> mySum = Sum(2); + auto op = std::static_pointer_cast<OperatorTensor>(mySum -> getOperator()); + op->associateInput(0, input1); + op->associateInput(1, input1); + op->setBackend("cpu"); + op->setDataType(DataType::Int32); + mySum->forward(); + + REQUIRE(*(op->getOutput(0)) == *expectedOutput); + } + + SECTION("Three inputs") { + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { + { + { + {{ 60, 141},{ 63, 144},{ 66, 147}}, + {{ 69, 150},{ 72, 153},{ 75, 156}}, + {{ 78, 159},{ 81, 162},{ 84, 165}} + }, + { + {{ 87, 168},{ 90, 171},{ 93, 174}}, + {{ 96, 177},{ 99, 180},{102, 183}}, + {{105, 186},{108, 189},{111, 192}} + }, + { + {{114, 195},{117, 198},{120, 201}}, + {{123, 204},{126, 207},{129, 210}}, + {{132, 213},{135, 216},{138, 219}} + } + } + }); + + std::shared_ptr<Node> mySum = Sum(3); + auto op = std::static_pointer_cast<OperatorTensor>(mySum -> getOperator()); + op->associateInput(0, input1); + op->associateInput(1, input1); + op->associateInput(2, input1); + op->setDataType(DataType::Int32); + op->setBackend("cpu"); + mySum->forward(); + + REQUIRE(*op->getOutput(0) == *expectedOutput); + } + + SECTION("Broadcasting") { + std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<int,3,1,3,2> { + { // + { // + {{0, 1},{2, 3},{4, 5}} // + }, // + { // + {{6, 7},{8, 9},{10, 11}} // + }, // + { // + {{12, 13},{14, 15},{16, 17}} // + } // + } // + }); // + std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> { + { // + { // + {{20, 21},{22, 23},{24, 25}}, // + {{26, 27},{28, 29},{30, 31}}, // + {{32, 33},{34, 35},{36, 37}} // + } // + } // + }); // + + std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{100,200}}); + std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> { + { // + { // + {{ 120, 222},{ 124, 226},{ 128, 230}}, // + {{ 126, 228},{ 130, 232},{ 134, 236}}, // + {{ 132, 234},{ 136, 238},{ 140, 242}} // + }, // + { // + {{ 126, 228},{ 130, 232},{ 134, 236}}, // + {{ 132, 234},{ 136, 238},{ 140, 242}}, // + {{ 138, 240},{ 142, 244},{ 146, 248}} // + }, // + { // + {{ 132, 234},{ 136, 238},{140, 242}}, // + {{ 138, 240},{ 142, 244},{146, 248}}, // + {{ 144, 246},{ 148, 250},{152, 254}} // + } // + } // + }); // + + std::shared_ptr<Node> mySum = Sum(3); + auto op = std::static_pointer_cast<OperatorTensor>(mySum -> getOperator()); + op->associateInput(0, input_0); + op->associateInput(1, input_1); + op->associateInput(2, input_2); + op->setDataType(DataType::Int32); + op->setBackend("cpu"); + mySum->forward(); + op->getOutput(0)->print(); + expectedOutput->print(); + REQUIRE(*op->getOutput(0) == *expectedOutput); + } + } + \ No newline at end of file -- GitLab From 32c64230a062063744bac653ae473d8ab2e4e933 Mon Sep 17 00:00:00 2001 From: hrouis <houssemeddine.rouis92@gmail.com> Date: Mon, 24 Mar 2025 13:49:45 +0100 Subject: [PATCH 8/8] remove transA and transB attr from FC to be handled in the metaop TransposeFC --- include/aidge/backend/cpu/operator/FCImpl.hpp | 4 - .../backend/cpu/operator/FCImpl_kernels.hpp | 16 +-- src/operator/FCImpl.cpp | 15 +-- unit_tests/operator/Test_FCImpl.cpp | 112 +----------------- unit_tests/operator/Test_MetaOperator.cpp | 4 +- unit_tests/scheduler/Test_CastMove.cpp | 4 +- unit_tests/scheduler/Test_Scheduler.cpp | 6 +- 7 files changed, 20 insertions(+), 141 deletions(-) diff --git a/include/aidge/backend/cpu/operator/FCImpl.hpp b/include/aidge/backend/cpu/operator/FCImpl.hpp index 9249ba77..4daa522f 100644 --- a/include/aidge/backend/cpu/operator/FCImpl.hpp +++ b/include/aidge/backend/cpu/operator/FCImpl.hpp @@ -27,8 +27,6 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op, void(const DimSize_t, const DimSize_t, const DimSize_t, - const bool, - const bool, const float, const float, const void *, @@ -38,8 +36,6 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op, void(const DimSize_t, const DimSize_t, const DimSize_t, - const bool, - const bool, const float, const float, const void *, diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp index 3542c8cf..873830d3 100644 --- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp @@ -22,8 +22,6 @@ template <class I, class W, class B, class O> void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, const DimSize_t inputFeatureSize, const DimSize_t outputFeatureSize, - const bool transA, - const bool transB, const float alpha_, const float beta_, const void* input_, @@ -61,8 +59,8 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, for (std::size_t out = 0; out < outputFeatureSize; ++out) { O sum = O(0); for (std::size_t i = 0; i < inputFeatureSize; ++i) { - std::size_t inputIdx = transA ? (i * batchSize + batch) : (batch * inputFeatureSize + i); - std::size_t weightIdx = transB ? (i * outputFeatureSize + out) : (out * inputFeatureSize + i); + std::size_t inputIdx = batch * inputFeatureSize + i; + std::size_t weightIdx = out * inputFeatureSize + i; sum += static_cast<O>(input[inputIdx]) * static_cast<O>(weights[weightIdx]); } output[batch * outputFeatureSize + out] += alpha * sum; @@ -75,8 +73,6 @@ template <class I, class O, class W, class B> void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, const DimSize_t inputFeatureSize, const DimSize_t outputFeatureSize, - const bool transA, - const bool transB, const float alpha_, const float beta_, const void* input_, @@ -120,11 +116,11 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, for (std::size_t c = 0; c < inputFeatureSize; ++c) { O sum{0}; for (std::size_t b = 0; b < batchSize; ++b) { - std::size_t inputIdx = transA ? (c * batchSize + b) : (b * inputFeatureSize + c); + std::size_t inputIdx = b * inputFeatureSize + c; std::size_t outputIdx = b * outputFeatureSize + o; sum += originalInput[inputIdx] * outputGrad[outputIdx]; } - std::size_t weightIdx = transB ? (c * outputFeatureSize + o) : (o * inputFeatureSize + c); + std::size_t weightIdx = o * inputFeatureSize + c; weightGrad[weightIdx] = alpha * sum; } } @@ -135,11 +131,11 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize, for (std::size_t c = 0; c < inputFeatureSize; ++c) { O sum{0}; for (std::size_t o = 0; o < outputFeatureSize; ++o) { - std::size_t weightIdx = transB ? (c * outputFeatureSize + o) : (o * inputFeatureSize + c); + std::size_t weightIdx = o * inputFeatureSize + c; std::size_t outputIdx = b * outputFeatureSize + o; sum += weight[weightIdx] * outputGrad[outputIdx]; } - std::size_t inputIdx = transA ? (c * batchSize + b) : (b * inputFeatureSize + c); + std::size_t inputIdx = b * inputFeatureSize + c; inputGrad[inputIdx] = alpha * sum; } } diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp index 2a06803c..821f673d 100644 --- a/src/operator/FCImpl.cpp +++ b/src/operator/FCImpl.cpp @@ -41,13 +41,10 @@ void Aidge::FCImpl_cpu::forward() const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *(op_.getOutput(0))); const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *(op_.getOutput(0))) : Tensor(); // Call kernel - const DimSize_t nbInFeat = op_.transB()? input1.dims()[0]:input1.dims()[1]; - const auto batchSize = input0.size() /nbInFeat; + const auto batchSize = input0.size() /input1.dims()[1]; impl.forward(batchSize, - op_.transB()? input1.dims()[0]:input1.dims()[1], // nb input features - op_.transB()?input1.dims()[1]: input1.dims()[0], // nb output features - op_.transA(), - op_.transB(), + input1.dims()[1], // nb input features + input1.dims()[0], // nb output features op_.alpha(), op_.beta(), input0.getImpl()->rawPtr(), @@ -76,13 +73,11 @@ void Aidge::FCImpl_cpu::backward() const auto& input1grad = op_.getInput(1)->grad()->refCastFrom(input1gradFallback, *(op_.getOutput(0))); const auto& input2grad = (op_.getInput(2)) ? op_.getInput(2)->grad()->refCastFrom(input2gradFallback, *(op_.getOutput(0))) : Tensor(); // Call kernel - const DimSize_t nbInFeat = op_.transB()? input1grad.dims()[0]:input1grad.dims()[1]; + const DimSize_t nbInFeat = input1grad.dims()[1]; const auto batchSize = input0grad.size() /nbInFeat; impl.backward(batchSize, nbInFeat, // nb input features - op_.transB()?input1grad.dims()[1]: input1grad.dims()[0], // nb output features - op_.transA(), - op_.transB(), + input1grad.dims()[0], // nb output features op_.alpha(), op_.beta(), getCPUPtr(fc_grad), diff --git a/unit_tests/operator/Test_FCImpl.cpp b/unit_tests/operator/Test_FCImpl.cpp index ce41ec1e..a84ab63a 100644 --- a/unit_tests/operator/Test_FCImpl.cpp +++ b/unit_tests/operator/Test_FCImpl.cpp @@ -49,7 +49,7 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") { Tensor myOutput = Array2D<int, 2, 5>{ {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}}; - std::shared_ptr<Node> myFC = FC(75, 5, 1.0f, 1.0f, false, false, false, "myfc"); + std::shared_ptr<Node> myFC = FC(75, 5, 1.0f, 1.0f, false, "myfc"); auto op = std::static_pointer_cast<FC_Op>(myFC -> getOperator()); op -> setDataType(DataType::Int32); op -> setBackend("cpu"); @@ -107,45 +107,6 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") { myFC->forward(); REQUIRE(*(op->getOutput(0)) == myOutput); } - - SECTION("transA and transB") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 3, 4>{{ - {1.2137502, 0.85416597, -1.2204953, -1.1383502}, - {-0.73264742, -1.361271, -1.400124, -0.47965094}, - {-0.64570695, -0.48093504, 0.14018863, -0.64090657} - }}); // Transposed Input (4x3 → 3x4) - - std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 3, 2>{{ - {0.12352414, -1.0330718}, - {-0.036496852, 0.011371522}, - {-0.15418212, -0.60658616} - }}); // Transposed Weights (2x3 → 3x2) - - std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{ - -0.31278628, 0.68560582 - }}); - - Tensor myOutput = Tensor(Array2D<float, 4, 2>{{ - {0.39605334, -1.3982881}, - {0.30229449, -0.86952901}, - {-0.39894336, 2.6626053}, - {-0.20497644, 3.4614196} - }}); - - std::shared_ptr<Node> myFC = FC(3, 2, 2.0f, 0.5f, false, true, true, "myfc"); // transA = true, transB = true - - auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator()); - op->associateInput(0, myInput); - op->associateInput(1, myWeights); - op->associateInput(2, myBias); - op->setDataType(DataType::Float32); - op->setBackend("cpu"); - - myFC->forward(); - - REQUIRE(approxEq<float>(*(op->getOutput(0)), myOutput)); - } - // std::cout << static_cast<Tensor>((*myFC->getOperator())["weight"])[0][0][0][0] << std::endl; } @@ -170,7 +131,7 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") { {3.0791781, -1.3184667}, {2.3588469, -0.31109101}}}); - std::shared_ptr<Node> myFC = FC(3, 2, 1.0f, 1.0f, false, false, false, "myfc"); + std::shared_ptr<Node> myFC = FC(3, 2, 1.0f, 1.0f, false, "myfc"); auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator()); op -> associateInput(0, myInput); op -> associateInput(1, myWeights); @@ -208,73 +169,4 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") { REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), expectedWeightsGrad)); REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), expectedBiasGrad)); } - SECTION("transA and transB") { - std::shared_ptr<Tensor> myInput = - std::make_shared<Tensor>(Array2D<float, 3, 4>{{ - {0.51234567, -1.23456789, 0.67891234, -0.43219876}, - {1.87654321, -0.98765432, 1.34567890, -1.23456789}, - {-0.67891234, 0.43219876, -1.87654321, 0.98765432} - }}); - - std::shared_ptr<Tensor> myWeights = - std::make_shared<Tensor>(Array2D<float, 3, 2>{{ - {0.12345678, -1.34567890}, - {-0.87654321, 0.56789012}, - {0.23456789, -0.45678901} - }}); - - std::shared_ptr<Tensor> myBias = - std::make_shared<Tensor>(Array1D<float, 2>{ - {0.65432109, -0.54321098} - }); - - Tensor myOutput = Tensor(Array2D<float, 4, 2>{{ - {-3.1545789, 1.101069}, - {1.9565322, 1.534453}, - {-2.7446516, 1.1439626}, - {2.8480933, -1.412901} - }}); - - std::shared_ptr<Node> myFC = FC(3, 2, 2.0f, 0.5f, false, true, true, "myfc"); - auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator()); - op->associateInput(0, myInput); - op->associateInput(1, myWeights); - op->associateInput(2, myBias); - op->setDataType(DataType::Float32); - op->setBackend("cpu"); - - // Forward pass - myFC->forward(); - - REQUIRE(approxEq<float>(*(op->getOutput(0)), myOutput)); - - // Backward Pass - std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{ - {0.19876543, -0.65432109}, - {0.76543210, -1.23456789}, - {-0.43210987, 1.09876543}, - {-0.98765432, 0.87654321} - }}); - - Tensor expectedInputGrad = Tensor(Array2D<float, 3, 4>{{ - {1.8100901, 3.5116596, -3.0638645, -2.6029568}, - {-1.0916179, -2.7440662, 2.0054817, 2.7270038}, - {0.69102132, 1.4869658, -1.206526, -1.2641346} - }}); - - Tensor expectedWeightsGrad = Tensor(Array2D<float, 3, 2>{{ - {-1.4192863, 3.1120875}, - {0.50970948, 0.77579576}, - {0.062572584, -2.571022} - }}); - - Tensor expectedBiasGrad = Tensor(Array1D<float, 2>{ - {-0.22778332, 0.04320982} - }); - op->getOutput(0)->setGrad(myOutputGrad); - myFC->backward(); - REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), expectedInputGrad)); - REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), expectedWeightsGrad)); - REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), expectedBiasGrad)); - } } \ No newline at end of file diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp index 925a264a..7c1718dd 100644 --- a/unit_tests/operator/Test_MetaOperator.cpp +++ b/unit_tests/operator/Test_MetaOperator.cpp @@ -702,8 +702,8 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") { auto init = std::make_shared<Tensor>(Array2D<float, 2, 5>{}); uniformFiller<float>(init, 0.0, 0.0); - auto fc1 = FC(inChannels, outChannels, 1.0f, 1.0f, true, false, false, "myfc"); - auto fc2 = FC(outChannels, inChannels, 1.0f, 1.0f, true, false, false, "fc2"); + auto fc1 = FC(inChannels, outChannels, 1.0f, 1.0f, true, "myfc"); + auto fc2 = FC(outChannels, inChannels, 1.0f, 1.0f, true, "fc2"); // NOTE: Account for init step by adding 1 to the max timestep // parameter. auto lif1 = Leaky(nbTimeSteps + 1, beta, threshold, LeakyReset::Subtraction, "leaky"); diff --git a/unit_tests/scheduler/Test_CastMove.cpp b/unit_tests/scheduler/Test_CastMove.cpp index fdeb2d79..3f1538a4 100644 --- a/unit_tests/scheduler/Test_CastMove.cpp +++ b/unit_tests/scheduler/Test_CastMove.cpp @@ -56,7 +56,7 @@ TEST_CASE("[cpu/castmove] CastMove(forward)") { Conv(1, 3, {3, 3}, "conv1"), Conv(3, 4, {1, 1}, "conv2"), Conv(4, 3, {1, 1}, "conv3"), - FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")}); + FC(27, 5, 1.0f, 1.0f, false, "fc")}); g->getNode("conv1")->getOperator()->setInput(0, inputTensor); g->getNode("conv1")->getOperator()->setInput(1, weight1); @@ -158,7 +158,7 @@ TEST_CASE("[cpu/castmove] CastMove(forward)") { Conv(1, 3, {3, 3}, "conv1"), Conv(3, 4, {1, 1}, "conv2"), Conv(4, 3, {1, 1}, "conv3"), - FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")}); + FC(27, 5, 1.0f, 1.0f, false, "fc")}); g->getNode("conv1")->getOperator()->setInput(0, inputTensor); g->getNode("conv1")->getOperator()->setInput(1, weight1); diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp index e264f114..1361b8a2 100644 --- a/unit_tests/scheduler/Test_Scheduler.cpp +++ b/unit_tests/scheduler/Test_Scheduler.cpp @@ -71,7 +71,7 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") { Conv(1, 3, {3, 3}, "conv1"), Conv(3, 4, {1, 1}, "conv2"), Conv(4, 3, {1, 1}, "conv3"), - FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")}); + FC(27, 5, 1.0f, 1.0f, false, "fc")}); g->getNode("conv1")->getOperator()->setInput(0, inputTensor); g->getNode("conv1")->getOperator()->setInput(1, weight1); @@ -173,7 +173,7 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") { Conv(3, 3, {1, 1}, "conv1.3")}), Add("add2"), Conv(3, 2, {1, 1}, "conv2"), - FC(18, 5, 1.0f, 1.0f, false, false, false, "out")}); + FC(18, 5, 1.0f, 1.0f, false, "out")}); g->getNode("inputConv")->getOperator()->setInput(0, inputTensor); g->getNode("inputConv")->getOperator()->setInput(1, weight1); @@ -321,7 +321,7 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") { Conv(1, 3, {3, 3}, "conv1"), Conv(3, 4, {1, 1}, "conv2"), Conv(4, 3, {1, 1}, "conv3"), - FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")}); + FC(27, 5, 1.0f, 1.0f, false, "fc")}); // g->getNode("conv1")->getOperator()->setInput(0, inputTensor); g->getNode("conv1")->getOperator()->setInput(1, weight1); -- GitLab