diff --git a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp index 7cb1239ebf4bd782635600e64eab6cd75b3a0282..088c89e68d1447214e2be4be0a6b75ec3f13b488 100644 --- a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp @@ -18,19 +18,19 @@ namespace Aidge { template <class I, class O> void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, const std::size_t m, - const void* input1_, const void* input2_, void* output_) { + const void* input1_, const void* input2_, void* __restrict__ output_) { // FIXME: missing MatMul parameters as arguments const I* input1 = static_cast<const I*>(input1_); const I* input2 = static_cast<const I*>(input2_); - O* output = static_cast<O*>(output_); + O* __restrict__ output = static_cast<O* __restrict__>(output_); + + std::memset(output, O(0), n * m * sizeof(O)); for (std::size_t i = 0; i < n; ++i) { - for (std::size_t j = 0; j < m; ++j) { - O sum = O(0); - for (std::size_t l = 0; l < k; ++l) { - sum += static_cast<O>(input1[i*k + l] * input2[l*m + j]); + for (std::size_t l = 0; l < k; ++l) { + for (std::size_t j = 0; j < m; ++j) { + output[i*m + j] += static_cast<O>(input1[i*k + l] * input2[l*m + j]); } - output[i*m + j] = sum; } } }