Skip to content
Snippets Groups Projects
Commit aa30d4b3 authored by Olivier BICHLER's avatar Olivier BICHLER
Browse files

Merge branch 'feat/mul-backward' into 'dev'

Add backward pass CPU implementation for Mul operator

See merge request !85
parents ddcd9dbc 1cd0d071
No related branches found
No related tags found
2 merge requests!93Release v0.3.0,!85Add backward pass CPU implementation for Mul operator
Pipeline #54403 passed
...@@ -25,11 +25,25 @@ namespace Aidge { ...@@ -25,11 +25,25 @@ namespace Aidge {
// compute kernel registry for forward and backward // compute kernel registry for forward and backward
class MulImplForward_cpu class MulImplForward_cpu
: public Registrable<MulImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)> { : public Registrable<MulImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&,
}; const std::vector<std::size_t>&,
const std::vector<std::size_t>&,
const void*,
const void*,
void*)> {};
class MulImplBackward_cpu class MulImplBackward_cpu
: public Registrable<MulImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, void*)> { : public Registrable<MulImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::size_t,
}; const std::size_t,
const std::size_t,
const std::vector<std::size_t>,
const std::vector<std::size_t>,
const void*,
const void*,
const void*,
void*,
void*)> {};
class MulImpl_cpu : public OperatorImpl { class MulImpl_cpu : public OperatorImpl {
public: public:
...@@ -40,7 +54,9 @@ public: ...@@ -40,7 +54,9 @@ public:
} }
Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final; Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
void forward() override; void forward() override;
void backward() override;
}; };
namespace { namespace {
......
#ifndef AIDGE_CPU_OPERATOR_MULIMPL_BACKWARD_KERNEL_H_
#define AIDGE_CPU_OPERATOR_MULIMPL_BACKWARD_KERNEL_H_
#include "aidge/utils/Registrar.hpp"
#include <cstdint> // std::int32_t, std::int64_t
#include <algorithm>
#include "aidge/backend/cpu/data/Broadcasting.hpp"
#include "aidge/backend/cpu/operator/MulImpl.hpp"
namespace Aidge {
template <class I1, class I2, class O>
void MulImpl_cpu_backward_kernel(const std::size_t input0Length,
const std::size_t input1Length,
const std::size_t grad0Length,
const std::vector<std::size_t> input0Dims,
const std::vector<std::size_t> input1Dims,
const void* input0_,
const void* input1_,
const void* grad_output_,
void* gradientInput0,
void* gradientInput1)
{
const auto* input0 = static_cast<const I1*>(input0_);
const auto* input1 = static_cast<const I1*>(input1_);
const auto* grad_output = static_cast<const O*>(grad_output_);
auto* grad_input_0 = static_cast<I1*>(gradientInput0);
auto* grad_input_1 = static_cast<I2*>(gradientInput1);
if(input0Dims.size() >= input1Dims.size())
{
AIDGE_ASSERT(input0Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
for(auto i = 0U; i < input0Length; ++i)
{
const auto indices = getMultiDimIndices(input1Dims, i);
const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
grad_input_0[i] = input1[flattenedIndex] * grad_output[i];
}
for(std::size_t i = 0 ; i < grad0Length; ++i)
{
const auto indices = getMultiDimIndices(input1Dims, i);
const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
grad_input_1[flattenedIndex] += input0[i] * grad_output[i];
}
} else {
AIDGE_ASSERT(input1Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
for(auto i = 0U; i < input1Length; ++i)
{
const auto indices = getMultiDimIndices(input0Dims, i);
const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
grad_input_1[i] = input0[flattenedIndex] * grad_output[i];
}
for(std::size_t i = 0 ; i < grad0Length; ++i)
{
const auto indices = getMultiDimIndices(input0Dims, i);
const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
grad_input_0[flattenedIndex] += input1[i] * grad_output[i];
}
}
}
namespace {
static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Float32(
{DataType::Float32, DataType::Float32, DataType::Float32},
Aidge::MulImpl_cpu_backward_kernel<float, float, float>);
static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Float64(
{DataType::Float64, DataType::Float64, DataType::Float64},
Aidge::MulImpl_cpu_backward_kernel<double, double, double>);
static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Int32(
{DataType::Int32, DataType::Int32, DataType::Int32},
Aidge::MulImpl_cpu_backward_kernel<std::int32_t, std::int32_t, std::int32_t>);
static Registrar<MulImplBackward_cpu> registrarMulImplBackward_cpu_Int64(
{DataType::Int64, DataType::Int64, DataType::Int64},
Aidge::MulImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>);
} // namespace
} // namespace Aidge
#endif
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "aidge/backend/cpu/operator/MulImpl.hpp" #include "aidge/backend/cpu/operator/MulImpl.hpp"
#include "aidge/backend/cpu/operator/MulImpl_forward_kernels.hpp" #include "aidge/backend/cpu/operator/MulImpl_forward_kernels.hpp"
#include "aidge/backend/cpu/operator/MulImpl_backward_kernels.hpp"
Aidge::Elts_t Aidge::MulImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const { Aidge::Elts_t Aidge::MulImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
// this implementation can be in-place // this implementation can be in-place
...@@ -40,6 +41,7 @@ void Aidge::MulImpl_cpu::forward() { ...@@ -40,6 +41,7 @@ void Aidge::MulImpl_cpu::forward() {
const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(), const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims()); std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
// Call kernel // Call kernel
kernelFunc(inputDims0, kernelFunc(inputDims0,
inputDims1, inputDims1,
...@@ -48,3 +50,32 @@ void Aidge::MulImpl_cpu::forward() { ...@@ -48,3 +50,32 @@ void Aidge::MulImpl_cpu::forward() {
getCPUPtr(mOp.getRawInput(1)), getCPUPtr(mOp.getRawInput(1)),
getCPUPtr(mOp.getRawOutput(0))); getCPUPtr(mOp.getRawOutput(0)));
} }
void Aidge::MulImpl_cpu::backward() {
const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp);
auto in0 = op_.getInput(0);
auto in1 = op_.getInput(1);
auto in0grad = op_.getInput(0)->grad();
auto in1grad = op_.getInput(1)->grad();
auto out0grad = op_.getOutput(0)->grad();
// Find kernel function
auto kernelFunc = Registrar<MulImplBackward_cpu>::create({
out0grad->dataType(),
in0grad->dataType(),
in1grad->dataType()});
kernelFunc(/* input0Length */ in0grad->size(),
/* input1Length */ in1grad->size(),
/* grad0Length */ out0grad->size(),
/* input0Dims */ in0->dims(),
/* input1Dims */ in1->dims(),
getCPUPtr(in0),
getCPUPtr(in1),
getCPUPtr(out0grad),
getCPUPtr(in0grad),
getCPUPtr(in1grad));
}
...@@ -24,6 +24,337 @@ ...@@ -24,6 +24,337 @@
namespace Aidge { namespace Aidge {
TEST_CASE("[CPU/Operator] Mul Backward", "[Mul][CPU][Backward]")
{
std::shared_ptr<Node> myMul = Mul();
auto op = std::static_pointer_cast<OperatorTensor>(myMul->getOperator());
op->setDataType(DataType::Float32);
op->setBackend("cpu");
SECTION("Case 1: 2D and 1D tensors") {
const auto T0 = std::make_shared<Tensor>(Array2D<float,2,3>(
{
{
{1,2,3},{4,5,6}
}
}
));
const auto T1 = std::make_shared<Tensor>(Array1D<float,3>(
{0.1,0.2,0.3}
));
T0->setDataType(DataType::Float32);
T0->setBackend("cpu");
T1->setDataType(DataType::Float32);
T1->setBackend("cpu");
op->getOutput(0)->setGrad(std::make_shared<Tensor>(Array2D<float,2,3>({{{1.0,1.0,1.0},{1.0,1.0,1.0}}})));
op->associateInput(0,T0);
op->associateInput(1,T1);
op->forwardDims();
myMul->forward();
myMul->backward();
auto T0Grad = std::make_shared<Tensor>(Array2D<float, 2,3>({{{0.1,0.2,0.3},{0.1, 0.2, 0.3}}}));
auto T1Grad = std::make_shared<Tensor>(Array1D<float, 3>({5,7,9}));
REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *T0Grad));
REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *T1Grad));
}
SECTION("Case 2: 3D and 1D tensors") {
const auto T0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
{
{
{
{1.0, 2.0, 3.0},
{4.0, 5.0, 6.0}
},
{
{7.0, 8.0, 9.0},
{10.0, 11.0, 12.0}
}
}
}
));
const auto T1 = std::make_shared<Tensor>(Array1D<float, 3>({0.3,0.2,0.1}));
const auto newGrad = std::make_shared<Tensor>(Array3D<float,2,2,3>(
{
{
{
{1, 1, 1},
{1, 1, 1}
},
{
{1, 1, 1},
{1, 1, 1}
}
}
}
));
const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
{
{
{
{0.3, 0.2, 0.1},
{0.3, 0.2, 0.1}
},
{
{0.3, 0.2, 0.1},
{0.3, 0.2, 0.1}
}
}
}
));
const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float,3>(
{22.0, 26.0, 30.0}
));
for(auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
{
T->setBackend("cpu") ;
T->setDataType(DataType::Float32);
}
op->associateInput(0, T0);
op->associateInput(1, T1);
op->getOutput(0)->setGrad(newGrad);
op->forwardDims();
myMul->backward();
REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
}
SECTION("Case 3: 4D and 2D tensors") {
const auto T0 = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
{
{
{
{
{1.0, 2.0, 3.0},
{4.0, 5.0, 6.0},
{7.0, 8.0, 9.0}
},
{
{10.0, 11.0, 12.0},
{13.0, 14.0, 15.0},
{16.0, 17.0, 18.0}
}
},
{
{
{19.0, 20.0, 21.0},
{22.0, 23.0, 24.0},
{25.0, 26.0, 27.0}
},
{
{28.0, 29.0, 30.0},
{31.0, 32.0, 33.0},
{34.0, 35.0, 36.0}
}
}
}
}
));
const auto T1 = std::make_shared<Tensor>(Array2D<float, 3,3>(
{
{
{0.5,0.3,0.1},
{0.4,0.2,0.6},
{0.7,0.8,0.9}
}
}
));
const auto newGrad = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
{
{
{
{
{1.0, 1.0, 1.0},
{1.0, 1.0, 1.0},
{1.0, 1.0, 1.0}
},
{
{1.0, 1.0, 1.0},
{1.0, 1.0, 1.0},
{1.0, 1.0, 1.0}
}
},
{
{
{1.0, 1.0, 1.0},
{1.0, 1.0, 1.0},
{1.0, 1.0, 1.0}
},
{
{1.0, 1.0, 1.0},
{1.0, 1.0, 1.0},
{1.0, 1.0, 1.0}
}
}
}
}
));
const auto expectedGrad0 = std::make_shared<Tensor>(Array4D<float,2,2,3,3>(
{
{
{
{
{0.5, 0.3, 0.1},
{0.4, 0.2, 0.6},
{0.7, 0.8, 0.9}
},
{
{0.5, 0.3, 0.1},
{0.4, 0.2, 0.6},
{0.7, 0.8, 0.9}
}
},
{
{
{0.5, 0.3, 0.1},
{0.4, 0.2, 0.6},
{0.7, 0.8, 0.9}
},
{
{0.5, 0.3, 0.1},
{0.4, 0.2, 0.6},
{0.7, 0.8, 0.9}
}
}
}
}
));
const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 3>(
{
{
{58.0, 62.0, 66.0},
{70.0, 74.0, 78.0},
{82.0, 86.0, 90.0}
}
}
));
for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
{
T->setBackend("cpu") ;
T->setDataType(DataType::Float32);
}
op->associateInput(0, T0);
op->associateInput(1, T1);
op->getOutput(0)->setGrad(newGrad);
op->forwardDims();
myMul->backward();
REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
}
SECTION("Case 4: 3D and 2D tensors") {
const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 3, 4>(
{
{
{
{1.0, 2.0, 3.0, 4.0},
{5.0, 6.0, 7.0, 8.0},
{9.0, 10.0, 11.0, 12.0},
},
{
{13.0, 14.0, 15.0, 16.0},
{17.0, 18.0, 19.0, 20.0},
{21.0, 22.0, 23.0, 24.0},
}
}
}
));
const auto T1 = std::make_shared<Tensor>(Array2D<float, 3, 4>(
{
{
{0.1, 0.2, 0.3, 0.4},
{0.5, 0.6, 0.7, 0.8},
{0.9, 1.0, 1.1, 1.2}
}
}
));
const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2,3,4>(
{
{
{
{1.0, 1.0, 1.0, 1.0},
{1.0, 1.0, 1.0, 1.0},
{1.0, 1.0, 1.0, 1.0},
},
{
{1.0, 1.0, 1.0, 1.0},
{1.0, 1.0, 1.0, 1.0},
{1.0, 1.0, 1.0, 1.0},
}
}
}
));
const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,3,4>(
{
{
{
{0.1, 0.2, 0.3, 0.4},
{0.5, 0.6, 0.7, 0.8},
{0.9, 1.0, 1.1, 1.2}
},
{
{0.1, 0.2, 0.3, 0.4},
{0.5, 0.6, 0.7, 0.8},
{0.9, 1.0, 1.1, 1.2}
}
}
}
));
const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 4>(
{
{
{14.0, 16.0, 18.0, 20.0},
{22.0, 24.0, 26.0, 28.0},
{30.0, 32.0, 34.0, 36.0}
}
}
));
for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
{
T->setBackend("cpu") ;
T->setDataType(DataType::Float32);
}
op->associateInput(0, T0);
op->associateInput(1, T1);
op->getOutput(0)->setGrad(newGrad);
op->forwardDims();
myMul->backward();
REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
}
}
TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
constexpr std::uint16_t NBTRIALS = 10; constexpr std::uint16_t NBTRIALS = 10;
// Create a random number generator // Create a random number generator
...@@ -31,7 +362,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { ...@@ -31,7 +362,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
std::mt19937 gen(rd()); std::mt19937 gen(rd());
std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1 std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10)); std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5)); std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
std::uniform_int_distribution<int> boolDist(0,1); std::uniform_int_distribution<int> boolDist(0,1);
// Create MatMul Operator // Create MatMul Operator
...@@ -60,6 +391,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { ...@@ -60,6 +391,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
std::chrono::time_point<std::chrono::system_clock> end; std::chrono::time_point<std::chrono::system_clock> end;
std::chrono::duration<double, std::micro> duration{}; std::chrono::duration<double, std::micro> duration{};
SECTION("MulImpl_cpu::forward()") { SECTION("MulImpl_cpu::forward()") {
SECTION("Scalar / Scalar") { SECTION("Scalar / Scalar") {
...@@ -68,16 +400,20 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { ...@@ -68,16 +400,20 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
} }
SECTION("+1-D Tensor / +1-D Tensor - same dimensions") { SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
std::size_t number_of_operation = 0; std::size_t number_of_operation = 0;
for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
// generate 2 random Tensors // generate 2 random Tensors
const std::size_t nbDims = nbDimsDist(gen); const auto nbDims = nbDimsDist(gen);
std::vector<std::size_t> dims; auto dims = std::vector<std::size_t>{};
for (std::size_t i = 0; i < nbDims; ++i) { for (std::size_t i = 0; i < nbDims; ++i) {
dims.push_back(dimSizeDist(gen)); dims.push_back(dimSizeDist(gen));
} }
const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
const auto nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
number_of_operation += nb_elements; number_of_operation += nb_elements;
// without broadcasting // without broadcasting
...@@ -114,67 +450,101 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") { ...@@ -114,67 +450,101 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
delete[] array0; delete[] array0;
delete[] array1; delete[] array1;
delete[] result; delete[] result;
// with broadcasting
} }
std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl; std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
std::cout << "total time: " << duration.count() << "μs" << std::endl; std::cout << "total time: " << duration.count() << "μs" << std::endl;
} }
SECTION("+1-D Tensor / +1-D Tensor - broadcasting") { SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
std::size_t number_of_operation = 0; std::size_t number_of_operation = 0;
for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) { for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
// generate 2 random Tensors // generate 2 random Tensors
// handle dimensions, replace some dimensions with '1' to get broadcasting // handle dimensions, replace some dimensions with '1' to get broadcasting
constexpr std::size_t nbDims = 4; constexpr std::size_t nbDims = 4;
std::vector<std::size_t> dims; std::vector<std::size_t> dimensions;
for (std::size_t i = 0; i < nbDims; ++i) {
dims.push_back(dimSizeDist(gen)); for (std::size_t i = 0; i < nbDims; ++i)
{
dimensions.push_back(dimSizeDist(gen));
} }
std::vector<std::size_t> dims0 = dims;
std::vector<std::size_t> dims1 = dims; auto dims0 = dimensions;
std::vector<std::size_t> dimsOut = dims; auto dims1 = dimensions;
for (std::size_t i = 0; i < nbDims; ++i) { auto dimsOut = dimensions;
if (boolDist(gen)) {
for (std::size_t i = 0; i < nbDims; ++i)
{
if (boolDist(gen))
{
dims0[i] = 1; dims0[i] = 1;
} }
if (boolDist(gen)) {
if (boolDist(gen))
{
dims1[i] = 1; dims1[i] = 1;
} }
dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i]; dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
} }
for(auto dim : dims0)
{
Log::info("Dimension of input 0 : {}", dim);
}
for(auto dim : dims1)
{
Log::info("Dimension of input 1 : {}", dim);
}
// create arrays and fill them with random values // create arrays and fill them with random values
float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]]; float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]]; float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]]; float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) {
for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i)
{
array0[i] = valueDist(gen); array0[i] = valueDist(gen);
} }
for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) {
for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i)
{
array1[i] = valueDist(gen); array1[i] = valueDist(gen);
} }
// compute true result // compute true result
const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1}; const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1}; const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
for (std::size_t a = 0; a < dimsOut[0]; ++a) {
for (std::size_t b = 0; b < dimsOut[1]; ++b) { for (std::size_t a = 0; a < dimsOut[0]; ++a)
{
for (std::size_t b = 0; b < dimsOut[1]; ++b)
{
const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0) const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
+ strides0[1] * ((dims0[1] > 1) ? b : 0); + strides0[1] * ((dims0[1] > 1) ? b : 0);
const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0) const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
+ strides1[1] * ((dims1[1] > 1) ? b : 0); + strides1[1] * ((dims1[1] > 1) ? b : 0);
for (std::size_t c = 0; c < dimsOut[2]; ++c) {
for (std::size_t c = 0; c < dimsOut[2]; ++c)
{
const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a)); const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
for (std::size_t d = 0; d < dimsOut[3]; ++d) {
for (std::size_t d = 0; d < dimsOut[3]; ++d)
{
std::size_t idx0 = idx0_0 std::size_t idx0 = idx0_0
+ strides0[2] * ((dims0[2] > 1) ? c : 0) + strides0[2] * ((dims0[2] > 1) ? c : 0)
+ ((dims0[3] > 1) ? d : 0); + ((dims0[3] > 1) ? d : 0);
std::size_t idx1 = idx1_0 std::size_t idx1 = idx1_0
+ strides1[2] * ((dims1[2] > 1) ? c : 0) + strides1[2] * ((dims1[2] > 1) ? c : 0)
+ ((dims1[3] > 1) ? d : 0); + ((dims1[3] > 1) ? d : 0);
result[idx_out + d] = array0[idx0] * array1[idx1]; result[idx_out + d] = array0[idx0] * array1[idx1];
// std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl; // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl;
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment