perf: update 'And' forard kernel

ae5f86da · Maxence Naud · 4c8412cc · ae5f86da · ae5f86da · ae5f86da
Commit ae5f86da authored 4 months ago by Maxence Naud
--- a/include/aidge/backend/cpu/operator/AndImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl.hpp
@@ -23,7 +23,7 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using AndImpl_cpu = OperatorImpl_cpu<And_Op,
-    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>;
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;

 // Implementation entry point registration to Operator
 REGISTRAR(And_Op, "cpu", Aidge::AndImpl_cpu::create);

--- a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
@@ -12,52 +12,153 @@
 #ifndef AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_
 #define AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_

-#include "aidge/backend/cpu/data/Broadcasting.hpp"
 #include "aidge/backend/cpu/operator/AndImpl.hpp"
 #include "aidge/utils/Registrar.hpp"

 namespace Aidge {
-template <class I1, class I2, class O>
-void AndImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
-                                const std::vector<std::size_t>& input2Dims,
+
+namespace {
+// suppose values are contiguous in memory
+template <class I, class O>
+void equal_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I* input1,
+                            const I* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]);
+    }
+}
+}
+
+
+template <class I, class O>
+void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
                                const void* input1_,
-                                const void* input2_,
                                void* output_) {

-    const I1* input_1 = static_cast<const I1*>(input1_);
-    const I2* input_2 = static_cast<const I2*>(input2_);
+    const I* input_0 = static_cast<const I*>(input0_);
+    const I* input_1 = static_cast<const I*>(input1_);
    O* output = static_cast<O*>(output_);

-    size_t totalElements = 1;
-    for (size_t dimSize : outputDims) {
-        totalElements *= dimSize;
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] + input_1[i]);
+        }
+        return;
    }

-	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
-	{
-		std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }

-		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
-		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
+    const std::size_t nbDims = dims0.size();

-        output[oIndex] = static_cast<O>(input_1[idx1] == input_2[idx2]);
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
    }
 }

+} // namespace Aidge
+
 // Kernels registration to implementation entry point
 REGISTRAR(AndImpl_cpu,
    {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float, float>, nullptr});
 REGISTRAR(AndImpl_cpu,
    {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double, double>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double, double>, nullptr});
 REGISTRAR(AndImpl_cpu,
    {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
 REGISTRAR(AndImpl_cpu,
    {DataType::Int64},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr});
 }  // namespace Aidge

 #endif /* AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ */
--- a/src/operator/AndImpl.cpp
+++ b/src/operator/AndImpl.cpp
@@ -25,22 +25,34 @@

 template <>
 void Aidge::AndImpl_cpu::forward() {
-    const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
-                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
-    const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
-                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
+    const And_Op& op = static_cast<const And_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in And operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run And forward because the 0-th input has no implementation.");

+    AIDGE_ASSERT(op.getInput(1), "missing input in And operator");
+    AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run And forward because the 1st input has no implementation.");
+
+    AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot And inputs with two differents data type.");

    // Find the correct kernel type
    const auto impl = Registrar<AndImpl_cpu>::create(getBestMatch(getRequiredSpec()));

-    // Call kernel
-    impl.forward(inputDims0,
-        inputDims1,
-        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
-        getCPUPtr(mOp.getRawInput(0)),
-        getCPUPtr(mOp.getRawInput(1)),
-        getCPUPtr(mOp.getRawOutput(0)));
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
+    const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0));
+    const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1));
+
+
+    impl.forward(op.getInput(0)->dims(),
+                op.getInput(1)->dims(),
+                op.getOutput(0)->dims(),
+                input0.getImpl()->rawPtr(),
+                input1.getImpl()->rawPtr(),
+                getCPUPtr(op.getRawOutput(0)));
 }

 template <>