Add WeightInterleaving Operator CPU Implementation

58c5ec69 · Thibault Allenet · Maxence Naud · 729b412a · 58c5ec69 · 58c5ec69
Commit 58c5ec69 authored 5 months ago by Thibault Allenet Committed by Maxence Naud 3 months ago
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -53,6 +53,7 @@
 #include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
 #include "aidge/backend/cpu/operator/SubImpl.hpp"
 #include "aidge/backend/cpu/operator/TanhImpl.hpp"
+#include "aidge/backend/cpu/operator/WeightInterleavingImpl.hpp"
 #include "aidge/backend/cpu/data/TensorImpl.hpp"

--- a/include/aidge/backend/cpu/operator/WeightInterleavingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/WeightInterleavingImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_H_
+#define AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_H_
+#include <array>
+#include <memory>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/WeightInterleaving.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using WeightInterleavingImpl_cpu = OperatorImpl_cpu<WeightInterleaving_Op,
+    void(const DimSize_t,
+        const DimSize_t,
+        const DimSize_t,
+        const void *,
+        void *)>;
+// Implementation entry point registration to Operator
+REGISTRAR(WeightInterleaving_Op, "cpu", Aidge::WeightInterleavingImpl_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_WeightInterleavingIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp
+#ifndef AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_KERNELS_H_
+#include <algorithm>
+#include "aidge/backend/cpu/operator/WeightInterleavingImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+namespace Aidge {
+    /**
+     * @brief Compacts 8-bit data into a smaller bit-width representation.
+     * 
+     * This function takes an array of 8-bit data and compacts it into smaller chunks 
+     * based on the specified bit-width `nb_bits`. Each element in `compactData` will 
+     * store multiple packed `nb_bits` segments extracted from `data`.
+     * 
+     * @param data The input array of 8-bit values to be compacted.
+     * @param dataSize The size of the input `data` array.
+     * @param compactData The output array storing the compacted data.
+     * @param nb_bits The number of bits to extract from each `data` element (must be less than 8).
+     */
+    void compact_data(const std::int8_t* data, std::size_t dataSize, std::int8_t* compactData, std::uint8_t nb_bits) {
+        AIDGE_ASSERT(nb_bits > 0 && nb_bits < 5, "Cannot compact with the given nb_bits"); // Ensure valid bit width
+        // Mask to extract `nb_bits` from each data element
+        const unsigned int mask = (1U << nb_bits) - 1;
+        // Calculate the number of `nb_bits` segments that fit into an 8-bit compacted value
+        const unsigned int nbSlot = 8 / nb_bits;
+        // Case nb_bits=3 or 4, then shift is 4
+        // Case nb_bits=2, then shift is 2
+        // Case nb_bits=1, then shift is 1
+        std::uint8_t shift = 8 / nbSlot;
+        const unsigned int nbFullCompactbytes = dataSize / nbSlot;
+        // Main loop to process data in groups of `nbSlot`
+        for (std::size_t i = 0; i < nbFullCompactbytes; ++i) {
+            std::int8_t compact = 0;
+            for (unsigned int j = 0; j < nbSlot; ++j) {
+                compact |= (data[i * nbSlot + j] & mask);    // Apply mask to keep `nb_bits` only
+                // Shift only if not on the last slot to make room for the next `nb_bits`
+                if (j < nbSlot - 1) {
+                    compact <<= shift;
+                }
+            }
+            // Store the compacted value in the output array
+            compactData[i] = compact;
+        }
+        // Handle any remaining data elements (if dataSize is not a multiple of nbSlot).
+        std::size_t remaining = dataSize % nbSlot;
+        if (remaining != 0) {
+            std::int8_t compact = 0;
+            for (std::size_t j = 0; j < remaining; ++j) {
+                compact |= (data[nbFullCompactbytes*nbSlot + j] & mask);
+                if (j < remaining - 1) {
+                    compact <<= shift;
+                }
+            }
+            compact <<= (shift*(nbSlot - remaining));
+            // Store the last compacted value
+            compactData[dataSize / nbSlot] = compact;
+        }
+    }
+template <class I, class O, int nb_bits>
+void WeightInterleavingImpl_cpu_forward_kernel(const DimSize_t input_interleaving,
+                            const DimSize_t nb_interleaving,
+                            const DimSize_t output_interleaving,
+                            const void* input_,
+                            void* output_) {
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+    // Aidge::compact_data(const std::int8_t* data, std::size_t dataSize, std::int8_t* compactData, std::uint8_t nb_bits) {
+    for (std::size_t i=0; i<nb_interleaving; ++i){
+        compact_data(input+(i*input_interleaving), input_interleaving, output+(i*output_interleaving), static_cast<std::uint8_t>(nb_bits));
+    }
+}
+REGISTRAR(WeightInterleavingImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int4, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 4>, nullptr});
+REGISTRAR(WeightInterleavingImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int3, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 3>, nullptr});
+REGISTRAR(WeightInterleavingImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int2, DataFormat::NHWC}},
+    {ProdConso::defaultModel, Aidge::WeightInterleavingImpl_cpu_forward_kernel<int8_t, int8_t, 2>, nullptr});
+}
+#endif /* AIDGE_CPU_OPERATOR_WEIGHTINTERLEAVINGIMPL_KERNELS_H_ */
\ No newline at end of file
--- a/src/operator/WeightInterleavingImpl.cpp
+++ b/src/operator/WeightInterleavingImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#include "aidge/backend/cpu/operator/WeightInterleavingImpl.hpp"
+#include <cstddef>  // std::size_t
+#include <functional>
+#include <memory>
+#include <tuple>
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/WeightInterleavingImpl_kernels.hpp"
+#include "aidge/operator/WeightInterleaving.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Types.h"
+template <>
+void Aidge::WeightInterleavingImpl_cpu::forward()
+{
+    const WeightInterleaving_Op& op_ = dynamic_cast<const WeightInterleaving_Op&>(mOp);
+    AIDGE_ASSERT(op_.getInput(0), "missing input #0");
+    const auto impl = Registrar<WeightInterleavingImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback;
+    const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *(op_.getOutput(0)));
+    // inputInterleaving is the number of consecutive input elements that will be compacted 
+    // Here the interleaving is the last dimension (cf STM32 low bit kernels)
+    std::size_t inputInterleaving = input0.dims().back();
+    // The resulting compacted dimension was computed in forwardDims and the output tensor was resized
+    std::size_t outputInterleaving = op_.getOutput(0)->dims().back();
+    // nb_interleaving is the number of compacted segments 
+    std::size_t nbInterleaving;
+    // Determine the number of segment to compact
+    if (input0.dims().size() > 1){
+        nbInterleaving = std::accumulate(
+        input0.dims().cbegin(),
+        std::prev(input0.dims().cend()), // Exclude the last element
+        std::size_t(1),
+        std::multiplies<std::size_t>());
+    } else {
+        // Case when the weight tensor is only one dimension
+        nbInterleaving = 1;
+    }
+    impl.forward(inputInterleaving,
+        nbInterleaving,
+        outputInterleaving,
+        input0.getImpl()->rawPtr(),
+        getCPUPtr(mOp.getRawOutput(0)));
+}
+template <>
+void Aidge::WeightInterleavingImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for WeightInterleaving_Op on backend cpu");
+}
\ No newline at end of file
--- a/unit_tests/operator/Test_WeightInterleavingImpl.cpp
+++ b/unit_tests/operator/Test_WeightInterleavingImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#include <catch2/catch_test_macros.hpp>
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/WeightInterleaving.hpp"
+#include "aidge/backend/cpu.hpp"
+#include <memory>
+using namespace Aidge;
+TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
+    std::shared_ptr<Node> myWeightInterleaving = WeightInterleaving();
+    auto opWeightInterleaving = std::static_pointer_cast<WeightInterleaving_Op>(myWeightInterleaving -> getOperator());
+    SECTION("CompactDataSize - Single element cases") {
+        REQUIRE(opWeightInterleaving->compactDataSize(1, 1) == 1);  // 1 bit, needs 1 byte
+        REQUIRE(opWeightInterleaving->compactDataSize(1, 7) == 1);  // 7 bits, needs 1 byte
+    }
+    SECTION("CompactDataSize - Boundary cases for different nb_bits values") {
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 1) == 1);  // 8 elements at 1 bit each, fits in 1 byte
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 2) == 2);  // 8 elements at 2 bits each, needs 2 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 3) == 4);  // 8 elements at 3 bits each, needs 4 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 4) == 4);  // 8 elements at 4 bits each, needs 4 bytes
+    }
+    SECTION("CompactDataSize - Larger dataSize values") {
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 1) == 2);  // 16 elements at 1 bit each, fits in 2 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 2) == 4);  // 16 elements at 2 bits each, needs 4 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 3) == 8);  // 16 elements at 3 bits each, needs 6 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 4) == 8);  // 16 elements at 4 bits each, needs 8 bytes
+    }
+    SECTION("CompactDataSize - Odd dataSize values with varying nb_bits") {
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 1) == 1);  // 7 elements at 1 bit each, fits in 1 byte
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 2) == 2);  // 7 elements at 2 bits each, needs 2 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 3) == 4);  // 7 elements at 3 bits each, needs 4 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 4) == 4);  // 7 elements at 4 bits each, needs 4 bytes
+    }
+    SECTION("CompactDataSize - Minimum and maximum values for nb_bits") {
+        REQUIRE(opWeightInterleaving->compactDataSize(5, 1) == 1);  // 5 elements at 1 bit each, fits in 1 byte
+    }
+    SECTION("CompactDataSize - Edge Case - dataSize of 0 should result in 0 required size") {
+        REQUIRE(opWeightInterleaving->compactDataSize(0, 1) == 0);  // No data elements
+    }
+    SECTION("CompactData - 4-bit compaction") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
+                                                                {static_cast<std::int8_t>(0x0F), 
+                                                                static_cast<std::int8_t>(0xF5), 
+                                                                static_cast<std::int8_t>(0xB3), 
+                                                                static_cast<std::int8_t>(0x9C)}
+                                                                });
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0xF5), 
+                                                                static_cast<int8_t>(0x3C)}
+                                                                });
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Int4);
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(DataType::Int4);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+    SECTION("CompactData - 3-bit compaction") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
+                                                                {static_cast<int8_t>(0x0F), 
+                                                                static_cast<int8_t>(0x05), 
+                                                                static_cast<int8_t>(0x04),
+                                                                static_cast<int8_t>(0xD3)}
+                                                                });
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int3);
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0x75), 
+                                                                static_cast<int8_t>(0x43)}
+                                                                });
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Int3);
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(DataType::Int3);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+    SECTION("CompactData - 2-bit compaction") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
+                                                                {static_cast<std::int8_t>(0x03),
+                                                                 static_cast<std::int8_t>(0x02),
+                                                                 static_cast<std::int8_t>(0x01), 
+                                                                 static_cast<std::int8_t>(0x00)}
+                                                                 });
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int2);
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{
+                                                                {static_cast<int8_t>(0xE4)}
+                                                                });
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Int2);
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(DataType::Int2);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+    SECTION("CompactData - Edge Cases - Single element data") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{
+                                                                {static_cast<int8_t>(0x0F)}
+                                                                });
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{
+                                                                {static_cast<int8_t>(0xF0)}
+                                                                });
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Int4);
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(DataType::Int4);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+    SECTION("CompactData - Edge Cases - Non-divisible dataSize for nbSlot with nbbits=4") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 3>{
+                                                                {static_cast<int8_t>(0x0F), 
+                                                                static_cast<int8_t>(0xA5), 
+                                                                static_cast<int8_t>(0x34)}
+                                                                });
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0xF5), 
+                                                                static_cast<int8_t>(0x40)}
+                                                                });
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Int4);
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(DataType::Int4);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+    SECTION("CompactData - Edge Cases - Non-divisible dataSize for nbSlot with nbbits=3") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 3>{
+                                                                {static_cast<int8_t>(0x0F), 
+                                                                static_cast<int8_t>(0x05), 
+                                                                static_cast<int8_t>(0x04)}
+                                                                });
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int3);
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0x75), 
+                                                                static_cast<int8_t>(0x40)}
+                                                                });
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Int3);
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(DataType::Int3);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+    SECTION("Forward Op - Convolution weight interleaving") {
+        // Weight [Cout = 2, H = 3, W = 3, Cin = 4]:
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,4> {
+            {
+                {
+                    {
+                        {-6,  0,  5, -8}, // 'A' '0' '5' '8' in hexadecimal format
+                        { 5,  5,  4, -5}, // '5' '5' '4' 'B' in hexadecimal format
+                        {-7, -1,  4, -7}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        { 3, -3, -3, -3}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        { 1,  3,  1, -1}, // '1' '3' '1' 'F' in hexadecimal format
+                        { 7, -3, -1,  4}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {-1,  3,  5,  6}, // 'F' '3' '5' '6' in hexadecimal format
+                        {-8,  4,  7,  1}, // '8' '4' '7' '1' in hexadecimal format
+                        {-5,  0, -1, -2}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        { 2, -7,  7, -4}, // '2' '9' '7' 'C' in hexadecimal format
+                        {-7,  3,  0,  2}, // '9' '3' '0' '2' in hexadecimal format
+                        { 1, -1,  2,  3}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {-1, -5, -3, -7}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {-8,  3,  5, -1}, // '8' '3' '5' 'F' in hexadecimal format
+                        {-7, -4, -6, -1}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        { 1,  7,  5, -1}, // '1' '7' '5' 'F' in hexadecimal format
+                        { 1, -8,  1,  2}, // '1' '8' '1' '2' in hexadecimal format
+                        {-1, -6, -3,  0}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            } 
+        });
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,2> {
+            {
+                {
+                    {
+                        {static_cast<int8_t>(0xA0), static_cast<int8_t>(0x58)}, // 'A' '0' '5' '8' in hexadecimal format
+                        {static_cast<int8_t>(0x55), static_cast<int8_t>(0x4B)}, // '5' '5' '4' 'B' in hexadecimal format
+                        {static_cast<int8_t>(0x9F), static_cast<int8_t>(0x49)}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x3D), static_cast<int8_t>(0xDD)}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        {static_cast<int8_t>(0x13), static_cast<int8_t>(0x1F)}, // '1' '3' '1' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x7D), static_cast<int8_t>(0xF4)}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xF3), static_cast<int8_t>(0x56)}, // 'F' '3' '5' '6' in hexadecimal format
+                        {static_cast<int8_t>(0x84), static_cast<int8_t>(0x71)}, // '8' '4' '7' '1' in hexadecimal format
+                        {static_cast<int8_t>(0xB0), static_cast<int8_t>(0xFE)}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        {static_cast<int8_t>(0x29), static_cast<int8_t>(0x7C)}, // '2' '9' '7' 'C' in hexadecimal format
+                        {static_cast<int8_t>(0x93), static_cast<int8_t>(0x02)}, // '9' '3' '0' '2' in hexadecimal format
+                        {static_cast<int8_t>(0x1F), static_cast<int8_t>(0x23)}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xFB), static_cast<int8_t>(0xD9)}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {static_cast<int8_t>(0x83), static_cast<int8_t>(0x5F)}, // '8' '3' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x9C), static_cast<int8_t>(0xAF)}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x17), static_cast<int8_t>(0x5F)}, // '1' '7' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x18), static_cast<int8_t>(0x12)}, // '1' '8' '1' '2' in hexadecimal format
+                        {static_cast<int8_t>(0xFA), static_cast<int8_t>(0xD0)}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            } 
+        });
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Int4);
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(DataType::Int4);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+}