Olivier BICHLER · d30c27b1 · 594e9898 · 2b674142 · dcdee743 · 8536d095
--- a/include/aidge/backend/cuda/data/TensorImpl.hpp

+ 143

− 23
+++ b/include/aidge/backend/cuda/data/TensorImpl.hpp

+ 143

− 23
 @@ -5,11 +5,23 @@
 #include "aidge/data/Tensor.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/future_std/span.hpp"

 #include "aidge/backend/cuda/utils/CudaUtils.hpp"
 #include "aidge/backend/cuda/utils/CudaContext.hpp"

 namespace Aidge {
+
+template <typename SRC_T, typename DST_T>
+void thrust_copy(const SRC_T* /*srcData*/, DST_T* /*dstData*/, size_t /*size*/);
+template <typename SRC_T, typename std::enable_if<!std::is_same<half_float::half, SRC_T>::value>::type* = nullptr>
+void thrust_copy(const SRC_T* srcData, half_float::half* dstData, size_t size);
+template <typename DST_T, typename std::enable_if<!std::is_same<half_float::half, DST_T>::value>::type* = nullptr>
+void thrust_copy(const half_float::half* srcData, DST_T* dstData, size_t size);
+template <>
+void thrust_copy(const half_float::half* srcData, half_float::half* dstData, size_t size);
+
 /**
 * @brief Abstract class for the TensorImpl_cuda class template.
 * @details Its purpose is to provide access to base methods that are specific 
 @@ -29,16 +41,31 @@ public:

 template <class T>
 class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_  {
-   private:
+private:
+    static T* cudaAlloc(NbElts_t length) {
+        T* data;
+        CHECK_CUDA_STATUS(cudaMalloc(reinterpret_cast<void**>(&data), length * sizeof(T)));
+        return data;
+    }
+
+    static void cudaDelete(T* data) {
+        // Should not be called if data is nullptr, according to the standard
+        cudaFree(data);
+    }
+
+private:
    const Tensor &mTensor;  // Impl needs to access Tensor information, but is not
                            // supposed to change it!
-    T* mData = nullptr;
+    /// Pointer to the data and its capacity
+    future_std::span<T> mData;
+    /// If this instance own the data, std::unique_ptr manages it
+    std::unique_ptr<T, decltype(&cudaDelete)> mDataOwner;
    mutable cudnnTensorDescriptor_t mCudnnTensor = nullptr;

-   public:
+public:
    static constexpr const char *Backend = "cuda";

-    TensorImpl_cuda(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor) {}
+    TensorImpl_cuda(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor), mDataOwner(nullptr, cudaDelete) {}

    bool operator==(const TensorImpl &otherImpl) const override final;

 @@ -47,23 +74,111 @@ class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_  {
    }

    // native interface
-    const T* data() const { return mData; }
+    const future_std::span<T>& data() const { return mData; }

+    std::size_t size() const override { return mData.size(); }
    std::size_t scalarSize() const override { return sizeof(T); }

-    void copy(const void *src, NbElts_t length) override {
-        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyHostToDevice));
+    void setDevice(DeviceIdx_t device) override {
+        mDevice = device;
    }

-    void *rawPtr() override {
-        lazyInit(reinterpret_cast<void**>(&mData));
-        return mData;
+    void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override {
+        void* dst = static_cast<void*>(static_cast<T*>(rawPtr()) + offset);
+        CHECK_CUDA_STATUS(cudaMemcpy(dst, src, length * sizeof(T), cudaMemcpyDeviceToDevice));
    }

-    void* getRaw(std::size_t idx) {
-        return static_cast<void*>(static_cast<T*>(rawPtr()) + idx);
+    void copyCast(const void *src, NbElts_t length, const DataType srcDt) override {
+        if (length == 0) {
+            return;
+        }
+
+        AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
+        if (srcDt == DataType::Float64) {
+            thrust_copy(static_cast<const double*>(src),
+                        static_cast<T*>(rawPtr()),
+                        length);
+        }
+        else if (srcDt == DataType::Float32) {
+            thrust_copy(static_cast<const float*>(src),
+                        static_cast<T*>(rawPtr()),
+                        length);
+        }
+        else if (srcDt == DataType::Float16) {
+            thrust_copy(static_cast<const half_float::half*>(src),
+                        static_cast<T*>(rawPtr()),
+                        length);
+        }
+        else if (srcDt == DataType::Int64) {
+            thrust_copy(static_cast<const int64_t*>(src),
+                        static_cast<T*>(rawPtr()),
+                        length);
+        }
+        else if (srcDt == DataType::UInt64) {
+            thrust_copy(static_cast<const uint64_t*>(src),
+                        static_cast<T*>(rawPtr()),
+                        length);
+        }
+        else if (srcDt == DataType::Int32) {
+            thrust_copy(static_cast<const int32_t*>(src),
+                        static_cast<T*>(rawPtr()),
+                        length);
+        }
+        else if (srcDt == DataType::UInt32) {
+            thrust_copy(static_cast<const uint32_t*>(src),
+                        static_cast<T*>(rawPtr()),
+                        length);
+        }
+        else if (srcDt == DataType::Int16) {
+            thrust_copy(static_cast<const int16_t*>(src),
+                        static_cast<T*>(rawPtr()),
+                        length);
+        }
+        else if (srcDt == DataType::UInt16) {
+            thrust_copy(static_cast<const uint16_t*>(src),
+                        static_cast<T*>(rawPtr()),
+                        length);
+        }
+        else if (srcDt == DataType::Int8) {
+            thrust_copy(static_cast<const int8_t*>(src),
+                        static_cast<T*>(rawPtr()),
+                        length);
+        }
+        else if (srcDt == DataType::UInt8) {
+            thrust_copy(static_cast<const uint8_t*>(src),
+                        static_cast<T*>(rawPtr()),
+                        length);
+        }
+        else {
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Unsupported data type.");
+        }
+    }
+
+    void copyFromDevice(const void *src, NbElts_t length, const std::pair<std::string, DeviceIdx_t>& device) override {
+        AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
+        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyDeviceToDevice));
+    }
+
+    void copyFromHost(const void *src, NbElts_t length) override {
+        AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
+        CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyHostToDevice));
    }

+    void copyToHost(void *dst, NbElts_t length) const override {
+        AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
+        CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(), length * sizeof(T), cudaMemcpyDeviceToHost));
+    }
+
+    void *rawPtr(NbElts_t offset = 0) override {
+        lazyInit();
+        return (mData.data() + offset);
+    };
+
+    const void *rawPtr(NbElts_t offset = 0) const override {
+        AIDGE_ASSERT(mData.size() >= mTensor.size(), "accessing uninitialized const rawPtr");
+        return (mData.data() + offset);
+    };
+
    const cudnnTensorDescriptor_t& getCudnnTensorDesc() const override {
        if (mCudnnTensor == nullptr) {
            CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mCudnnTensor));
 @@ -97,22 +212,25 @@ class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_  {
        return mCudnnTensor;
    }

-    virtual ~TensorImpl_cuda() {
-        if (mData != nullptr)
-            cudaFree(mData);
+    void setRawPtr(void *ptr, NbElts_t length) override final {
+        AIDGE_ASSERT(length >= mTensor.size(), "trying to set raw pointer of insufficient capacity");
+        mData = future_std::span<T>(static_cast<T *>(ptr), length);
+        mDataOwner.reset();
+    };

+    virtual ~TensorImpl_cuda() {
        if (mCudnnTensor != nullptr)
            cudnnDestroyTensorDescriptor(mCudnnTensor);
    }

-    void setRawPtr(void* /*ptr*/) override final {
-        printf("Not implemented yet.");
-    };
-
-   private:
-    void lazyInit(void** data) {
-        if (*data == nullptr)
-            CHECK_CUDA_STATUS(cudaMalloc(data, mTensor.size() * sizeof(T)));
+private:
+    void lazyInit() {
+        if (mData.size() < mTensor.size()) {
+            // Need more data, a re-allocation will occur
+            AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "trying to enlarge non-owned data");
+            mDataOwner.reset(cudaAlloc(mTensor.size()));
+            mData = future_std::span<T>(mDataOwner.get(), mTensor.size());
+        }
    }
 };

 @@ -121,6 +239,8 @@ static Registrar<Tensor> registrarTensorImpl_cuda_Float64(
        {"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create);
 static Registrar<Tensor> registrarTensorImpl_cuda_Float32(
        {"cuda", DataType::Float32}, Aidge::TensorImpl_cuda<float>::create);
+static Registrar<Tensor> registrarTensorImpl_cuda_Float16(
+        {"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create);
 static Registrar<Tensor> registrarTensorImpl_cuda_Int32(
        {"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int>::create);
 }  // namespace