Skip to content
Snippets Groups Projects

Add Convert operator (a.k.a. Transmitter)

Merged Olivier BICHLER requested to merge convert into main
11 files
+ 584
99
Compare changes
  • Side-by-side
  • Inline
Files
11
@@ -5,11 +5,23 @@
#include "aidge/data/Tensor.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/Types.h"
#include "aidge/utils/ErrorHandling.hpp"
#include "aidge/utils/future_std/span.hpp"
#include "aidge/backend/cuda/utils/CudaUtils.hpp"
#include "aidge/backend/cuda/utils/CudaContext.hpp"
namespace Aidge {
template <typename SRC_T, typename DST_T>
void thrust_copy(const SRC_T* /*srcData*/, DST_T* /*dstData*/, size_t /*size*/);
template <typename SRC_T, typename std::enable_if<!std::is_same<half_float::half, SRC_T>::value>::type* = nullptr>
void thrust_copy(const SRC_T* srcData, half_float::half* dstData, size_t size);
template <typename DST_T, typename std::enable_if<!std::is_same<half_float::half, DST_T>::value>::type* = nullptr>
void thrust_copy(const half_float::half* srcData, DST_T* dstData, size_t size);
template <>
void thrust_copy(const half_float::half* srcData, half_float::half* dstData, size_t size);
/**
* @brief Abstract class for the TensorImpl_cuda class template.
* @details Its purpose is to provide access to base methods that are specific
@@ -29,16 +41,31 @@ public:
template <class T>
class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_ {
private:
private:
static T* cudaAlloc(NbElts_t length) {
T* data;
CHECK_CUDA_STATUS(cudaMalloc(reinterpret_cast<void**>(&data), length * sizeof(T)));
return data;
}
static void cudaDelete(T* data) {
// Should not be called if data is nullptr, according to the standard
cudaFree(data);
}
private:
const Tensor &mTensor; // Impl needs to access Tensor information, but is not
// supposed to change it!
T* mData = nullptr;
/// Pointer to the data and its capacity
future_std::span<T> mData;
/// If this instance own the data, std::unique_ptr manages it
std::unique_ptr<T, decltype(&cudaDelete)> mDataOwner;
mutable cudnnTensorDescriptor_t mCudnnTensor = nullptr;
public:
public:
static constexpr const char *Backend = "cuda";
TensorImpl_cuda(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor) {}
TensorImpl_cuda(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor), mDataOwner(nullptr, cudaDelete) {}
bool operator==(const TensorImpl &otherImpl) const override final;
@@ -47,23 +74,111 @@ class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_ {
}
// native interface
const T* data() const { return mData; }
const future_std::span<T>& data() const { return mData; }
std::size_t size() const override { return mData.size(); }
std::size_t scalarSize() const override { return sizeof(T); }
void copy(const void *src, NbElts_t length) override {
CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyHostToDevice));
void setDevice(DeviceIdx_t device) override {
mDevice = device;
}
void *rawPtr() override {
lazyInit(reinterpret_cast<void**>(&mData));
return mData;
void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override {
void* dst = static_cast<void*>(static_cast<T*>(rawPtr()) + offset);
CHECK_CUDA_STATUS(cudaMemcpy(dst, src, length * sizeof(T), cudaMemcpyDeviceToDevice));
}
void* getRaw(std::size_t idx) {
return static_cast<void*>(static_cast<T*>(rawPtr()) + idx);
void copyCast(const void *src, NbElts_t length, const DataType srcDt) override {
if (length == 0) {
return;
}
AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
if (srcDt == DataType::Float64) {
thrust_copy(static_cast<const double*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::Float32) {
thrust_copy(static_cast<const float*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::Float16) {
thrust_copy(static_cast<const half_float::half*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::Int64) {
thrust_copy(static_cast<const int64_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::UInt64) {
thrust_copy(static_cast<const uint64_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::Int32) {
thrust_copy(static_cast<const int32_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::UInt32) {
thrust_copy(static_cast<const uint32_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::Int16) {
thrust_copy(static_cast<const int16_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::UInt16) {
thrust_copy(static_cast<const uint16_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::Int8) {
thrust_copy(static_cast<const int8_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else if (srcDt == DataType::UInt8) {
thrust_copy(static_cast<const uint8_t*>(src),
static_cast<T*>(rawPtr()),
length);
}
else {
AIDGE_THROW_OR_ABORT(std::runtime_error, "Unsupported data type.");
}
}
void copyFromDevice(const void *src, NbElts_t length, const std::pair<std::string, DeviceIdx_t>& device) override {
AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyDeviceToDevice));
}
void copyFromHost(const void *src, NbElts_t length) override {
AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
CHECK_CUDA_STATUS(cudaMemcpy(rawPtr(), src, length * sizeof(T), cudaMemcpyHostToDevice));
}
void copyToHost(void *dst, NbElts_t length) const override {
AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
CHECK_CUDA_STATUS(cudaMemcpy(dst, rawPtr(), length * sizeof(T), cudaMemcpyDeviceToHost));
}
void *rawPtr(NbElts_t offset = 0) override {
lazyInit();
return (mData.data() + offset);
};
const void *rawPtr(NbElts_t offset = 0) const override {
AIDGE_ASSERT(mData.size() >= mTensor.size(), "accessing uninitialized const rawPtr");
return (mData.data() + offset);
};
const cudnnTensorDescriptor_t& getCudnnTensorDesc() const override {
if (mCudnnTensor == nullptr) {
CHECK_CUDNN_STATUS(cudnnCreateTensorDescriptor(&mCudnnTensor));
@@ -97,22 +212,25 @@ class TensorImpl_cuda : public TensorImpl, public TensorImpl_cuda_ {
return mCudnnTensor;
}
virtual ~TensorImpl_cuda() {
if (mData != nullptr)
cudaFree(mData);
void setRawPtr(void *ptr, NbElts_t length) override final {
AIDGE_ASSERT(length >= mTensor.size(), "trying to set raw pointer of insufficient capacity");
mData = future_std::span<T>(static_cast<T *>(ptr), length);
mDataOwner.reset();
};
virtual ~TensorImpl_cuda() {
if (mCudnnTensor != nullptr)
cudnnDestroyTensorDescriptor(mCudnnTensor);
}
void setRawPtr(void* /*ptr*/) override final {
printf("Not implemented yet.");
};
private:
void lazyInit(void** data) {
if (*data == nullptr)
CHECK_CUDA_STATUS(cudaMalloc(data, mTensor.size() * sizeof(T)));
private:
void lazyInit() {
if (mData.size() < mTensor.size()) {
// Need more data, a re-allocation will occur
AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "trying to enlarge non-owned data");
mDataOwner.reset(cudaAlloc(mTensor.size()));
mData = future_std::span<T>(mDataOwner.get(), mTensor.size());
}
}
};
@@ -121,6 +239,8 @@ static Registrar<Tensor> registrarTensorImpl_cuda_Float64(
{"cuda", DataType::Float64}, Aidge::TensorImpl_cuda<double>::create);
static Registrar<Tensor> registrarTensorImpl_cuda_Float32(
{"cuda", DataType::Float32}, Aidge::TensorImpl_cuda<float>::create);
static Registrar<Tensor> registrarTensorImpl_cuda_Float16(
{"cuda", DataType::Float16}, Aidge::TensorImpl_cuda<half_float::half>::create);
static Registrar<Tensor> registrarTensorImpl_cuda_Int32(
{"cuda", DataType::Int32}, Aidge::TensorImpl_cuda<int>::create);
} // namespace
Loading