Skip to content
Snippets Groups Projects
Commit 45e65853 authored by Olivier BICHLER's avatar Olivier BICHLER
Browse files

Replaced cudaMalloc with thrust::device_vector

parent 2ddf435e
No related branches found
No related tags found
2 merge requests!790.6.1,!76Implement BitShift in CUDA
......@@ -9,6 +9,8 @@
*
********************************************************************************/
#include <thrust/device_vector.h>
#include "aidge/backend/cuda/operator/DivImpl_CUDA_kernels.hpp"
......@@ -28,8 +30,8 @@ __device__ half div<half>(half a, half b) {
template <class T>
__global__ void divKernel(const T* input1, T* output, const T* input2,
int* input1_shape, int* input2_shape, int* output_shape,
int* input1_strides, int* input2_strides, int* output_strides,
const int* input1_shape, const int* input2_shape, const int* output_shape,
const int* input1_strides, const int* input2_strides, const int* output_strides,
int num_dims, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size) return;
......@@ -51,37 +53,22 @@ void Aidge::divForward(const T* input1, T* output, const T* input2,
const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
int outSize)
{
int *d_input1_strides, *d_input2_strides, *d_output_strides, *d_input1_shape, *d_input2_shape, *d_output_shape;
// Allocate device memory
CHECK_CUDA_STATUS(cudaMalloc(&d_input1_shape, input1Dims.size() * sizeof(int)));
CHECK_CUDA_STATUS(cudaMalloc(&d_input2_shape, input1Dims.size() * sizeof(int)));
CHECK_CUDA_STATUS(cudaMalloc(&d_output_shape, input1Dims.size() * sizeof(int)));
CHECK_CUDA_STATUS(cudaMalloc(&d_input1_strides, input1Dims.size() * sizeof(int)));
CHECK_CUDA_STATUS(cudaMalloc(&d_input2_strides, input1Dims.size() * sizeof(int)));
CHECK_CUDA_STATUS(cudaMalloc(&d_output_strides, input1Dims.size() * sizeof(int)));
const thrust::device_vector<int> d_input1_shape = input1Dims;
const thrust::device_vector<int> d_input2_shape = input2Dims;
const thrust::device_vector<int> d_output_shape = outputDims;
const thrust::device_vector<int> d_input1_strides = input1Strides;
const thrust::device_vector<int> d_input2_strides = input2Strides;
const thrust::device_vector<int> d_output_strides = outputStrides;
// Copy data from host to device;
CHECK_CUDA_STATUS(cudaMemcpy(d_input1_shape, input1Dims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
CHECK_CUDA_STATUS(cudaMemcpy(d_input2_shape, input2Dims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
CHECK_CUDA_STATUS(cudaMemcpy(d_output_shape, outputDims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
CHECK_CUDA_STATUS(cudaMemcpy(d_input1_strides, input1Strides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
CHECK_CUDA_STATUS(cudaMemcpy(d_input2_strides, input2Strides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
CHECK_CUDA_STATUS(cudaMemcpy(d_output_strides, outputStrides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
int blockSize = 256;
int numBlocks = (outSize + blockSize - 1) / blockSize;
const int blockSize = 256;
const int numBlocks = (outSize + blockSize - 1) / blockSize;
int num_dims = input1Dims.size();
const int num_dims = input1Dims.size();
// Launch the kernel
divKernel<<<numBlocks, blockSize>>>(input1, output, input2,
d_input1_shape, d_input2_shape, d_output_shape,
d_input1_strides, d_input2_strides, d_output_strides,
num_dims, outSize);
CHECK_CUDA_STATUS(cudaFree(d_input1_shape));
CHECK_CUDA_STATUS(cudaFree(d_input2_shape));
CHECK_CUDA_STATUS(cudaFree(d_output_shape));
CHECK_CUDA_STATUS(cudaFree(d_input1_strides));
CHECK_CUDA_STATUS(cudaFree(d_input2_strides));
CHECK_CUDA_STATUS(cudaFree(d_output_strides));
thrust::raw_pointer_cast(d_input1_shape.data()), thrust::raw_pointer_cast(d_input2_shape.data()), thrust::raw_pointer_cast(d_output_shape.data()),
thrust::raw_pointer_cast(d_input1_strides.data()), thrust::raw_pointer_cast(d_input2_strides.data()), thrust::raw_pointer_cast(d_output_strides.data()),
num_dims, outSize);
};
template void Aidge::divForward<double>(const double* input1, double* output, const double* input2,
......
......@@ -10,6 +10,7 @@
********************************************************************************/
#include <cuda_fp16.h>
#include <thrust/device_vector.h>
#include "aidge/backend/cuda/operator/EqualImpl_CUDA_kernels.hpp"
......@@ -25,8 +26,8 @@ __device__ bool compareE<half>(half a, half b) {
template <typename T>
__global__ void equal_cuda_Kernel(const T* input1, const T* input2, T* output,
int* input1_shape, int* input2_shape,
int* input1_strides, int* input2_strides, int* output_strides,
const int* input1_shape, const int* input2_shape,
const int* input1_strides, const int* input2_strides, const int* output_strides,
int num_dims, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size) return;
......@@ -49,34 +50,21 @@ void Aidge::EqualForward(const T* input1, const T* input2, T* output,
const std::vector<int>& input1Strides, const std::vector<int>& input2Strides,const std::vector<int>& outputStrides,
int outSize)
{
int *d_input1_strides, *d_input2_strides, *d_output_strides, *d_input1_shape, *d_input2_shape;
// Allocate device memory
CHECK_CUDA_STATUS(cudaMalloc(&d_input1_shape, input1Dims.size() * sizeof(int)));
CHECK_CUDA_STATUS(cudaMalloc(&d_input2_shape, input1Dims.size() * sizeof(int)));
CHECK_CUDA_STATUS(cudaMalloc(&d_input1_strides, input1Dims.size() * sizeof(int)));
CHECK_CUDA_STATUS(cudaMalloc(&d_input2_strides, input1Dims.size() * sizeof(int)));
CHECK_CUDA_STATUS(cudaMalloc(&d_output_strides, input1Dims.size() * sizeof(int)));
const thrust::device_vector<int> d_input1_shape = input1Dims;
const thrust::device_vector<int> d_input2_shape = input2Dims;
const thrust::device_vector<int> d_input1_strides = input1Strides;
const thrust::device_vector<int> d_input2_strides = input2Strides;
const thrust::device_vector<int> d_output_strides = outputStrides;
// Copy data from host to device;
CHECK_CUDA_STATUS(cudaMemcpy(d_input1_shape, input1Dims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
CHECK_CUDA_STATUS(cudaMemcpy(d_input2_shape, input2Dims.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
CHECK_CUDA_STATUS(cudaMemcpy(d_input1_strides, input1Strides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
CHECK_CUDA_STATUS(cudaMemcpy(d_input2_strides, input2Strides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
CHECK_CUDA_STATUS(cudaMemcpy(d_output_strides, outputStrides.data(), input1Dims.size() * sizeof(int), cudaMemcpyHostToDevice));
int blockSize = 256;
int numBlocks = (outSize + blockSize - 1) / blockSize;
const int blockSize = 256;
const int numBlocks = (outSize + blockSize - 1) / blockSize;
int num_dims = input1Dims.size();
const int num_dims = input1Dims.size();
// Launch the kernel
equal_cuda_Kernel<<<numBlocks, blockSize>>>(input1, input2, output,
d_input1_shape, d_input2_shape,
d_input1_strides, d_input2_strides, d_output_strides,
num_dims, outSize);
CHECK_CUDA_STATUS(cudaFree(d_input1_shape));
CHECK_CUDA_STATUS(cudaFree(d_input2_shape));
CHECK_CUDA_STATUS(cudaFree(d_input1_strides));
CHECK_CUDA_STATUS(cudaFree(d_input2_strides));
CHECK_CUDA_STATUS(cudaFree(d_output_strides));
thrust::raw_pointer_cast(d_input1_shape.data()), thrust::raw_pointer_cast(d_input2_shape.data()),
thrust::raw_pointer_cast(d_input1_strides.data()), thrust::raw_pointer_cast(d_input2_strides.data()), thrust::raw_pointer_cast(d_output_strides.data()),
num_dims, outSize);
};
template void Aidge::EqualForward(const double* input1, const double* input2, double* output,
......
......@@ -10,6 +10,7 @@
********************************************************************************/
#include <cuda_fp16.h>
#include <thrust/device_vector.h>
#include "aidge/backend/cuda/operator/PowImpl_CUDA_kernels.hpp"
......@@ -25,8 +26,8 @@ __device__ half pow<half>(half x, half exponent) {
template <class T>
__global__ void pow_kernel(const T* input, T* output, const T* exponent,
int* input_shape, int* exponent_shape, int* output_shape,
int* input_strides, int* exponent_strides, int* output_strides,
const int* input_shape, const int* exponent_shape, const int* output_shape,
const int* input_strides, const int* exponent_strides, const int* output_strides,
int num_dims, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size) return;
......@@ -50,37 +51,22 @@ void Aidge::powForward<T>(const T* input, T* output, const T* exponent,
const std::vector<int>& inputStrides, const std::vector<int>& exponentStrides,const std::vector<int>& outputStrides,
int outSize)
{
int *d_input_strides, *d_exponent_strides, *d_output_strides, *d_input_shape, *d_exponent_shape, *d_output_shape;
// Allocate device memory
CHECK_CUDA_STATUS(cudaMalloc(&d_input_shape, inputDims.size() * sizeof(int)));
CHECK_CUDA_STATUS(cudaMalloc(&d_exponent_shape, inputDims.size() * sizeof(int)));
CHECK_CUDA_STATUS(cudaMalloc(&d_output_shape, inputDims.size() * sizeof(int)));
CHECK_CUDA_STATUS(cudaMalloc(&d_input_strides, inputDims.size() * sizeof(int)));
CHECK_CUDA_STATUS(cudaMalloc(&d_exponent_strides, inputDims.size() * sizeof(int)));
CHECK_CUDA_STATUS(cudaMalloc(&d_output_strides, inputDims.size() * sizeof(int)));
const thrust::device_vector<int> d_input_shape = inputDims;
const thrust::device_vector<int> d_exponent_shape = exponentDims;
const thrust::device_vector<int> d_output_shape = outputDims;
const thrust::device_vector<int> d_input_strides = inputStrides;
const thrust::device_vector<int> d_exponent_strides = exponentStrides;
const thrust::device_vector<int> d_output_strides = outputStrides;
// Copy data from host to device;
CHECK_CUDA_STATUS(cudaMemcpy(d_input_shape, inputDims.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice));
CHECK_CUDA_STATUS(cudaMemcpy(d_exponent_shape, exponentDims.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice));
CHECK_CUDA_STATUS(cudaMemcpy(d_output_shape, outputDims.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice));
CHECK_CUDA_STATUS(cudaMemcpy(d_input_strides, inputStrides.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice));
CHECK_CUDA_STATUS(cudaMemcpy(d_exponent_strides, exponentStrides.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice));
CHECK_CUDA_STATUS(cudaMemcpy(d_output_strides, outputStrides.data(), inputDims.size() * sizeof(int), cudaMemcpyHostToDevice));
int blockSize = 256;
int numBlocks = (outSize + blockSize - 1) / blockSize;
const int blockSize = 256;
const int numBlocks = (outSize + blockSize - 1) / blockSize;
int num_dims = inputDims.size();
const int num_dims = inputDims.size();
// Launch the kernel
pow_kernel<<<numBlocks, blockSize>>>(input, output, exponent,
d_input_shape, d_exponent_shape, d_output_shape,
d_input_strides, d_exponent_strides, d_output_strides,
num_dims, outSize);
CHECK_CUDA_STATUS(cudaFree(d_input_shape));
CHECK_CUDA_STATUS(cudaFree(d_exponent_shape));
CHECK_CUDA_STATUS(cudaFree(d_output_shape));
CHECK_CUDA_STATUS(cudaFree(d_input_strides));
CHECK_CUDA_STATUS(cudaFree(d_exponent_strides));
CHECK_CUDA_STATUS(cudaFree(d_output_strides));
thrust::raw_pointer_cast(d_input_shape.data()), thrust::raw_pointer_cast(d_exponent_shape.data()), thrust::raw_pointer_cast(d_output_shape.data()),
thrust::raw_pointer_cast(d_input_strides.data()), thrust::raw_pointer_cast(d_exponent_strides.data()), thrust::raw_pointer_cast(d_output_strides.data()),
num_dims, outSize);
};
template void Aidge::powForward<double>(const double* input, double* output, const double* exponent,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment