Skip to content
Snippets Groups Projects
Commit 16ffa25d authored by Houssem ROUIS's avatar Houssem ROUIS
Browse files

force casting float and double to int32 for BitShift

parent d5362453
Branches bitshift_op
No related tags found
No related merge requests found
Pipeline #65875 passed
...@@ -39,8 +39,14 @@ ...@@ -39,8 +39,14 @@
std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims std::vector<std::vector<int>> dims(op.nbInputs()); // For broadcasted dims
std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides std::vector<std::vector<int>> strides(op.nbInputs()); // For the cooresponding strides
for (IOIndex_t i = 0; i < op.nbInputs(); ++i) { for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0)); // TODO: remove the forced cast to int64
const auto dt = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType();
if(dt == DataType::Float32 || dt == DataType::Float64 ) {
inputs[i] = op.getInput(i)->refCast(inputFallbacks[i], DataType::Int32);
}
else {
inputs[i] = op.getInput(i)->refCastFrom(inputFallbacks[i], *op.getOutput(0));
}
// Get tensor dims and broadcast them // Get tensor dims and broadcast them
std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i])); std::copy(inputs[i].dims().begin(), inputs[i].dims().end(), std::back_inserter(dims[i]));
dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1)); dims[i].insert(dims[i].cbegin(), op.getOutput(0)->nbDims() - dims[i].size(), int(1));
...@@ -59,7 +65,7 @@ ...@@ -59,7 +65,7 @@
strides[i] = tensorStrides; strides[i] = tensorStrides;
} }
bool left = op.direction() == BitShift_Op::BitShiftDirection::left; bool left = op.direction() == BitShift_Op::BitShiftDirection::left;
switch(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()) { switch(inputs[0].dataType()) {
case DataType::Int64: case DataType::Int64:
forward_<int64_t>(inputs, dims, strides, left); forward_<int64_t>(inputs, dims, strides, left);
break; break;
...@@ -78,21 +84,36 @@ ...@@ -78,21 +84,36 @@
// const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f; // const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
const T * input1Ptr = static_cast<const T*>(inputs[0].getImpl()->rawPtr()); const T * input1Ptr = static_cast<const T*>(inputs[0].getImpl()->rawPtr());
const T * input2Ptr = static_cast<const T*>(inputs[1].getImpl()->rawPtr()); const T * input2Ptr = static_cast<const T*>(inputs[1].getImpl()->rawPtr());
T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr()); // T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr());
std::vector<int> outputStrides(op.getOutput(0)->nbDims(), 1); std::shared_ptr<Tensor> outputFallback;
if(op.getOutput(0)->nbDims()>1) { const auto dt = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType();
for (int i = op.getOutput(0)->nbDims()-2; i >= 0; i--) { Tensor outputCasted;
outputStrides[i] = outputStrides[i+1] * op.getOutput(0)->dims()[i+1]; if(dt == DataType::Float32 || dt == DataType::Float64 ) {
} outputCasted = op.getOutput(0)->refCastFrom(outputFallback, DataType::Int32, "cuda", op.getOutput(0)->device());
} } else {
std::vector<int> outDims(std::max(op.getOutput(0)->nbDims(),std::size_t(4)), 1); outputCasted = op.getOutput(0)->refCastFrom(outputFallback, *op.getOutput(0));
for (std::size_t i = 0; i < op.getOutput(0)->nbDims(); i++) { }
outDims[i] = static_cast<int>(op.getOutput(0)->dims()[i]); std::vector<int> outputStrides(op.getOutput(0)->nbDims(), 1);
} if(op.getOutput(0)->nbDims()>1) {
for (int i = op.getOutput(0)->nbDims()-2; i >= 0; i--) {
Aidge::bitShiftForward<T>(input1Ptr, input2Ptr, outputPtr, outputStrides[i] = outputStrides[i+1] * op.getOutput(0)->dims()[i+1];
inputsDims[0], inputsDims[1], outDims, }
inputsStrides[0], inputsStrides[1], outputStrides, }
static_cast<int>(op.getOutput(0)->size()), left); std::vector<int> outDims(std::max(op.getOutput(0)->nbDims(),std::size_t(4)), 1);
} for (std::size_t i = 0; i < op.getOutput(0)->nbDims(); i++) {
outDims[i] = static_cast<int>(op.getOutput(0)->dims()[i]);
}
Aidge::bitShiftForward<T>(input1Ptr, input2Ptr, static_cast<T*>(outputCasted.getImpl()->rawPtr()),
inputsDims[0], inputsDims[1], outDims,
inputsStrides[0], inputsStrides[1], outputStrides,
static_cast<int>(op.getOutput(0)->size()), left);
if(dt == DataType::Float32 || dt == DataType::Float64 ) {
op.getOutput(0)->getImpl()->copyCast(outputCasted.getImpl()->rawPtr(),DataType::Int32, outputCasted.size());
}else {
// op.getOutput(0)->getImpl()->copy(outputCasted.getImpl()->rawPtr(),outputCasted.size());
CHECK_CUDA_STATUS(cudaMemcpy(op.getOutput(0)->getImpl()->rawPtr(), outputCasted.getImpl()->rawPtr(), outputCasted.size() * sizeof(int), cudaMemcpyDeviceToDevice));
}
}
...@@ -38,6 +38,7 @@ using namespace Aidge; ...@@ -38,6 +38,7 @@ using namespace Aidge;
TEST_CASE("[gpu/operator] BitShift(forward)", "[BitShift][GPU]") TEST_CASE("[gpu/operator] BitShift(forward)", "[BitShift][GPU]")
{ {
SECTION("Int") {
constexpr std::uint16_t NBTRIALS = 15; constexpr std::uint16_t NBTRIALS = 15;
// Create a random number generator // Create a random number generator
std::random_device rd; std::random_device rd;
...@@ -149,6 +150,121 @@ TEST_CASE("[gpu/operator] BitShift(forward)", "[BitShift][GPU]") ...@@ -149,6 +150,121 @@ TEST_CASE("[gpu/operator] BitShift(forward)", "[BitShift][GPU]")
cudaFree(array1_d); cudaFree(array1_d);
}
Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
Log::info("total time: {}μs\n", duration.count());
}
}
SECTION("Float cast") {
constexpr std::uint16_t NBTRIALS = 15;
// Create a random number generator
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<int> valueDist(-15, 15);
std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5));
std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
std::uniform_int_distribution<int> boolDist(0,1);
BitShift_Op::BitShiftDirection direction = BitShift_Op::BitShiftDirection::left;
if(valueDist(gen) % 2 == 0)
{
direction = BitShift_Op::BitShiftDirection::right;
}
// Create BitShift Operator
std::shared_ptr<Node> myBitShift = BitShift(direction);
auto op = std::static_pointer_cast<OperatorTensor>(myBitShift-> getOperator());
op->setDataType(DataType::Float32);
op->setBackend("cuda");
// Create 2 input Tensors
std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
op->associateInput(0,T0);
T0->setDataType(DataType::Float32);
T0->setBackend("cuda");
std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
op -> associateInput(1,T1);
T1->setDataType(DataType::Float32);
T1->setBackend("cuda");
// Create results Tensor
std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
Tres->setDataType(DataType::Float32);
Tres->setBackend("cpu");
// To measure execution time of 'BitShift_Op::forward()' member function call
std::chrono::time_point<std::chrono::system_clock> start;
std::chrono::time_point<std::chrono::system_clock> end;
std::chrono::duration<double, std::micro> duration{};
std::size_t number_of_operation = 0;
for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
// generate 2 random Tensors
const std::size_t nbDims = nbDimsDist(gen);
std::vector<std::size_t> dims;
for (std::size_t i = 0; i < nbDims; ++i) {
dims.push_back(dimSizeDist(gen));
}
const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
number_of_operation += nb_elements;
// without broadcasting
float* array0 = new float[nb_elements];
float* array1 = new float[nb_elements];
float* result = new float[nb_elements];
for (std::size_t i = 0; i < nb_elements; ++i) {
array0[i] = float(valueDist(gen));
array1[i] = float(std::abs(valueDist(gen))); // bitshift is impossible with negative value
if(direction == BitShift_Op::BitShiftDirection::left)
{
result[i] = float(int(array0[i]) << int(array1[i]));
}
else
{
result[i] = float(int(array0[i]) >> int(array1[i]));
}
}
float *array0_d, *array1_d;
// input0
T0->resize(dims);
cudaMalloc(reinterpret_cast<void **>(&array0_d), sizeof(float) * nb_elements);
cudaMemcpy(array0_d, array0, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
T0->getImpl()->setRawPtr(array0_d, nb_elements);
// input1
T1->resize(dims);
cudaMalloc(reinterpret_cast<void **>(&array1_d), sizeof(float) * nb_elements);
cudaMemcpy(array1_d, array1, sizeof(float) * nb_elements, cudaMemcpyHostToDevice);
T1->getImpl()->setRawPtr(array1_d, nb_elements);
// results
Tres->resize(dims);
Tres -> getImpl() -> setRawPtr(result, nb_elements);
op->forwardDims();
start = std::chrono::system_clock::now();
myBitShift->forward();
end = std::chrono::system_clock::now();
duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
std::shared_ptr<Tensor> outputFallback;
const auto& cudaOutput = op->getOutput(0)->refCastFrom(outputFallback, *Tres);
REQUIRE(approxEq<float>(cudaOutput, *(Tres)));
delete[] array0;
delete[] array1;
delete[] result;
cudaFree(array0_d);
cudaFree(array1_d);
} }
Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count())); Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
Log::info("total time: {}μs\n", duration.count()); Log::info("total time: {}μs\n", duration.count());
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment