Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
aidge_backend_cuda
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Eclipse Projects
aidge
aidge_backend_cuda
Commits
16ffa25d
Commit
16ffa25d
authored
3 months ago
by
Houssem ROUIS
Browse files
Options
Downloads
Patches
Plain Diff
force casting float and double to int32 for BitShift
parent
d5362453
Branches
bitshift_op
No related tags found
No related merge requests found
Pipeline
#65875
passed
3 months ago
Stage: static_analysis
Stage: build
Stage: test
Stage: coverage
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/operator/BitShitImpl.cpp
+42
-21
42 additions, 21 deletions
src/operator/BitShitImpl.cpp
unit_tests/Test_BitShiftImpl.cpp
+116
-0
116 additions, 0 deletions
unit_tests/Test_BitShiftImpl.cpp
with
158 additions
and
21 deletions
src/operator/BitShitImpl.cpp
+
42
−
21
View file @
16ffa25d
...
@@ -39,8 +39,14 @@
...
@@ -39,8 +39,14 @@
std
::
vector
<
std
::
vector
<
int
>>
dims
(
op
.
nbInputs
());
// For broadcasted dims
std
::
vector
<
std
::
vector
<
int
>>
dims
(
op
.
nbInputs
());
// For broadcasted dims
std
::
vector
<
std
::
vector
<
int
>>
strides
(
op
.
nbInputs
());
// For the cooresponding strides
std
::
vector
<
std
::
vector
<
int
>>
strides
(
op
.
nbInputs
());
// For the cooresponding strides
for
(
IOIndex_t
i
=
0
;
i
<
op
.
nbInputs
();
++
i
)
{
for
(
IOIndex_t
i
=
0
;
i
<
op
.
nbInputs
();
++
i
)
{
inputs
[
i
]
=
op
.
getInput
(
i
)
->
refCastFrom
(
inputFallbacks
[
i
],
*
op
.
getOutput
(
0
));
// TODO: remove the forced cast to int64
const
auto
dt
=
std
::
static_pointer_cast
<
Tensor
>
(
mOp
.
getRawOutput
(
0
))
->
dataType
();
if
(
dt
==
DataType
::
Float32
||
dt
==
DataType
::
Float64
)
{
inputs
[
i
]
=
op
.
getInput
(
i
)
->
refCast
(
inputFallbacks
[
i
],
DataType
::
Int32
);
}
else
{
inputs
[
i
]
=
op
.
getInput
(
i
)
->
refCastFrom
(
inputFallbacks
[
i
],
*
op
.
getOutput
(
0
));
}
// Get tensor dims and broadcast them
// Get tensor dims and broadcast them
std
::
copy
(
inputs
[
i
].
dims
().
begin
(),
inputs
[
i
].
dims
().
end
(),
std
::
back_inserter
(
dims
[
i
]));
std
::
copy
(
inputs
[
i
].
dims
().
begin
(),
inputs
[
i
].
dims
().
end
(),
std
::
back_inserter
(
dims
[
i
]));
dims
[
i
].
insert
(
dims
[
i
].
cbegin
(),
op
.
getOutput
(
0
)
->
nbDims
()
-
dims
[
i
].
size
(),
int
(
1
));
dims
[
i
].
insert
(
dims
[
i
].
cbegin
(),
op
.
getOutput
(
0
)
->
nbDims
()
-
dims
[
i
].
size
(),
int
(
1
));
...
@@ -59,7 +65,7 @@
...
@@ -59,7 +65,7 @@
strides
[
i
]
=
tensorStrides
;
strides
[
i
]
=
tensorStrides
;
}
}
bool
left
=
op
.
direction
()
==
BitShift_Op
::
BitShiftDirection
::
left
;
bool
left
=
op
.
direction
()
==
BitShift_Op
::
BitShiftDirection
::
left
;
switch
(
std
::
static_pointer_cast
<
Tensor
>
(
mOp
.
getRawOutput
(
0
))
->
dataType
())
{
switch
(
inputs
[
0
].
dataType
())
{
case
DataType
::
Int64
:
case
DataType
::
Int64
:
forward_
<
int64_t
>
(
inputs
,
dims
,
strides
,
left
);
forward_
<
int64_t
>
(
inputs
,
dims
,
strides
,
left
);
break
;
break
;
...
@@ -78,21 +84,36 @@
...
@@ -78,21 +84,36 @@
// const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
// const typename Cuda::cudnn_scaling_type<T>::type beta = 0.0f;
const
T
*
input1Ptr
=
static_cast
<
const
T
*>
(
inputs
[
0
].
getImpl
()
->
rawPtr
());
const
T
*
input1Ptr
=
static_cast
<
const
T
*>
(
inputs
[
0
].
getImpl
()
->
rawPtr
());
const
T
*
input2Ptr
=
static_cast
<
const
T
*>
(
inputs
[
1
].
getImpl
()
->
rawPtr
());
const
T
*
input2Ptr
=
static_cast
<
const
T
*>
(
inputs
[
1
].
getImpl
()
->
rawPtr
());
T
*
outputPtr
=
static_cast
<
T
*>
(
op
.
getOutput
(
0
)
->
getImpl
()
->
rawPtr
());
// T * outputPtr = static_cast<T*>(op.getOutput(0)->getImpl()->rawPtr());
std
::
vector
<
int
>
outputStrides
(
op
.
getOutput
(
0
)
->
nbDims
(),
1
);
std
::
shared_ptr
<
Tensor
>
outputFallback
;
if
(
op
.
getOutput
(
0
)
->
nbDims
()
>
1
)
{
const
auto
dt
=
std
::
static_pointer_cast
<
Tensor
>
(
mOp
.
getRawOutput
(
0
))
->
dataType
();
for
(
int
i
=
op
.
getOutput
(
0
)
->
nbDims
()
-
2
;
i
>=
0
;
i
--
)
{
Tensor
outputCasted
;
outputStrides
[
i
]
=
outputStrides
[
i
+
1
]
*
op
.
getOutput
(
0
)
->
dims
()[
i
+
1
];
if
(
dt
==
DataType
::
Float32
||
dt
==
DataType
::
Float64
)
{
}
outputCasted
=
op
.
getOutput
(
0
)
->
refCastFrom
(
outputFallback
,
DataType
::
Int32
,
"cuda"
,
op
.
getOutput
(
0
)
->
device
());
}
}
else
{
std
::
vector
<
int
>
outDims
(
std
::
max
(
op
.
getOutput
(
0
)
->
nbDims
(),
std
::
size_t
(
4
)),
1
);
outputCasted
=
op
.
getOutput
(
0
)
->
refCastFrom
(
outputFallback
,
*
op
.
getOutput
(
0
));
for
(
std
::
size_t
i
=
0
;
i
<
op
.
getOutput
(
0
)
->
nbDims
();
i
++
)
{
}
outDims
[
i
]
=
static_cast
<
int
>
(
op
.
getOutput
(
0
)
->
dims
()[
i
]);
std
::
vector
<
int
>
outputStrides
(
op
.
getOutput
(
0
)
->
nbDims
(),
1
);
}
if
(
op
.
getOutput
(
0
)
->
nbDims
()
>
1
)
{
for
(
int
i
=
op
.
getOutput
(
0
)
->
nbDims
()
-
2
;
i
>=
0
;
i
--
)
{
Aidge
::
bitShiftForward
<
T
>
(
input1Ptr
,
input2Ptr
,
outputPtr
,
outputStrides
[
i
]
=
outputStrides
[
i
+
1
]
*
op
.
getOutput
(
0
)
->
dims
()[
i
+
1
];
inputsDims
[
0
],
inputsDims
[
1
],
outDims
,
}
inputsStrides
[
0
],
inputsStrides
[
1
],
outputStrides
,
}
static_cast
<
int
>
(
op
.
getOutput
(
0
)
->
size
()),
left
);
std
::
vector
<
int
>
outDims
(
std
::
max
(
op
.
getOutput
(
0
)
->
nbDims
(),
std
::
size_t
(
4
)),
1
);
}
for
(
std
::
size_t
i
=
0
;
i
<
op
.
getOutput
(
0
)
->
nbDims
();
i
++
)
{
outDims
[
i
]
=
static_cast
<
int
>
(
op
.
getOutput
(
0
)
->
dims
()[
i
]);
}
Aidge
::
bitShiftForward
<
T
>
(
input1Ptr
,
input2Ptr
,
static_cast
<
T
*>
(
outputCasted
.
getImpl
()
->
rawPtr
()),
inputsDims
[
0
],
inputsDims
[
1
],
outDims
,
inputsStrides
[
0
],
inputsStrides
[
1
],
outputStrides
,
static_cast
<
int
>
(
op
.
getOutput
(
0
)
->
size
()),
left
);
if
(
dt
==
DataType
::
Float32
||
dt
==
DataType
::
Float64
)
{
op
.
getOutput
(
0
)
->
getImpl
()
->
copyCast
(
outputCasted
.
getImpl
()
->
rawPtr
(),
DataType
::
Int32
,
outputCasted
.
size
());
}
else
{
// op.getOutput(0)->getImpl()->copy(outputCasted.getImpl()->rawPtr(),outputCasted.size());
CHECK_CUDA_STATUS
(
cudaMemcpy
(
op
.
getOutput
(
0
)
->
getImpl
()
->
rawPtr
(),
outputCasted
.
getImpl
()
->
rawPtr
(),
outputCasted
.
size
()
*
sizeof
(
int
),
cudaMemcpyDeviceToDevice
));
}
}
This diff is collapsed.
Click to expand it.
unit_tests/Test_BitShiftImpl.cpp
+
116
−
0
View file @
16ffa25d
...
@@ -38,6 +38,7 @@ using namespace Aidge;
...
@@ -38,6 +38,7 @@ using namespace Aidge;
TEST_CASE
(
"[gpu/operator] BitShift(forward)"
,
"[BitShift][GPU]"
)
TEST_CASE
(
"[gpu/operator] BitShift(forward)"
,
"[BitShift][GPU]"
)
{
{
SECTION
(
"Int"
)
{
constexpr
std
::
uint16_t
NBTRIALS
=
15
;
constexpr
std
::
uint16_t
NBTRIALS
=
15
;
// Create a random number generator
// Create a random number generator
std
::
random_device
rd
;
std
::
random_device
rd
;
...
@@ -149,6 +150,121 @@ TEST_CASE("[gpu/operator] BitShift(forward)", "[BitShift][GPU]")
...
@@ -149,6 +150,121 @@ TEST_CASE("[gpu/operator] BitShift(forward)", "[BitShift][GPU]")
cudaFree
(
array1_d
);
cudaFree
(
array1_d
);
}
Log
::
info
(
"number of elements over time spent: {}
\n
"
,
(
number_of_operation
/
duration
.
count
()));
Log
::
info
(
"total time: {}μs
\n
"
,
duration
.
count
());
}
}
SECTION
(
"Float cast"
)
{
constexpr
std
::
uint16_t
NBTRIALS
=
15
;
// Create a random number generator
std
::
random_device
rd
;
std
::
mt19937
gen
(
rd
());
std
::
uniform_int_distribution
<
int
>
valueDist
(
-
15
,
15
);
std
::
uniform_int_distribution
<
std
::
size_t
>
dimSizeDist
(
std
::
size_t
(
2
),
std
::
size_t
(
5
));
std
::
uniform_int_distribution
<
std
::
size_t
>
nbDimsDist
(
std
::
size_t
(
1
),
std
::
size_t
(
3
));
std
::
uniform_int_distribution
<
int
>
boolDist
(
0
,
1
);
BitShift_Op
::
BitShiftDirection
direction
=
BitShift_Op
::
BitShiftDirection
::
left
;
if
(
valueDist
(
gen
)
%
2
==
0
)
{
direction
=
BitShift_Op
::
BitShiftDirection
::
right
;
}
// Create BitShift Operator
std
::
shared_ptr
<
Node
>
myBitShift
=
BitShift
(
direction
);
auto
op
=
std
::
static_pointer_cast
<
OperatorTensor
>
(
myBitShift
->
getOperator
());
op
->
setDataType
(
DataType
::
Float32
);
op
->
setBackend
(
"cuda"
);
// Create 2 input Tensors
std
::
shared_ptr
<
Tensor
>
T0
=
std
::
make_shared
<
Tensor
>
();
op
->
associateInput
(
0
,
T0
);
T0
->
setDataType
(
DataType
::
Float32
);
T0
->
setBackend
(
"cuda"
);
std
::
shared_ptr
<
Tensor
>
T1
=
std
::
make_shared
<
Tensor
>
();
op
->
associateInput
(
1
,
T1
);
T1
->
setDataType
(
DataType
::
Float32
);
T1
->
setBackend
(
"cuda"
);
// Create results Tensor
std
::
shared_ptr
<
Tensor
>
Tres
=
std
::
make_shared
<
Tensor
>
();
Tres
->
setDataType
(
DataType
::
Float32
);
Tres
->
setBackend
(
"cpu"
);
// To measure execution time of 'BitShift_Op::forward()' member function call
std
::
chrono
::
time_point
<
std
::
chrono
::
system_clock
>
start
;
std
::
chrono
::
time_point
<
std
::
chrono
::
system_clock
>
end
;
std
::
chrono
::
duration
<
double
,
std
::
micro
>
duration
{};
std
::
size_t
number_of_operation
=
0
;
for
(
std
::
uint16_t
trial
=
0
;
trial
<
NBTRIALS
;
++
trial
)
{
// generate 2 random Tensors
const
std
::
size_t
nbDims
=
nbDimsDist
(
gen
);
std
::
vector
<
std
::
size_t
>
dims
;
for
(
std
::
size_t
i
=
0
;
i
<
nbDims
;
++
i
)
{
dims
.
push_back
(
dimSizeDist
(
gen
));
}
const
std
::
size_t
nb_elements
=
std
::
accumulate
(
dims
.
cbegin
(),
dims
.
cend
(),
std
::
size_t
(
1
),
std
::
multiplies
<
std
::
size_t
>
());
number_of_operation
+=
nb_elements
;
// without broadcasting
float
*
array0
=
new
float
[
nb_elements
];
float
*
array1
=
new
float
[
nb_elements
];
float
*
result
=
new
float
[
nb_elements
];
for
(
std
::
size_t
i
=
0
;
i
<
nb_elements
;
++
i
)
{
array0
[
i
]
=
float
(
valueDist
(
gen
));
array1
[
i
]
=
float
(
std
::
abs
(
valueDist
(
gen
)));
// bitshift is impossible with negative value
if
(
direction
==
BitShift_Op
::
BitShiftDirection
::
left
)
{
result
[
i
]
=
float
(
int
(
array0
[
i
])
<<
int
(
array1
[
i
]));
}
else
{
result
[
i
]
=
float
(
int
(
array0
[
i
])
>>
int
(
array1
[
i
]));
}
}
float
*
array0_d
,
*
array1_d
;
// input0
T0
->
resize
(
dims
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
array0_d
),
sizeof
(
float
)
*
nb_elements
);
cudaMemcpy
(
array0_d
,
array0
,
sizeof
(
float
)
*
nb_elements
,
cudaMemcpyHostToDevice
);
T0
->
getImpl
()
->
setRawPtr
(
array0_d
,
nb_elements
);
// input1
T1
->
resize
(
dims
);
cudaMalloc
(
reinterpret_cast
<
void
**>
(
&
array1_d
),
sizeof
(
float
)
*
nb_elements
);
cudaMemcpy
(
array1_d
,
array1
,
sizeof
(
float
)
*
nb_elements
,
cudaMemcpyHostToDevice
);
T1
->
getImpl
()
->
setRawPtr
(
array1_d
,
nb_elements
);
// results
Tres
->
resize
(
dims
);
Tres
->
getImpl
()
->
setRawPtr
(
result
,
nb_elements
);
op
->
forwardDims
();
start
=
std
::
chrono
::
system_clock
::
now
();
myBitShift
->
forward
();
end
=
std
::
chrono
::
system_clock
::
now
();
duration
+=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
end
-
start
);
std
::
shared_ptr
<
Tensor
>
outputFallback
;
const
auto
&
cudaOutput
=
op
->
getOutput
(
0
)
->
refCastFrom
(
outputFallback
,
*
Tres
);
REQUIRE
(
approxEq
<
float
>
(
cudaOutput
,
*
(
Tres
)));
delete
[]
array0
;
delete
[]
array1
;
delete
[]
result
;
cudaFree
(
array0_d
);
cudaFree
(
array1_d
);
}
}
Log
::
info
(
"number of elements over time spent: {}
\n
"
,
(
number_of_operation
/
duration
.
count
()));
Log
::
info
(
"number of elements over time spent: {}
\n
"
,
(
number_of_operation
/
duration
.
count
()));
Log
::
info
(
"total time: {}μs
\n
"
,
duration
.
count
());
Log
::
info
(
"total time: {}μs
\n
"
,
duration
.
count
());
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment