Skip to content
Snippets Groups Projects
Commit 4fa3e9a7 authored by Benjamin Halimi's avatar Benjamin Halimi
Browse files

add LSQ calibration cuda support + hotfix the malloc

parent f8886972
No related branches found
No related tags found
3 merge requests!28v0.3.0,!20v0.4.0,!18CUDA support for the Quantization routines
......@@ -19,6 +19,7 @@
#include "aidge/scheduler/SequentialScheduler.hpp"
#include "aidge/scheduler/Scheduler.hpp"
#include "aidge/graph/Matching.hpp"
#include "aidge/recipes/QuantRecipes.hpp"
namespace Aidge {
......@@ -35,7 +36,8 @@ void QuantLSQ::insertQuantizers(std::shared_ptr<GraphView> graphView, size_t nbB
// INPUT QUANTIZERS INSERTION
auto inputQuantizerName = linearNode->name() + "_lsq_i"; // TODO : double check this, and use createUniqueName()
// TODO : double check this, and use createUniqueName()
auto inputQuantizerName = makeUniqueName(linearNode->name() + "_lsq_i", graphView);
auto inputQuantizerNode = LSQ(signedRange, inputQuantizerName);
// Set the step size
......@@ -46,7 +48,9 @@ void QuantLSQ::insertQuantizers(std::shared_ptr<GraphView> graphView, size_t nbB
// Absorb the ReLU when possible ...
bool nodeHasParent = static_cast<bool> (linearNode->getParents()[0]); // XXX is this safe ???
// XXX is this safe ???
bool nodeHasParent = static_cast<bool> (linearNode->getParents()[0]);
// bool nodeHasParent = (linearNode->getParents().size() != 0);
if (nodeHasParent) {
auto parentNode = linearNode->getParents()[0];
......@@ -68,7 +72,8 @@ void QuantLSQ::insertQuantizers(std::shared_ptr<GraphView> graphView, size_t nbB
// PARAM QUANTIZERS INSERTION
auto paramQuantizerName = linearNode->name() + "_lsq_p"; // TODO : double check this, and use createUniqueName()
// TODO : double check this, and use createUniqueName()
auto paramQuantizerName = makeUniqueName(linearNode->name() + "_lsq_p", graphView);
auto paramQuantizerNode = LSQ(signedRange, paramQuantizerName);
graphView->insertParent(linearNode, paramQuantizerNode, 1, 0, 0);
......@@ -91,7 +96,7 @@ static float getTensorAbsMean(std::shared_ptr<Tensor> tensor)
return acc;
}
static std::map<std::string, float> collectInputStats(std::shared_ptr<GraphView> graphView, std::shared_ptr<Tensor> calibrationData)
static std::map<std::string, float> collectInputStats(std::shared_ptr<GraphView> graphView, std::shared_ptr<Tensor> calibrationData, bool useCuda)
{
// Propagate the calibration tensor
......@@ -101,6 +106,9 @@ static std::map<std::string, float> collectInputStats(std::shared_ptr<GraphView>
// Store the input tensor statistics
if (useCuda)
graphView->setBackend("cpu");
std::map<std::string, float> inputStats;
for (auto node : graphView->getNodes())
{
......@@ -113,11 +121,17 @@ static std::map<std::string, float> collectInputStats(std::shared_ptr<GraphView>
}
}
if (useCuda)
graphView->setBackend("cuda");
return inputStats;
}
static std::map<std::string, float> collectParamStats(std::shared_ptr<GraphView> graphView)
static std::map<std::string, float> collectParamStats(std::shared_ptr<GraphView> graphView, bool useCuda)
{
if (useCuda)
graphView->setBackend("cpu");
std::map<std::string, float> paramStats;
for (auto node : graphView->getNodes())
{
......@@ -130,6 +144,9 @@ static std::map<std::string, float> collectParamStats(std::shared_ptr<GraphView>
}
}
if (useCuda)
graphView->setBackend("cuda");
return paramStats;
}
......@@ -171,9 +188,11 @@ static void adjustQuantizersStepSizes(std::shared_ptr<GraphView> graphView, std:
void QuantLSQ::insertAndInitQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits, std::shared_ptr<Tensor> calibrationData)
{
bool useCuda = (calibrationData->backend() == "cuda");
// Collect the tensor statisics
auto inputStats = collectInputStats(graphView, calibrationData);
auto paramStats = collectParamStats(graphView);
auto inputStats = collectInputStats(graphView, calibrationData, useCuda);
auto paramStats = collectParamStats(graphView, useCuda);
// Insert the quantizers
insertQuantizers(graphView, nbBits, 1.0);
......
......@@ -52,18 +52,28 @@ void Aidge::LSQImpl_cuda::backward() {
std::shared_ptr<Tensor> gra_int1 = op_.getInput(1)->grad();
std::shared_ptr<Tensor> gra_out0 = op_.getOutput(0)->grad();
// XXX
/*
size_t tmp;
cudaDeviceSetLimit(cudaLimitStackSize, 2048);
cudaDeviceGetLimit(&tmp, cudaLimitStackSize );
printf(" stack limit = %ld \n", tmp);
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 100000000);
cudaDeviceGetLimit(&tmp, cudaLimitMallocHeapSize);
printf(" heap limit = %ld \n", tmp);
*/
if (gra_int0->size() > mWorkspaceSize) {
// std::cout << " reallocation " << sizeof(gra_int0) << " " << gra_int0->size() << std::endl;
if (mWorkspace != nullptr) {
cudaFree(mWorkspace);
}
CHECK_CUDA_STATUS(cudaMalloc(&mWorkspace, gra_int0->size()));
CHECK_CUDA_STATUS(cudaMalloc(&mWorkspace, 8 * gra_int0->size())); // XXX This must be changed !!!
mWorkspaceSize = gra_int0->size();
}
// XXX
//cudaFree(mWorkspace);
//cudaMalloc(&mWorkspace, 4000000);
// Find the correct kernel type
auto impl = Registrar<LSQImpl_cuda>::create(getBestMatch(getRequiredSpec()));
......@@ -77,6 +87,12 @@ void Aidge::LSQImpl_cuda::backward() {
gra_int0->getImpl()->rawPtr(),
gra_int1->getImpl()->rawPtr(),
mWorkspace);
/*
gra_int1->setBackend("cpu");
float *castedTensor = static_cast<float *> (gra_int1->getImpl()->rawPtr());
std::cout << castedTensor[0] << std::endl;
gra_int1->setBackend("cuda");
*/
}
Aidge::LSQImpl_cuda::~LSQImpl_cuda() {
......
......@@ -49,7 +49,7 @@ void Aidge::LSQImpl_cuda_forward_kernel(std::size_t inputLength,
const I* stepSize = static_cast<const I*>(stepSize_);
O* output = static_cast<O*>(output_);
int blockSize = 32;
int blockSize = 256;
int numBlocks = (inputLength + blockSize - 1) / blockSize;
LSQImpl_cuda_forward_kernel_<<<numBlocks, blockSize>>>(
......@@ -124,7 +124,7 @@ void Aidge::LSQImpl_cuda_backward_kernel(const std::size_t inputLength,
const GI gradScaleFactor = static_cast<GI>(1.0f / std::sqrt(inputLength * range.second));
int blockSize = 32;
int blockSize = 256;
int numBlocks = (inputLength + blockSize - 1) / blockSize;
LSQImpl_cuda_backward_kernel_<<<numBlocks, blockSize>>>(
......@@ -144,5 +144,7 @@ void Aidge::LSQImpl_cuda_backward_kernel(const std::size_t inputLength,
thrust::device_ptr<GI> grad_stepSizePtr(grad_stepSize);
grad_stepSizePtr[0] = thrust::reduce(grad_workspacePtr, grad_workspacePtr + inputLength, GI(0.0));
//printf(" step grad = %f \n", (float) grad_stepSizePtr[0]);
CHECK_CUDA_STATUS(cudaPeekAtLastError());
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment