add LSQ calibration cuda support + hotfix the malloc

4fa3e9a7 · Benjamin Halimi · f8886972 · 4fa3e9a7 · 4fa3e9a7 · 4fa3e9a7
Commit 4fa3e9a7 authored 6 months ago by Benjamin Halimi
--- a/src/QAT/QAT_LSQ.cpp
+++ b/src/QAT/QAT_LSQ.cpp
@@ -19,6 +19,7 @@
 #include "aidge/scheduler/SequentialScheduler.hpp"
 #include "aidge/scheduler/Scheduler.hpp"
 #include "aidge/graph/Matching.hpp"
+#include "aidge/recipes/QuantRecipes.hpp"

 namespace Aidge {

@@ -35,7 +36,8 @@ void QuantLSQ::insertQuantizers(std::shared_ptr<GraphView> graphView, size_t nbB

        // INPUT QUANTIZERS INSERTION

-        auto inputQuantizerName = linearNode->name() + "_lsq_i";  // TODO : double check this, and use createUniqueName()
+        // TODO : double check this, and use createUniqueName()
+        auto inputQuantizerName = makeUniqueName(linearNode->name() + "_lsq_i", graphView);  
        auto inputQuantizerNode = LSQ(signedRange, inputQuantizerName);

        // Set the step size
@@ -46,7 +48,9 @@ void QuantLSQ::insertQuantizers(std::shared_ptr<GraphView> graphView, size_t nbB

        // Absorb the ReLU when possible ...

-        bool nodeHasParent = static_cast<bool> (linearNode->getParents()[0]); // XXX is this safe ???
+        // XXX is this safe ???
+        bool nodeHasParent = static_cast<bool> (linearNode->getParents()[0]); 
+        // bool nodeHasParent = (linearNode->getParents().size() != 0);

        if (nodeHasParent) {
            auto parentNode = linearNode->getParents()[0];
@@ -68,7 +72,8 @@ void QuantLSQ::insertQuantizers(std::shared_ptr<GraphView> graphView, size_t nbB

        // PARAM QUANTIZERS INSERTION

-        auto paramQuantizerName = linearNode->name() + "_lsq_p";  // TODO : double check this, and use createUniqueName()
+        // TODO : double check this, and use createUniqueName()
+        auto paramQuantizerName = makeUniqueName(linearNode->name() + "_lsq_p", graphView);  
        auto paramQuantizerNode = LSQ(signedRange, paramQuantizerName); 
        graphView->insertParent(linearNode, paramQuantizerNode, 1, 0, 0);

@@ -91,7 +96,7 @@ static float getTensorAbsMean(std::shared_ptr<Tensor> tensor)
    return acc;
 }

-static std::map<std::string, float> collectInputStats(std::shared_ptr<GraphView> graphView, std::shared_ptr<Tensor> calibrationData)
+static std::map<std::string, float> collectInputStats(std::shared_ptr<GraphView> graphView, std::shared_ptr<Tensor> calibrationData, bool useCuda)
 {
    // Propagate the calibration tensor

@@ -101,6 +106,9 @@ static std::map<std::string, float> collectInputStats(std::shared_ptr<GraphView>

    // Store the input tensor statistics

+    if (useCuda)
+        graphView->setBackend("cpu");
+
    std::map<std::string, float> inputStats;
    for (auto node : graphView->getNodes())
    {
@@ -113,11 +121,17 @@ static std::map<std::string, float> collectInputStats(std::shared_ptr<GraphView>
        }
    }

+    if (useCuda)
+        graphView->setBackend("cuda");
+
    return inputStats;
 }

-static std::map<std::string, float> collectParamStats(std::shared_ptr<GraphView> graphView)
+static std::map<std::string, float> collectParamStats(std::shared_ptr<GraphView> graphView, bool useCuda)
 {
+    if (useCuda)
+        graphView->setBackend("cpu");
+
    std::map<std::string, float> paramStats;
    for (auto node : graphView->getNodes())
    {
@@ -130,6 +144,9 @@ static std::map<std::string, float> collectParamStats(std::shared_ptr<GraphView>
        }
    }
    
+    if (useCuda)
+        graphView->setBackend("cuda");
+
    return paramStats;
 }

@@ -171,9 +188,11 @@ static void adjustQuantizersStepSizes(std::shared_ptr<GraphView> graphView, std:

 void QuantLSQ::insertAndInitQuantizers(std::shared_ptr<GraphView> graphView, size_t nbBits, std::shared_ptr<Tensor> calibrationData)
 {
+    bool useCuda = (calibrationData->backend() == "cuda");
+
    // Collect the tensor statisics
-    auto inputStats = collectInputStats(graphView, calibrationData);
-    auto paramStats = collectParamStats(graphView);
+    auto inputStats = collectInputStats(graphView, calibrationData, useCuda);
+    auto paramStats = collectParamStats(graphView, useCuda);

    // Insert the quantizers
    insertQuantizers(graphView, nbBits, 1.0);

--- a/src/backend/cuda/operator/LSQImpl.cpp
+++ b/src/backend/cuda/operator/LSQImpl.cpp
@@ -52,18 +52,28 @@ void Aidge::LSQImpl_cuda::backward() {
    std::shared_ptr<Tensor> gra_int1 = op_.getInput(1)->grad();
    std::shared_ptr<Tensor> gra_out0 = op_.getOutput(0)->grad();    

+    // XXX
+/*
+    size_t tmp;
+
+    cudaDeviceSetLimit(cudaLimitStackSize, 2048);
+    cudaDeviceGetLimit(&tmp, cudaLimitStackSize );
+    printf(" stack limit = %ld \n", tmp);
+
+    cudaDeviceSetLimit(cudaLimitMallocHeapSize, 100000000);
+    cudaDeviceGetLimit(&tmp, cudaLimitMallocHeapSize);
+    printf(" heap limit = %ld \n", tmp);
+*/
+
    if (gra_int0->size() > mWorkspaceSize) {
+        // std::cout << " reallocation " << sizeof(gra_int0) << " " << gra_int0->size() << std::endl;
        if (mWorkspace != nullptr) {
            cudaFree(mWorkspace);
        }
-        CHECK_CUDA_STATUS(cudaMalloc(&mWorkspace, gra_int0->size()));
+        CHECK_CUDA_STATUS(cudaMalloc(&mWorkspace, 8 * gra_int0->size())); // XXX This must be changed !!!
        mWorkspaceSize = gra_int0->size();
    }

-    // XXX
-    //cudaFree(mWorkspace);
-    //cudaMalloc(&mWorkspace, 4000000);
-
    // Find the correct kernel type
    auto impl = Registrar<LSQImpl_cuda>::create(getBestMatch(getRequiredSpec()));

@@ -77,6 +87,12 @@ void Aidge::LSQImpl_cuda::backward() {
        gra_int0->getImpl()->rawPtr(),
        gra_int1->getImpl()->rawPtr(),
        mWorkspace);
+/*
+    gra_int1->setBackend("cpu");
+    float *castedTensor = static_cast<float *> (gra_int1->getImpl()->rawPtr());
+    std::cout << castedTensor[0] << std::endl;
+    gra_int1->setBackend("cuda");
+*/
 }

 Aidge::LSQImpl_cuda::~LSQImpl_cuda() {

--- a/src/backend/cuda/operator/LSQImpl_CUDA_kernels.cu
+++ b/src/backend/cuda/operator/LSQImpl_CUDA_kernels.cu
@@ -49,7 +49,7 @@ void Aidge::LSQImpl_cuda_forward_kernel(std::size_t inputLength,
    const I* stepSize = static_cast<const I*>(stepSize_);
    O* output = static_cast<O*>(output_);

-    int blockSize = 32;
+    int blockSize = 256;
    int numBlocks = (inputLength + blockSize - 1) / blockSize;

    LSQImpl_cuda_forward_kernel_<<<numBlocks, blockSize>>>(
@@ -124,7 +124,7 @@ void Aidge::LSQImpl_cuda_backward_kernel(const std::size_t inputLength,
    const GI gradScaleFactor = static_cast<GI>(1.0f / std::sqrt(inputLength * range.second));


-    int blockSize = 32;
+    int blockSize = 256;
    int numBlocks = (inputLength + blockSize - 1) / blockSize;

    LSQImpl_cuda_backward_kernel_<<<numBlocks, blockSize>>>(
@@ -144,5 +144,7 @@ void Aidge::LSQImpl_cuda_backward_kernel(const std::size_t inputLength,
    thrust::device_ptr<GI> grad_stepSizePtr(grad_stepSize);
    grad_stepSizePtr[0] = thrust::reduce(grad_workspacePtr, grad_workspacePtr + inputLength, GI(0.0));

+    //printf(" step grad = %f \n", (float) grad_stepSizePtr[0]);
+
    CHECK_CUDA_STATUS(cudaPeekAtLastError());
 }