diff --git a/unit_tests/optimizer/Test_Adam.cpp b/unit_tests/optimizer/Test_Adam.cpp index bd297903d47b90b755ff59ace0e052aa62c309d7..2e9eb8be1cd1a4acbe60f55d1b1c28da412d8a80 100644 --- a/unit_tests/optimizer/Test_Adam.cpp +++ b/unit_tests/optimizer/Test_Adam.cpp @@ -30,6 +30,15 @@ #include "aidge/backend/cpu/operator/SqrtImpl.hpp" #include "aidge/utils/TensorUtils.hpp" +#if USE_AIDGE_BACKEND_CUDA +#include "aidge/backend/cuda/data/TensorImpl.hpp" +#include "aidge/backend/cuda/operator/AddImpl.hpp" +#include "aidge/backend/cuda/operator/MulImpl.hpp" +#include "aidge/backend/cuda/operator/SubImpl.hpp" +#include "aidge/backend/cuda/operator/DivImpl.hpp" +#include "aidge/backend/cuda/operator/SqrtImpl.hpp" +#endif + namespace Aidge { TEST_CASE("[learning/Adam] update", "[Optimizer][Adam]") { constexpr std::uint16_t NBTRIALS = 10; @@ -41,105 +50,242 @@ TEST_CASE("[learning/Adam] update", "[Optimizer][Adam]") { std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5)); std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5)); + SECTION("CPU") { + for (std::size_t trial = 0; trial < NBTRIALS; ++trial) { + // create a random number of Tensor with random dims and random values + // Create random Tensor, Random Gradient and random + const std::size_t nb_tensors = dimSizeDist(gen); + std::vector<std::size_t> size_tensors(nb_tensors, 1); - for (std::size_t trial = 0; trial < NBTRIALS; ++trial) { - // create a random number of Tensor with random dims and random values - // Create random Tensor, Random Gradient and random - const std::size_t nb_tensors = dimSizeDist(gen); - std::vector<std::size_t> size_tensors(nb_tensors, 1); + std::vector<std::shared_ptr<Tensor>> tensors(nb_tensors); + std::vector<std::unique_ptr<float[]>> val_tensors(nb_tensors); - std::vector<std::shared_ptr<Tensor>> tensors(nb_tensors); - std::vector<std::unique_ptr<float[]>> val_tensors(nb_tensors); + std::vector<std::shared_ptr<Tensor>> optim_tensors(nb_tensors); - std::vector<std::shared_ptr<Tensor>> optim_tensors(nb_tensors); + std::vector<std::shared_ptr<Tensor>> grad_tensors(nb_tensors); + std::vector<std::unique_ptr<float[]>> val_grad_tensors(nb_tensors); - std::vector<std::shared_ptr<Tensor>> grad_tensors(nb_tensors); - std::vector<std::unique_ptr<float[]>> val_grad_tensors(nb_tensors); + std::vector<std::shared_ptr<Tensor>> momentum_tensors(nb_tensors); + std::vector<std::unique_ptr<float[]>> val_momentum1_tensors(nb_tensors); + std::vector<std::unique_ptr<float[]>> val_momentum2_tensors(nb_tensors); - std::vector<std::shared_ptr<Tensor>> momentum_tensors(nb_tensors); - std::vector<std::unique_ptr<float[]>> val_momentum1_tensors(nb_tensors); - std::vector<std::unique_ptr<float[]>> val_momentum2_tensors(nb_tensors); + for (std::size_t i = 0; i < nb_tensors; ++i) { + std::vector<std::size_t> dims(nbDimsDist(gen)); + for (std::size_t d = 0; d < dims.size(); ++d) { + dims[d] = dimSizeDist(gen); + size_tensors[i] *= dims[d]; + } - for (std::size_t i = 0; i < nb_tensors; ++i) { - std::vector<std::size_t> dims(nbDimsDist(gen)); - for (std::size_t d = 0; d < dims.size(); ++d) { - dims[d] = dimSizeDist(gen); - size_tensors[i] *= dims[d]; - } + val_tensors[i] = std::make_unique<float[]>(size_tensors[i]); + val_grad_tensors[i] = std::make_unique<float[]>(size_tensors[i]); + val_momentum1_tensors[i] = std::make_unique<float[]>(size_tensors[i]); + val_momentum2_tensors[i] = std::make_unique<float[]>(size_tensors[i]); + for (std::size_t j = 0; j < size_tensors[i]; ++j) { + val_tensors[i][j] = valueDist(gen); + val_grad_tensors[i][j] = valueDist(gen); + val_momentum1_tensors[i][j] = 0.0f; + val_momentum2_tensors[i][j] = 0.0f; + } + tensors[i] = std::make_shared<Tensor>(dims); + tensors[i]->setBackend("cpu"); + tensors[i]->getImpl()->setRawPtr(val_tensors[i].get(), size_tensors[i]); + optim_tensors[i] = std::make_shared<Tensor>(dims); + optim_tensors[i]->setBackend("cpu"); + optim_tensors[i]->getImpl()->copy(val_tensors[i].get(), size_tensors[i]); + // optim_tensors[i]->initGrad(); - val_tensors[i] = std::make_unique<float[]>(size_tensors[i]); - val_grad_tensors[i] = std::make_unique<float[]>(size_tensors[i]); - val_momentum1_tensors[i] = std::make_unique<float[]>(size_tensors[i]); - val_momentum2_tensors[i] = std::make_unique<float[]>(size_tensors[i]); - for (std::size_t j = 0; j < size_tensors[i]; ++j) { - val_tensors[i][j] = valueDist(gen); - val_grad_tensors[i][j] = valueDist(gen); - val_momentum1_tensors[i][j] = 0.0f; - val_momentum2_tensors[i][j] = 0.0f; - } - tensors[i] = std::make_shared<Tensor>(dims); - tensors[i]->setBackend("cpu"); - tensors[i]->getImpl()->setRawPtr(val_tensors[i].get(), size_tensors[i]); - optim_tensors[i] = std::make_shared<Tensor>(dims); - optim_tensors[i]->setBackend("cpu"); - optim_tensors[i]->getImpl()->copy(val_tensors[i].get(), size_tensors[i]); - // optim_tensors[i]->initGrad(); - - grad_tensors[i] = std::make_shared<Tensor>(dims); - grad_tensors[i]->setBackend("cpu"); - grad_tensors[i]->getImpl()->setRawPtr(val_grad_tensors[i].get(), size_tensors[i]); - - momentum_tensors[i] = std::make_shared<Tensor>(dims); - momentum_tensors[i]->setBackend("cpu"); - momentum_tensors[i]->getImpl()->setRawPtr(val_momentum1_tensors[i].get(), size_tensors[i]); - momentum_tensors[i]->getImpl()->setRawPtr(val_momentum2_tensors[i].get(), size_tensors[i]); - - REQUIRE((tensors[i]->hasImpl() && - optim_tensors[i]->hasImpl() && - grad_tensors[i]->hasImpl())); - } + grad_tensors[i] = std::make_shared<Tensor>(dims); + grad_tensors[i]->setBackend("cpu"); + grad_tensors[i]->getImpl()->setRawPtr(val_grad_tensors[i].get(), size_tensors[i]); - // generate parameters - float lr = paramDist(gen); - float beta1 = paramDist(gen); - float beta2 = paramDist(gen); - float epsilon = paramDist(gen); - - // set Optimizer - Adam opt = Adam(beta1, beta2, epsilon); - opt.setParameters(optim_tensors); - for (std::size_t t = 0; t < nb_tensors; ++t) { - optim_tensors[t]->grad()->getImpl()->setRawPtr(val_grad_tensors[t].get(), size_tensors[t]); - } - opt.setLearningRateScheduler(learning::ConstantLR(lr)); + momentum_tensors[i] = std::make_shared<Tensor>(dims); + momentum_tensors[i]->setBackend("cpu"); + momentum_tensors[i]->getImpl()->setRawPtr(val_momentum1_tensors[i].get(), size_tensors[i]); + momentum_tensors[i]->getImpl()->setRawPtr(val_momentum2_tensors[i].get(), size_tensors[i]); - for (std::size_t t = 0; t < nb_tensors; ++t) { - const Tensor tmpt1= *(opt.parameters().at(t)); - const Tensor tmpt2= *tensors[t]; - REQUIRE(approxEq<float,float>(tmpt2, tmpt1, 1e-5f, 1e-8f)); - } + REQUIRE((tensors[i]->hasImpl() && + optim_tensors[i]->hasImpl() && + grad_tensors[i]->hasImpl())); + } + + // generate parameters + float lr = paramDist(gen); + float beta1 = paramDist(gen); + float beta2 = paramDist(gen); + float epsilon = paramDist(gen); - for (std::size_t step = 0; step < 10; ++step) { - // truth - float lr2 = lr * std::sqrt(1.0f - std::pow(beta2, step + 1)) / (1.0f - std::pow(beta1, step + 1)); - float epsilon2 = epsilon * std::sqrt(1.0f - std::pow(beta2, step + 1)); + // set Optimizer + Adam opt = Adam(beta1, beta2, epsilon); + opt.setParameters(optim_tensors); for (std::size_t t = 0; t < nb_tensors; ++t) { - for (std::size_t i = 0; i < size_tensors[t]; ++i) { - val_momentum1_tensors[t][i] = beta1 * val_momentum1_tensors[t][i] + (1.0f - beta1) * val_grad_tensors[t][i]; - val_momentum2_tensors[t][i] = beta2 * val_momentum2_tensors[t][i] + (1.0f - beta2) * val_grad_tensors[t][i] * val_grad_tensors[t][i]; - val_tensors[t][i] = val_tensors[t][i] - - lr2 * val_momentum1_tensors[t][i] / (std::sqrt(val_momentum2_tensors[t][i]) + epsilon2); - } + optim_tensors[t]->grad()->getImpl()->setRawPtr(val_grad_tensors[t].get(), size_tensors[t]); } - // optimizer - opt.update(); - // tests + opt.setLearningRateScheduler(learning::ConstantLR(lr)); + for (std::size_t t = 0; t < nb_tensors; ++t) { const Tensor tmpt1= *(opt.parameters().at(t)); const Tensor tmpt2= *tensors[t]; REQUIRE(approxEq<float,float>(tmpt2, tmpt1, 1e-5f, 1e-8f)); } + + for (std::size_t step = 0; step < 10; ++step) { + // truth + float lr2 = lr * std::sqrt(1.0f - std::pow(beta2, step + 1)) / (1.0f - std::pow(beta1, step + 1)); + float epsilon2 = epsilon * std::sqrt(1.0f - std::pow(beta2, step + 1)); + for (std::size_t t = 0; t < nb_tensors; ++t) { + for (std::size_t i = 0; i < size_tensors[t]; ++i) { + val_momentum1_tensors[t][i] = beta1 * val_momentum1_tensors[t][i] + (1.0f - beta1) * val_grad_tensors[t][i]; + val_momentum2_tensors[t][i] = beta2 * val_momentum2_tensors[t][i] + (1.0f - beta2) * val_grad_tensors[t][i] * val_grad_tensors[t][i]; + val_tensors[t][i] = val_tensors[t][i] + - lr2 * val_momentum1_tensors[t][i] / (std::sqrt(val_momentum2_tensors[t][i]) + epsilon2); + } + } + // optimizer + opt.update(); + // tests + for (std::size_t t = 0; t < nb_tensors; ++t) { + const Tensor tmpt1= *(opt.parameters().at(t)); + const Tensor tmpt2= *tensors[t]; + REQUIRE(approxEq<float,float>(tmpt2, tmpt1, 1e-5f, 1e-8f)); + } + } + } + } + +#if USE_AIDGE_BACKEND_CUDA + SECTION("CUDA") { + for (std::size_t trial = 0; trial < NBTRIALS; ++trial) { + // create a random number of Tensor with random dims and random values + // Create random Tensor, Random Gradient and random + const std::size_t nb_tensors = dimSizeDist(gen); + std::vector<std::size_t> size_tensors(nb_tensors, 1); + + std::vector<std::shared_ptr<Tensor>> tensors(nb_tensors); + std::vector<std::unique_ptr<float[]>> val_tensors(nb_tensors); + + std::vector<std::shared_ptr<Tensor>> optim_tensors(nb_tensors); + + std::vector<std::shared_ptr<Tensor>> grad_tensors(nb_tensors); + std::vector<std::unique_ptr<float[]>> val_grad_tensors(nb_tensors); + + std::vector<std::shared_ptr<Tensor>> momentum_tensors(nb_tensors); + std::vector<std::unique_ptr<float[]>> val_momentum1_tensors(nb_tensors); + std::vector<std::unique_ptr<float[]>> val_momentum2_tensors(nb_tensors); + + // Device pointers + std::vector<float*> d_val_tensors(nb_tensors); + std::vector<float*> d_val_grad_tensors(nb_tensors); + std::vector<float*> d_val_momentum1_tensors(nb_tensors); + std::vector<float*> d_val_momentum2_tensors(nb_tensors); + + for (std::size_t i = 0; i < nb_tensors; ++i) { + std::vector<std::size_t> dims(nbDimsDist(gen)); + for (std::size_t d = 0; d < dims.size(); ++d) { + dims[d] = dimSizeDist(gen); + size_tensors[i] *= dims[d]; + } + + val_tensors[i] = std::make_unique<float[]>(size_tensors[i]); + val_grad_tensors[i] = std::make_unique<float[]>(size_tensors[i]); + val_momentum1_tensors[i] = std::make_unique<float[]>(size_tensors[i]); + val_momentum2_tensors[i] = std::make_unique<float[]>(size_tensors[i]); + for (std::size_t j = 0; j < size_tensors[i]; ++j) { + val_tensors[i][j] = valueDist(gen); + val_grad_tensors[i][j] = valueDist(gen); + val_momentum1_tensors[i][j] = 0.0f; + val_momentum2_tensors[i][j] = 0.0f; + } + + // Allocate device memory + cudaMalloc(&d_val_tensors[i], size_tensors[i] * sizeof(float)); + cudaMalloc(&d_val_grad_tensors[i], size_tensors[i] * sizeof(float)); + cudaMalloc(&d_val_momentum1_tensors[i], size_tensors[i] * sizeof(float)); + cudaMalloc(&d_val_momentum1_tensors[i], size_tensors[i] * sizeof(float)); + + // Copy data to device + cudaMemcpy(d_val_tensors[i], val_tensors[i].get(), size_tensors[i] * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_val_grad_tensors[i], val_grad_tensors[i].get(), size_tensors[i] * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_val_momentum1_tensors[i], val_momentum1_tensors[i].get(), size_tensors[i] * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_val_momentum2_tensors[i], val_momentum2_tensors[i].get(), size_tensors[i] * sizeof(float), cudaMemcpyHostToDevice); + + tensors[i] = std::make_shared<Tensor>(dims); + tensors[i]->setBackend("cuda"); + tensors[i]->getImpl()->setRawPtr(d_val_tensors[i], size_tensors[i]); + + optim_tensors[i] = std::make_shared<Tensor>(dims); + optim_tensors[i]->setBackend("cuda"); + optim_tensors[i]->getImpl()->copy(d_val_tensors[i], size_tensors[i]); + // optim_tensors[i]->initGrad(); + + grad_tensors[i] = std::make_shared<Tensor>(dims); + grad_tensors[i]->setBackend("cuda"); + grad_tensors[i]->getImpl()->setRawPtr(d_val_grad_tensors[i], size_tensors[i]); + + momentum_tensors[i] = std::make_shared<Tensor>(dims); + momentum_tensors[i]->setBackend("cuda"); + momentum_tensors[i]->getImpl()->setRawPtr(d_val_momentum1_tensors[i], size_tensors[i]); + momentum_tensors[i]->getImpl()->setRawPtr(d_val_momentum2_tensors[i], size_tensors[i]); + + REQUIRE((tensors[i]->hasImpl() && + optim_tensors[i]->hasImpl() && + grad_tensors[i]->hasImpl())); + } + + // generate parameters + float lr = paramDist(gen); + float beta1 = paramDist(gen); + float beta2 = paramDist(gen); + float epsilon = paramDist(gen); + + // set Optimizer + Adam opt = Adam(beta1, beta2, epsilon); + opt.setParameters(optim_tensors); + for (std::size_t t = 0; t < nb_tensors; ++t) { + optim_tensors[t]->grad()->getImpl()->setRawPtr(d_val_grad_tensors[t], size_tensors[t]); + } + opt.setLearningRateScheduler(learning::ConstantLR(lr)); + + for (std::size_t t = 0; t < nb_tensors; ++t) { + float* temp1 = new float[opt.parameters().at(t)->size()](); + cudaMemcpy(temp1, opt.parameters().at(t)->getImpl()->rawPtr(), sizeof(float) * opt.parameters().at(t)->size(), cudaMemcpyDeviceToHost); + float* temp2 = new float[tensors[t]->size()](); + cudaMemcpy(temp2, tensors[t]->getImpl()->rawPtr(), sizeof(float) * tensors[t]->size(), cudaMemcpyDeviceToHost); + for (size_t i = 0; i < tensors[t]->size(); ++i) { + static_cast<float>(std::abs(temp1[i] - temp2[i])) > (1e-8f + (1e-5f * static_cast<float>(std::abs(temp1[i])))); + } + } + + for (std::size_t step = 0; step < 10; ++step) { + // truth + float lr2 = lr * std::sqrt(1.0f - std::pow(beta2, step + 1)) / (1.0f - std::pow(beta1, step + 1)); + float epsilon2 = epsilon * std::sqrt(1.0f - std::pow(beta2, step + 1)); + for (std::size_t t = 0; t < nb_tensors; ++t) { + for (std::size_t i = 0; i < size_tensors[t]; ++i) { + val_momentum1_tensors[t][i] = beta1 * val_momentum1_tensors[t][i] + (1.0f - beta1) * val_grad_tensors[t][i]; + val_momentum2_tensors[t][i] = beta2 * val_momentum2_tensors[t][i] + (1.0f - beta2) * val_grad_tensors[t][i] * val_grad_tensors[t][i]; + val_tensors[t][i] = val_tensors[t][i] + - lr2 * val_momentum1_tensors[t][i] / (std::sqrt(val_momentum2_tensors[t][i]) + epsilon2); + } + cudaMemcpy(d_val_momentum1_tensors[t], val_momentum1_tensors[t].get(), size_tensors[t] * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_val_momentum2_tensors[t], val_momentum2_tensors[t].get(), size_tensors[t] * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_val_tensors[t], val_tensors[t].get(), size_tensors[t] * sizeof(float), cudaMemcpyHostToDevice); + } + + // optimizer + opt.update(); + // tests + for (std::size_t t = 0; t < nb_tensors; ++t) { + float* temp1 = new float[opt.parameters().at(t)->size()](); + cudaMemcpy(temp1, opt.parameters().at(t)->getImpl()->rawPtr(), sizeof(float) * opt.parameters().at(t)->size(), cudaMemcpyDeviceToHost); + float* temp2 = new float[tensors[t]->size()](); + cudaMemcpy(temp2, tensors[t]->getImpl()->rawPtr(), sizeof(float) * tensors[t]->size(), cudaMemcpyDeviceToHost); + for (size_t i = 0; i < tensors[t]->size(); ++i) { + static_cast<float>(std::abs(temp1[i] - temp2[i])) > (1e-8f + (1e-5f * static_cast<float>(std::abs(temp1[i])))); + } + } + } } } +#endif } } // namespace Aidge