diff --git a/include/aidge/learning/optimizer/Adam.hpp b/include/aidge/learning/optimizer/Adam.hpp new file mode 100644 index 0000000000000000000000000000000000000000..550c10e2e6b9e87b48428ce47a7eb78fac669f37 --- /dev/null +++ b/include/aidge/learning/optimizer/Adam.hpp @@ -0,0 +1,145 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#ifndef AIDGE_CORE_OPTIMIZER_ADAM_H_ +#define AIDGE_CORE_OPTIMIZER_ADAM_H_ + +#include <functional> +#include <memory> +#include <vector> +#include <cmath> // std::sqrt, std::pow + +#include "aidge/data/Tensor.hpp" +#include "aidge/learning/optimizer/Optimizer.hpp" +#include "aidge/utils/StaticAttributes.hpp" +#include "aidge/utils/Registrar.hpp" +#include "aidge/utils/TensorUtils.hpp" +//#include "aidge/operator/Sqrt.hpp" + +namespace Aidge { + +enum class AdamAttr { + Beta1, + Beta2, + Epsilon +}; + +class Adam: public Optimizer, public StaticAttributes<AdamAttr, float, float, float> { +private: + std::vector<Tensor> mMomentum1; + std::vector<Tensor> mMomentum2; + Tensor mLR{std::vector<std::size_t>({1})}; + Tensor mBeta1{std::vector<std::size_t>({1})}; + Tensor mReversedBeta1{std::vector<std::size_t>({1})}; + Tensor mBeta2{std::vector<std::size_t>({1})}; + Tensor mReversedBeta2{std::vector<std::size_t>({1})}; + Tensor mEpsilon{std::vector<std::size_t>({1})}; + +public: + using Attributes_ = StaticAttributes<AdamAttr, float, float, float>; + template <AdamAttr e> + using attr = typename Attributes_::template attr<e>; + + Adam(const float beta1 = 0.9f, const float beta2 = 0.999f, const float epsilon = 1.0e-8f) + : Optimizer(), + Attributes_(attr<AdamAttr::Beta1>(beta1), + attr<AdamAttr::Beta2>(beta2), + attr<AdamAttr::Epsilon>(epsilon)) + { + mBeta1.setBackend("cpu"); + mBeta1.set<float>(0, beta1); + mReversedBeta1.setBackend("cpu"); + mReversedBeta1.set<float>(0, 1.0f - beta1); + + mBeta2.setBackend("cpu"); + mBeta2.set<float>(0, beta2); + mReversedBeta2.setBackend("cpu"); + mReversedBeta2.set<float>(0, 1.0f - beta2); + + mEpsilon.setBackend("cpu"); + mEpsilon.set<float>(0, epsilon); + } + + void update() override final { + mLR.setBackend(mParameters[0]->getImpl()->backend()); + mLR.set<float>(0, learningRate()); + if (mParameters[0]->getImpl()->backend() != mBeta1.getImpl()->backend()) { + mBeta1.setBackend(mParameters[0]->getImpl()->backend()); + mReversedBeta1.setBackend(mParameters[0]->getImpl()->backend()); + mBeta2.setBackend(mParameters[0]->getImpl()->backend()); + mReversedBeta2.setBackend(mParameters[0]->getImpl()->backend()); + } + + Tensor alpha{std::vector<std::size_t>({1})}; + alpha.setBackend(mParameters[0]->getImpl()->backend()); + alpha.set<float>(0, learningRate() * std::sqrt(1.0f - std::pow(mBeta2.get<float>(0), mLRScheduler.step() + 1)) + / (1.0f - std::pow(mBeta1.get<float>(0), mLRScheduler.step() + 1))); + + Tensor epsilon{std::vector<std::size_t>({1})}; + epsilon.setBackend(mParameters[0]->getImpl()->backend()); + epsilon.set<float>(0, mEpsilon.get<float>(0) * std::sqrt(1.0f - std::pow(mBeta2.get<float>(0), mLRScheduler.step() + 1))); + + if (mLRScheduler.step() == 0) { + for (std::size_t i = 0; i < mParameters.size(); ++i) { + mMomentum1[i].setBackend(mParameters[i]->getImpl()->backend()); + mMomentum1[i].setDataType(mParameters[i]->grad()->dataType()); + mMomentum1[i].zeros(); + mMomentum2[i].setBackend(mParameters[i]->getImpl()->backend()); + mMomentum2[i].setDataType(mParameters[i]->grad()->dataType()); + mMomentum2[i].zeros(); + } + } + + for (std::size_t i = 0; i < mParameters.size(); ++i) { + mMomentum1[i] = mBeta1 * mMomentum1[i] + mReversedBeta1 * (*mParameters[i]->grad()); + mMomentum2[i] = mBeta2 * mMomentum2[i] + mReversedBeta2 * (*mParameters[i]->grad()) * (*mParameters[i]->grad()); + + /* + auto sqrt_ = Sqrt_Op(); + sqrt_.associateInput(0, std::make_shared<Tensor>(mMomentum2[i])); + sqrt_.setDataType(mMomentum2[i].dataType()); + sqrt_.setDataFormat(mMomentum2[i].dataFormat()); + sqrt_.setBackend(mMomentum2[i].getImpl()->backend()); + sqrt_.forward(); + const std::shared_ptr<Tensor> sqrt_mMomentum2 = sqrt_.getOutput(0); + + *mParameters[i] = *mParameters[i] - alpha * mMomentum1[i] / (*sqrt_mMomentum2 + epsilon); + */ + + *mParameters[i] = *mParameters[i] - alpha * mMomentum1[i] / (mMomentum2[i].sqrt() + epsilon); + } + + mLRScheduler.update(); + } + + void setParameters(const std::vector<std::shared_ptr<Tensor>>& parameters) override final { + Optimizer::setParameters(parameters); + mMomentum1 = std::vector<Tensor>(parameters.size()); + mMomentum2 = std::vector<Tensor>(parameters.size()); + for (std::size_t i = 0; i < parameters.size(); ++i) { + mMomentum1[i] = Tensor(parameters[i]->dims()); + mMomentum2[i] = Tensor(parameters[i]->dims()); + } + } +}; + +} // namespace Aidge + + +namespace { +template <> +const char *const EnumStrings<Aidge::AdamAttr>::data[] = { + "Beta1", + "Beta2", + "Epsilon" +}; +} +#endif // AIDGE_CORE_OPTIMIZER_ADAM_H_ diff --git a/python_binding/learning/optimizer/pybind_Adam.cpp b/python_binding/learning/optimizer/pybind_Adam.cpp new file mode 100644 index 0000000000000000000000000000000000000000..03e8de97b06f40a7430294e6e55509178697450a --- /dev/null +++ b/python_binding/learning/optimizer/pybind_Adam.cpp @@ -0,0 +1,27 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <pybind11/pybind11.h> + +#include "aidge/learning/optimizer/Optimizer.hpp" +#include "aidge/learning/optimizer/Adam.hpp" + +namespace py = pybind11; +namespace Aidge { +// namespace learning { + +void init_Adam(py::module& m) { + py::class_<Adam, std::shared_ptr<Adam>, Attributes, Optimizer>(m, "Adam", py::multiple_inheritance()) + .def(py::init<float, float, float>(), py::arg("beta1") = 0.9f, py::arg("beta2") = 0.999f, py::arg("epsilon") = 1.0e-8f) + .def("update", &Adam::update); +} +// } // namespace learning +} // namespace Aidge diff --git a/python_binding/pybind_learning.cpp b/python_binding/pybind_learning.cpp index 3b4a16ceffb0db7bd7e1d407bcef5d5df830cb2f..c0566dd1bd4bcfc32977ad3372018d00a9c54259 100644 --- a/python_binding/pybind_learning.cpp +++ b/python_binding/pybind_learning.cpp @@ -19,12 +19,14 @@ namespace Aidge { void init_Loss(py::module&); void init_Optimizer(py::module&); void init_SGD(py::module&); +void init_Adam(py::module&); void init_LRScheduler(py::module&); void init_Aidge(py::module& m) { init_Loss(m); init_Optimizer(m); init_SGD(m); + init_Adam(m); init_LRScheduler(m); } diff --git a/unit_tests/optimizer/Test_Adam.cpp b/unit_tests/optimizer/Test_Adam.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5f516728057df42cfb55ad2fb0172dfd611cfdeb --- /dev/null +++ b/unit_tests/optimizer/Test_Adam.cpp @@ -0,0 +1,142 @@ +/******************************************************************************** + * Copyright (c) 2023 CEA-List + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Public License 2.0 which is available at + * http://www.eclipse.org/legal/epl-2.0. + * + * SPDX-License-Identifier: EPL-2.0 + * + ********************************************************************************/ + +#include <catch2/catch_test_macros.hpp> +#include <cstddef> // std::size_t +#include <cmath> // std::sqrt, std::pow +#include <memory> +#include <random> // std::random_device, std::mt19937, std::uniform_int_distribution +#include <set> +#include <vector> + +#include "aidge/data/Tensor.hpp" +#include "aidge/backend/cpu/data/TensorImpl.hpp" +#include "aidge/learning/learningRate/LRScheduler.hpp" +#include "aidge/learning/learningRate/LRSchedulerList.hpp" +#include "aidge/learning/optimizer/Optimizer.hpp" +#include "aidge/learning/optimizer/Adam.hpp" +//#include "aidge/backend/cpu/operator/AddImpl.hpp" +//#include "aidge/backend/cpu/operator/MulImpl.hpp" +//#include "aidge/backend/cpu/operator/SubImpl.hpp" +#include "aidge/utils/TensorUtils.hpp" + +namespace Aidge { +TEST_CASE("[learning/Adam] update", "[Optimizer][Adam]") { + constexpr std::uint16_t NBTRIALS = 10; + // Create a random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> valueDist(0.1f, 1.0f); // Random float distribution between 0 and 1 + std::uniform_real_distribution<float> paramDist(0.001f, 1.0f); // Random float distribution between 0 and 1 + std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5)); + std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5)); + + + for (std::size_t trial = 0; trial < NBTRIALS; ++trial) { + // create a random number of Tensor with random dims and random values + // Create random Tensor, Random Gradient and random + const std::size_t nb_tensors = dimSizeDist(gen); + std::vector<std::size_t> size_tensors(nb_tensors, 1); + + std::vector<std::shared_ptr<Tensor>> tensors(nb_tensors); + std::vector<std::unique_ptr<float[]>> val_tensors(nb_tensors); + + std::vector<std::shared_ptr<Tensor>> optim_tensors(nb_tensors); + + std::vector<std::shared_ptr<Tensor>> grad_tensors(nb_tensors); + std::vector<std::unique_ptr<float[]>> val_grad_tensors(nb_tensors); + + std::vector<std::shared_ptr<Tensor>> momentum_tensors(nb_tensors); + std::vector<std::unique_ptr<float[]>> val_momentum1_tensors(nb_tensors); + std::vector<std::unique_ptr<float[]>> val_momentum2_tensors(nb_tensors); + + for (std::size_t i = 0; i < nb_tensors; ++i) { + std::vector<std::size_t> dims(nbDimsDist(gen)); + for (std::size_t d = 0; d < dims.size(); ++d) { + dims[d] = dimSizeDist(gen); + size_tensors[i] *= dims[d]; + } + + val_tensors[i] = std::make_unique<float[]>(size_tensors[i]); + val_grad_tensors[i] = std::make_unique<float[]>(size_tensors[i]); + val_momentum1_tensors[i] = std::make_unique<float[]>(size_tensors[i]); + val_momentum2_tensors[i] = std::make_unique<float[]>(size_tensors[i]); + for (std::size_t j = 0; j < size_tensors[i]; ++j) { + val_tensors[i][j] = valueDist(gen); + val_grad_tensors[i][j] = valueDist(gen); + val_momentum1_tensors[i][j] = 0.0f; + val_momentum2_tensors[i][j] = 0.0f; + } + tensors[i] = std::make_shared<Tensor>(dims); + tensors[i]->setBackend("cpu"); + tensors[i]->getImpl()->setRawPtr(val_tensors[i].get(), size_tensors[i]); + optim_tensors[i] = std::make_shared<Tensor>(dims); + optim_tensors[i]->setBackend("cpu"); + optim_tensors[i]->getImpl()->copy(val_tensors[i].get(), size_tensors[i]); + optim_tensors[i]->initGrad(); + + grad_tensors[i] = std::make_shared<Tensor>(dims); + grad_tensors[i]->setBackend("cpu"); + grad_tensors[i]->getImpl()->setRawPtr(val_grad_tensors[i].get(), size_tensors[i]); + + momentum_tensors[i] = std::make_shared<Tensor>(dims); + momentum_tensors[i]->setBackend("cpu"); + momentum_tensors[i]->getImpl()->setRawPtr(val_momentum1_tensors[i].get(), size_tensors[i]); + momentum_tensors[i]->getImpl()->setRawPtr(val_momentum2_tensors[i].get(), size_tensors[i]); + + REQUIRE((tensors[i]->hasImpl() && + optim_tensors[i]->hasImpl() && + grad_tensors[i]->hasImpl())); + } + + // generate parameters + float lr = paramDist(gen); + float beta1 = paramDist(gen); + float beta2 = paramDist(gen); + float epsilon = paramDist(gen); + + // set Optimizer + Adam opt = Adam(beta1, beta2, epsilon); + opt.setParameters(optim_tensors); + for (std::size_t t = 0; t < nb_tensors; ++t) { + optim_tensors[t]->grad()->getImpl()->setRawPtr(val_grad_tensors[t].get(), size_tensors[t]); + } + opt.setLearningRateScheduler(learning::ConstantLR(lr)); + + for (std::size_t t = 0; t < nb_tensors; ++t) { + const Tensor tmpt1= *(opt.parameters().at(t)); + const Tensor tmpt2= *tensors[t]; + REQUIRE(approxEq<float,float>(tmpt2, tmpt1, 1e-5f, 1e-8f)); + } + + // truth + for (std::size_t step = 0; step < 10; ++step) { + for (std::size_t t = 0; t < nb_tensors; ++t) { + for (std::size_t i = 0; i < size_tensors[t]; ++i) { + val_momentum1_tensors[t][i] = beta1 * val_momentum1_tensors[t][i] + (1.0f - beta1) * val_grad_tensors[t][i]; + val_momentum2_tensors[t][i] = beta2 * val_momentum2_tensors[t][i] + (1.0f - beta2) * val_grad_tensors[t][i] * val_grad_tensors[t][i]; + val_tensors[t][i] = val_tensors[t][i] + - lr * val_momentum1_tensors[t][i] / (1.0f - std::pow(beta1, step + 1)) + / (std::sqrt(val_momentum2_tensors[t][i] / (1.0f - std::pow(beta2, step + 1))) + epsilon); + } + } + // optimizer + opt.update(); + // tests + for (std::size_t t = 0; t < nb_tensors; ++t) { + const Tensor tmpt1= *(opt.parameters().at(t)); + const Tensor tmpt2= *tensors[t]; + REQUIRE(approxEq<float,float>(tmpt2, tmpt1, 1e-5f, 1e-8f)); + } + } + } +} +} // namespace Aidge