Skip to content
Snippets Groups Projects
Commit 51e3cf9e authored by Olivier BICHLER's avatar Olivier BICHLER
Browse files

Merge branch 'Adam_optimizer' into 'dev'

Add Adam optimizer

See merge request !9
parents 6f45a121 b76944f7
No related branches found
No related tags found
2 merge requests!10version 0.1.2,!9Add Adam optimizer
Pipeline #49082 passed
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#ifndef AIDGE_CORE_OPTIMIZER_ADAM_H_
#define AIDGE_CORE_OPTIMIZER_ADAM_H_
#include <functional>
#include <memory>
#include <vector>
#include <cmath> // std::sqrt, std::pow
#include "aidge/data/Tensor.hpp"
#include "aidge/learning/optimizer/Optimizer.hpp"
#include "aidge/utils/StaticAttributes.hpp"
#include "aidge/utils/Registrar.hpp"
#include "aidge/utils/TensorUtils.hpp"
namespace Aidge {
enum class AdamAttr {
Beta1,
Beta2,
Epsilon
};
class Adam: public Optimizer, public StaticAttributes<AdamAttr, float, float, float> {
private:
std::vector<Tensor> mMomentum1;
std::vector<Tensor> mMomentum2;
Tensor mLR{std::vector<std::size_t>({1})};
Tensor mBeta1{std::vector<std::size_t>({1})};
Tensor mReversedBeta1{std::vector<std::size_t>({1})};
Tensor mBeta2{std::vector<std::size_t>({1})};
Tensor mReversedBeta2{std::vector<std::size_t>({1})};
Tensor mEpsilon{std::vector<std::size_t>({1})};
public:
using Attributes_ = StaticAttributes<AdamAttr, float, float, float>;
template <AdamAttr e>
using attr = typename Attributes_::template attr<e>;
Adam(const float beta1 = 0.9f, const float beta2 = 0.999f, const float epsilon = 1.0e-8f)
: Optimizer(),
Attributes_(attr<AdamAttr::Beta1>(beta1),
attr<AdamAttr::Beta2>(beta2),
attr<AdamAttr::Epsilon>(epsilon))
{
mBeta1.setBackend("cpu");
mBeta1.set<float>(0, beta1);
mReversedBeta1.setBackend("cpu");
mReversedBeta1.set<float>(0, 1.0f - beta1);
mBeta2.setBackend("cpu");
mBeta2.set<float>(0, beta2);
mReversedBeta2.setBackend("cpu");
mReversedBeta2.set<float>(0, 1.0f - beta2);
mEpsilon.setBackend("cpu");
mEpsilon.set<float>(0, epsilon);
}
void update() override final {
mLR.setBackend(mParameters[0]->getImpl()->backend());
mLR.set<float>(0, learningRate());
if (mParameters[0]->getImpl()->backend() != mBeta1.getImpl()->backend()) {
mBeta1.setBackend(mParameters[0]->getImpl()->backend());
mReversedBeta1.setBackend(mParameters[0]->getImpl()->backend());
mBeta2.setBackend(mParameters[0]->getImpl()->backend());
mReversedBeta2.setBackend(mParameters[0]->getImpl()->backend());
}
Tensor alpha{std::vector<std::size_t>({1})};
alpha.setBackend(mParameters[0]->getImpl()->backend());
alpha.set<float>(0, learningRate() * std::sqrt(1.0f - std::pow(mBeta2.get<float>(0), mLRScheduler.step() + 1))
/ (1.0f - std::pow(mBeta1.get<float>(0), mLRScheduler.step() + 1)));
Tensor epsilon{std::vector<std::size_t>({1})};
epsilon.setBackend(mParameters[0]->getImpl()->backend());
epsilon.set<float>(0, mEpsilon.get<float>(0) * std::sqrt(1.0f - std::pow(mBeta2.get<float>(0), mLRScheduler.step() + 1)));
if (mLRScheduler.step() == 0) {
for (std::size_t i = 0; i < mParameters.size(); ++i) {
mMomentum1[i].setBackend(mParameters[i]->getImpl()->backend());
mMomentum1[i].setDataType(mParameters[i]->grad()->dataType());
mMomentum1[i].zeros();
mMomentum2[i].setBackend(mParameters[i]->getImpl()->backend());
mMomentum2[i].setDataType(mParameters[i]->grad()->dataType());
mMomentum2[i].zeros();
}
}
for (std::size_t i = 0; i < mParameters.size(); ++i) {
mMomentum1[i] = mBeta1 * mMomentum1[i] + mReversedBeta1 * (*mParameters[i]->grad());
mMomentum2[i] = mBeta2 * mMomentum2[i] + mReversedBeta2 * (*mParameters[i]->grad()) * (*mParameters[i]->grad());
*mParameters[i] = *mParameters[i] - alpha * mMomentum1[i] / (mMomentum2[i].sqrt() + epsilon);
}
mLRScheduler.update();
}
void setParameters(const std::vector<std::shared_ptr<Tensor>>& parameters) override final {
Optimizer::setParameters(parameters);
mMomentum1 = std::vector<Tensor>(parameters.size());
mMomentum2 = std::vector<Tensor>(parameters.size());
for (std::size_t i = 0; i < parameters.size(); ++i) {
mMomentum1[i] = Tensor(parameters[i]->dims());
mMomentum2[i] = Tensor(parameters[i]->dims());
}
}
};
} // namespace Aidge
namespace {
template <>
const char *const EnumStrings<Aidge::AdamAttr>::data[] = {
"Beta1",
"Beta2",
"Epsilon"
};
}
#endif // AIDGE_CORE_OPTIMIZER_ADAM_H_
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <pybind11/pybind11.h>
#include "aidge/learning/optimizer/Optimizer.hpp"
#include "aidge/learning/optimizer/Adam.hpp"
namespace py = pybind11;
namespace Aidge {
// namespace learning {
void init_Adam(py::module& m) {
py::class_<Adam, std::shared_ptr<Adam>, Attributes, Optimizer>(m, "Adam", py::multiple_inheritance())
.def(py::init<float, float, float>(), py::arg("beta1") = 0.9f, py::arg("beta2") = 0.999f, py::arg("epsilon") = 1.0e-8f)
.def("update", &Adam::update);
}
// } // namespace learning
} // namespace Aidge
......@@ -19,12 +19,14 @@ namespace Aidge {
void init_Loss(py::module&);
void init_Optimizer(py::module&);
void init_SGD(py::module&);
void init_Adam(py::module&);
void init_LRScheduler(py::module&);
void init_Aidge(py::module& m) {
init_Loss(m);
init_Optimizer(m);
init_SGD(m);
init_Adam(m);
init_LRScheduler(m);
}
......
/********************************************************************************
* Copyright (c) 2023 CEA-List
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License 2.0 which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* SPDX-License-Identifier: EPL-2.0
*
********************************************************************************/
#include <catch2/catch_test_macros.hpp>
#include <cstddef> // std::size_t
#include <cmath> // std::sqrt, std::pow
#include <memory>
#include <random> // std::random_device, std::mt19937, std::uniform_int_distribution
#include <set>
#include <vector>
#include "aidge/data/Tensor.hpp"
#include "aidge/backend/cpu/data/TensorImpl.hpp"
#include "aidge/learning/learningRate/LRScheduler.hpp"
#include "aidge/learning/learningRate/LRSchedulerList.hpp"
#include "aidge/learning/optimizer/Optimizer.hpp"
#include "aidge/learning/optimizer/Adam.hpp"
#include "aidge/backend/cpu/operator/AddImpl.hpp"
#include "aidge/backend/cpu/operator/MulImpl.hpp"
#include "aidge/backend/cpu/operator/SubImpl.hpp"
#include "aidge/backend/cpu/operator/DivImpl.hpp"
#include "aidge/backend/cpu/operator/SqrtImpl.hpp"
#include "aidge/utils/TensorUtils.hpp"
namespace Aidge {
TEST_CASE("[learning/Adam] update", "[Optimizer][Adam]") {
constexpr std::uint16_t NBTRIALS = 10;
// Create a random number generator
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> valueDist(0.1f, 1.0f); // Random float distribution between 0 and 1
std::uniform_real_distribution<float> paramDist(0.001f, 1.0f); // Random float distribution between 0 and 1
std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5));
std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
for (std::size_t trial = 0; trial < NBTRIALS; ++trial) {
// create a random number of Tensor with random dims and random values
// Create random Tensor, Random Gradient and random
const std::size_t nb_tensors = dimSizeDist(gen);
std::vector<std::size_t> size_tensors(nb_tensors, 1);
std::vector<std::shared_ptr<Tensor>> tensors(nb_tensors);
std::vector<std::unique_ptr<float[]>> val_tensors(nb_tensors);
std::vector<std::shared_ptr<Tensor>> optim_tensors(nb_tensors);
std::vector<std::shared_ptr<Tensor>> grad_tensors(nb_tensors);
std::vector<std::unique_ptr<float[]>> val_grad_tensors(nb_tensors);
std::vector<std::shared_ptr<Tensor>> momentum_tensors(nb_tensors);
std::vector<std::unique_ptr<float[]>> val_momentum1_tensors(nb_tensors);
std::vector<std::unique_ptr<float[]>> val_momentum2_tensors(nb_tensors);
for (std::size_t i = 0; i < nb_tensors; ++i) {
std::vector<std::size_t> dims(nbDimsDist(gen));
for (std::size_t d = 0; d < dims.size(); ++d) {
dims[d] = dimSizeDist(gen);
size_tensors[i] *= dims[d];
}
val_tensors[i] = std::make_unique<float[]>(size_tensors[i]);
val_grad_tensors[i] = std::make_unique<float[]>(size_tensors[i]);
val_momentum1_tensors[i] = std::make_unique<float[]>(size_tensors[i]);
val_momentum2_tensors[i] = std::make_unique<float[]>(size_tensors[i]);
for (std::size_t j = 0; j < size_tensors[i]; ++j) {
val_tensors[i][j] = valueDist(gen);
val_grad_tensors[i][j] = valueDist(gen);
val_momentum1_tensors[i][j] = 0.0f;
val_momentum2_tensors[i][j] = 0.0f;
}
tensors[i] = std::make_shared<Tensor>(dims);
tensors[i]->setBackend("cpu");
tensors[i]->getImpl()->setRawPtr(val_tensors[i].get(), size_tensors[i]);
optim_tensors[i] = std::make_shared<Tensor>(dims);
optim_tensors[i]->setBackend("cpu");
optim_tensors[i]->getImpl()->copy(val_tensors[i].get(), size_tensors[i]);
optim_tensors[i]->initGrad();
grad_tensors[i] = std::make_shared<Tensor>(dims);
grad_tensors[i]->setBackend("cpu");
grad_tensors[i]->getImpl()->setRawPtr(val_grad_tensors[i].get(), size_tensors[i]);
momentum_tensors[i] = std::make_shared<Tensor>(dims);
momentum_tensors[i]->setBackend("cpu");
momentum_tensors[i]->getImpl()->setRawPtr(val_momentum1_tensors[i].get(), size_tensors[i]);
momentum_tensors[i]->getImpl()->setRawPtr(val_momentum2_tensors[i].get(), size_tensors[i]);
REQUIRE((tensors[i]->hasImpl() &&
optim_tensors[i]->hasImpl() &&
grad_tensors[i]->hasImpl()));
}
// generate parameters
float lr = paramDist(gen);
float beta1 = paramDist(gen);
float beta2 = paramDist(gen);
float epsilon = paramDist(gen);
// set Optimizer
Adam opt = Adam(beta1, beta2, epsilon);
opt.setParameters(optim_tensors);
for (std::size_t t = 0; t < nb_tensors; ++t) {
optim_tensors[t]->grad()->getImpl()->setRawPtr(val_grad_tensors[t].get(), size_tensors[t]);
}
opt.setLearningRateScheduler(learning::ConstantLR(lr));
for (std::size_t t = 0; t < nb_tensors; ++t) {
const Tensor tmpt1= *(opt.parameters().at(t));
const Tensor tmpt2= *tensors[t];
REQUIRE(approxEq<float,float>(tmpt2, tmpt1, 1e-5f, 1e-8f));
}
for (std::size_t step = 0; step < 10; ++step) {
// truth
float lr2 = lr * std::sqrt(1.0f - std::pow(beta2, step + 1)) / (1.0f - std::pow(beta1, step + 1));
float epsilon2 = epsilon * std::sqrt(1.0f - std::pow(beta2, step + 1));
for (std::size_t t = 0; t < nb_tensors; ++t) {
for (std::size_t i = 0; i < size_tensors[t]; ++i) {
val_momentum1_tensors[t][i] = beta1 * val_momentum1_tensors[t][i] + (1.0f - beta1) * val_grad_tensors[t][i];
val_momentum2_tensors[t][i] = beta2 * val_momentum2_tensors[t][i] + (1.0f - beta2) * val_grad_tensors[t][i] * val_grad_tensors[t][i];
val_tensors[t][i] = val_tensors[t][i]
- lr2 * val_momentum1_tensors[t][i] / (std::sqrt(val_momentum2_tensors[t][i]) + epsilon2);
}
}
// optimizer
opt.update();
// tests
for (std::size_t t = 0; t < nb_tensors; ++t) {
const Tensor tmpt1= *(opt.parameters().at(t));
const Tensor tmpt2= *tensors[t];
REQUIRE(approxEq<float,float>(tmpt2, tmpt1, 1e-5f, 1e-8f));
}
}
}
}
} // namespace Aidge
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment