GitOrigin-RevId: befb85fd43
tags/v1.7.2.m1
| @@ -0,0 +1,100 @@ | |||
| /** | |||
| * \file src/opr/impl/training/dataview.cpp | |||
| * | |||
| * This file is part of MegBrain, a deep learning framework developed by Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| */ | |||
| #include "megbrain/opr/training/dataview.h" | |||
| #include "megbrain/exception.h" | |||
| #include "megbrain/opr/basic_arith_wrapper.h" | |||
| #include "megbrain/opr/io.h" | |||
| #include "megbrain/opr/tensor_manip.h" | |||
| #include "megbrain/tensor.h" | |||
| #include <random> | |||
| namespace mgb { | |||
| DataLoader::DataLoader( | |||
| std::shared_ptr<IDataView> dataview, mgb::CompNode comp_node, | |||
| unsigned long batchsize, bool shuffle, bool drop_last) | |||
| : m_dataview(dataview), | |||
| m_comp_node(comp_node), | |||
| m_batchsize(batchsize), | |||
| m_shuffle(shuffle), | |||
| m_drop_last(drop_last), | |||
| m_idx(0) { | |||
| if (!m_comp_node.valid()) { | |||
| m_comp_node = CompNode::load("xpu0"); | |||
| } | |||
| for (size_t i = 0; i < m_dataview->size(); i++) { | |||
| m_index_collection.push_back(i); | |||
| } | |||
| if (m_dataview->size() > 0) { | |||
| auto data_sample = m_dataview->get_item(0); | |||
| SmallVector<size_t> dshape; | |||
| dshape.push_back(static_cast<size_t>(batchsize)); | |||
| for (size_t i = 0; i < data_sample.first->layout().ndim; i++) { | |||
| dshape.push_back(data_sample.first->shape()[i]); | |||
| } | |||
| m_data_shape = dshape; | |||
| SmallVector<size_t> lshape; | |||
| lshape.push_back(m_batchsize); | |||
| for (size_t i = 1; i < data_sample.second->layout().ndim; i++) { | |||
| lshape.push_back(data_sample.second->shape()[i]); | |||
| } | |||
| m_label_shape = lshape; | |||
| m_data_type = data_sample.first->dtype(); | |||
| m_label_type = data_sample.second->dtype(); | |||
| } else { | |||
| mgb_throw(AssertionError, "The dataset is empty."); | |||
| } | |||
| } | |||
| size_t DataLoader::size() { | |||
| return m_dataview->size() / m_batchsize; | |||
| } | |||
| DataPair DataLoader::next() { | |||
| if (m_idx == 0 && m_shuffle) { | |||
| std::shuffle( | |||
| m_index_collection.begin(), m_index_collection.end(), | |||
| std::default_random_engine()); | |||
| } | |||
| if (m_idx >= m_index_collection.size() - m_batchsize) { | |||
| m_idx = 0; | |||
| } | |||
| auto data = std::make_shared<HostTensorND>(m_comp_node, m_data_shape, m_data_type); | |||
| auto label = | |||
| std::make_shared<HostTensorND>(m_comp_node, m_label_shape, m_label_type); | |||
| size_t data_bytes = m_dataview->get_item(m_index_collection.at(m_idx)) | |||
| .first->layout() | |||
| .access_bytes(); | |||
| size_t label_bytes = m_dataview->get_item(m_index_collection.at(m_idx)) | |||
| .second->layout() | |||
| .access_bytes(); | |||
| auto data_ptr = data->raw_ptr(); | |||
| auto label_ptr = label->raw_ptr(); | |||
| for (unsigned int i = 0; i < m_batchsize; i++) { | |||
| auto item = m_dataview->get_item(m_index_collection.at(m_idx)); | |||
| auto pre_data = item.first; | |||
| auto pre_label = item.second; | |||
| auto pre_data_ptr = pre_data->raw_ptr(); | |||
| auto pre_label_ptr = pre_label->raw_ptr(); | |||
| memcpy(data_ptr + data_bytes * i, pre_data_ptr, | |||
| sizeof(megdnn::dt_byte) * data_bytes); | |||
| memcpy(label_ptr + label_bytes * i, pre_label_ptr, | |||
| sizeof(megdnn::dt_byte) * label_bytes); | |||
| m_idx++; | |||
| } | |||
| return {data, label}; | |||
| } | |||
| } // namespace mgb | |||
| @@ -0,0 +1,82 @@ | |||
| /** | |||
| * \file src/opr/impl/training/loss.cpp | |||
| * | |||
| * This file is part of MegBrain, a deep learning framework developed by Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| */ | |||
| #include "megbrain/opr/training/loss.h" | |||
| #include "megbrain/exception.h" | |||
| #include "megbrain/opr/indexing.h" | |||
| namespace mgb { | |||
| namespace loss { | |||
| CrossEntropyLoss::CrossEntropyLoss( | |||
| bool with_logits, float label_smooth, ReduceMode reduce_mode, int axis) | |||
| : m_with_logits(with_logits), | |||
| m_label_smooth(label_smooth), | |||
| m_reduce_mode(reduce_mode), | |||
| m_axis(axis) {} | |||
| SymbolVar CrossEntropyLoss::operator()( | |||
| mgb::SymbolVar symbol_pred, mgb::SymbolVar symbol_label) { | |||
| mgb_assert( | |||
| symbol_pred.shape().ndim >= symbol_label.shape().ndim, | |||
| "The label must have less dimensions than the pred."); | |||
| for (size_t i = 0; i < symbol_label.shape().ndim; i++) { | |||
| mgb_assert( | |||
| symbol_pred.shape()[i] == symbol_label.shape()[i] || (int)i == m_axis, | |||
| "Unmatched shape for pred and label."); | |||
| } | |||
| mgb_assert(m_label_smooth >= .0f, "The label_smmoth must be positive value"); | |||
| SymbolVar symbol_loss; | |||
| SymbolVar symbol_middle; | |||
| SymbolVar symbol_max = opr::reduce_ax_max(symbol_pred, m_axis); | |||
| SymbolVar symbol_primary_item = | |||
| opr::IndexingOneHot::make(symbol_pred, symbol_label, {m_axis}); | |||
| if (m_with_logits) { | |||
| symbol_middle = opr::reduce_ax_sum(symbol_pred, m_axis) / | |||
| opr::GetVarShape::make(symbol_pred, {m_axis}); | |||
| SymbolVar symbol_logits = | |||
| symbol_max + opr::log(opr::reduce_ax_sum( | |||
| opr::exp(symbol_pred - symbol_max), m_axis)); | |||
| symbol_loss = symbol_logits; | |||
| } else { | |||
| symbol_middle = opr::reduce_ax_sum(opr::log(symbol_pred), m_axis) / | |||
| opr::GetVarShape::make(symbol_pred, {m_axis}); | |||
| symbol_primary_item = opr::log(symbol_primary_item); | |||
| } | |||
| if (m_label_smooth > .0f) { | |||
| symbol_loss = symbol_loss - m_label_smooth * symbol_middle - | |||
| (1 - m_label_smooth) * symbol_primary_item; | |||
| } else { | |||
| symbol_loss = symbol_loss - symbol_primary_item; | |||
| } | |||
| if (m_reduce_mode == ReduceMode::MEAN) { | |||
| symbol_loss = | |||
| opr::reduce_sum(symbol_loss.flatten(), symbol_loss.make_scalar(1)) / | |||
| (float)(symbol_loss.shape().total_nr_elems()); | |||
| } else if (m_reduce_mode == ReduceMode::SUM) { | |||
| symbol_loss = | |||
| opr::reduce_sum(symbol_loss.flatten(), symbol_loss.make_scalar(1)); | |||
| } | |||
| return symbol_loss; | |||
| } | |||
| MSELoss::MSELoss(ReduceMode reduce_mode) : m_reduce_mode(reduce_mode){}; | |||
| mgb::SymbolVar MSELoss::operator()( | |||
| mgb::SymbolVar symbol_pred, mgb::SymbolVar symol_label) { | |||
| return opr::pow(symbol_pred - symol_label, symbol_pred.make_scalar(2)); | |||
| } | |||
| } // namespace loss | |||
| } // namespace mgb | |||
| @@ -0,0 +1,143 @@ | |||
| /** | |||
| * \file src/opr/impl/training/optimizer.cpp | |||
| * | |||
| * This file is part of MegBrain, a deep learning framework developed by Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| */ | |||
| #include "megbrain/opr/training/optimizer.h" | |||
| #include "megbrain/exception.h" | |||
| #include "megbrain/opr/training/utils.h" | |||
| namespace mgb { | |||
| namespace optimizer { | |||
| SymbolVarArray Optimizer::make_multiple( | |||
| SymbolVarArray symbol_weights, SymbolVarArray symbol_grads, | |||
| std::shared_ptr<mgb::cg::ComputingGraph> graph) { | |||
| if (symbol_weights.size() != symbol_grads.size()) { | |||
| mgb_throw(AssertionError, "The count of weights differs with that of grads."); | |||
| } | |||
| SymbolVarArray r; | |||
| for (size_t i = 0; i < symbol_weights.size(); i++) { | |||
| r.push_back(make(symbol_weights[i], symbol_grads[i], graph)); | |||
| } | |||
| return r; | |||
| } | |||
| SGD::SGD(float lr, float weight_decay, float momentum) | |||
| : m_lr(lr), m_weight_decay(weight_decay), m_momentum(momentum) { | |||
| if (m_lr <= 0) { | |||
| mgb_throw(AssertionError, "Invalid learning rate: negative value."); | |||
| } | |||
| if (m_weight_decay < 0) { | |||
| mgb_throw(AssertionError, "Invalid weight_decay value: negative value."); | |||
| } | |||
| if (m_momentum < 0) { | |||
| mgb_throw(AssertionError, "Invalid momentum value: negative value."); | |||
| } | |||
| } | |||
| SymbolVar SGD::make( | |||
| SymbolVar symbol_weight, SymbolVar symbol_grad, | |||
| std::shared_ptr<cg::ComputingGraph> graph) { | |||
| SymbolVar symbol_pre_grad; | |||
| auto pre_grad = TensorGen::zeros<dtype::Float32>( | |||
| symbol_grad.shape(), symbol_grad.node()->comp_node()); | |||
| m_pre_grads.push_back(pre_grad); | |||
| symbol_pre_grad = opr::SharedDeviceTensor::make(*graph, *pre_grad); | |||
| if (m_weight_decay != .0f) { | |||
| symbol_grad = symbol_grad + m_weight_decay * symbol_weight; | |||
| } | |||
| if (m_momentum != .0f) { | |||
| symbol_pre_grad = | |||
| opr::AddUpdate::make(symbol_pre_grad, symbol_grad, {m_momentum, 1.0f}); | |||
| return opr::AddUpdate::make(symbol_weight, -symbol_pre_grad, {1.f, m_lr}); | |||
| } else { | |||
| return opr::AddUpdate::make(symbol_weight, -symbol_grad, {1.f, m_lr}); | |||
| } | |||
| } | |||
| Adam::Adam( | |||
| float lr, float weight_decay, std::pair<float, float> betas, float eps, | |||
| bool amsgrad) | |||
| : m_lr(lr), | |||
| m_weight_decay(weight_decay), | |||
| m_betas(betas), | |||
| m_eps(eps), | |||
| m_amsgrad(amsgrad) { | |||
| mgb_assert(m_lr > 0, "Invalid learning rate: negative value."); | |||
| mgb_assert(m_weight_decay >= 0, "Invalid weight_decay value: negative value."); | |||
| mgb_assert( | |||
| m_betas.first >= 0 && m_betas.second >= 0 && m_betas.first < 1 && | |||
| m_betas.second < 1, | |||
| "Invalid betas value: negative value or larger than 1."); | |||
| } | |||
| SymbolVar Adam::make( | |||
| SymbolVar symbol_weight, SymbolVar symbol_grad, | |||
| std::shared_ptr<cg::ComputingGraph> graph) { | |||
| CompNode comp_node = symbol_grad.node()->comp_node(); | |||
| DType dt = symbol_grad.dtype(); | |||
| m_correction1 = TensorGen::ones<dtype::Float32>({1}, comp_node); | |||
| m_correction2 = TensorGen::ones<dtype::Float32>({1}, comp_node); | |||
| std::shared_ptr<DeviceTensorND> exp_avg = | |||
| std::make_shared<DeviceTensorND>(comp_node, symbol_grad.shape(), dt); | |||
| mgb::fill_zero_dev_tensor(*exp_avg); | |||
| std::shared_ptr<DeviceTensorND> exp_avg_sq = | |||
| std::make_shared<DeviceTensorND>(comp_node, symbol_grad.shape(), dt); | |||
| mgb::fill_zero_dev_tensor(*exp_avg_sq); | |||
| m_exp_avg.push_back(exp_avg); | |||
| m_exp_avg_sq.push_back(exp_avg_sq); | |||
| SymbolVar symbol_correction1 = | |||
| opr::SharedDeviceTensor::make(*graph, *m_correction1); | |||
| SymbolVar symbol_correction2 = | |||
| opr::SharedDeviceTensor::make(*graph, *m_correction2); | |||
| SymbolVar symbol_exp_avg = opr::SharedDeviceTensor::make(*graph, exp_avg); | |||
| SymbolVar symbol_exp_avg_sq = opr::SharedDeviceTensor::make(*graph, exp_avg_sq); | |||
| symbol_correction1 = opr::AddUpdate::make( | |||
| symbol_correction1, symbol_correction1, {m_betas.first, .0f}); | |||
| symbol_correction2 = opr::AddUpdate::make( | |||
| symbol_correction2, symbol_correction2, {m_betas.second, .0f}); | |||
| if (m_weight_decay != .0f) { | |||
| symbol_grad = symbol_grad + m_weight_decay * symbol_weight; | |||
| } | |||
| symbol_exp_avg = opr::AddUpdate::make( | |||
| symbol_exp_avg, symbol_grad, {m_betas.first, 1.f - m_betas.first}); | |||
| symbol_exp_avg_sq = opr::AddUpdate::make( | |||
| symbol_exp_avg_sq, symbol_grad * symbol_grad, | |||
| {m_betas.second, 1.f - m_betas.second}); | |||
| SymbolVar delta; | |||
| if (m_amsgrad) { | |||
| std::shared_ptr<DeviceTensorND> max_exp_avg_sq = | |||
| std::make_shared<DeviceTensorND>(comp_node, symbol_grad.shape(), dt); | |||
| mgb::fill_zero_dev_tensor(*max_exp_avg_sq); | |||
| SymbolVar symbol_max_exp_avg_sq = | |||
| opr::SharedDeviceTensor::make(*graph, max_exp_avg_sq); | |||
| symbol_max_exp_avg_sq = opr::AddUpdate::make( | |||
| symbol_exp_avg_sq, opr::max(symbol_max_exp_avg_sq, symbol_exp_avg_sq), | |||
| {1.0f, 1.0f}); | |||
| delta = (symbol_exp_avg / (1.f - symbol_correction1)) / | |||
| (opr::powf(symbol_max_exp_avg_sq / (1.f - symbol_correction2), 0.5f) + | |||
| m_eps); | |||
| } else { | |||
| delta = (symbol_exp_avg / (1.f - symbol_correction1)) / | |||
| (opr::pow( | |||
| symbol_exp_avg_sq / (1.f - symbol_correction2), | |||
| symbol_exp_avg.make_scalar(0.5f)) + | |||
| m_eps); | |||
| } | |||
| return opr::AddUpdate::make(symbol_weight, -delta, {1.0f, m_lr}); | |||
| } | |||
| } // namespace optimizer | |||
| } // namespace mgb | |||
| @@ -0,0 +1,69 @@ | |||
| /** | |||
| * \file src/opr/include/training/dataview.h | |||
| * | |||
| * This file is part of MegBrain, a deep learning framework developed by Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| */ | |||
| #pragma once | |||
| #include "megbrain/opr/basic_arith_wrapper.h" | |||
| #include "megbrain/opr/tensor_manip.h" | |||
| #include <type_traits> | |||
| namespace mgb { | |||
| using DataPair = std::pair< | |||
| std::shared_ptr<mgb::HostTensorND>, std::shared_ptr<mgb::HostTensorND>>; | |||
| //! The interface of the dataset. | |||
| class IDataView { | |||
| public: | |||
| /*! | |||
| * The method to get an item in dataset with index. | |||
| */ | |||
| virtual DataPair get_item(int idx) = 0; | |||
| /*! | |||
| * The method to get the size of the dataset. | |||
| */ | |||
| virtual size_t size() = 0; | |||
| virtual ~IDataView() = default; | |||
| }; | |||
| //! The definition of dataloader, which is corresponding to the <DataLoader> of | |||
| //! Python API of MegEngine. | |||
| class DataLoader { | |||
| public: | |||
| DataLoader( | |||
| std::shared_ptr<IDataView> dataview, mgb::CompNode compnode, | |||
| unsigned long batchsize = 1U, bool shuffle = false, bool drop_last = true); | |||
| /*! | |||
| * Get the next pair of data of the dataset. | |||
| */ | |||
| DataPair next(); | |||
| /*! | |||
| * Get the size of the dataloader. | |||
| */ | |||
| size_t size(); | |||
| private: | |||
| std::shared_ptr<IDataView> m_dataview; | |||
| mgb::CompNode m_comp_node; | |||
| unsigned long m_batchsize; | |||
| bool m_shuffle; | |||
| bool m_drop_last; | |||
| size_t m_idx; | |||
| mgb::TensorShape m_data_shape; | |||
| mgb::TensorShape m_label_shape; | |||
| mgb::DType m_data_type; | |||
| mgb::DType m_label_type; | |||
| // Only used in the temp solution for shuffle | |||
| std::vector<int> m_index_collection; | |||
| }; | |||
| } // namespace mgb | |||
| @@ -0,0 +1,70 @@ | |||
| /** | |||
| * \file src/opr/include/training/loss.h | |||
| * | |||
| * This file is part of MegBrain, a deep learning framework developed by Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| */ | |||
| #pragma once | |||
| #include "megbrain/opr/basic_arith_wrapper.h" | |||
| #include "megbrain/opr/io.h" | |||
| #include "megbrain/opr/tensor_manip.h" | |||
| #include "megbrain/tensor.h" | |||
| namespace mgb { | |||
| namespace loss { | |||
| //! The interface of losses which should be inherited by each loss class. | |||
| class ILoss { | |||
| public: | |||
| /*! | |||
| * The reduce mode of loss to convert output to scalar. | |||
| */ | |||
| enum ReduceMode { SUM = 0, MEAN = 1 }; | |||
| /*! | |||
| * The calculation of the loss, in which the output is a scalar symbolvar | |||
| */ | |||
| virtual mgb::SymbolVar operator()( | |||
| mgb::SymbolVar symbol_pred, mgb::SymbolVar symol_label) = 0; | |||
| virtual ~ILoss() = default; | |||
| }; | |||
| /*! | |||
| * The cross entropy loss. The definition could be found here: | |||
| * https://en.wikipedia.org/wiki/Cross_entropy | |||
| * | |||
| * It's corresponding to the <CrossEntropy> of Python API of MegEngine. | |||
| */ | |||
| class CrossEntropyLoss : public ILoss { | |||
| public: | |||
| CrossEntropyLoss( | |||
| bool with_logits = true, float label_smooth = .0f, | |||
| ReduceMode reduce_mode = ReduceMode::MEAN, int axis = 1); | |||
| mgb::SymbolVar operator()(mgb::SymbolVar symbol_pred, mgb::SymbolVar symol_label); | |||
| protected: | |||
| bool m_with_logits; | |||
| float m_label_smooth; | |||
| ReduceMode m_reduce_mode; | |||
| int m_axis; | |||
| }; | |||
| /*! | |||
| * The MSE(Mean Square Error) loss. The definition could be found here: | |||
| * https://en.wikipedia.org/wiki/Mean_squared_error | |||
| * | |||
| * It's corresponding to the <MSE> of Python API of MegEngine. | |||
| */ | |||
| class MSELoss : public ILoss { | |||
| public: | |||
| MSELoss(ReduceMode reduce_mode = ReduceMode::MEAN); | |||
| mgb::SymbolVar operator()(mgb::SymbolVar symbol_pred, mgb::SymbolVar symol_label); | |||
| protected: | |||
| ReduceMode m_reduce_mode; | |||
| }; | |||
| } // namespace loss | |||
| } // namespace mgb | |||
| @@ -0,0 +1,135 @@ | |||
| /** | |||
| * \file src/opr/include/training/optimizer.h | |||
| * | |||
| * This file is part of MegBrain, a deep learning framework developed by Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| */ | |||
| #pragma once | |||
| #include "megbrain/opr/basic_arith_wrapper.h" | |||
| #include "megbrain/opr/io.h" | |||
| #include "megbrain/opr/tensor_manip.h" | |||
| #include "megbrain/tensor.h" | |||
| namespace mgb { | |||
| namespace optimizer { | |||
| //! The interface of optimizers which should be inherited by each optimizer. | |||
| class IOptimizer { | |||
| public: | |||
| /*! | |||
| * The method to add manipulations to the graph to update the weight when the | |||
| * input is SymbolvarArrays. | |||
| */ | |||
| virtual mgb::SymbolVarArray make_multiple( | |||
| mgb::SymbolVarArray symbol_weights, mgb::SymbolVarArray symbol_grads, | |||
| std::shared_ptr<mgb::cg::ComputingGraph> graph) = 0; | |||
| /*! | |||
| * The method to add manipulations to the graph to update the weight with a | |||
| * certain strategy. | |||
| * The output is expected to be the symbolvar after updating the weight. | |||
| */ | |||
| virtual mgb::SymbolVar make( | |||
| mgb::SymbolVar symbol_weight, mgb::SymbolVar symbol_grad, | |||
| std::shared_ptr<mgb::cg::ComputingGraph> graph) = 0; | |||
| virtual ~IOptimizer() = default; | |||
| }; | |||
| /*! | |||
| * An abstract class which helps to simplify the implemention of optimizers. | |||
| * It gives a default implemention of method <make_multiple> based on the method | |||
| * <make> defined by its derived class. | |||
| */ | |||
| class Optimizer : public IOptimizer { | |||
| public: | |||
| mgb::SymbolVarArray make_multiple( | |||
| mgb::SymbolVarArray symbol_weights, mgb::SymbolVarArray symbol_grads, | |||
| std::shared_ptr<mgb::cg::ComputingGraph> graph); | |||
| virtual mgb::SymbolVar make( | |||
| mgb::SymbolVar symbol_weight, mgb::SymbolVar symbol_grad, | |||
| std::shared_ptr<mgb::cg::ComputingGraph> graph) = 0; | |||
| virtual ~Optimizer() = default; | |||
| }; | |||
| /*! | |||
| * The SGD(Stochastic gradient descent) optimizer. | |||
| * The definition could be found here: | |||
| * https://en.wikipedia.org/wiki/Stochastic_gradient_descent | |||
| * It is corresponding to the <SGD> of Python API of MegEngine. | |||
| */ | |||
| class SGD : public Optimizer { | |||
| public: | |||
| SGD() = default; | |||
| SGD(float lr, float weight_decay = .0f, float momentum = .0f); | |||
| SGD(const SGD& that) { | |||
| m_lr = that.m_lr; | |||
| m_momentum = that.m_momentum; | |||
| m_weight_decay = that.m_weight_decay; | |||
| } | |||
| mgb::SymbolVar make( | |||
| mgb::SymbolVar symbol_weight, mgb::SymbolVar symbol_grad, | |||
| std::shared_ptr<mgb::cg::ComputingGraph> graph); | |||
| const SGD& operator=(const SGD& that) { | |||
| m_lr = that.m_lr; | |||
| m_momentum = that.m_momentum; | |||
| m_weight_decay = that.m_weight_decay; | |||
| return *this; | |||
| } | |||
| protected: | |||
| float m_lr; | |||
| float m_weight_decay; | |||
| float m_momentum; | |||
| std::vector<std::shared_ptr<mgb::HostTensorND>> m_pre_grads; | |||
| }; | |||
| /*! | |||
| * The Adam optimizer. The definition could be found here: | |||
| * https://en.wikipedia.org/wiki/Stochastic_gradient_descent#:~:text=full%2Dbatches.%5B26%5D-,Adam,-%5Bedit%5D | |||
| * It is corresponding to the <Adam> of Python API of MegEngine. | |||
| */ | |||
| class Adam : public Optimizer { | |||
| public: | |||
| Adam() = default; | |||
| Adam(float lr, float weight_decay = .0f, | |||
| std::pair<float, float> betas = {0.9f, 0.999f}, float eps = 1e-8f, | |||
| bool amsgrad = false); | |||
| Adam(const Adam& that) { | |||
| m_lr = that.m_lr; | |||
| m_betas = that.m_betas; | |||
| m_eps = that.m_eps; | |||
| m_weight_decay = that.m_weight_decay; | |||
| m_amsgrad = that.m_amsgrad; | |||
| } | |||
| mgb::SymbolVar make( | |||
| mgb::SymbolVar symbol_weight, mgb::SymbolVar symbol_grad, | |||
| std::shared_ptr<mgb::cg::ComputingGraph> graph); | |||
| const Adam& operator=(const Adam& that) { | |||
| m_lr = that.m_lr; | |||
| m_betas = that.m_betas; | |||
| m_eps = that.m_eps; | |||
| m_weight_decay = that.m_weight_decay; | |||
| m_amsgrad = that.m_amsgrad; | |||
| return *this; | |||
| } | |||
| protected: | |||
| float m_lr; | |||
| float m_weight_decay; | |||
| std::pair<float, float> m_betas; | |||
| float m_eps; | |||
| bool m_amsgrad; | |||
| std::vector<std::shared_ptr<mgb::DeviceTensorND>> m_exp_avg; | |||
| std::vector<std::shared_ptr<mgb::DeviceTensorND>> m_exp_avg_sq; | |||
| std::vector<std::shared_ptr<mgb::DeviceTensorND>> m_max_exp_avg_sq; | |||
| std::shared_ptr<mgb::HostTensorND> m_correction1; | |||
| std::shared_ptr<mgb::HostTensorND> m_correction2; | |||
| }; | |||
| } // namespace optimizer | |||
| } // namespace mgb | |||
| @@ -0,0 +1,81 @@ | |||
| /** | |||
| * \file src/opr/include/tensor_gen.h | |||
| * | |||
| * This file is part of MegBrain, a deep learning framework developed by Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| */ | |||
| #pragma once | |||
| #include "megbrain/tensor.h" | |||
| namespace mgb { | |||
| /*! | |||
| * A static class including methods to generate host tensors. | |||
| */ | |||
| class TensorGen { | |||
| public: | |||
| /*! | |||
| * \brief Generate a tensor with all the elements equal to the given value | |||
| */ | |||
| template <typename ctype, typename = typename mgb::ctype_enable_if<ctype>::type> | |||
| static std::shared_ptr<mgb::HostTensorND> constant( | |||
| mgb::TensorShape shape, ctype value, | |||
| mgb::CompNode comp_node = mgb::CompNode::load("xpu0")) { | |||
| std::shared_ptr<mgb::HostTensorND> r = std::make_shared<mgb::HostTensorND>( | |||
| comp_node, shape, typename mgb::DTypeTrait<ctype>::dtype()); | |||
| auto ptr = r->ptr<ctype>(); | |||
| for (size_t i = 0, it = r->layout().total_nr_elems(); i < it; i++) { | |||
| ptr[i] = value; | |||
| } | |||
| return r; | |||
| } | |||
| /*! | |||
| * \brief Generate a tensor with all the elements equal to 0 | |||
| */ | |||
| template <typename T> | |||
| static std::shared_ptr<mgb::HostTensorND> zeros( | |||
| mgb::TensorShape shape, | |||
| mgb::CompNode comp_node = mgb::CompNode::load("xpu0")) { | |||
| static_assert( | |||
| std::is_base_of<mgb::DType, T>(), | |||
| "Please use the dtype in namespace mgb or use " | |||
| "Tensor::constant."); | |||
| using ctype = typename mgb::DTypeTrait<T>::ctype; | |||
| return constant(shape, (ctype)0, comp_node); | |||
| } | |||
| /*! | |||
| * \brief Generate a tensor with all the elements equal to 0. In this method | |||
| * typename is not required. | |||
| */ | |||
| static std::shared_ptr<mgb::HostTensorND> zeros( | |||
| mgb::TensorShape shape, mgb::DType dtype = mgb::dtype::Float32(), | |||
| mgb::CompNode comp_node = mgb::CompNode::load("xpu0")) { | |||
| std::shared_ptr<mgb::HostTensorND> r = | |||
| std::make_shared<mgb::HostTensorND>(comp_node, shape, dtype); | |||
| auto ptr = r->raw_ptr(); | |||
| memset(ptr, 0, sizeof(megdnn::dt_byte)); | |||
| return r; | |||
| } | |||
| /*! | |||
| * \brief Generate a tensor with all the elements equal to 1 | |||
| */ | |||
| template <typename T> | |||
| static std::shared_ptr<mgb::HostTensorND> ones( | |||
| mgb::TensorShape shape, | |||
| mgb::CompNode comp_node = mgb::CompNode::load("xpu0")) { | |||
| static_assert( | |||
| std::is_base_of<mgb::DType, T>(), | |||
| "Please use the dtype in namespace mgb or use " | |||
| "Tensor::constant."); | |||
| using ctype = typename mgb::DTypeTrait<T>::ctype; | |||
| return constant(shape, (ctype)1, comp_node); | |||
| } | |||
| }; | |||
| } // namespace mgb | |||
| @@ -0,0 +1,106 @@ | |||
| /** | |||
| * \file src/opr/test/training/loss.cpp | |||
| * | |||
| * This file is part of MegBrain, a deep learning framework developed by Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| */ | |||
| #include "megbrain/opr/basic_arith_wrapper.h" | |||
| #include "megbrain/opr/indexing.h" | |||
| #include "megbrain/opr/io.h" | |||
| #include "megbrain/opr/tensor_manip.h" | |||
| #include "megbrain/tensor.h" | |||
| #include "megbrain/test/helper.h" | |||
| #include "megbrain/opr/training/loss.h" | |||
| using namespace mgb; | |||
| using namespace loss; | |||
| namespace { | |||
| class Device2HostCallback { | |||
| public: | |||
| Device2HostCallback(std::shared_ptr<HostTensorND> host) : m_host{host} {} | |||
| void operator()(const DeviceTensorND& device) { m_host->copy_from(device).sync(); } | |||
| private: | |||
| std::shared_ptr<HostTensorND> m_host; | |||
| }; | |||
| class CrossEntropyTest : public ::testing::Test { | |||
| private: | |||
| /* data */ | |||
| std::shared_ptr<HostTensorND> pred, label, truth, loss; | |||
| TensorShape pred_shape = {2, 10}; | |||
| TensorShape label_shape = {2}; | |||
| TensorShape truth_shape = {1}; | |||
| std::vector<float> pred_values = { | |||
| -0.22847f, -0.65020f, -0.42470f, 1.32903f, -0.58377f, -0.15881f, -0.23134f, | |||
| -0.36147f, -1.05848f, -0.23285f, 0.32360f, -0.36430f, -0.03172f, 1.18970f, | |||
| -0.23465f, -0.16139f, -0.22942f, -0.22538f, -0.68029f, -0.41004f}; | |||
| std::vector<int> label_values = {5, 3}; | |||
| std::vector<float> truth_values = {1.8120441}; | |||
| CompNode node = CompNode::load("cpu0"); | |||
| std::shared_ptr<cg::ComputingGraph> graph; | |||
| CrossEntropyLoss cross_entropy_loss; | |||
| public: | |||
| std::unique_ptr<cg::AsyncExecutable> func; | |||
| void setup(); | |||
| void build_model(float label_smooth = .0f); | |||
| void verify(); | |||
| template <typename T> | |||
| void assign_value(std::shared_ptr<HostTensorND> tensor, std::vector<T> value); | |||
| }; | |||
| } // namespace | |||
| void CrossEntropyTest::setup() { | |||
| pred = std::make_shared<HostTensorND>(node, pred_shape, dtype::Float32()); | |||
| label = std::make_shared<HostTensorND>(node, label_shape, dtype::Int32()); | |||
| truth = std::make_shared<HostTensorND>(node, truth_shape, dtype::Float32()); | |||
| loss = std::make_shared<HostTensorND>(node, truth_shape, dtype::Float32()); | |||
| assign_value<float>(pred, pred_values); | |||
| assign_value<int>(label, label_values); | |||
| assign_value<float>(truth, truth_values); | |||
| } | |||
| template <typename T> | |||
| void CrossEntropyTest::assign_value( | |||
| std::shared_ptr<HostTensorND> tensor, std::vector<T> values) { | |||
| ASSERT_EQ(values.size(), tensor->shape().total_nr_elems()); | |||
| auto ptr = tensor->ptr<T>(); | |||
| for (size_t i = 0, it = tensor->shape().total_nr_elems(); i < it; i += 1) { | |||
| ptr[i] = values.at(i); | |||
| } | |||
| } | |||
| void CrossEntropyTest::build_model(float label_smooth) { | |||
| graph = cg::ComputingGraph::make(); | |||
| SymbolVar symbol_pred = opr::SharedDeviceTensor::make(*graph, *pred); | |||
| SymbolVar symbol_label = opr::SharedDeviceTensor::make(*graph, *label); | |||
| SymbolVar symbol_loss = cross_entropy_loss(symbol_pred, symbol_label); | |||
| cg::ComputingGraph::OutputSpec spec; | |||
| spec.push_back({symbol_loss, Device2HostCallback(loss)}); | |||
| func = graph->compile(spec); | |||
| } | |||
| void CrossEntropyTest::verify() { | |||
| func->execute().wait(); | |||
| ASSERT_NEAR(loss->ptr<float>()[0], truth->ptr<float>()[0], 0.001f); | |||
| } | |||
| TEST_F(CrossEntropyTest, CrossEntropy) { | |||
| setup(); | |||
| build_model(); | |||
| verify(); | |||
| } | |||
| @@ -0,0 +1,98 @@ | |||
| /** | |||
| * \file src/opr/test/training/optimizer.cpp | |||
| * | |||
| * This file is part of MegBrain, a deep learning framework developed by Megvii. | |||
| * | |||
| * \copyright Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| */ | |||
| #include "megbrain/opr/basic_arith_wrapper.h" | |||
| #include "megbrain/opr/indexing.h" | |||
| #include "megbrain/opr/io.h" | |||
| #include "megbrain/opr/tensor_manip.h" | |||
| #include "megbrain/tensor.h" | |||
| #include "megbrain/test/helper.h" | |||
| #include "megbrain/opr/training/optimizer.h" | |||
| #include "megbrain/opr/training/utils.h" | |||
| using namespace mgb; | |||
| using namespace optimizer; | |||
| namespace { | |||
| class Device2HostCallback { | |||
| public: | |||
| Device2HostCallback(std::shared_ptr<HostTensorND> host) : m_host{host} {} | |||
| void operator()(const DeviceTensorND& device) { m_host->copy_from(device).sync(); } | |||
| private: | |||
| std::shared_ptr<HostTensorND> m_host; | |||
| }; | |||
| template <typename T> | |||
| void assign_value(std::shared_ptr<HostTensorND>& tensor, std::vector<T>& values) { | |||
| ASSERT_EQ(values.size(), tensor->layout().total_nr_elems()); | |||
| auto ptr = tensor->ptr<T>(); | |||
| for (size_t i = 0, it = tensor->layout().total_nr_elems(); i < it; i += 1) { | |||
| ptr[i] = values.at(i); | |||
| } | |||
| } | |||
| class OptimizerTest : public ::testing::Test { | |||
| public: | |||
| void verify( | |||
| std::shared_ptr<IOptimizer> optimizer, std::shared_ptr<HostTensorND> weight, | |||
| std::shared_ptr<HostTensorND> grad, std::shared_ptr<HostTensorND> truth, | |||
| int execute_times); | |||
| protected: | |||
| std::shared_ptr<IOptimizer> optimizer; | |||
| std::shared_ptr<cg::ComputingGraph> graph; | |||
| }; | |||
| void OptimizerTest::verify( | |||
| std::shared_ptr<IOptimizer> optimizer, std::shared_ptr<HostTensorND> weight, | |||
| std::shared_ptr<HostTensorND> grad, std::shared_ptr<HostTensorND> truth, | |||
| int execute_times) { | |||
| graph = cg::ComputingGraph::make(); | |||
| SymbolVar symbol_weight = opr::SharedDeviceTensor::make(*graph, *weight); | |||
| SymbolVar symbol_grad = opr::SharedDeviceTensor::make(*graph, *grad); | |||
| cg::ComputingGraph::OutputSpec spec; | |||
| spec.push_back( | |||
| {optimizer->make(symbol_weight, symbol_grad, graph), | |||
| Device2HostCallback(weight)}); | |||
| auto func = graph->compile(spec); | |||
| for (int i = 0; i < execute_times; i++) { | |||
| func->execute(); | |||
| } | |||
| auto weight_ptr = weight->ptr<float>(); | |||
| auto truth_ptr = truth->ptr<float>(); | |||
| for (size_t i = 0, it = weight->shape().total_nr_elems(); i < it; i += 1) { | |||
| ASSERT_NEAR(weight_ptr[i], truth_ptr[i], 0.001f); | |||
| } | |||
| } | |||
| } // namespace | |||
| TEST_F(OptimizerTest, SGD) { | |||
| auto weight = TensorGen::constant({1}, 0.30542f); | |||
| auto grad = TensorGen::constant({1}, -1.81453f); | |||
| auto truth = TensorGen::constant({1}, 1.04673f); | |||
| int execute_times = 10; | |||
| std::shared_ptr<SGD> sgd = std::make_shared<SGD>(0.01f, 5e-2f, 0.9f); | |||
| verify(sgd, weight, grad, truth, execute_times); | |||
| } | |||
| TEST_F(OptimizerTest, AdamTest) { | |||
| auto weight = TensorGen::constant({1}, 1.62957f); | |||
| auto grad = TensorGen::constant({1}, 1.02605f); | |||
| auto truth = TensorGen::constant({1}, 1.52969f); | |||
| int execute_times = 10; | |||
| std::shared_ptr<Adam> adam = std::make_shared<Adam>(0.01f, 0.9f); | |||
| verify(adam, weight, grad, truth, execute_times); | |||
| } | |||