| @@ -26,6 +26,7 @@ | |||||
| #include "device/cpu/cpu_device_address.h" | #include "device/cpu/cpu_device_address.h" | ||||
| #include "utils/context/ms_context.h" | #include "utils/context/ms_context.h" | ||||
| #include "utils/config_manager.h" | #include "utils/config_manager.h" | ||||
| #include "utils/profile.h" | |||||
| #include "common/utils.h" | #include "common/utils.h" | ||||
| #include "session/anf_runtime_algorithm.h" | #include "session/anf_runtime_algorithm.h" | ||||
| #include "session/session_basic.h" | #include "session/session_basic.h" | ||||
| @@ -270,6 +271,9 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) { | |||||
| auto kernels = kernel_graph->execution_order(); | auto kernels = kernel_graph->execution_order(); | ||||
| for (const auto &kernel : kernels) { | for (const auto &kernel : kernels) { | ||||
| #ifdef ENABLE_PROFILE | |||||
| double start_time = GetTime(); | |||||
| #endif | |||||
| std::vector<kernel::AddressPtr> kernel_inputs; | std::vector<kernel::AddressPtr> kernel_inputs; | ||||
| std::vector<kernel::AddressPtr> kernel_workspaces; | std::vector<kernel::AddressPtr> kernel_workspaces; | ||||
| std::vector<kernel::AddressPtr> kernel_outputs; | std::vector<kernel::AddressPtr> kernel_outputs; | ||||
| @@ -297,6 +301,10 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph) { | |||||
| if (!ret) { | if (!ret) { | ||||
| MS_LOG(EXCEPTION) << "Launch kernel failed."; | MS_LOG(EXCEPTION) << "Launch kernel failed."; | ||||
| } | } | ||||
| #ifdef ENABLE_PROFILE | |||||
| double cost_time = GetTime() - start_time; | |||||
| MS_LOG(INFO) << "cpu kernel: " << kernel->fullname_with_scope() << " costs " << cost_time * 1e6 << " us"; | |||||
| #endif | |||||
| } | } | ||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -29,7 +29,7 @@ void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t en | |||||
| auto linear = input_params->linear_; | auto linear = input_params->linear_; | ||||
| auto lr = input_params->lr_; | auto lr = input_params->lr_; | ||||
| auto l1 = input_params->l1_; | auto l1 = input_params->l1_; | ||||
| auto l2 = input_params->l2_; | |||||
| auto l2_plus = 2 * input_params->l2_; | |||||
| auto lr_power = input_params->lr_power_; | auto lr_power = input_params->lr_power_; | ||||
| auto unique_sparse_grad = input_params->sparse_grad_; | auto unique_sparse_grad = input_params->sparse_grad_; | ||||
| auto var_first_dim_size = input_params->var_first_dim_size_; | auto var_first_dim_size = input_params->var_first_dim_size_; | ||||
| @@ -44,21 +44,18 @@ void ComputeFtrl(MultiThreadComputeParams *input_params, size_t start, size_t en | |||||
| for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) { | for (size_t j = start_index, k = var_outer_dim_size * i; j < end_index; ++j, ++k) { | ||||
| auto summed_grad = unique_sparse_grad.value_[k]; | auto summed_grad = unique_sparse_grad.value_[k]; | ||||
| auto accum_new = accum[j] + summed_grad * summed_grad; | auto accum_new = accum[j] + summed_grad * summed_grad; | ||||
| if (lr_power == -0.5) { | |||||
| linear[j] += summed_grad - (std::sqrt(accum_new) - std::sqrt(accum[j])) / lr * var[j]; | |||||
| } else { | |||||
| linear[j] += summed_grad - (std::pow(accum_new, -lr_power) - std::pow(accum[j], -lr_power)) / lr * var[j]; | |||||
| } | |||||
| auto x = Sign(linear[j]) * l1 - linear[j]; | |||||
| float y; | float y; | ||||
| if (lr_power == -0.5) { | if (lr_power == -0.5) { | ||||
| y = std::sqrt(accum_new) / lr + 2 * l2; | |||||
| y = std::sqrt(accum_new); | |||||
| linear[j] += summed_grad - (y - std::sqrt(accum[j])) / lr * var[j]; | |||||
| } else { | } else { | ||||
| y = std::pow(accum_new, -lr_power) / lr + 2 * l2; | |||||
| y = std::pow(accum_new, -lr_power); | |||||
| linear[j] += summed_grad - (y - std::pow(accum[j], -lr_power)) / lr * var[j]; | |||||
| } | } | ||||
| auto pre_shrink = x / y; | |||||
| var[j] = std::fabs(linear[j]) > l1 ? pre_shrink : 0; | |||||
| accum[j] = accum_new; | accum[j] = accum_new; | ||||
| auto x = Sign(linear[j]) * l1 - linear[j]; | |||||
| y = y / lr + l2_plus; | |||||
| var[j] = std::fabs(linear[j]) > l1 ? x / y : 0; | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -112,10 +112,10 @@ void GPUSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph, | |||||
| auto tensor_address = tensor->device_address(); | auto tensor_address = tensor->device_address(); | ||||
| bool need_sync = false; | bool need_sync = false; | ||||
| if (ms_context->enable_pynative_infer()) { | if (ms_context->enable_pynative_infer()) { | ||||
| if (tensor_address.get() == nullptr || tensor_address != device_address) { | |||||
| if (tensor_address == nullptr || tensor_address != device_address) { | |||||
| need_sync = true; | need_sync = true; | ||||
| } | } | ||||
| } else if (tensor->is_dirty()) { | |||||
| } else if (tensor->is_dirty() || tensor_address == nullptr) { | |||||
| need_sync = true; | need_sync = true; | ||||
| } else if (tensor_address != device_address) { | } else if (tensor_address != device_address) { | ||||
| if (tensor_address->DeviceType() == device_address->DeviceType()) { | if (tensor_address->DeviceType() == device_address->DeviceType()) { | ||||