Merge pull request !2905 from kisnwang/parallel-reduce-sparse-gradienttags/v0.6.0-beta
| @@ -632,6 +632,75 @@ void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradie | |||
| unique_grad->indices_size_ = slice_positions.size(); | |||
| } | |||
| void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads, | |||
| SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim, | |||
| size_t outer_dim) { | |||
| if (unique_slice_grads.empty()) { | |||
| return; | |||
| } | |||
| size_t index_data_size = outer_dim * sizeof(float); | |||
| size_t unique_indices_size = 0; | |||
| for (size_t i = 0; i < unique_slice_grads.size(); ++i) { | |||
| auto &slice_grad = unique_slice_grads[i]; | |||
| auto ret_code = memcpy_s(tmp_grad->value_ + unique_indices_size * outer_dim, | |||
| (tmp_grad->indices_size_ - unique_indices_size) * index_data_size, slice_grad->value_, | |||
| slice_grad->indices_size_ * index_data_size); | |||
| if (ret_code != EOK) { | |||
| MS_LOG(EXCEPTION) << "Failed to copy data!"; | |||
| } | |||
| ret_code = | |||
| memcpy_s(tmp_grad->indices_ + unique_indices_size, (tmp_grad->indices_size_ - unique_indices_size) * sizeof(int), | |||
| slice_grad->indices_, slice_grad->indices_size_ * sizeof(int)); | |||
| if (ret_code != EOK) { | |||
| MS_LOG(EXCEPTION) << "Failed to copy data!"; | |||
| } | |||
| unique_indices_size += slice_grad->indices_size_; | |||
| } | |||
| tmp_grad->indices_size_ = unique_indices_size; | |||
| ReduceSparseGradient(*tmp_grad, unique_grad, first_dim, outer_dim); | |||
| } | |||
| void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad, | |||
| SparseGradient *unique_grad, size_t first_dim, size_t outer_dim) { | |||
| MS_EXCEPTION_IF_NULL(origin_sparse_grad.value_); | |||
| MS_EXCEPTION_IF_NULL(origin_sparse_grad.indices_); | |||
| MS_EXCEPTION_IF_NULL(unique_grad); | |||
| MS_EXCEPTION_IF_NULL(unique_grad->value_); | |||
| MS_EXCEPTION_IF_NULL(unique_grad->indices_); | |||
| MS_EXCEPTION_IF_NULL(tmp_grad); | |||
| MS_EXCEPTION_IF_NULL(tmp_grad->value_); | |||
| MS_EXCEPTION_IF_NULL(tmp_grad->indices_); | |||
| size_t thread_num = 24; | |||
| if (origin_sparse_grad.indices_size_ < thread_num) { | |||
| thread_num = origin_sparse_grad.indices_size_; | |||
| } | |||
| size_t thread_indices_size = origin_sparse_grad.indices_size_ / thread_num; | |||
| size_t left_indices_size = origin_sparse_grad.indices_size_ % thread_num; | |||
| std::vector<std::thread> threads; | |||
| threads.reserve(thread_num); | |||
| std::vector<std::shared_ptr<SparseGradient>> unique_slice_grads; | |||
| for (size_t i = 0; i < thread_num; ++i) { | |||
| size_t indices_size = thread_indices_size; | |||
| if (i == thread_num - 1) { | |||
| indices_size = thread_indices_size + left_indices_size; | |||
| } | |||
| size_t value_offset = i * thread_indices_size * outer_dim; | |||
| size_t indices_offset = i * thread_indices_size; | |||
| auto slice_grad = SparseGradient( | |||
| {origin_sparse_grad.value_ + value_offset, origin_sparse_grad.indices_ + indices_offset, indices_size}); | |||
| unique_slice_grads.emplace_back(std::make_shared<SparseGradient>()); | |||
| unique_slice_grads[i]->value_ = unique_grad->value_ + value_offset; | |||
| unique_slice_grads[i]->indices_ = unique_grad->indices_ + indices_offset; | |||
| unique_slice_grads[i]->indices_size_ = indices_size; | |||
| threads.emplace_back( | |||
| std::thread(ReduceSparseGradient, slice_grad, unique_slice_grads[i].get(), first_dim, outer_dim)); | |||
| } | |||
| for (size_t i = 0; i < thread_num; ++i) { | |||
| threads[i].join(); | |||
| } | |||
| ReduceMultiSparseGradient(unique_slice_grads, tmp_grad, unique_grad, first_dim, outer_dim); | |||
| } | |||
| std::pair<AnfNodePtr, size_t> GetKernelInput(const AnfNodePtr &anf_node, size_t index) { | |||
| MS_EXCEPTION_IF_NULL(anf_node); | |||
| @@ -130,6 +130,11 @@ void GetGraphRealOutput(const FuncGraphPtr &func_graph, std::vector<std::pair<An | |||
| bool IsWeightBoundary(const AnfNodePtr &node); | |||
| void MultiThreadCompute(const MultiThreadComputeFunc &func, MultiThreadComputeParams *params, | |||
| size_t total_compute_size); | |||
| void ReduceMultiSparseGradient(const std::vector<std::shared_ptr<SparseGradient>> &unique_slice_grads, | |||
| SparseGradient *tmp_grad, SparseGradient *unique_grad, size_t first_dim, | |||
| size_t outer_dim); | |||
| void TwoLevelReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradient *tmp_grad, | |||
| SparseGradient *unique_grad, size_t first_dim, size_t outer_dim); | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -66,6 +66,8 @@ void SparseApplyFtrlCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | |||
| workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); | |||
| workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | |||
| workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); | |||
| } | |||
| void SparseApplyFtrlCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| @@ -130,9 +132,12 @@ bool SparseApplyFtrlCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inp | |||
| auto indices = reinterpret_cast<int *>(inputs[4]->addr); | |||
| auto new_grad = reinterpret_cast<float *>(workspace[0]->addr); | |||
| auto new_indices = reinterpret_cast<int *>(workspace[1]->addr); | |||
| auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr); | |||
| auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr); | |||
| SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_}); | |||
| ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_, | |||
| var_outer_dim_size_); | |||
| SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_}); | |||
| TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad, | |||
| var_first_dim_size_, var_outer_dim_size_); | |||
| MultiThreadComputeParams input_params; | |||
| input_params.var_ = var; | |||
| @@ -61,6 +61,8 @@ void SparseApplyLazyAdamCPUKernel::InitInputOutputSize(const CNodePtr &kernel_no | |||
| MS_EXCEPTION_IF_NULL(kernel_node); | |||
| workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | |||
| workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); | |||
| workspace_size_list_.emplace_back(indices_size_ * var_outer_dim_size_ * sizeof(float)); | |||
| workspace_size_list_.emplace_back(indices_size_ * sizeof(int)); | |||
| } | |||
| void SparseApplyLazyAdamCPUKernel::InitKernel(const CNodePtr &kernel_node) { | |||
| @@ -121,10 +123,13 @@ bool SparseApplyLazyAdamCPUKernel::Launch(const std::vector<kernel::AddressPtr> | |||
| auto indices = reinterpret_cast<int *>(inputs[10]->addr); | |||
| auto new_grad = reinterpret_cast<float *>(workspace[0]->addr); | |||
| auto new_indices = reinterpret_cast<int *>(workspace[1]->addr); | |||
| auto tmp_grad = reinterpret_cast<float *>(workspace[2]->addr); | |||
| auto tmp_indices = reinterpret_cast<int *>(workspace[3]->addr); | |||
| SparseGradient unique_sparse_grad({new_grad, new_indices, indices_size_}); | |||
| ReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &unique_sparse_grad, var_first_dim_size_, | |||
| var_outer_dim_size_); | |||
| SparseGradient tmp_sparse_grad({tmp_grad, tmp_indices, indices_size_}); | |||
| TwoLevelReduceSparseGradient(SparseGradient({grad, indices, indices_size_}), &tmp_sparse_grad, &unique_sparse_grad, | |||
| var_first_dim_size_, var_outer_dim_size_); | |||
| lr = lr * std::sqrt(1 - beta2_power) / (1 - beta1_power); | |||
| MultiThreadComputeParams input_params; | |||
| @@ -58,9 +58,12 @@ class SparseApplyAdamCpuKernelTest : public UT::Common { | |||
| inputs_.push_back(CreateKernelAddress(indices.data())); | |||
| } | |||
| void CreateWorkspaceAddress(std::vector<float> &new_grad, std::vector<int> &new_indices, std::vector<float> &m_t) { | |||
| void CreateWorkspaceAddress(std::vector<float> &new_grad, std::vector<int> &new_indices, std::vector<float> &tmp_grad, | |||
| std::vector<int> &tmp_indices, std::vector<float> &m_t) { | |||
| workspace_.push_back(CreateKernelAddress(new_grad.data())); | |||
| workspace_.push_back(CreateKernelAddress(new_indices.data())); | |||
| workspace_.push_back(CreateKernelAddress(tmp_grad.data())); | |||
| workspace_.push_back(CreateKernelAddress(tmp_indices.data())); | |||
| workspace_.push_back(CreateKernelAddress(m_t.data())); | |||
| } | |||
| @@ -95,8 +98,10 @@ TEST_F(SparseApplyAdamCpuKernelTest, dense_test) { | |||
| CreateInputAddress(indices); | |||
| std::vector<float> new_grad(3 * 3 * 3); | |||
| std::vector<int> new_indices(3); | |||
| std::vector<float> tmp_grad(3 * 3 * 3); | |||
| std::vector<int> tmp_indices(3); | |||
| std::vector<float> m_t(3 * 3 * 3); | |||
| CreateWorkspaceAddress(new_grad, new_indices, m_t); | |||
| CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices, m_t); | |||
| sparse_adam_->Launch(inputs_, workspace_, outputs_); | |||
| for (size_t i = 0; i < 3 * 3 * 3; ++i) { | |||
| EXPECT_TRUE(std::fabs(var_[i] - 0.999684) < 1e-6); | |||
| @@ -120,8 +125,10 @@ TEST_F(SparseApplyAdamCpuKernelTest, sparse_test1) { | |||
| CreateInputAddress(indices); | |||
| std::vector<float> new_grad(3 * 3 * 3); | |||
| std::vector<int> new_indices(3); | |||
| std::vector<float> tmp_grad(3 * 3 * 3); | |||
| std::vector<int> tmp_indices(3); | |||
| std::vector<float> m_t(3 * 3 * 3); | |||
| CreateWorkspaceAddress(new_grad, new_indices, m_t); | |||
| CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices, m_t); | |||
| sparse_adam_->Launch(inputs_, workspace_, outputs_); | |||
| for (size_t i = 0; i < 3 * 3; ++i) { | |||
| EXPECT_TRUE(std::fabs(var_[i] - 0.999684) < 1e-6); | |||
| @@ -149,8 +156,10 @@ TEST_F(SparseApplyAdamCpuKernelTest, sparse_test2) { | |||
| CreateInputAddress(indices); | |||
| std::vector<float> new_grad(3 * 3 * 3); | |||
| std::vector<int> new_indices(3); | |||
| std::vector<float> tmp_grad(3 * 3 * 3); | |||
| std::vector<int> tmp_indices(3); | |||
| std::vector<float> m_t(3 * 3 * 3); | |||
| CreateWorkspaceAddress(new_grad, new_indices, m_t); | |||
| CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices, m_t); | |||
| sparse_adam_->Launch(inputs_, workspace_, outputs_); | |||
| for (size_t i = 0; i < 3 * 3; ++i) { | |||
| EXPECT_TRUE(std::fabs(var_[i] - 0.999715) < 1e-6); | |||
| @@ -56,9 +56,12 @@ class SparseApplyFtrlCpuKernelTest : public UT::Common { | |||
| inputs_.push_back(CreateKernelAddress(indices.data())); | |||
| } | |||
| void CreateWorkspaceAddress(std::vector<float> &new_grad, std::vector<int> &new_indices) { | |||
| void CreateWorkspaceAddress(std::vector<float> &new_grad, std::vector<int> &new_indices, std::vector<float> &tmp_grad, | |||
| std::vector<int> &tmp_indices) { | |||
| workspace_.push_back(CreateKernelAddress(new_grad.data())); | |||
| workspace_.push_back(CreateKernelAddress(new_indices.data())); | |||
| workspace_.push_back(CreateKernelAddress(tmp_grad.data())); | |||
| workspace_.push_back(CreateKernelAddress(tmp_indices.data())); | |||
| } | |||
| std::vector<float> var_; | |||
| @@ -86,7 +89,9 @@ TEST_F(SparseApplyFtrlCpuKernelTest, dense_test) { | |||
| CreateInputAddress(indices); | |||
| std::vector<float> new_grad(3 * 3 * 3); | |||
| std::vector<int> new_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices); | |||
| std::vector<float> tmp_grad(3 * 3 * 3); | |||
| std::vector<int> tmp_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); | |||
| sparse_ftrl_->Launch(inputs_, workspace_, outputs_); | |||
| for (size_t i = 0; i < 3 * 3 * 3; ++i) { | |||
| EXPECT_TRUE(std::fabs(var_[i] - 0.291479) < 1e-6); | |||
| @@ -110,7 +115,9 @@ TEST_F(SparseApplyFtrlCpuKernelTest, sparse_test1) { | |||
| CreateInputAddress(indices); | |||
| std::vector<float> new_grad(3 * 3 * 3); | |||
| std::vector<int> new_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices); | |||
| std::vector<float> tmp_grad(3 * 3 * 3); | |||
| std::vector<int> tmp_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); | |||
| sparse_ftrl_->Launch(inputs_, workspace_, outputs_); | |||
| for (size_t i = 0; i < 3 * 3; ++i) { | |||
| EXPECT_TRUE(std::fabs(var_[i] - 0.291479) < 1e-6); | |||
| @@ -138,7 +145,9 @@ TEST_F(SparseApplyFtrlCpuKernelTest, sparse_test2) { | |||
| CreateInputAddress(indices); | |||
| std::vector<float> new_grad(3 * 3 * 3); | |||
| std::vector<int> new_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices); | |||
| std::vector<float> tmp_grad(3 * 3 * 3); | |||
| std::vector<int> tmp_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); | |||
| sparse_ftrl_->Launch(inputs_, workspace_, outputs_); | |||
| for (size_t i = 0; i < 3 * 3; ++i) { | |||
| EXPECT_EQ(var_[i], 1.0); | |||
| @@ -58,9 +58,12 @@ class SparseApplyLazyAdamCpuKernelTest : public UT::Common { | |||
| inputs_.push_back(CreateKernelAddress(indices.data())); | |||
| } | |||
| void CreateWorkspaceAddress(std::vector<float> &new_grad, std::vector<int> &new_indices) { | |||
| void CreateWorkspaceAddress(std::vector<float> &new_grad, std::vector<int> &new_indices, std::vector<float> &tmp_grad, | |||
| std::vector<int> &tmp_indices) { | |||
| workspace_.push_back(CreateKernelAddress(new_grad.data())); | |||
| workspace_.push_back(CreateKernelAddress(new_indices.data())); | |||
| workspace_.push_back(CreateKernelAddress(tmp_grad.data())); | |||
| workspace_.push_back(CreateKernelAddress(tmp_indices.data())); | |||
| } | |||
| std::vector<float> var_; | |||
| @@ -94,7 +97,9 @@ TEST_F(SparseApplyLazyAdamCpuKernelTest, dense_test) { | |||
| CreateInputAddress(indices); | |||
| std::vector<float> new_grad(3 * 3 * 3); | |||
| std::vector<int> new_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices); | |||
| std::vector<float> tmp_grad(3 * 3 * 3); | |||
| std::vector<int> tmp_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); | |||
| sparse_lazy_adam_->Launch(inputs_, workspace_, outputs_); | |||
| for (size_t i = 0; i < 3 * 3 * 3; ++i) { | |||
| EXPECT_TRUE(std::fabs(var_[i] - 0.999684) < 1e-6); | |||
| @@ -118,7 +123,9 @@ TEST_F(SparseApplyLazyAdamCpuKernelTest, sparse_test1) { | |||
| CreateInputAddress(indices); | |||
| std::vector<float> new_grad(3 * 3 * 3); | |||
| std::vector<int> new_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices); | |||
| std::vector<float> tmp_grad(3 * 3 * 3); | |||
| std::vector<int> tmp_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); | |||
| sparse_lazy_adam_->Launch(inputs_, workspace_, outputs_); | |||
| for (size_t i = 0; i < 3 * 3; ++i) { | |||
| EXPECT_TRUE(std::fabs(var_[i] - 0.999684) < 1e-6); | |||
| @@ -146,7 +153,9 @@ TEST_F(SparseApplyLazyAdamCpuKernelTest, sparse_test2) { | |||
| CreateInputAddress(indices); | |||
| std::vector<float> new_grad(3 * 3 * 3); | |||
| std::vector<int> new_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices); | |||
| std::vector<float> tmp_grad(3 * 3 * 3); | |||
| std::vector<int> tmp_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); | |||
| sparse_lazy_adam_->Launch(inputs_, workspace_, outputs_); | |||
| for (size_t i = 0; i < 3 * 3; ++i) { | |||
| EXPECT_EQ(var_[i], 1.0); | |||
| @@ -54,9 +54,12 @@ class SparseApplyProximalAdagradCpuKernelTest : public UT::Common { | |||
| inputs_.push_back(CreateKernelAddress(indices.data())); | |||
| } | |||
| void CreateWorkspaceAddress(std::vector<float> &new_grad, std::vector<int> &new_indices) { | |||
| void CreateWorkspaceAddress(std::vector<float> &new_grad, std::vector<int> &new_indices, std::vector<float> &tmp_grad, | |||
| std::vector<int> &tmp_indices) { | |||
| workspace_.push_back(CreateKernelAddress(new_grad.data())); | |||
| workspace_.push_back(CreateKernelAddress(new_indices.data())); | |||
| workspace_.push_back(CreateKernelAddress(tmp_grad.data())); | |||
| workspace_.push_back(CreateKernelAddress(tmp_indices.data())); | |||
| } | |||
| std::vector<float> var_; | |||
| @@ -85,7 +88,9 @@ TEST_F(SparseApplyProximalAdagradCpuKernelTest, dense_test) { | |||
| CreateInputAddress(indices); | |||
| std::vector<float> new_grad(3 * 3 * 3); | |||
| std::vector<int> new_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices); | |||
| std::vector<float> tmp_grad(3 * 3 * 3); | |||
| std::vector<int> tmp_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); | |||
| sparse_proximal_adagrad_->Launch(inputs_, workspace_, outputs_); | |||
| for (size_t i = 0; i < 3 * 3 * 3; ++i) { | |||
| EXPECT_TRUE(std::fabs(var_[i] - 0.9929289) < 1e-6); | |||
| @@ -108,7 +113,9 @@ TEST_F(SparseApplyProximalAdagradCpuKernelTest, sparse_test1) { | |||
| CreateInputAddress(indices); | |||
| std::vector<float> new_grad(3 * 3 * 3); | |||
| std::vector<int> new_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices); | |||
| std::vector<float> tmp_grad(3 * 3 * 3); | |||
| std::vector<int> tmp_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); | |||
| sparse_proximal_adagrad_->Launch(inputs_, workspace_, outputs_); | |||
| for (size_t i = 0; i < 3 * 3; ++i) { | |||
| EXPECT_TRUE(std::fabs(var_[i] - 0.9929289) < 1e-6); | |||
| @@ -135,7 +142,9 @@ TEST_F(SparseApplyProximalAdagradCpuKernelTest, sparse_test2) { | |||
| CreateInputAddress(indices); | |||
| std::vector<float> new_grad(3 * 3 * 3); | |||
| std::vector<int> new_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices); | |||
| std::vector<float> tmp_grad(3 * 3 * 3); | |||
| std::vector<int> tmp_indices(3); | |||
| CreateWorkspaceAddress(new_grad, new_indices, tmp_grad, tmp_indices); | |||
| sparse_proximal_adagrad_->Launch(inputs_, workspace_, outputs_); | |||
| for (size_t i = 0; i < 3 * 3; ++i) { | |||
| EXPECT_EQ(var_[i], 1.0); | |||