diff --git a/mindspore/ccsrc/frontend/parallel/group_manager.cc b/mindspore/ccsrc/frontend/parallel/group_manager.cc index 58eb60b284..811f98596a 100644 --- a/mindspore/ccsrc/frontend/parallel/group_manager.cc +++ b/mindspore/ccsrc/frontend/parallel/group_manager.cc @@ -18,7 +18,11 @@ #include #include #include +#if !defined(NO_DLIB) || defined(ENABLE_GPU) #include "backend/session/executor_manager.h" +#else +#include "frontend/parallel/parallel_stub/executor_manager_stub.h" +#endif #include "frontend/parallel/device_manager.h" #include "utils/comm_manager.h" #include "utils/ms_context.h" @@ -66,6 +70,79 @@ Status Group::GetIndex(size_t *index) { GroupManager::GroupManager() { groups_.clear(); } +#if !defined(NO_DLIB) || defined(ENABLE_GPU) +bool GroupManager::CreateGroupByExecutor(const std::string &device_name, const std::string &group_name, + const std::vector ranks, int device_id) { + auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id); + MS_EXCEPTION_IF_NULL(executor); + bool ret = executor->CreateCommGroup(group_name, ranks); + return ret; +} + +bool GroupManager::DestroyGroupByExecutor(const std::string &device_name, const std::string &group_name, + int device_id) { + auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id); + MS_EXCEPTION_IF_NULL(executor); + bool ret = executor->DestroyCommGroup(group_name); + return ret; +} + +Status CreateGroups(const std::vector>> &group_info) { + // Create group through the executor + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + std::string device_name = context_ptr->get_param(MS_CTX_DEVICE_TARGET); + uint32_t device_id = context_ptr->get_param(MS_CTX_DEVICE_ID); + auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id); + MS_EXCEPTION_IF_NULL(executor); + for (auto &group : group_info) { + bool ret = executor->CreateCommGroup(group.first, group.second); + if (!ret) { + MS_LOG(ERROR) << "Create group failed, group name is " << group.first << ", ranks is " << group.second; + return FAILED; + } + MS_LOG(INFO) << "Create group success, group name is " << group.first << ", ranks is " << group.second; + } + + return SUCCESS; +} +#else +bool GroupManager::CreateGroupByExecutor(const std::string &device_name, const std::string &group_name, + const std::vector ranks, int device_id) { + MS_LOG(WARNING) << "Create group in stub"; + auto executor = parallel::ExecutorManager::Instance().GetExecutor(device_name, device_id); + MS_EXCEPTION_IF_NULL(executor); + return executor->CreateCommGroup(group_name, ranks); +} + +bool GroupManager::DestroyGroupByExecutor(const std::string &device_name, const std::string &group_name, + int device_id) { + MS_LOG(WARNING) << "Destroy group in stub"; + auto executor = parallel::ExecutorManager::Instance().GetExecutor(device_name, device_id); + MS_EXCEPTION_IF_NULL(executor); + return executor->DestroyCommGroup(group_name); +} + +Status CreateGroups(const std::vector>> &group_info) { + // Create group through the executor + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + std::string device_name = context_ptr->get_param(MS_CTX_DEVICE_TARGET); + uint32_t device_id = context_ptr->get_param(MS_CTX_DEVICE_ID); + auto executor = parallel::ExecutorManager::Instance().GetExecutor(device_name, device_id); + MS_EXCEPTION_IF_NULL(executor); + for (auto &group : group_info) { + bool ret = executor->CreateCommGroup(group.first, group.second); + if (!ret) { + MS_LOG(ERROR) << "Create group failed, group name is " << group.first << ", ranks is " << group.second; + return FAILED; + } + MS_LOG(INFO) << "Create group success, group name is " << group.first << ", ranks is " << group.second; + } + + return SUCCESS; +} +#endif Status GroupManager::CreateGroup(const std::string &group_name, const std::vector &devices, mindspore::parallel::Group *const group) { // it is simple to use size to determine whether it is a world group @@ -102,9 +179,7 @@ Status GroupManager::CreateGroup(const std::string &group_name, const std::vecto MS_EXCEPTION_IF_NULL(context_ptr); std::string device_name = context_ptr->get_param(MS_CTX_DEVICE_TARGET); uint32_t device_id = context_ptr->get_param(MS_CTX_DEVICE_ID); - auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id); - MS_EXCEPTION_IF_NULL(executor); - bool ret = executor->CreateCommGroup(group_name, ranks); + bool ret = CreateGroupByExecutor(device_name, group_name, ranks, device_id); if (!ret) { MS_LOG(ERROR) << "Create group failed, group name is " << group_name; return Status::FAILED; @@ -123,9 +198,7 @@ Status GroupManager::DestroyGroup(const std::string &group_name) { MS_EXCEPTION_IF_NULL(context_ptr); std::string device_name = context_ptr->get_param(MS_CTX_DEVICE_TARGET); uint32_t device_id = context_ptr->get_param(MS_CTX_DEVICE_ID); - auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id); - MS_EXCEPTION_IF_NULL(executor); - bool ret = executor->DestroyCommGroup(group_name); + bool ret = DestroyGroupByExecutor(device_name, group_name, device_id); if (!ret) { return Status::FAILED; } @@ -192,26 +265,5 @@ Status GroupManager::FindGroup(const std::string &name, mindspore::parallel::Gro void GroupManager::Clear() { (void)DestroyAllGroups(); } -Status CreateGroups(const std::vector>> &group_info) { - // Create group through the executor - auto context_ptr = MsContext::GetInstance(); - MS_EXCEPTION_IF_NULL(context_ptr); - std::string device_name = context_ptr->get_param(MS_CTX_DEVICE_TARGET); - uint32_t device_id = context_ptr->get_param(MS_CTX_DEVICE_ID); - auto executor = session::ExecutorManager::Instance().GetExecutor(device_name, device_id); - MS_EXCEPTION_IF_NULL(executor); - - for (auto &group : group_info) { - bool ret = executor->CreateCommGroup(group.first, group.second); - if (!ret) { - MS_LOG(ERROR) << "Create group failed, group name is " << group.first << ", ranks is " << group.second; - return FAILED; - } - MS_LOG(INFO) << "Create group success, group name is " << group.first << ", ranks is " << group.second; - } - - return SUCCESS; -} - } // namespace parallel } // namespace mindspore diff --git a/mindspore/ccsrc/frontend/parallel/group_manager.h b/mindspore/ccsrc/frontend/parallel/group_manager.h index 3c106e8624..fa8fb15ddf 100644 --- a/mindspore/ccsrc/frontend/parallel/group_manager.h +++ b/mindspore/ccsrc/frontend/parallel/group_manager.h @@ -67,6 +67,9 @@ class GroupManager { void Clear(); private: + bool CreateGroupByExecutor(const std::string &device_name, const std::string &group_name, + const std::vector ranks, int device_id); + bool DestroyGroupByExecutor(const std::string &device_name, const std::string &group_name, int device_id); Status DestroyGroup(const std::string &group_name); // the key is group name (name_) std::map groups_; diff --git a/mindspore/ccsrc/frontend/parallel/parallel_stub/executor_manager_stub.cc b/mindspore/ccsrc/frontend/parallel/parallel_stub/executor_manager_stub.cc new file mode 100644 index 0000000000..2c056bc451 --- /dev/null +++ b/mindspore/ccsrc/frontend/parallel/parallel_stub/executor_manager_stub.cc @@ -0,0 +1,31 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "frontend/parallel/parallel_stub/executor_manager_stub.h" +namespace mindspore { +namespace parallel { +std::shared_ptr ExecutorManager::GetExecutor(const std::string &device_name, int device_id) { + std::string device_key = device_name + "_" + std::to_string(device_id); + auto iter = executors_.find(device_key); + if (iter != executors_.end()) { + return iter->second; + } + auto executor = std::make_shared(device_name, device_id); + executors_[device_key] = executor; + return executor; +} + +} // namespace parallel +} // namespace mindspore diff --git a/mindspore/ccsrc/frontend/parallel/parallel_stub/executor_manager_stub.h b/mindspore/ccsrc/frontend/parallel/parallel_stub/executor_manager_stub.h new file mode 100644 index 0000000000..67a771262b --- /dev/null +++ b/mindspore/ccsrc/frontend/parallel/parallel_stub/executor_manager_stub.h @@ -0,0 +1,41 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_PARALLEL_EXECUTOR_MANAGER_STUB_H_ +#define MINDSPORE_CCSRC_PARALLEL_EXECUTOR_MANAGER_STUB_H_ +#include +#include +#include +#include +#include "frontend/parallel/parallel_stub/executor_stub.h" +namespace mindspore { +namespace parallel { +class Executor; +class ExecutorManager { + public: + static ExecutorManager &Instance() { + static ExecutorManager instance; + return instance; + } + std::shared_ptr GetExecutor(const std::string &device_name, int device_id); + + private: + ExecutorManager() = default; + ~ExecutorManager() = default; + std::map> executors_; +}; +} // namespace parallel +} // namespace mindspore +#endif // MINDSPORE_CCSRC_PARALLEL_EXECUTOR_MANAGER_STUB_H_ diff --git a/mindspore/ccsrc/frontend/parallel/parallel_stub/executor_stub.h b/mindspore/ccsrc/frontend/parallel/parallel_stub/executor_stub.h new file mode 100644 index 0000000000..bb655f9178 --- /dev/null +++ b/mindspore/ccsrc/frontend/parallel/parallel_stub/executor_stub.h @@ -0,0 +1,38 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_PARALLEL_EXECUTOR_STUB_H +#define MINDSPORE_CCSRC_PARALLEL_EXECUTOR_STUB_H + +#include +#include +#include + +namespace mindspore { +namespace parallel { +class Executor { + public: + Executor(const std::string &device_name, uint32_t device_id) : device_name_(device_name), device_id_(device_id) {} + ~Executor() = default; + bool CreateCommGroup(const std::string &group_name, std::vector ranks) const { return true; } + bool DestroyCommGroup(const std::string &group_name) const { return true; } + + private: + std::string device_name_; + uint32_t device_id_; +}; +} // namespace parallel +} // namespace mindspore +#endif // MINDSPORE_CCSRC_PARALLEL_EXECUTOR_STUB_H