/** * Copyright 2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef MINDSPORE_CCSRC_DISTRIBUTED_CLUSTER_CLUSTER_CONTEXT_H_ #define MINDSPORE_CCSRC_DISTRIBUTED_CLUSTER_CLUSTER_CONTEXT_H_ #include #include #include #include #include #include #include #include "distributed/constants.h" #include "utils/log_adapter.h" #include "utils/ms_utils.h" #include "ps/core/cluster_config.h" #include "ps/core/node.h" #include "ps/core/worker_node.h" #include "ps/core/server_node.h" #include "ps/core/scheduler_node.h" namespace mindspore { namespace distributed { namespace cluster { // Node role based cluster built by MindSpore communication framework. class ClusterContext { public: ~ClusterContext(); DISABLE_COPY_AND_ASSIGN(ClusterContext) static std::shared_ptr instance(); // Initialize the cluster configuration and build network. bool Initialize(); // Finalize the cluster and process exits. If timeout is set to UINT32_MAX, this method will block without timeout. bool Finalize(uint32_t timeout = kDefaultFinishTimeout); // Return node object of this process. const std::shared_ptr &node() const; // Return cluster is initialized. bool initialized() const; private: ClusterContext(); // This initializing cluster configurations. They can be exported by environment variables, set by python API or // configuration file. void InitClusterConfig(); // Build the cluster with other processes. This method will not return until the networking is done. bool BuildCluster(); // Load the cluster configuration like worker number, server number and etc. void InitNodeRole(); void InitSchedulerIp(); void InitSchedulerPort(); // Register event callbacks for NODE_TIMEOUT, SCHEDULER_TIMEOUT, etc. void RegisterEventCallback(); // The flag that whether this cluster context instance is already initialized. std::atomic_bool inited_; // The flag that whether this cluster context instance is already finalized. std::atomic_bool finalized_; // The mutex about exiting status of this node. std::mutex finish_mutex_; // Node role to role number map. std::map node_num_each_role_; // Scheduler information. std::string scheduler_host_; uint16_t scheduler_port_; // The node could be Worker, Server or Scheduler, etc. std::shared_ptr node_; // The role of this process in the cluster. std::string node_role_; // The configuration of this cluster. std::unique_ptr cluster_config_; }; } // namespace cluster } // namespace distributed } // namespace mindspore #endif // MINDSPORE_CCSRC_DISTRIBUTED_CLUSTER_CLUSTER_CONTEXT_H_