You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

stream_synchronizer.cc 3.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. /**
  2. * Copyright 2022 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "runtime/device/stream_synchronizer.h"
  17. #include "utils/ms_context.h"
  18. #include "distributed/collective/collective_manager.h"
  19. #include "runtime/recovery/recovery_context.h"
  20. namespace mindspore {
  21. namespace device {
  22. std::mutex StreamSynchronizer::instance_lock_;
  23. std::shared_ptr<StreamSynchronizer> StreamSynchronizer::instance_ = nullptr;
  24. void StreamSynchronizer::Initialize() {
  25. // Non disaster recovery mode does not need to start thread and timeout mechanisms.
  26. if (!runtime::recovery::RecoveryContext::GetInstance()->enable_recovery()) {
  27. return;
  28. }
  29. worker_thread_ = std::thread(&StreamSynchronizer::DoSyncStreamTask, this);
  30. }
  31. void StreamSynchronizer::Finalize() {
  32. {
  33. std::unique_lock<std::mutex> lock(task_mutex_);
  34. stop_ = true;
  35. }
  36. do_sync_stream_cv_.notify_all();
  37. if (worker_thread_.joinable()) {
  38. worker_thread_.join();
  39. }
  40. device_context_ = nullptr;
  41. }
  42. bool StreamSynchronizer::SyncStream(const std::string &device_name, uint32_t timeout) {
  43. std::unique_lock<std::mutex> reentrant_lock(reentrant_mutex_);
  44. auto ms_context = MsContext::GetInstance();
  45. MS_EXCEPTION_IF_NULL(ms_context);
  46. uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
  47. const auto &device_context =
  48. device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({device_name, device_id});
  49. MS_EXCEPTION_IF_NULL(device_context);
  50. // If disable recovery or timeout==0, sync stream directly to improve performance.
  51. if (!runtime::recovery::RecoveryContext::GetInstance()->enable_recovery() || timeout == 0) {
  52. device_context->Initialize();
  53. return device_context->SyncStream();
  54. }
  55. std::unique_lock<std::mutex> lock(task_mutex_);
  56. if (stop_) {
  57. MS_LOG(EXCEPTION) << "The synchronization stream task has stopped";
  58. }
  59. device_context_ = device_context;
  60. do_sync_stream_cv_.notify_one();
  61. if (sync_stream_time_out_) {
  62. // If sync stream timeout has happened, increase the timeout by 4 times.
  63. const uint32_t kTimeOutScaleFactor = 4;
  64. timeout *= kTimeOutScaleFactor;
  65. }
  66. if (time_out_cv_.wait_for(lock, std::chrono::seconds(timeout)) == std::cv_status::no_timeout) {
  67. if (!sync_stream_ret_) {
  68. MS_LOG(ERROR) << "Synchronize stream failed.";
  69. }
  70. return sync_stream_ret_;
  71. } else {
  72. sync_stream_time_out_ = true;
  73. runtime::recovery::RecoveryContext::GetInstance()->set_need_reinit_collective(true);
  74. if (!distributed::collective::CollectiveManager::instance()->Finalize()) {
  75. MS_LOG(ERROR) << "Finalize collective manager failed.";
  76. return false;
  77. }
  78. time_out_cv_.wait(lock, [this]() { return device_context_ == nullptr; });
  79. MS_LOG(WARNING) << "Synchronize stream time out.";
  80. return true;
  81. }
  82. }
  83. void StreamSynchronizer::DoSyncStreamTask() {
  84. for (;;) {
  85. {
  86. std::unique_lock<std::mutex> lock(task_mutex_);
  87. do_sync_stream_cv_.wait(lock, [this]() { return stop_ || device_context_ != nullptr; });
  88. if (stop_) {
  89. return;
  90. }
  91. }
  92. device_context_->Initialize();
  93. // Really sync stream.
  94. sync_stream_ret_ = device_context_->SyncStream();
  95. {
  96. std::unique_lock<std::mutex> lock(task_mutex_);
  97. device_context_ = nullptr;
  98. }
  99. time_out_cv_.notify_one();
  100. }
  101. }
  102. } // namespace device
  103. } // namespace mindspore