You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

task.cc 5.7 kB

5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "minddata/dataset/util/task.h"
  17. #include "utils/ms_utils.h"
  18. #include "minddata/dataset/util/task_manager.h"
  19. #include "utils/log_adapter.h"
  20. #if defined(__ANDROID__) || defined(ANDROID)
  21. #include "minddata/dataset/util/services.h"
  22. #endif
  23. namespace mindspore {
  24. namespace dataset {
  25. thread_local Task *gMyTask = nullptr;
  26. void Task::operator()() {
  27. #if !defined(_WIN32) && !defined(_WIN64)
  28. gMyTask = this;
  29. #endif
  30. id_ = this_thread::get_id();
  31. std::stringstream ss;
  32. ss << id_;
  33. #if defined(__ANDROID__) || defined(ANDROID)
  34. // The thread id in Linux may be duplicate
  35. ss << Services::GetUniqueID();
  36. #endif
  37. MS_LOG(DEBUG) << my_name_ << " Thread ID " << ss.str() << " Started.";
  38. try {
  39. // Previously there is a timing hole where the thread is spawn but hit error immediately before we can set
  40. // the TaskGroup pointer and register. We move the registration logic to here (after we spawn) so we can
  41. // get the thread id.
  42. TaskGroup *vg = MyTaskGroup();
  43. rc_ = vg->GetIntrpService()->Register(ss.str(), this);
  44. if (rc_.IsOk()) {
  45. // Now we can run the given task.
  46. rc_ = fnc_obj_();
  47. }
  48. // Some error codes are ignored, e.g. interrupt. Others we just shutdown the group.
  49. if (rc_.IsError() && !rc_.IsInterrupted()) {
  50. ShutdownGroup();
  51. }
  52. } catch (const std::bad_alloc &e) {
  53. rc_ = Status(StatusCode::kOutOfMemory, __LINE__, __FILE__, e.what());
  54. ShutdownGroup();
  55. } catch (const std::exception &e) {
  56. rc_ = Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, e.what());
  57. ShutdownGroup();
  58. }
  59. }
  60. void Task::ShutdownGroup() { // Wake up watch dog and shutdown the engine.
  61. {
  62. std::lock_guard<std::mutex> lk(mux_);
  63. caught_severe_exception_ = true;
  64. }
  65. TaskGroup *vg = MyTaskGroup();
  66. // If multiple threads hit severe errors in the same group. Keep the first one and
  67. // discard the rest.
  68. if (vg->rc_.IsOk()) {
  69. std::unique_lock<std::mutex> rcLock(vg->rc_mux_);
  70. // Check again after we get the lock
  71. if (vg->rc_.IsOk()) {
  72. vg->rc_ = rc_;
  73. rcLock.unlock();
  74. TaskManager::InterruptMaster(rc_);
  75. TaskManager::InterruptGroup(*this);
  76. }
  77. }
  78. }
  79. Status Task::GetTaskErrorIfAny() const {
  80. std::lock_guard<std::mutex> lk(mux_);
  81. if (caught_severe_exception_) {
  82. return rc_;
  83. } else {
  84. return Status::OK();
  85. }
  86. }
  87. Task::Task(const std::string &myName, const std::function<Status()> &f)
  88. : my_name_(myName),
  89. rc_(),
  90. fnc_obj_(f),
  91. task_group_(nullptr),
  92. is_master_(false),
  93. running_(false),
  94. caught_severe_exception_(false) {
  95. IntrpResource::ResetIntrpState();
  96. wp_.ResetIntrpState();
  97. wp_.Clear();
  98. }
  99. Status Task::Run() {
  100. Status rc;
  101. if (running_ == false) {
  102. try {
  103. thrd_ = std::async(std::launch::async, std::ref(*this));
  104. running_ = true;
  105. caught_severe_exception_ = false;
  106. } catch (const std::exception &e) {
  107. rc = Status(StatusCode::kUnexpectedError, __LINE__, __FILE__, e.what());
  108. }
  109. }
  110. return rc;
  111. }
  112. Status Task::Join(WaitFlag blocking) {
  113. if (running_) {
  114. RETURN_UNEXPECTED_IF_NULL(MyTaskGroup());
  115. auto interrupt_svc = MyTaskGroup()->GetIntrpService();
  116. try {
  117. if (blocking == WaitFlag::kBlocking) {
  118. // If we are asked to wait, then wait
  119. thrd_.get();
  120. } else if (blocking == WaitFlag::kNonBlocking) {
  121. // There is a race condition in the global resource tracking such that a thread can miss the
  122. // interrupt and becomes blocked on a conditional variable forever. As a result, calling
  123. // join() will not come back. We need some timeout version of join such that if the thread
  124. // doesn't come back in a reasonable of time, we will send the interrupt again.
  125. while (thrd_.wait_for(std::chrono::seconds(1)) != std::future_status::ready) {
  126. // We can't tell which conditional_variable this thread is waiting on. So we may need
  127. // to interrupt everything one more time.
  128. MS_LOG(INFO) << "Some threads not responding. Interrupt again";
  129. interrupt_svc->InterruptAll();
  130. }
  131. } else {
  132. RETURN_STATUS_UNEXPECTED("Unknown WaitFlag");
  133. }
  134. std::stringstream ss;
  135. ss << get_id();
  136. MS_LOG(DEBUG) << MyName() << " Thread ID " << ss.str() << " Stopped.";
  137. running_ = false;
  138. RETURN_IF_NOT_OK(wp_.Deregister());
  139. RETURN_IF_NOT_OK(interrupt_svc->Deregister(ss.str()));
  140. } catch (const std::exception &e) {
  141. RETURN_STATUS_UNEXPECTED(e.what());
  142. }
  143. }
  144. return Status::OK();
  145. }
  146. TaskGroup *Task::MyTaskGroup() { return task_group_; }
  147. void Task::set_task_group(TaskGroup *vg) { task_group_ = vg; }
  148. Task::~Task() { task_group_ = nullptr; }
  149. Status Task::OverrideInterruptRc(const Status &rc) {
  150. if (rc.IsInterrupted() && this_thread::is_master_thread()) {
  151. // If we are interrupted, override the return value if this is the master thread.
  152. // Master thread is being interrupted mostly because of some thread is reporting error.
  153. return TaskManager::GetMasterThreadRc();
  154. }
  155. return rc;
  156. }
  157. } // namespace dataset
  158. } // namespace mindspore