You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

task_manager.cc 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <algorithm>
  17. #include <functional>
  18. #include <set>
  19. #include "./securec.h"
  20. #include "dataset/util/task_manager.h"
  21. namespace mindspore {
  22. namespace dataset {
  23. // This takes the same parameter as Task constructor.
  24. Status TaskManager::CreateAsyncTask(const std::string &my_name, const std::function<Status()> &f, TaskGroup *vg,
  25. Task **task) {
  26. // We need to block destructor coming otherwise we will deadlock. We will grab the
  27. // stateLock in shared allowing CreateAsyncTask to run concurrently.
  28. SharedLock stateLck(&state_lock_);
  29. // Now double check the state
  30. if (ServiceState() == STATE::kStopInProg || ServiceState() == STATE::kStopped) {
  31. return Status(StatusCode::kInterrupted, __LINE__, __FILE__, "TaskManager is shutting down");
  32. }
  33. RETURN_IF_NOT_OK(GetFreeTask(my_name, f, task));
  34. if (vg == nullptr) {
  35. RETURN_STATUS_UNEXPECTED("TaskGroup is null");
  36. }
  37. // Previously there is a timing hole where the thread is spawn but hit error immediately before we can set
  38. // the TaskGroup pointer. We will do the set here before we call run(). The run() will do the registration.
  39. (*task)->set_task_group(vg);
  40. // Link to the master lru list.
  41. {
  42. UniqueLock lck(&lru_lock_);
  43. lru_.Append(*task);
  44. }
  45. // Link to the group list as well before we spawn.
  46. {
  47. UniqueLock lck(&vg->rw_lock_);
  48. vg->grp_list_.Append(*task);
  49. }
  50. // Track all the TaskGroup. Used for control-c
  51. {
  52. LockGuard lck(&tg_lock_);
  53. this->grp_list_.insert(vg);
  54. }
  55. RETURN_IF_NOT_OK((*task)->wp_.Register(vg));
  56. RETURN_IF_NOT_OK((*task)->Run());
  57. // Wait for the thread to initialize successfully.
  58. RETURN_IF_NOT_OK((*task)->Wait());
  59. return Status::OK();
  60. }
  61. Status TaskManager::join_all() {
  62. Status rc;
  63. Status rc2;
  64. SharedLock lck(&lru_lock_);
  65. for (Task &tk : lru_) {
  66. rc = tk.Join();
  67. if (rc.IsError()) {
  68. rc2 = rc;
  69. }
  70. }
  71. return rc2;
  72. }
  73. void TaskManager::interrupt_all() noexcept {
  74. global_interrupt_ = 1;
  75. LockGuard lck(&tg_lock_);
  76. for (TaskGroup *vg : grp_list_) {
  77. auto svc = vg->GetIntrpService();
  78. if (svc) {
  79. // Stop the interrupt service. No new request is accepted.
  80. svc->ServiceStop();
  81. svc->InterruptAll();
  82. }
  83. }
  84. (void)master_->Interrupt();
  85. }
  86. Task *TaskManager::FindMe() { return gMyTask; }
  87. TaskManager::TaskManager() try : global_interrupt_(0),
  88. lru_(&Task::node),
  89. free_lst_(&Task::free),
  90. watchdog_grp_(nullptr),
  91. watchdog_(nullptr) {
  92. std::shared_ptr<MemoryPool> mp = Services::GetInstance().GetServiceMemPool();
  93. Allocator<Task> alloc(mp);
  94. // Create a dummy Task for the master thread (this thread)
  95. master_ = std::allocate_shared<Task>(alloc, "master", []() -> Status { return Status::OK(); });
  96. master_->id_ = this_thread::get_id();
  97. master_->running_ = true;
  98. master_->is_master_ = true;
  99. gMyTask = master_.get();
  100. // Initialize the semaphore for the watchdog
  101. errno_t rc = sem_init(&sem_, 0, 0);
  102. if (rc == -1) {
  103. MS_LOG(INFO) << "Unable to initialize a semaphore. Errno = " << rc << ".";
  104. std::terminate();
  105. }
  106. } catch (const std::exception &e) {
  107. MS_LOG(ERROR) << "MindData initialization failed: " << e.what() << ".";
  108. std::terminate();
  109. }
  110. TaskManager::~TaskManager() {
  111. if (watchdog_) {
  112. WakeUpWatchDog();
  113. watchdog_->thrd_.join();
  114. // watchdog_grp_ and watchdog_ pointers come from Services::GetInstance().GetServiceMemPool() which we will free it
  115. // on shutdown. So no need to free these pointers one by one.
  116. watchdog_grp_ = nullptr;
  117. watchdog_ = nullptr;
  118. }
  119. (void)sem_destroy(&sem_);
  120. }
  121. Status TaskManager::DoServiceStart() {
  122. MS_LOG(INFO) << "Starting Task Manager.";
  123. // Create a watchdog for control-c
  124. std::shared_ptr<MemoryPool> mp = Services::GetInstance().GetServiceMemPool();
  125. // A dummy group just for the watchdog. We aren't really using it. But most code assumes a thread must
  126. // belong to a group.
  127. auto f = std::bind(&TaskManager::WatchDog, this);
  128. Status rc;
  129. watchdog_grp_ = new (&rc, mp) TaskGroup();
  130. RETURN_IF_NOT_OK(rc);
  131. rc = watchdog_grp_->CreateAsyncTask("Watchdog", f, &watchdog_);
  132. if (rc.IsError()) {
  133. ::operator delete(watchdog_grp_, mp);
  134. watchdog_grp_ = nullptr;
  135. return rc;
  136. }
  137. grp_list_.erase(watchdog_grp_);
  138. lru_.Remove(watchdog_);
  139. return Status::OK();
  140. }
  141. Status TaskManager::DoServiceStop() {
  142. WakeUpWatchDog();
  143. interrupt_all();
  144. return Status::OK();
  145. }
  146. Status TaskManager::WatchDog() {
  147. TaskManager::FindMe()->Post();
  148. errno_t err = sem_wait(&sem_);
  149. if (err == -1) {
  150. RETURN_STATUS_UNEXPECTED("Errno = " + std::to_string(errno));
  151. }
  152. // We are woken up by control-c and we are going to stop all threads that are running.
  153. // In addition, we also want to prevent new thread from creating. This can be done
  154. // easily by calling the parent function.
  155. RETURN_IF_NOT_OK(ServiceStop());
  156. return Status::OK();
  157. }
  158. // Follow the group link and interrupt other
  159. // Task in the same group. It is used by
  160. // Watchdog only.
  161. void TaskManager::InterruptGroup(Task &curTk) {
  162. TaskGroup *vg = curTk.MyTaskGroup();
  163. vg->interrupt_all();
  164. }
  165. void TaskManager::InterruptMaster(const Status &rc) {
  166. TaskManager &tm = TaskManager::GetInstance();
  167. std::shared_ptr<Task> master = tm.master_;
  168. std::lock_guard<std::mutex> lck(master->mux_);
  169. (void)master->Interrupt();
  170. if (rc.IsError() && master->rc_.IsOk()) {
  171. master->rc_ = rc;
  172. master->caught_severe_exception_ = true;
  173. }
  174. }
  175. Status TaskManager::GetMasterThreadRc() {
  176. TaskManager &tm = TaskManager::GetInstance();
  177. std::shared_ptr<Task> master = tm.master_;
  178. Status rc = tm.master_->GetTaskErrorIfAny();
  179. if (rc.IsError()) {
  180. // Reset the state once we retrieve the value.
  181. std::lock_guard<std::mutex> lck(master->mux_);
  182. master->rc_ = Status::OK();
  183. master->caught_severe_exception_ = false;
  184. master->ResetIntrpState();
  185. }
  186. return rc;
  187. }
  188. void TaskManager::ReturnFreeTask(Task *p) noexcept {
  189. // Take it out from lru_ if any
  190. {
  191. UniqueLock lck(&lru_lock_);
  192. auto it = std::find(lru_.begin(), lru_.end(), *p);
  193. if (it != lru_.end()) {
  194. lru_.Remove(p);
  195. }
  196. }
  197. // We need to deallocate the string resources associated with the Task class
  198. // before we cache its memory for future use.
  199. p->~Task();
  200. // Put it back into free list
  201. {
  202. LockGuard lck(&free_lock_);
  203. free_lst_.Append(p);
  204. }
  205. }
  206. Status TaskManager::GetFreeTask(const std::string &my_name, const std::function<Status()> &f, Task **p) {
  207. if (p == nullptr) {
  208. RETURN_STATUS_UNEXPECTED("p is null");
  209. }
  210. Task *q = nullptr;
  211. // First try the free list
  212. {
  213. LockGuard lck(&free_lock_);
  214. if (free_lst_.count > 0) {
  215. q = free_lst_.head;
  216. free_lst_.Remove(q);
  217. }
  218. }
  219. if (q) {
  220. new (q) Task(my_name, f);
  221. } else {
  222. std::shared_ptr<MemoryPool> mp = Services::GetInstance().GetServiceMemPool();
  223. Status rc;
  224. q = new (&rc, mp) Task(my_name, f);
  225. RETURN_IF_NOT_OK(rc);
  226. }
  227. *p = q;
  228. return Status::OK();
  229. }
  230. Status TaskGroup::CreateAsyncTask(const std::string &my_name, const std::function<Status()> &f, Task **ppTask) {
  231. auto pMytask = TaskManager::FindMe();
  232. // We need to block ~TaskGroup coming otherwise we will deadlock. We will grab the
  233. // stateLock in shared allowing CreateAsyncTask to run concurrently.
  234. SharedLock state_lck(&state_lock_);
  235. // Now double check the state
  236. if (ServiceState() != STATE::kRunning) {
  237. return Status(StatusCode::kInterrupted, __LINE__, __FILE__, "Taskgroup is shutting down");
  238. }
  239. TaskManager &dm = TaskManager::GetInstance();
  240. Task *pTask = nullptr;
  241. // If the group is already in error, early exit too.
  242. // We can't hold the rc_mux_ throughout because the thread spawned by CreateAsyncTask may hit error which
  243. // will try to shutdown the group and grab the rc_mux_ and we will deadlock.
  244. {
  245. std::unique_lock<std::mutex> rcLock(rc_mux_);
  246. if (rc_.IsError()) {
  247. return pMytask->IsMasterThread() ? rc_ : Status(StatusCode::kInterrupted);
  248. }
  249. }
  250. RETURN_IF_NOT_OK(dm.CreateAsyncTask(my_name, f, this, &pTask));
  251. if (ppTask) {
  252. *ppTask = pTask;
  253. }
  254. return Status::OK();
  255. }
  256. void TaskGroup::interrupt_all() noexcept { (void)intrp_svc_->InterruptAll(); }
  257. Status TaskGroup::join_all() {
  258. Status rc;
  259. Status rc2;
  260. SharedLock lck(&rw_lock_);
  261. for (Task &tk : grp_list_) {
  262. rc = tk.Join();
  263. if (rc.IsError()) {
  264. rc2 = rc;
  265. }
  266. }
  267. return rc2;
  268. }
  269. Status TaskGroup::DoServiceStop() {
  270. intrp_svc_->ServiceStop();
  271. interrupt_all();
  272. return (join_all());
  273. }
  274. TaskGroup::TaskGroup() : grp_list_(&Task::group), intrp_svc_(nullptr) {
  275. std::shared_ptr<MemoryPool> mp = Services::GetInstance().GetServiceMemPool();
  276. Allocator<IntrpService> alloc(mp);
  277. intrp_svc_ = std::allocate_shared<IntrpService>(alloc);
  278. (void)Service::ServiceStart();
  279. }
  280. TaskGroup::~TaskGroup() {
  281. (void)Service::ServiceStop();
  282. // The TaskGroup is going out of scope, and we can return the Task list to the free list.
  283. Task *cur = grp_list_.head;
  284. TaskManager &tm = TaskManager::GetInstance();
  285. while (cur) {
  286. Task *next = cur->group.next;
  287. grp_list_.Remove(cur);
  288. tm.ReturnFreeTask(cur);
  289. cur = next;
  290. }
  291. {
  292. LockGuard lck(&tm.tg_lock_);
  293. (void)tm.grp_list_.erase(this);
  294. }
  295. }
  296. Status TaskGroup::GetTaskErrorIfAny() {
  297. SharedLock lck(&rw_lock_);
  298. for (Task &tk : grp_list_) {
  299. RETURN_IF_NOT_OK(tk.GetTaskErrorIfAny());
  300. }
  301. return Status::OK();
  302. }
  303. std::shared_ptr<IntrpService> TaskGroup::GetIntrpService() { return intrp_svc_; }
  304. } // namespace dataset
  305. } // namespace mindspore