enable refactor thread pool

4 years ago · a6622a70ba
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -3,6 +3,7 @@ include_directories(${CMAKE_SOURCE_DIR}/mindspore/core)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${CMAKE_BINARY_DIR})
 include_directories(${CMAKE_SOURCE_DIR}/mindspore/core/mindrt/include)
 include_directories(${CMAKE_SOURCE_DIR}/mindspore/core/mindrt/src)

 if(ENABLE_CPU)
    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/backend/kernel_compiler/cpu)
--- a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
@@ -312,6 +312,8 @@ GraphScheduler::~GraphScheduler() {
  // Local maps clear.
  actor_name_to_actor_.clear();
  graph_output_to_actor_.clear();
  delete thread_pool_;
  thread_pool_ = nullptr;
 }

 void GraphScheduler::Initialize() {
@@ -326,11 +328,13 @@ void GraphScheduler::Initialize() {

  auto actorMgr = ActorMgr::GetActorMgrRef();
  MS_EXCEPTION_IF_NULL(actorMgr);
  actorMgr->Initialize();

  // Create the thread pool of actor runtime.
  auto max_thread_num = GetMaxThreadNum();
  MS_LOG(INFO) << "Max available thread number: " << max_thread_num;
  actorMgr->Initialize(max_thread_num);
  thread_pool_ = InterThreadPool::CreateThreadPool(max_thread_num);
  MS_EXCEPTION_IF_NULL(thread_pool_);

  // Create memory manager actor.
  auto memory_manager_actor = std::make_shared<MemoryManagerActor>();
@@ -338,6 +342,7 @@ void GraphScheduler::Initialize() {
  memory_manager_aid_ = memory_manager_actor->GetAID();
  // Schedule memory manager actor, bind single thread to response to memory alloc and free quickly.
  auto base_actor = static_cast<ActorReference>(memory_manager_actor);
  base_actor->set_thread_pool(thread_pool_);
  (void)actorMgr->Spawn(base_actor, false);
 }

@@ -400,6 +405,7 @@ void GraphScheduler::Schedule(const ActorSet *actor_set) {
  auto actorMgr = ActorMgr::GetActorMgrRef();
  MS_EXCEPTION_IF_NULL(actorMgr);
  for (auto actor : actors) {
    actor->set_thread_pool(thread_pool_);
    (void)actorMgr->Spawn(actor);
  }
 }
--- a/mindspore/ccsrc/runtime/framework/graph_scheduler.h
+++ b/mindspore/ccsrc/runtime/framework/graph_scheduler.h
@@ -33,6 +33,7 @@
 #include "runtime/framework/actor/copy_actor.h"
 #include "runtime/hardware/device_context.h"
 #include "backend/session/kernel_graph.h"
 #include "thread/inter_threadpool.h"

 namespace mindspore {
 namespace runtime {
@@ -242,6 +243,8 @@ class GraphScheduler {
  // The id of memory manager actor.
  AID memory_manager_aid_;

  InterThreadPool *thread_pool_{nullptr};

  bool init_{false};
 };
 }  // namespace runtime
--- a/mindspore/ccsrc/vm/CMakeLists.txt
+++ b/mindspore/ccsrc/vm/CMakeLists.txt
@@ -1,4 +1,5 @@
 include_directories(${CMAKE_SOURCE_DIR}/mindspore/core/mindrt/include)
 include_directories(${CMAKE_SOURCE_DIR}/mindspore/core/mindrt/src)

 file(GLOB_RECURSE _VM_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
 set_property(SOURCE ${_VM_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_VM)
--- a/mindspore/core/mindrt/CMakeLists.txt
+++ b/mindspore/core/mindrt/CMakeLists.txt
@@ -8,6 +8,7 @@ file(GLOB MINDRT_SRC
    ${CMAKE_CURRENT_SOURCE_DIR}/src/async/*.cc
    ${CMAKE_CURRENT_SOURCE_DIR}/src/evloop/*.cc
    ${CMAKE_CURRENT_SOURCE_DIR}/src/timer/*.cc
    ${CMAKE_CURRENT_SOURCE_DIR}/src/thread/*.cc
    )

 add_library(mindrt_mid OBJECT ${MINDRT_SRC})
--- a/mindspore/core/mindrt/include/actor/actor.h
+++ b/mindspore/core/mindrt/include/actor/actor.h
@@ -31,6 +31,7 @@ namespace mindspore {
 class ActorBase;
 class ActorMgr;
 class ActorPolicy;
 class InterThreadPool;

 using ActorReference = std::shared_ptr<ActorBase>;

@@ -78,6 +79,8 @@ class ActorBase {
  // delete the send/receive message package size
  void DelRuleUdp(const std::string &peer, bool outputLog);

  void set_thread_pool(InterThreadPool *pool) { pool_ = pool; }

 protected:
  using ActorFunction = std::function<void(const std::unique_ptr<MessageBase> &msg)>;

@@ -146,7 +149,7 @@ class ActorBase {

 private:
  friend class ActorMgr;
  friend class ActorThread;
  friend class InterThreadPool;

  // KMSG Msg Handler
  virtual void HandlekMsg(const std::unique_ptr<MessageBase> &msg);
@@ -194,6 +197,7 @@ class ActorBase {
  void SetRunningStatus(bool start);

  std::unique_ptr<ActorPolicy> actorThread;
  InterThreadPool *pool_{nullptr};

  AID id;
  std::map<std::string, ActorFunction> actionFunctions;
--- a/mindspore/core/mindrt/include/mindrt.hpp
+++ b/mindspore/core/mindrt/include/mindrt.hpp
@@ -33,9 +33,6 @@ struct MindrtAddress {
 int Initialize(const std::string &tcpUrl, const std::string &tcpUrlAdv = "", const std::string &udpUrl = "",
               const std::string &udpUrlAdv = "", int threadCount = 0);

 // brief terminate the threads for current session
 void TerminateCurThreads(int threadCount = 0);

 // brief spawn a process to run an actor
 AID Spawn(ActorReference actor, bool sharedThread = true, bool start = true);

--- a/mindspore/core/mindrt/src/actor/actormgr.cc
+++ b/mindspore/core/mindrt/src/actor/actormgr.cc
@@ -98,16 +98,11 @@ void ActorMgr::TerminateAll() {
  }
 }

 void ActorMgr::Initialize(int threadCount) { threadPool.AddThread(threadCount); }

 void ActorMgr::TerminateCurThreads(int threadCount) { threadPool.TerminateThread(threadCount); }

 void ActorMgr::Finalize() {
  this->TerminateAll();
  MS_LOG(INFO) << "mindrt Actors finish exiting.";

  // stop all actor threads;
  threadPool.Finalize();
  MS_LOG(INFO) << "mindrt Threads finish exiting.";

  // stop iomgr thread
@@ -115,7 +110,6 @@ void ActorMgr::Finalize() {
    MS_LOG(INFO) << "finalize IOMgr=" << mgrIt->first.c_str();
    mgrIt->second->Finish();
  }

  MS_LOG(INFO) << "mindrt IOMGRS finish exiting.";
 }

--- a/mindspore/core/mindrt/src/actor/actormgr.h
+++ b/mindspore/core/mindrt/src/actor/actormgr.h
@@ -23,7 +23,8 @@
 #include <memory>
 #include <string>

 #include "actor/actorthread.h"
 #include "actor/actor.h"
 #include "thread/inter_threadpool.h"

 namespace mindspore {

@@ -47,8 +48,7 @@ class ActorMgr {
  ~ActorMgr();

  void Finalize();
  void Initialize(int threadCount);
  void TerminateCurThreads(int threadCount);
  void Initialize() {}
  void RemoveActor(const std::string &name);
  ActorReference GetActor(const AID &id);
  const std::string GetUrl(const std::string &protocol = "tcp");
@@ -62,7 +62,14 @@ class ActorMgr {
  inline const std::string &GetDelegate() const { return delegate; }

  inline void SetDelegate(const std::string &d) { delegate = d; }
  inline void SetActorReady(const std::shared_ptr<ActorBase> &actor) { threadPool.EnqueReadyActor(actor); }
  inline void SetActorReady(const std::shared_ptr<ActorBase> &actor) const {
    auto pool = actor->pool_;
    if (pool == nullptr) {
      MS_LOG(ERROR) << "ThreadPOol is nullptr, actor: " << actor->GetAID().Name();
      return;
    }
    pool->EnqueReadyActor(actor);
  }
  void SetActorStatus(const AID &pid, bool start);

 private:
@@ -77,8 +84,6 @@ class ActorMgr {
  std::map<std::string, ActorReference> actors;
  std::mutex actorsMutex;

  ActorThread threadPool;

  std::map<std::string, std::string> procotols;
  std::set<std::string> urls;
  std::string delegate;
--- a/mindspore/core/mindrt/src/actor/actorthread.cc
+++ b/mindspore/core/mindrt/src/actor/actorthread.cc
@@ -1,138 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "actor/actorthread.h"
 #ifdef __WIN32__
 #include <windows.h>
 #else
 #include <unistd.h>
 #endif
 #include <atomic>
 #include <utility>
 #include <memory>

 namespace mindspore {
 constexpr int MAXTHREADNAMELEN = 12;

 size_t GetMaxThreadCount() {
  size_t max_num;
 #ifdef __WIN32__
  SYSTEM_INFO sys_info;
  GetSystemInfo(&sys_info);
  max_num = sys_info.dwNumberOfProcessors;
 #else
  max_num = sysconf(_SC_NPROCESSORS_ONLN);
 #endif
  return max_num;
 }

 ActorThread::ActorThread() : readyActors(), workers() {
  readyActors.clear();
  workers.clear();

  char *envThreadName = getenv("MINDRT_THREAD_NAME");
  if (envThreadName != nullptr) {
    threadName = envThreadName;
    if (threadName.size() > MAXTHREADNAMELEN) {
      threadName.resize(MAXTHREADNAMELEN);
    }
  } else {
    threadName = "MINDRT_ACT";
  }

  maxThreads_ = GetMaxThreadCount();
 }

 ActorThread::~ActorThread() {}
 void ActorThread::AddThread(int threadCount) {
  std::unique_lock<std::mutex> lock(initLock_);
  int threadsNeed = threadCount - (workers.size() - threadsInUse_);
  for (int i = 0; i < threadsNeed; ++i) {
    if (workers.size() >= maxThreads_) {
      MS_LOG(DEBUG) << "threads number in mindrt reach upper limit. maxThreads:" << maxThreads_;
      break;
    }
    std::unique_ptr<std::thread> worker(new (std::nothrow) std::thread(&ActorThread::Run, this));
    MINDRT_OOM_EXIT(worker)

    workers.push_back(std::move(worker));
    threadsInUse_ += 1;
  }
 }

 void ActorThread::TerminateThread(int threadCount) {
  // temp scheme, not actually terminate the threads when current session destructs
  threadsInUse_ -= threadCount;
 }

 void ActorThread::Finalize() {
  MS_LOG(INFO) << "Actor's threads are exiting.";
  // terminate all thread; enqueue nullptr actor to terminate;
  std::shared_ptr<ActorBase> exitActor(nullptr);
  for (auto it = workers.begin(); it != workers.end(); ++it) {
    EnqueReadyActor(exitActor);
  }
  // wait all thread to exit
  for (auto it = workers.begin(); it != workers.end(); ++it) {
    std::unique_ptr<std::thread> &worker = *it;
    if (worker->joinable()) {
      worker->join();
    }
  }
  workers.clear();
  MS_LOG(INFO) << "Actor's threads finish exiting.";
 }

 void ActorThread::DequeReadyActor(std::shared_ptr<ActorBase> &actor) {
  std::unique_lock<std::mutex> lock(readyActorMutex);
  conditionVar.wait(lock, [this] { return (this->readyActors.size() > 0); });
  actor = readyActors.front();
  readyActors.pop_front();
 }

 void ActorThread::EnqueReadyActor(const std::shared_ptr<ActorBase> &actor) {
  {
    std::lock_guard<std::mutex> lock(readyActorMutex);
    readyActors.push_back(actor);
  }
  conditionVar.notify_one();
 }

 void ActorThread::Run() {
 #if __GLIBC__ >= 2 && __GLIBC_MINOR__ >= 12
  static std::atomic<int> actorCount(1);
  int ret = pthread_setname_np(pthread_self(), (threadName + std::to_string(actorCount.fetch_add(1))).c_str());
  if (0 != ret) {
    MS_LOG(INFO) << "set pthread name fail]ret:" << ret;
  } else {
    MS_LOG(INFO) << "set pthread name success]threadID:" << pthread_self();
  }
 #endif

  bool terminate = false;
  do {
    std::shared_ptr<ActorBase> actor;
    DequeReadyActor(actor);
    if (actor != nullptr) {
      actor->Run();
    } else {
      terminate = true;
      MS_LOG(DEBUG) << "Actor this Threads have finished exiting.";
    }
  } while (!terminate);
 }

 };  // end of namespace mindspore
--- a/mindspore/core/mindrt/src/actor/actorthread.h
+++ b/mindspore/core/mindrt/src/actor/actorthread.h
@@ -1,56 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CORE_MINDRT_SRC_ACTOR_ACTORTHREAD_H
 #define MINDSPORE_CORE_MINDRT_SRC_ACTOR_ACTORTHREAD_H

 #include <condition_variable>
 #include <list>
 #include <thread>
 #include <memory>
 #include <string>

 #include "actor/actor.h"

 namespace mindspore {

 class ActorThread {
 public:
  ActorThread();
  ~ActorThread();
  void Finalize();
  void AddThread(int threadCount);
  void TerminateThread(int threadCount);
  void EnqueReadyActor(const std::shared_ptr<ActorBase> &actor);

 private:
  void Run();
  void DequeReadyActor(std::shared_ptr<ActorBase> &actor);

  std::list<std::shared_ptr<ActorBase>> readyActors;
  std::mutex readyActorMutex;
  std::condition_variable conditionVar;

  std::list<std::unique_ptr<std::thread>> workers;
  std::string threadName;

  size_t threadsInUse_ = 0;
  size_t maxThreads_;
  std::mutex initLock_;
 };

 };  // end of namespace mindspore
 #endif
--- a/mindspore/core/mindrt/src/mindrt.cc
+++ b/mindspore/core/mindrt/src/mindrt.cc
@@ -57,10 +57,6 @@ const MindrtAddress &GetMindrtAddress() {
  return *local::g_mindrtAddress;
 }

 void SetThreadCount(int threadCount) { ActorMgr::GetActorMgrRef()->Initialize(threadCount); }

 void TerminateCurThreads(int threadCount) { ActorMgr::GetActorMgrRef()->TerminateCurThreads(threadCount); }

 class MindrtExit {
 public:
  MindrtExit() { MS_LOG(DEBUG) << "trace: enter MindrtExit()---------"; }
@@ -73,10 +69,7 @@ class MindrtExit {
 int InitializeImp(const std::string &tcpUrl, const std::string &tcpUrlAdv, const std::string &udpUrl,
                  const std::string &udpUrlAdv, int threadCount) {
  MS_LOG(DEBUG) << "mindrt starts ......";

  // start actor's thread
  SetThreadCount(threadCount);

  ActorMgr::GetActorMgrRef()->Initialize();
  MS_LOG(DEBUG) << "mindrt has started.";
  return MINDRT_OK;
 }
--- a/mindspore/core/mindrt/src/thread/core_affinity.cc
+++ b/mindspore/core/mindrt/src/thread/core_affinity.cc
@@ -19,9 +19,6 @@
 #include <stdlib.h>
 #include <string>
 #include <algorithm>
 #ifdef __ANDROID__
 #include <sched.h>
 #endif
 #ifdef MS_COMPILE_IOS
 #include <sys/types.h>
 #include <sys/sysctl.h>
@@ -29,7 +26,6 @@
 #endif  // MS_COMPILE_IOS
 #include "thread/threadpool.h"

 #ifdef BIND_CORE
 namespace mindspore {

 #define MAX_PATH_SIZE (256)
@@ -240,6 +236,8 @@ int CoreAffinity::SortCPUProcessors() {
      }
    }
  }
  higher_num_ = 0;
  sorted_id_.clear();
  int max_freq = freq_set.front().max_freq;
  for (const auto &info : freq_set) {
    THREAD_INFO("sorted core id: %d, max frequency: %d, arch: %d", info.core_id, info.max_freq, info.arch);
@@ -274,14 +272,15 @@ int CoreAffinity::InitBindCoreId(size_t thread_num, BindMode bind_mode) {
  return THREAD_OK;
 }

 int CoreAffinity::SetAffinity(pthread_t thread_id, cpu_set_t *cpuSet) const {
 #ifdef BIND_CORE
 int CoreAffinity::SetAffinity(const pthread_t &thread_id, cpu_set_t *cpu_set) const {
 #ifdef __ANDROID__
 #if __ANDROID_API__ >= 21
  THREAD_INFO("thread: %d, mask: %lu", pthread_gettid_np(thread_id), cpuSet->__bits[0]);
  int ret = sched_setaffinity(pthread_gettid_np(thread_id), sizeof(cpu_set_t), cpuSet);
  THREAD_INFO("thread: %d, mask: %lu", pthread_gettid_np(thread_id), cpu_set->__bits[0]);
  int ret = sched_setaffinity(pthread_gettid_np(thread_id), sizeof(cpu_set_t), cpu_set);
  if (ret != THREAD_OK) {
    THREAD_ERROR("bind thread %d to cpu failed. ERROR %d", pthread_gettid_np(thread_id), ret);
    return THREAD_OK;
    return THREAD_ERROR;
  }
 #endif
 #else
@@ -289,7 +288,7 @@ int CoreAffinity::SetAffinity(pthread_t thread_id, cpu_set_t *cpuSet) const {
  THREAD_ERROR("not bind thread to apple's cpu.");
  return THREAD_ERROR;
 #else
  int ret = pthread_setaffinity_np(thread_id, sizeof(cpu_set_t), cpuSet);
  int ret = pthread_setaffinity_np(thread_id, sizeof(cpu_set_t), cpu_set);
  if (ret != THREAD_OK) {
    THREAD_ERROR("set thread: %lu to cpu failed", thread_id);
    return THREAD_ERROR;
@@ -298,8 +297,10 @@ int CoreAffinity::SetAffinity(pthread_t thread_id, cpu_set_t *cpuSet) const {
 #endif
  return THREAD_OK;
 }
 #endif  // BIND_CORE

 int CoreAffinity::FreeScheduleThreads(const std::vector<Worker *> &workers) const {
 #ifdef BIND_CORE
  if (thread_num_ != workers.size()) {
    return THREAD_ERROR;
  }
@@ -315,11 +316,13 @@ int CoreAffinity::FreeScheduleThreads(const std::vector<Worker *> &workers) cons
      return THREAD_ERROR;
    }
  }
 #endif  // BIND_CORE
  return THREAD_OK;
 }

 int CoreAffinity::BindThreadsToCoreList(const std::vector<Worker *> &workers) const {
  if (bind_id_.size() != thread_num_) {
 #ifdef BIND_CORE
  if (thread_num_ != workers.size()) {
    THREAD_ERROR("invalid core list");
    return THREAD_ERROR;
  }
@@ -330,14 +333,32 @@ int CoreAffinity::BindThreadsToCoreList(const std::vector<Worker *> &workers) co
    // affinity mask determines the CPU core which it is eligible to run
    int ret = SetAffinity(workers[i]->thread.native_handle(), &mask);
    if (ret != THREAD_OK) {
      THREAD_ERROR("set thread[%zu] affinity failed", i);
      THREAD_ERROR("set thread[%zu] affinity to core[%d] failed", i, bind_id_[i]);
      return THREAD_ERROR;
    }
    THREAD_INFO("bind thread[%zu] success", i);
    THREAD_ERROR("set thread[%zu] affinity to core[%d] success", i, bind_id_[i]);
  }
 #endif  // BIND_CORE
  return THREAD_OK;
 }

 int CoreAffinity::BindProcess(BindMode bind_mode) const {
 #ifdef BIND_CORE
  cpu_set_t mask;
  CPU_ZERO(&mask);
  if (bind_mode != Power_NoBind) {
    CPU_SET(bind_id_.front(), &mask);
  } else {
    for (int id : bind_id_) {
      CPU_SET(id, &mask);
    }
  }
  return SetAffinity(pthread_self(), &mask);
 #else
  return THREAD_OK;
 #endif  // BIND_CORE
 }

 int CoreAffinity::BindThreads(const std::vector<Worker *> &workers, BindMode bind_mode) const {
  if (bind_mode == Power_NoBind) {
    return FreeScheduleThreads(workers);
@@ -351,5 +372,3 @@ int CoreAffinity::BindThreads(const std::vector<Worker *> &workers, const std::v
  return BindThreadsToCoreList(workers);
 }
 }  // namespace mindspore

 #endif  // BIND_CORE
--- a/mindspore/core/mindrt/src/thread/core_affinity.h
+++ b/mindspore/core/mindrt/src/thread/core_affinity.h
@@ -18,28 +18,41 @@
 #define MINDSPORE_CORE_MINDRT_RUNTIME_CORE_AFFINITY_H_

 #include <vector>
 #include "thread/threadpool.h"
 #ifdef BIND_CORE
 #include <thread>

 #ifdef __ANDROID__
 #define BIND_CORE
 #include <sched.h>
 #endif

 namespace mindspore {

 enum BindMode {
  Power_NoBind = 0,  // free schedule
  Power_Higher = 1,
  Power_Middle = 2,
 };

 struct Worker;
 class CoreAffinity {
 public:
  static CoreAffinity *GetInstance() {
    static CoreAffinity affinity;
    return &affinity;
  }
  CoreAffinity() = default;
  ~CoreAffinity() = default;

  int InitBindCoreId(size_t thread_num, BindMode bind_mode);

  int BindThreads(const std::vector<Worker *> &workers, const std::vector<int> &core_list);
  int BindThreads(const std::vector<Worker *> &workers, BindMode bind_mode) const;
  int BindProcess(BindMode bind_mode) const;

 private:
  CoreAffinity() = default;
  ~CoreAffinity() = default;
 #ifdef BIND_CORE
  int SetAffinity(const pthread_t &thread_id, cpu_set_t *cpu_set) const;
 #endif  // BIND_CORE

  int BindThreadsToCoreList(const std::vector<Worker *> &workers) const;
  int FreeScheduleThreads(const std::vector<Worker *> &workers) const;
  int SetAffinity(pthread_t thread_id, cpu_set_t *cpuSet) const;

  int SortCPUProcessors();

  // bind_id contains the CPU cores to bind
@@ -52,7 +65,7 @@ class CoreAffinity {
  size_t higher_num_{0};
  size_t thread_num_{0};
 };

 }  // namespace mindspore

 #endif  // BIND_CORE
 #endif  // MINDSPORE_CORE_MINDRT_RUNTIME_CORE_AFFINITY_H_
--- a/mindspore/core/mindrt/src/thread/inter_threadpool.cc
+++ b/mindspore/core/mindrt/src/thread/inter_threadpool.cc
@@ -20,6 +20,11 @@
 namespace mindspore {

 InterThreadPool::~InterThreadPool() {
  {
    THREAD_INFO("wait util actor queue is empty");
    std::unique_lock<std::mutex> _l(actor_mutex_);
    finish_cond_var_.wait(_l, [this]() { return actor_queue_.empty(); });
  }
  exit_ = true;
  alive_ = false;
  actor_cond_var_.notify_all();
@@ -38,15 +43,12 @@ void InterThreadPool::ActorThreadRun() {
    actor_queue_.pop();
  }
  actor->Run();
  finish_cond_var_.notify_one();
 }

 void InterThreadPool::ThreadAsyncRun(size_t thread_id) {
  {
    std::unique_lock<std::mutex> _l(pool_mutex_);
    start_cond_.wait(_l, [this]() { return workers_.size() == thread_num_; });
  }
  Worker *worker = workers_[thread_id];
 void InterThreadPool::ThreadAsyncRun(Worker *worker) {
  THREAD_RETURN_IF_NULL(worker);
  sem_post(&worker->init);
  while (alive_) {
    if (worker->type == kKernelThread) {
      KernelThreadRun(worker);
@@ -78,7 +80,7 @@ InterThreadPool *InterThreadPool::CreateThreadPool(size_t inter_thread_num, size
    return nullptr;
  }
 #ifdef BIND_CORE
  ret = CoreAffinity::GetInstance()->InitBindCoreId(thread_num, bind_mode);
  ret = pool->InitAffinityInfo(bind_mode);
  if (ret != THREAD_OK) {
    delete pool;
    return nullptr;
--- a/mindspore/core/mindrt/src/thread/inter_threadpool.h
+++ b/mindspore/core/mindrt/src/thread/inter_threadpool.h
@@ -39,12 +39,13 @@ class InterThreadPool : public ThreadPool {
 private:
  explicit InterThreadPool(size_t inter_thread_num) { inter_thread_num_ = inter_thread_num; }

  void ThreadAsyncRun(size_t thread_id) override;
  void ThreadAsyncRun(Worker *worker) override;

  void ActorThreadRun();

  std::mutex actor_mutex_;
  std::condition_variable actor_cond_var_;
  std::condition_variable finish_cond_var_;
  std::queue<ActorReference> actor_queue_;

  std::atomic_bool exit_{false};
--- a/mindspore/core/mindrt/src/thread/threadpool.cc
+++ b/mindspore/core/mindrt/src/thread/threadpool.cc
@@ -24,12 +24,11 @@ namespace mindspore {
 constexpr int kDefaultSpinCount = 30000;

 ThreadPool::~ThreadPool() {
  alive_ = false;
  alive_.store(false);
  DestructThreads();
 }

 void ThreadPool::DestructThreads() {
  std::lock_guard<std::mutex> lock(pool_mutex_);
  for (auto &worker : workers_) {
    sem_post(&worker->sem);
    if (worker->thread.joinable()) {
@@ -39,12 +38,15 @@ void ThreadPool::DestructThreads() {
    delete worker;
    worker = nullptr;
  }
  THREAD_INFO("deconstruct threads success");
  workers_.clear();
  if (affinity_ != nullptr) {
    delete affinity_;
    affinity_ = nullptr;
  }
  THREAD_INFO("deconstruct threads success");
 }

 int ThreadPool::CreateThreads(size_t thread_num) {
  std::lock_guard<std::mutex> lock(pool_mutex_);
  size_t core_num = std::thread::hardware_concurrency();
  thread_num_ = std::min(thread_num, core_num);
  if (thread_num_ <= 0) {
@@ -54,14 +56,17 @@ int ThreadPool::CreateThreads(size_t thread_num) {
  for (size_t i = 0; i < thread_num_; ++i) {
    Worker *worker = new (std::nothrow) Worker();
    THREAD_ERROR_IF_NULL(worker);
    worker->type = i < inter_thread_num_ ? kActorThread : kKernelThread;
    worker->thread = std::thread(&ThreadPool::ThreadAsyncRun, this, i);
    sem_init(&worker->sem, 0, 0);
    sem_init(&worker->init, 0, 0);
    worker->type = i < inter_thread_num_ ? kActorThread : kKernelThread;
    if (worker->type == kKernelThread) {
      freelist_.push_back(worker);
    }
    worker->thread = std::thread(&ThreadPool::ThreadAsyncRun, this, worker);
    sem_wait(&worker->init);
    workers_.push_back(worker);
    THREAD_INFO("create thread[%zu]", i);
  }
  freelist_.insert(freelist_.begin(), workers_.begin() + inter_thread_num_, workers_.end());
  start_cond_.notify_all();
  return THREAD_OK;
 }

@@ -89,20 +94,16 @@ void ThreadPool::KernelThreadRun(Worker *worker) {
  }
 }

 void ThreadPool::ThreadAsyncRun(size_t thread_id) {
  {
    // wait for all threads to be created
    std::unique_lock<std::mutex> _l(pool_mutex_);
    start_cond_.wait(_l, [this]() { return workers_.size() == thread_num_; });
  }
  Worker *worker = workers_[thread_id];
 void ThreadPool::ThreadAsyncRun(Worker *worker) {
  THREAD_RETURN_IF_NULL(worker);
  sem_post(&worker->init);
  while (alive_) {
    KernelThreadRun(worker);
  }
 }

 int ThreadPool::ParallelLaunch(const Func &func, Contend contend, int task_num) {
  THREAD_INFO("parallel launch, task num: %d", task_num);
  // distribute task to the KernelThread and the free ActorThread,
  // if the task num is greater than the KernelThread num
  Task task = Task(func, contend);
@@ -136,12 +137,25 @@ void ThreadPool::DistributeTask(Task *task, int task_num) {
  }
 }

 int ThreadPool::InitAffinityInfo(BindMode bind_mode) {
  affinity_ = new (std::nothrow) CoreAffinity();
  THREAD_ERROR_IF_NULL(affinity_);
  int ret = affinity_->InitBindCoreId(thread_num_, bind_mode);
  if (ret != THREAD_OK) {
    delete affinity_;
    affinity_ = nullptr;
    return THREAD_ERROR;
  }
  return THREAD_OK;
 }

 int ThreadPool::SetCpuAffinity(BindMode bind_mode) {
  if (workers_.empty()) {
    return THREAD_ERROR;
  }
 #ifdef BIND_CORE
  return CoreAffinity::GetInstance()->BindThreads(workers_, bind_mode);
  THREAD_ERROR_IF_NULL(affinity_);
  return affinity_->BindThreads(workers_, bind_mode);
 #else
  return THREAD_OK;
 #endif  // BIND_CORE
@@ -152,7 +166,17 @@ int ThreadPool::SetCpuAffinity(const std::vector<int> &core_list) {
    return THREAD_ERROR;
  }
 #ifdef BIND_CORE
  return CoreAffinity::GetInstance()->BindThreads(workers_, core_list);
  THREAD_ERROR_IF_NULL(affinity_);
  return affinity_->BindThreads(workers_, core_list);
 #else
  return THREAD_OK;
 #endif  // BIND_CORE
 }

 int ThreadPool::SetProcessAffinity(BindMode bind_mode) const {
 #ifdef BIND_CORE
  THREAD_ERROR_IF_NULL(affinity_);
  return affinity_->BindProcess(bind_mode);
 #else
  return THREAD_OK;
 #endif  // BIND_CORE
@@ -169,7 +193,7 @@ ThreadPool *ThreadPool::CreateThreadPool(size_t thread_num, BindMode bind_mode)
    return nullptr;
  }
 #ifdef BIND_CORE
  ret = CoreAffinity::GetInstance()->InitBindCoreId(thread_num, bind_mode);
  ret = pool->InitAffinityInfo(bind_mode);
  if (ret != THREAD_OK) {
    delete pool;
    return nullptr;
--- a/mindspore/core/mindrt/src/thread/threadpool.h
+++ b/mindspore/core/mindrt/src/thread/threadpool.h
@@ -25,10 +25,7 @@
 #include <condition_variable>
 #include <mutex>
 #include <new>

 #ifdef __ANDROID__
 #define BIND_CORE
 #endif
 #include "thread/core_affinity.h"

 namespace mindspore {

@@ -60,12 +57,6 @@ namespace mindspore {
 enum ThreadRet { THREAD_OK = 0, THREAD_ERROR = 1 };
 enum ThreadType { kActorThread = 0, kKernelThread = 1 };

 enum BindMode {
  Power_NoBind = 0,  // free schedule
  Power_Higher = 1,
  Power_Middle = 2,
 };

 using Func = int (*)(void *arg, int);
 using Contend = void *;

@@ -83,6 +74,7 @@ typedef struct Worker {
  std::atomic_int type{kActorThread};
  Task *task{nullptr};
  sem_t sem;
  sem_t init;
  int spin{0};
 } Worker;

@@ -96,6 +88,8 @@ class ThreadPool {
  int SetCpuAffinity(const std::vector<int> &core_list);
  int SetCpuAffinity(BindMode bind_mode);

  int SetProcessAffinity(BindMode bind_mode) const;

  int ParallelLaunch(const Func &func, Contend contend, int task_num);

 protected:
@@ -104,13 +98,14 @@ class ThreadPool {
  int CreateThreads(size_t thread_num);
  void DestructThreads();

  virtual void ThreadAsyncRun(size_t thread_id);
  int InitAffinityInfo(BindMode bind_mode);

  virtual void ThreadAsyncRun(Worker *worker);
  void KernelThreadRun(Worker *worker);

  void DistributeTask(Task *task, int task_num);

  std::mutex pool_mutex_;
  std::condition_variable start_cond_;

  std::vector<Worker *> workers_;
  std::vector<Worker *> freelist_;
@@ -118,6 +113,8 @@ class ThreadPool {

  size_t inter_thread_num_{0};
  size_t thread_num_{1};

  CoreAffinity *affinity_{nullptr};
 };

 }  // namespace mindspore
--- a/mindspore/lite/CMakeLists.txt
+++ b/mindspore/lite/CMakeLists.txt
@@ -130,10 +130,6 @@ if(ENABLE_CONVERTER OR BUILD_MINDDATA STREQUAL "full" OR BUILD_MINDDATA STREQUAL
    include(${TOP_DIR}/cmake/external_libs/json.cmake)
 endif()

 if(SUPPORT_TRAIN OR WIN32)
    set(ENABLE_MINDRT "off")
 endif()

 if(DEFINED ARCHS)
    add_definitions(-DMS_COMPILE_IOS)
 endif()
@@ -210,6 +206,7 @@ endif()

 if(ENABLE_MINDRT)
    include_directories(${CORE_DIR}/mindrt/include)
    include_directories(${CORE_DIR}/mindrt/src)
 endif()

 if(NOT WIN32 AND NOT APPLE)
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -54,7 +54,6 @@ set(LITE_SRC
        ${CMAKE_CURRENT_SOURCE_DIR}/common/quant_utils.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/allocator.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/runtime_api.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/thread_pool.c
        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/infer_manager.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/tensor.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/ms_tensor.cc
--- a/mindspore/lite/src/executor.cc
+++ b/mindspore/lite/src/executor.cc
@@ -23,8 +23,12 @@ namespace mindspore::lite {
 int Executor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                  const std::vector<kernel::LiteKernel *> &kernels, mindspore::Allocator *allocator,
                  const KernelCallBack &before, const KernelCallBack &after) {
  InterThreadPool *thread_pool = ctx_->thread_pool_;
  if (thread_pool == nullptr) {
    return RET_ERROR;
  }
  CpuBindMode cpu_bind_mode = ctx_->device_list_.front().device_info_.cpu_device_info_.cpu_bind_mode_;
  BindThreads(ctx_->thread_pool_, true, cpu_bind_mode);
  thread_pool->SetCpuAffinity(static_cast<BindMode>(cpu_bind_mode));

  MS_ASSERT(nullptr != allocator);
  auto ret = CheckTensorsInvalid(in_tensors);
@@ -59,8 +63,7 @@ int Executor::Run(const std::vector<Tensor *> &in_tensors, const std::vector<Ten
      }
    }
  }

  BindThreads(ctx_->thread_pool_, false, cpu_bind_mode);
  thread_pool->SetCpuAffinity(static_cast<BindMode>(NO_BIND));
  return RET_OK;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/inner_context.cc
+++ b/mindspore/lite/src/inner_context.cc
@@ -72,9 +72,9 @@ int InnerContext::Init() {
    return RET_NOT_SUPPORT;
  }
  if (this->thread_pool_ == nullptr && this->IsCpuEnabled()) {
    this->thread_pool_ =
      CreateLiteThreadPool(this->thread_num_, this->device_list_[0].device_info_.cpu_device_info_.cpu_bind_mode_);
    if (this->thread_pool_ == nullptr) {
    thread_pool_ = InterThreadPool::CreateThreadPool(
      1, this->thread_num_, static_cast<BindMode>(this->device_list_[0].device_info_.cpu_device_info_.cpu_bind_mode_));
    if (thread_pool_ == nullptr) {
      MS_LOG(ERROR) << "Create ThreadPool failed";
      return RET_NULL_PTR;
    }
@@ -110,8 +110,7 @@ int InnerContext::Init() {

 InnerContext::~InnerContext() {
  if (this->thread_pool_ != nullptr) {
    DestroyThreadPool(this->thread_pool_);
    free(this->thread_pool_);
    delete thread_pool_;
    this->thread_pool_ = nullptr;
  }
 #ifdef ENABLE_ARM
--- a/mindspore/lite/src/inner_context.h
+++ b/mindspore/lite/src/inner_context.h
@@ -21,6 +21,7 @@
 #include "include/context.h"
 #include "src/runtime/runtime_api.h"
 #include "src/runtime/allocator.h"
 #include "thread/inter_threadpool.h"
 #ifdef ENABLE_ARM
 #include "src/cpu_info.h"
 #endif
@@ -31,7 +32,7 @@
 namespace mindspore::lite {
 struct InnerContext : public Context {
 public:
  struct ThreadPool *thread_pool_ = nullptr;
  InterThreadPool *thread_pool_{nullptr};

 public:
  InnerContext() = default;
--- a/mindspore/lite/src/lite_mindrt.cc
+++ b/mindspore/lite/src/lite_mindrt.cc
@@ -199,6 +199,11 @@ int LiteOpActor::PrepareOutputData() {
 std::vector<std::shared_ptr<LiteOpActor>> CreateOpActor(const std::vector<kernel::LiteKernel *> &kernels) {
  std::vector<std::shared_ptr<LiteOpActor>> actors;
  std::unordered_map<size_t, AID> partial_map{};
  auto thread_pool = kernels[0]->Context()->thread_pool_;
  if (thread_pool == nullptr) {
    MS_LOG(ERROR) << "thread pool is nullptr";
    return actors;
  }
  for (size_t i = 0; i < kernels.size(); ++i) {
    if ((kernel::LiteKernelUtil::IsSwitchCall(kernels[i]))) {
      auto switch_actor = std::make_shared<LiteSwitchOpActor>(kernels[i]);
@@ -207,6 +212,7 @@ std::vector<std::shared_ptr<LiteOpActor>> CreateOpActor(const std::vector<kernel
        actors.clear();
        return actors;
      }
      switch_actor->set_thread_pool(thread_pool);
      partial_map[i] = switch_actor->GetAID();
      actors.push_back(switch_actor);
    } else {
@@ -216,6 +222,7 @@ std::vector<std::shared_ptr<LiteOpActor>> CreateOpActor(const std::vector<kernel
        actors.clear();
        return actors;
      }
      actor->set_thread_pool(thread_pool);
      partial_map[i] = actor->GetAID();
      actors.push_back(actor);
    }
@@ -423,7 +430,6 @@ void MindrtTerminate(const std::vector<std::shared_ptr<LiteOpActor>> &actor_list
  for (const auto &actor : actor_list) {
    mindspore::Terminate(actor->GetAID());
  }
  mindspore::TerminateCurThreads(1);
 }

 }  // namespace mindspore::lite
--- a/mindspore/lite/src/lite_mindrt.h
+++ b/mindspore/lite/src/lite_mindrt.h
@@ -47,8 +47,13 @@ class LiteOpActor : public OpActor<lite::Tensor> {
      return;
    }

    InterThreadPool *thread_pool = kernel_->Context()->thread_pool_;
    if (thread_pool == nullptr) {
      MS_LOG(ERROR) << "ThreadPool is nullptr, kernel: " << kernel_->name();
      return;
    }
    CpuBindMode cpu_bind_mode = kernel_->Context()->device_list_.front().device_info_.cpu_device_info_.cpu_bind_mode_;
    BindThreads(static_cast<const lite::InnerContext *>(kernel_->Context())->thread_pool_, true, cpu_bind_mode);
    thread_pool->SetCpuAffinity(static_cast<BindMode>(cpu_bind_mode));

    int ret = CheckInputData();
    if (ret != RET_OK) {
@@ -78,7 +83,7 @@ class LiteOpActor : public OpActor<lite::Tensor> {
    inputs_data_.clear();
    AsyncOutput(context);

    BindThreads(static_cast<const lite::InnerContext *>(kernel_->Context())->thread_pool_, false, cpu_bind_mode);
    thread_pool->SetCpuAffinity(static_cast<BindMode>(NO_BIND));
    SetOutputData(context);

    for (auto &input_data : inputs_data_) {
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@@ -570,11 +570,16 @@ int LiteSession::Init(const Context *context) {
    is_running_.store(false);
    return ret;
  }
  BindThreads(context_->thread_pool_, true,
              context_->device_list_.front().device_info_.cpu_device_info_.cpu_bind_mode_);
  CpuBindMode cpu_bind_mode = this->context_->device_list_.front().device_info_.cpu_device_info_.cpu_bind_mode_;
  InterThreadPool *thread_pool = this->context_->thread_pool_;
  if (thread_pool == nullptr) {
    MS_LOG(ERROR) << "thread pool is nullptr";
    is_running_.store(false);
    return RET_NULL_PTR;
  }
  thread_pool->SetProcessAffinity(static_cast<BindMode>(cpu_bind_mode));
  ret = InitGPURuntime();
  BindThreads(context_->thread_pool_, false,
              context_->device_list_.front().device_info_.cpu_device_info_.cpu_bind_mode_);
  thread_pool->SetProcessAffinity(static_cast<BindMode>(NO_BIND));
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init GPU runtime failed.";
    is_running_.store(false);
@@ -614,7 +619,6 @@ LiteSession::~LiteSession() {
  output_node_map_.clear();
  output_tensor_map_.clear();
  input_vec_.clear();
  delete this->context_;
  delete this->executor_;
  this->executor_ = nullptr;
 #if SUPPORT_NPU
@@ -628,6 +632,8 @@ LiteSession::~LiteSession() {
 #if GPU_OPENCL
  delete opencl_runtime_wrapper_;
 #endif
  delete this->context_;
  this->context_ = nullptr;
  delete (model_);
  is_running_.store(false);
 }
--- a/mindspore/lite/src/runtime/kernel/arm/base/constant_of_shape.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/constant_of_shape.cc
@@ -73,8 +73,8 @@ int ConstantOfShapeCPUKernel::Run() {
  int thread_count = MSMIN(op_parameter_->thread_num_, param_->element_size_);
  thread_stride_ = UP_DIV(param_->element_size_, thread_count);

  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConstantOfShapeRun,
                            this, thread_count);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(ConstantOfShapeRun, this, thread_count);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConstantOfShapeRun error error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/base/detection_post_process_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/detection_post_process_base.cc
@@ -236,8 +236,8 @@ int DetectionPostProcessBaseCPUKernel::Run() {
      return status;
    }
  } else {
    status = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                            NmsMultiClassesFastCoreRun, this, op_parameter_->thread_num_);
    status = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(NmsMultiClassesFastCoreRun, this, op_parameter_->thread_num_);
    if (status != RET_OK) {
      MS_LOG(ERROR) << "NmsMultiClassesFastCoreRun error error_code[" << status << "]";
      FreeAllocatedBuffer();
--- a/mindspore/lite/src/runtime/kernel/arm/base/prior_box.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/prior_box.cc
@@ -166,8 +166,8 @@ int RunPriorBox(void *cdata, int task_id) {
 }

 int PriorBoxCPUKernel::Run() {
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, RunPriorBox,
                                  this, thread_count_);
  int error_code = static_cast<const lite::InnerContext *>(this->context_)
                     ->thread_pool_->ParallelLaunch(RunPriorBox, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "PriorBox run error, error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc
@@ -172,8 +172,8 @@ int QuantDTypeCastCPUKernel::Run() {
    uint8_ptr_ = reinterpret_cast<uint8_t *>(out_tensors_[0]->data_c());
  }

  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, QuantDTypeCastRun,
                            this, thread_n_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(QuantDTypeCastRun, this, thread_n_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
    if (in_tensors_[0]->data_type() == TypeId::kNumberTypeInt8 &&
--- a/mindspore/lite/src/runtime/kernel/arm/base/reshape_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/reshape_base.cc
@@ -66,8 +66,8 @@ int ReshapeRun(void *cdata, int task_id) {
 int ReshapeBaseCPUKernel::Run() {
  input_ptr_ = reinterpret_cast<uint8_t *>(in_tensors_.at(kInputIndex)->data_c());
  output_ptr_ = reinterpret_cast<uint8_t *>(out_tensors_.at(kOutputIndex)->data_c());
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ReshapeRun, this,
                            context_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(ReshapeRun, this, context_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Reshape run error error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/base/slice_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/slice_base.cc
@@ -82,8 +82,8 @@ int SliceCPUKernel::Run() {
                      lite::DataTypeSize(in_tensors_.at(0)->data_type()));
    return RET_OK;
  }
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, SliceLaunch, this,
                            op_parameter_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(SliceLaunch, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "slice launch fail!ret: " << ret;
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/base/split_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/split_base.cc
@@ -126,8 +126,8 @@ int SplitBaseCPUKernel::Run() {
    output_ptr_.at(i) = output_tensor->data_c();
  }

  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, SplitRun, this,
                            thread_n_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(SplitRun, this, thread_n_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "split error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/base/split_with_over_lap_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/split_with_over_lap_base.cc
@@ -118,8 +118,8 @@ int SplitWithOverlapBaseCPUKernel::Run() {
    inner_stride_ *= input_shape[i];
  }

  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, SplitWithOverlapRun,
                            this, param_->num_split_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(SplitWithOverlapRun, this, param_->num_split_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ParallelLaunch for SplitWIthOverlapRun run fail. errorcode:[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/base/stack_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/stack_base.cc
@@ -101,7 +101,7 @@ int StackBaseCPUKernel::Run() {
  // run stack
  num_threads_ = MSMIN(UP_DIV(outer_size_, 64), this->context_->thread_num_);
  auto ret =
    ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, StackRun, this, num_threads_);
    static_cast<const lite::InnerContext *>(this->context_)->thread_pool_->ParallelLaunch(StackRun, this, num_threads_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "StackBaseCPUKernel Run error: error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/base/strided_slice.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/strided_slice.cc
@@ -163,8 +163,8 @@ int StridedSliceCPUKernel::FastRun() {
  }
  input_ptr_ = reinterpret_cast<uint8_t *>(in_tensors_.front()->data_c());
  output_ptr_ = reinterpret_cast<uint8_t *>(out_tensors_.front()->data_c());
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, StrideRun, this,
                            context_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(StrideRun, this, context_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Stride run error error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/base/tile_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/tile_base.cc
@@ -128,8 +128,8 @@ int TileCPUKernel::SimpleTileImpl(int task_id) {
 }

 int TileCPUKernel::RunSimpleTile() {
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, SimpleTile, this,
                            context_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(SimpleTile, this, context_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "RunSimpleTile error code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc
@@ -100,8 +100,8 @@ int ActivationFp16CPUKernel::Run() {
  fp16_input_ = reinterpret_cast<float16_t *>(input_tensor->data_c());
  fp16_output_ = reinterpret_cast<float16_t *>(output_tensor->data_c());

  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                  ActivationFp16Run, this, thread_count_);
  int error_code = static_cast<const lite::InnerContext *>(this->context_)
                     ->thread_pool_->ParallelLaunch(ActivationFp16Run, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_compare_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_compare_fp16.cc
@@ -168,8 +168,8 @@ int ArithmeticCompareFP16CPUKernel::Run() {
    FreeTmpBuffer();
    return RET_ERROR;
  }
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ArithmeticsRunFp16,
                            this, context_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(ArithmeticsRunFp16, this, context_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ArithmeticsRunFp16 run error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc
@@ -178,8 +178,8 @@ int ArithmeticFP16CPUKernel::Run() {
    FreeFp16Buffer();
    return RET_ERROR;
  }
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ArithmeticsRun, this,
                            context_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(ArithmeticsRun, this, context_->thread_num_);
  if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) {
    Float16ToFloat32(static_cast<float16_t *>(output_ptr_), reinterpret_cast<float *>(output_tensor->MutableData()),
                     output_tensor->ElementsNum());
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_self_fp16.cc
@@ -83,8 +83,8 @@ int ArithmeticSelfFp16CPUKernel::Run() {
  }
  output_fp16_ptr_ = reinterpret_cast<float16_t *>(output_tensor->data_c());

  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ArithmeticSelfRun,
                            this, op_parameter_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(ArithmeticSelfRun, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ArithmeticSelfRun error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc
@@ -59,8 +59,8 @@ int BatchnormFp16CPUKernel::Run() {
    return RET_ERROR;
  }

  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, BatchNormRun, this,
                            op_parameter_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(BatchNormRun, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc
@@ -132,8 +132,8 @@ int CastFp16CPUKernel::Run() {
  if (data_num_ == 0) {
    return RET_OK;
  }
  return ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, CastFp16Run, this,
                        op_parameter_->thread_num_);
  return static_cast<const lite::InnerContext *>(this->context_)
    ->thread_pool_->ParallelLaunch(CastFp16Run, this, op_parameter_->thread_num_);
 }

 REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Cast, LiteKernelCreator<CastFp16CPUKernel>)
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@@ -236,16 +236,16 @@ int Convolution1x1FP16CPUKernel::Run() {

    int ret = RET_ERROR;
    if (multi_thread_by_hw_) {
      ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                           Convolution1x1Fp16RunHw, this, thread_count_);
      ret = static_cast<const lite::InnerContext *>(this->context_)
              ->thread_pool_->ParallelLaunch(Convolution1x1Fp16RunHw, this, thread_count_);
    } else {
 #ifdef ENABLE_ARM64
      RowMajor2Col16MajorFp16Opt(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
 #else
      RowMajor2Col12MajorFp16Opt(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
 #endif
      ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                           Convolution1x1Fp16RunOc, this, thread_count_);
      ret = static_cast<const lite::InnerContext *>(this->context_)
              ->thread_pool_->ParallelLaunch(Convolution1x1Fp16RunOc, this, thread_count_);
    }
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "ParallelLaunch failed.";
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@@ -104,8 +104,8 @@ static int ConvDwFp16Run(void *cdata, int task_id) {
 }

 int ConvolutionDepthwiseFp16CPUKernel::Run() {
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvDwFp16Run, this,
                            conv_param_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
@@ -155,8 +155,8 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() {
    packed_output_ = output_ptr;
  }

  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvDwSWFp16Run, this,
                       conv_param_->thread_num_);
  ret = static_cast<const lite::InnerContext *>(this->context_)
          ->thread_pool_->ParallelLaunch(ConvDwSWFp16Run, this, conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvDwSWFp16Run error: error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
@@ -144,8 +144,8 @@ int ConvolutionFP16CPUKernel::Run() {
    return RET_ERROR;
  }

  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvolutionFp16Impl, this,
                       thread_count_);
  ret = static_cast<const lite::InnerContext *>(this->context_)
          ->thread_pool_->ParallelLaunch(ConvolutionFp16Impl, this, thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "conv fp16 error ret[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
@@ -213,8 +213,8 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
    return RET_ERROR;
  }

  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                       ConvolutionWinogradFp16Impl, this, thread_count_);
  ret = static_cast<const lite::InnerContext *>(this->context_)
          ->thread_pool_->ParallelLaunch(ConvolutionWinogradFp16Impl, this, thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "conv winograd error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/crop_fp16.cc
@@ -53,8 +53,8 @@ int CropFp16CPUKernel::Run() {
  input_ptr_ = reinterpret_cast<float16_t *>(input_tensor->data_c());
  output_ptr_ = reinterpret_cast<float16_t *>(output_tensor->data_c());

  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, CropFp16Run, this,
                            crop_para_->thread_count_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(CropFp16Run, this, crop_para_->thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ParallelLaunch failed: " << ret;
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
@@ -173,8 +173,8 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
    memset(output_ptr, 0, out_tensors_.at(kOutputIndex)->ElementsNum() * sizeof(float16_t));
    packed_output_ = output_ptr;
  }
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeconvDwFp16Run, this,
                       conv_param_->thread_num_);
  ret = static_cast<const lite::InnerContext *>(this->context_)
          ->thread_pool_->ParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "DeconvDwFp16Run error: error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
@@ -218,8 +218,8 @@ int DeConvolutionFp16CPUKernel::Run() {

    RowMajor2Col16MajorFp16Opt(batch_input_, pack_input_, input_plane_, conv_param_->input_channel_);

    error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeConvFp16Run,
                                this, thread_count_);
    error_code = static_cast<const lite::InnerContext *>(this->context_)
                   ->thread_pool_->ParallelLaunch(DeConvFp16Run, this, thread_count_);
    if (error_code != RET_OK) {
      MS_LOG(ERROR) << "deconv fp16 run error! error_code[" << error_code << "]";
    }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
@@ -392,12 +392,12 @@ int DeConvWinogradFp16CPUKernel::Run() {
    nhwc_output_ = output_ptr + batch_index * deconv_param_->output_plane_ * conv_param_->output_channel_;

    ::memset(nc4hw4_output_, 0, deconv_param_->output_plane_ * deconv_param_->oc_div4_ * C4NUM * sizeof(float16_t));
    ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeConvWgFp16Run, this,
                   deconv_param_->thread_num_);
    static_cast<const lite::InnerContext *>(this->context_)
      ->thread_pool_->ParallelLaunch(DeConvWgFp16Run, this, deconv_param_->thread_num_);

    /*post bias activate and nhwc */
    ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeConvWgPostFp16Run, this,
                   thread_num_hw_);
    static_cast<const lite::InnerContext *>(this->context_)
      ->thread_pool_->ParallelLaunch(DeConvWgPostFp16Run, this, thread_num_hw_);
  }

  return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/gather_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/gather_fp16.cc
@@ -148,8 +148,8 @@ int GatherFp16CPUKernel::Run() {
      Float32ToFloat16(reinterpret_cast<float *>(input_tensor->data_c()), input_data_, input_tensor->ElementsNum());
    }
  }
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, GatherRunFp16, this,
                       op_parameter_->thread_num_);
  ret = static_cast<const lite::InnerContext *>(this->context_)
          ->thread_pool_->ParallelLaunch(GatherRunFp16, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Gather function error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/instance_norm_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/instance_norm_fp16.cc
@@ -109,8 +109,8 @@ int InstanceNormFp16Run(void *cdata, int task_id) {
 int InstanceNormFp16CPUKernel::Run() {
  src_data_ = reinterpret_cast<float16_t *>(in_tensors_[0]->data_c());
  dst_data_ = reinterpret_cast<float16_t *>(out_tensors_[0]->data_c());
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, InstanceNormFp16Run,
                            this, op_parameter_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(InstanceNormFp16Run, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "InstanceNormFp16Run error error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/layer_norm_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/layer_norm_fp16.cc
@@ -95,8 +95,8 @@ int LayerNormFp16CPUKernel::Run() {
    var_data_ =
      reinterpret_cast<float16_t *>(context_->allocator->Malloc(param_->norm_outer_size_ * sizeof(float16_t)));
  }
  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, LayerNormFp16Run, this,
                       op_parameter_->thread_num_);
  ret = static_cast<const lite::InnerContext *>(this->context_)
          ->thread_pool_->ParallelLaunch(LayerNormFp16Run, this, op_parameter_->thread_num_);
  if (out_tensors_.size() != 3) {
    context_->allocator->Free(mean_data_);
    context_->allocator->Free(var_data_);
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/log_softmax_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/log_softmax_fp16.cc
@@ -95,8 +95,8 @@ int LogSoftmaxLastAxisFp16Run(void *cdata, int task_id) {

 int LogSoftmaxFp16CPUKernel::Run() {
  if (in_plane_size_ == 1) {
    auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                              LogSoftmaxLastAxisFp16Run, this, context_->thread_num_);
    auto ret = static_cast<const lite::InnerContext *>(this->context_)
                 ->thread_pool_->ParallelLaunch(LogSoftmaxLastAxisFp16Run, this, context_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "LogSoftmaxFp16CPUKernel ParallelLaunch failed, ret: " << ret;
    }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc
@@ -294,8 +294,8 @@ int MatmulBaseFP16CPUKernel::Run() {
      batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
      batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_;
    }
    auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, MatmulBaseFP16Run,
                              this, thread_count_);
    auto ret = static_cast<const lite::InnerContext *>(this->context_)
                 ->thread_pool_->ParallelLaunch(MatmulBaseFP16Run, this, thread_count_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "MatmulBaseFloatRun failed";
      return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pad_fp16.cc
@@ -89,8 +89,8 @@ int PadFp16CPUKernel::Run() {
        output_[i] = pad_param_->constant_value_;
      }
    }
    ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, PadImpl, this,
                         op_parameter_->thread_num_);
    ret = static_cast<const lite::InnerContext *>(this->context_)
            ->thread_pool_->ParallelLaunch(PadImpl, this, op_parameter_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
    }
@@ -102,8 +102,8 @@ int PadFp16CPUKernel::Run() {
      return ret;
    }

    ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, MirrorPadImpl, this,
                         context_->thread_num_);
    ret = static_cast<const lite::InnerContext *>(this->context_)
            ->thread_pool_->ParallelLaunch(MirrorPadImpl, this, context_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Pad Reflect or Symmetric mode run error, error_code[" << ret << "]";
    }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc
@@ -90,8 +90,8 @@ int PoolingFp16CPUKernel::Run() {
  fp16_input_ = reinterpret_cast<float16_t *>(input_tensor->data_c());
  fp16_output_ = reinterpret_cast<float16_t *>(output_tensor->data_c());

  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                  PoolingFp16Impl, this, thread_count_);
  int error_code = static_cast<const lite::InnerContext *>(this->context_)
                     ->thread_pool_->ParallelLaunch(PoolingFp16Impl, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "pooling error error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/power_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/power_fp16.cc
@@ -87,8 +87,8 @@ int PowerFp16CPUKernel::Run() {
      return ret;
    }
  }
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, PowerImplFp16, this,
                            thread_count_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(PowerImplFp16, this, thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "PowerFp16CPUKernel error: " << ret;
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/quant_dtype_cast_fp16.cc
@@ -164,8 +164,8 @@ int QuantDTypeCastFp16CPUKernel::Run() {
    return RET_ERROR;
  }

  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                            QuantDTypeCastFP16Run, this, thread_n_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(QuantDTypeCastFP16Run, this, thread_n_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc
@@ -93,8 +93,8 @@ int ReduceFp16CPUKernel::Run() {
    outer_size_ = outer_sizes_.at(i);
    inner_size_ = inner_sizes_.at(i);
    axis_size_ = axis_sizes_.at(i);
    auto error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                     ReduceFp16Impl, this, context_->thread_num_);
    auto error_code = static_cast<const lite::InnerContext *>(this->context_)
                        ->thread_pool_->ParallelLaunch(ReduceFp16Impl, this, context_->thread_num_);
    if (error_code != RET_OK) {
      FreeTmpBuffer();
      MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
@@ -109,8 +109,8 @@ int ReduceFp16CPUKernel::Run() {
  outer_size_ = outer_sizes_.back();
  inner_size_ = inner_sizes_.back();
  axis_size_ = axis_sizes_.back();
  auto error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                   ReduceFp16Impl, this, context_->thread_num_);
  auto error_code = static_cast<const lite::InnerContext *>(this->context_)
                      ->thread_pool_->ParallelLaunch(ReduceFp16Impl, this, context_->thread_num_);
  if (error_code != RET_OK) {
    FreeTmpBuffer();
    MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc
@@ -115,8 +115,8 @@ int ScaleFp16CPUKernel::Run() {
    return ret;
  }

  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ScaleFp16Run, this,
                       op_parameter_->thread_num_);
  ret = static_cast<const lite::InnerContext *>(this->context_)
          ->thread_pool_->ParallelLaunch(ScaleFp16Run, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
    FreeTmpBuffer();
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/slice_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/slice_fp16.cc
@@ -63,8 +63,8 @@ int SliceFp16CPUKernel::Run() {
    DoSliceNoParallel(input_data, out_tensors_.at(0)->data_c(), param_, lite::DataTypeSize(kNumberTypeFloat16));
    return RET_OK;
  }
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, SliceFp16Launch,
                            this, op_parameter_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(SliceFp16Launch, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "fp16 slice launch fail!ret: " << ret;
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/softmax_fp16.cc
@@ -95,8 +95,8 @@ int SoftmaxLastAxisFp16Run(void *cdata, int task_id) {

 int SoftmaxFp16CPUKernel::Run() {
  if (in_plane_size_ == 1) {
    auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                              SoftmaxLastAxisFp16Run, this, context_->thread_num_);
    auto ret = static_cast<const lite::InnerContext *>(this->context_)
                 ->thread_pool_->ParallelLaunch(SoftmaxLastAxisFp16Run, this, context_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "SoftmaxFp16CPUKernel ParallelLaunch failed, ret: " << ret;
    }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc
@@ -102,7 +102,7 @@ int StackFp16CPUKernel::Run() {
  // run stack
  num_threads_ = MSMIN(UP_DIV(outer_size_, 64), this->context_->thread_num_);
  ret =
    ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, StackRun, this, num_threads_);
    static_cast<const lite::InnerContext *>(this->context_)->thread_pool_->ParallelLaunch(StackRun, this, num_threads_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "StackBaseCPUKernel Run error: error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/activation_fp16_grad.cc
@@ -79,8 +79,8 @@ int ActivationGradRunFp16(void *cdata, int task_id) {
 }

 int ActivationGradCPUKernelFp16::Run() {
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                  ActivationGradRunFp16, this, thread_count_);
  int error_code = static_cast<const lite::InnerContext *>(this->context_)
                     ->thread_pool_->ParallelLaunch(ActivationGradRunFp16, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Activation Grad function error error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_self_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16_grad/arithmetic_fp16_self_grad.cc
@@ -73,8 +73,8 @@ int ArithmeticSelfGradFp16Run(void *cdata, int task_id) {
 }

 int ArithmeticSelfGradFp16CPUKernel::Run() {
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                  ArithmeticSelfGradFp16Run, this, thread_count_);
  int error_code = static_cast<const lite::InnerContext *>(this->context_)
                     ->thread_pool_->ParallelLaunch(ArithmeticSelfGradFp16Run, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Activation Grad function error error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/activation_fp32.cc
@@ -105,8 +105,8 @@ int ActivationRun(void *cdata, int task_id) {
 }

 int ActivationCPUKernel::Run() {
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ActivationRun,
                                  this, thread_count_);
  int error_code = static_cast<const lite::InnerContext *>(this->context_)
                     ->thread_pool_->ParallelLaunch(ActivationRun, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/adder_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/adder_fp32.cc
@@ -122,8 +122,8 @@ int AdderCPUKernel::Run() {
    return RET_ERROR;
  }

  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, AdderImpl,
                                  this, thread_count_);
  int error_code = static_cast<const lite::InnerContext *>(this->context_)
                     ->thread_pool_->ParallelLaunch(AdderImpl, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "adder error error_code[" << error_code << "]";
    FreeTmpBuffer();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/addn_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/addn_fp32.cc
@@ -89,8 +89,8 @@ int AddNCPUKernel::Run() {
  in1_addr_ = input0_data;
  in2_addr_ = input1_data;
  out_addr_ = output_data;
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, AddNLaunch, this,
                            op_parameter_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(AddNLaunch, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "addn launch fail!ret: " << ret;
    return RET_ERROR;
@@ -98,8 +98,8 @@ int AddNCPUKernel::Run() {
  for (size_t i = 2; i < in_tensors_.size(); ++i) {
    in1_addr_ = reinterpret_cast<float *>(in_tensors_[i]->MutableData());
    in2_addr_ = output_data;
    ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, AddNLaunch, this,
                         op_parameter_->thread_num_);
    ret = static_cast<const lite::InnerContext *>(this->context_)
            ->thread_pool_->ParallelLaunch(AddNLaunch, this, op_parameter_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "addn launch fail!ret: " << ret << ", input index: " << i;
      return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_fp32.cc
@@ -419,8 +419,8 @@ int ArithmeticCPUKernel::Run() {
    input1_ptr_ = in_tensors_[1]->data_c();
  }
  output_ptr_ = out_tensors_[0]->data_c();
  return ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ArithmeticsRun, this,
                        context_->thread_num_);
  return static_cast<const lite::InnerContext *>(this->context_)
    ->thread_pool_->ParallelLaunch(ArithmeticsRun, this, context_->thread_num_);
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_MulFusion, LiteKernelCreator<ArithmeticCPUKernel>)
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self_fp32.cc
@@ -113,8 +113,8 @@ int ArithmeticSelfRun(void *cdata, int task_id) {
 }

 int ArithmeticSelfCPUKernel::Run() {
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ArithmeticSelfRun,
                            this, op_parameter_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(ArithmeticSelfRun, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ArithmeticSelfRun error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm_fp32.cc
@@ -75,8 +75,8 @@ int BatchnormCPUKernel::InitConstTensor() {
 }

 int BatchnormCPUKernel::Run() {
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, BatchNormRun, this,
                            op_parameter_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(BatchNormRun, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/cast_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/cast_fp32.cc
@@ -141,8 +141,8 @@ int CastCPUKernel::Run() {
  if (data_num_ == 0) {
    return RET_OK;
  }
  return ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, CastRun, this,
                        op_parameter_->thread_num_);
  return static_cast<const lite::InnerContext *>(this->context_)
    ->thread_pool_->ParallelLaunch(CastRun, this, op_parameter_->thread_num_);
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Cast, LiteKernelCreator<CastCPUKernel>)
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/concat_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/concat_fp32.cc
@@ -69,8 +69,8 @@ int ConcatRun(void *cdata, int task_id) {
 }

 int ConcatCPUKernel::Run() {
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConcatRun,
                                  this, op_parameter_->thread_num_);
  int error_code = static_cast<const lite::InnerContext *>(this->context_)
                     ->thread_pool_->ParallelLaunch(ConcatRun, this, op_parameter_->thread_num_);
  return error_code;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/concat_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/concat_fp32.h
@@ -22,7 +22,6 @@
 #include "nnacl/concat_parameter.h"
 #include "include/errorcode.h"
 #include "src/runtime/runtime_api.h"
 #include "src/runtime/thread_pool.h"
 #include "include/context.h"

 using mindspore::lite::InnerContext;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
@@ -247,12 +247,12 @@ int Convolution1x1CPUKernel::Run() {
    }

    if (multi_thread_by_hw_) {
      ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, Convolution1x1RunHw, this,
                     thread_count_);
      static_cast<const lite::InnerContext *>(this->context_)
        ->thread_pool_->ParallelLaunch(Convolution1x1RunHw, this, thread_count_);
    } else {
      PackMatmulInput(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
      ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, Convolution1x1Run, this,
                     thread_count_);
      static_cast<const lite::InnerContext *>(this->context_)
        ->thread_pool_->ParallelLaunch(Convolution1x1Run, this, thread_count_);
    }
  }

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_3x3_fp32.cc
@@ -126,8 +126,8 @@ int ConvolutionDepthwise3x3CPUKernel::Run() {
  auto output_tensor = out_tensors_.at(kOutputIndex);
  output_ptr_ = reinterpret_cast<float *>(output_tensor->data_c());

  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvDw3x3Run, this,
                            conv_param_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(ConvDw3x3Run, this, conv_param_->thread_num_);
  ctx_->allocator->Free(buffer_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvDw3x3Run error: error_code[" << ret << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32.cc
@@ -107,8 +107,8 @@ int ConvolutionDepthwiseCPUKernel::Run() {
  auto output_tensor = out_tensors_.at(kOutputIndex);
  output_ptr_ = reinterpret_cast<float *>(output_tensor->MutableData());

  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvDwRun, this,
                            conv_param_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(ConvDwRun, this, conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvDwRun error: error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.cc
@@ -194,8 +194,8 @@ int ConvolutionDepthwiseIndirectCPUKernel::Run() {

  ConvDwInitIndirection(indirect_buffer_, packed_input_, zero_ptr_, conv_param_, step_h, step_w);

  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvDwIndirectRun,
                            this, conv_param_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(ConvDwIndirectRun, this, conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvDwIndirectRun error: error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.cc
@@ -163,8 +163,8 @@ int ConvolutionDepthwiseSWCPUKernel::Run() {
    packed_output_ = output_ptr;
  }

  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvDwSWRun, this,
                       conv_param_->thread_num_);
  ret = static_cast<const lite::InnerContext *>(this->context_)
          ->thread_pool_->ParallelLaunch(ConvDwSWRun, this, conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvDwSWRun error: error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc
@@ -152,8 +152,8 @@ int ConvolutionCPUKernel::Run() {
    PackWeight();
  }

  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvolutionImpl, this,
                       thread_count_);
  ret = static_cast<const lite::InnerContext *>(this->context_)
          ->thread_pool_->ParallelLaunch(ConvolutionImpl, this, thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "conv error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow_fp32.cc
@@ -179,8 +179,8 @@ int ConvolutionSWCPUKernel::Run() {
  auto input_data = in_tensors_.at(kInputIndex)->MutableData();
  MS_ASSERT(input_data != nullptr);
  ori_input_data_ = reinterpret_cast<float *>(input_data);
  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                  ConvolutionSWImpl, this, thread_count_);
  int error_code = static_cast<const lite::InnerContext *>(this->context_)
                     ->thread_pool_->ParallelLaunch(ConvolutionSWImpl, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "conv error error_code[" << error_code << "]";
    FreeTmpBuffer();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
@@ -219,8 +219,8 @@ int ConvolutionWinogradCPUKernel::Run() {
    InitWeightBias();
  }

  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ConvolutionWinogradImpl,
                       this, thread_count_);
  ret = static_cast<const lite::InnerContext *>(this->context_)
          ->thread_pool_->ParallelLaunch(ConvolutionWinogradImpl, this, thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "conv winograd error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/crop_and_resize_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/crop_and_resize_fp32.cc
@@ -159,8 +159,8 @@ int CropAndResizeCPUKernel::Run() {
    return ret;
  }

  int error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                                  CropAndResizeImpl, this, context_->thread_num_);
  int error_code = static_cast<const lite::InnerContext *>(this->context_)
                     ->thread_pool_->ParallelLaunch(CropAndResizeImpl, this, context_->thread_num_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "CropAndResize run error, error_code[" << error_code << "]";
    FreeTmpBuffer();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/crop_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/crop_fp32.cc
@@ -62,8 +62,8 @@ int CropCPUKernel::Run() {
    return RET_OK;
  }

  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, CropLaunch, this,
                            crop_para_->thread_count_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(CropLaunch, this, crop_para_->thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Crop launch fail!ret: " << ret;
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/cumsum_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/cumsum_fp32.cc
@@ -137,8 +137,8 @@ int CumSumCPUKernel::DoCumsumInt(int task_id) {
 }

 int CumSumCPUKernel::Run() {
  int ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, CumsumLaunch, this,
                           op_parameter_->thread_num_);
  int ret = static_cast<const lite::InnerContext *>(this->context_)
              ->thread_pool_->ParallelLaunch(CumsumLaunch, this, op_parameter_->thread_num_);

  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Crop launch fail!ret: " << ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise_fp32.cc
@@ -168,8 +168,8 @@ int DeconvolutionDepthwiseCPUKernel::Run() {
    packed_output_ = output_addr;
  }

  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeconvDwRun, this,
                       conv_param_->thread_num_);
  ret = static_cast<const lite::InnerContext *>(this->context_)
          ->thread_pool_->ParallelLaunch(DeconvDwRun, this, conv_param_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "DeconvDwRun error: error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_fp32.cc
@@ -227,8 +227,8 @@ int DeConvolutionCPUKernel::Run() {
    RowMajor2Col12Major(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_);
 #endif

    error_code = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeConvFp32Run,
                                this, thread_count_);
    error_code = static_cast<const lite::InnerContext *>(this->context_)
                   ->thread_pool_->ParallelLaunch(DeConvFp32Run, this, thread_count_);
    if (error_code != RET_OK) {
      MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]";
      FreeRunBuf();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd_fp32.cc
@@ -411,12 +411,12 @@ int DeConvolutionWinogradCPUKernel::Run() {
    nhwc_output_ = src_out + batch_index * deconv_param_->output_plane_ * conv_param_->output_channel_;

    ::memset(nc4hw4_output_, 0, deconv_param_->output_plane_ * deconv_param_->oc_div4_ * C4NUM * sizeof(float));
    ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeConvWgFp32Run, this,
                   deconv_param_->thread_num_);
    static_cast<const lite::InnerContext *>(this->context_)
      ->thread_pool_->ParallelLaunch(DeConvWgFp32Run, this, deconv_param_->thread_num_);

    /*post bias activate and nhwc */
    ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, DeConvWgPostFp32Run, this,
                   thread_num_hw_);
    static_cast<const lite::InnerContext *>(this->context_)
      ->thread_pool_->ParallelLaunch(DeConvWgPostFp32Run, this, thread_num_hw_);
  }

  FreeRunBuf();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/elu_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/elu_fp32.cc
@@ -55,8 +55,8 @@ int EluRun(void *cdata, int task_id) {
 }

 int EluCPUKernel::Run() {
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, EluRun, this,
                            op_parameter_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(EluRun, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Elu error: error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/embedding_lookup_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/embedding_lookup_fp32.cc
@@ -87,8 +87,8 @@ int EmbeddingLookupCPUKernel::Run() {
    memcpy(input_addr_ + dest_loc, input_t, sizeof(float) * in_tensors_.at(i)->ElementsNum());
    dest_loc += in_tensors_.at(i)->ElementsNum();
  }
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, EmbeddingLookupRun,
                            this, op_parameter_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(EmbeddingLookupRun, this, op_parameter_->thread_num_);
  FreeRunBuff();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "EmbeddingLookup error: error_code[" << ret << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/exp_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/exp_fp32.cc
@@ -73,8 +73,8 @@ int ExpCPUKernel::Run() {
  output_addr_ = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
  exp_parameter_->element_num_ = in_tensors_.front()->ElementsNum();

  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, ExpRun, this,
                            exp_parameter_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(ExpRun, this, exp_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Exp error: error_code[" << ret << "]";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fill_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fill_fp32.cc
@@ -91,8 +91,8 @@ int FillCPUKernel::Run() {
    MS_LOG(ERROR) << "unsupported fill data type " << fill_input->data_type();
    return RET_ERROR;
  }
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, FillRun, this,
                            thread_sz_count_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(FillRun, this, thread_sz_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "FillRun error error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fused_batchnorm_fp32.cc
@@ -93,8 +93,8 @@ int FusedBatchnormCPUKernel::Run() {

    trained_ = true;  // trained at least once
  }
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, BatchNormRun, this,
                            op_parameter_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(BatchNormRun, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd_fp32.cc
@@ -128,8 +128,8 @@ int GatherNdCPUKernel::Run() {
  in_ptr_ = reinterpret_cast<float *>(in_tensors_.front()->MutableData());
  out_ptr_ = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
  InitOffset();
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, GatherNdRun, this,
                            thread_sz_count_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(GatherNdRun, this, thread_sz_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "gatherNd error error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gather_fp32.cc
@@ -92,8 +92,8 @@ int GatherCPUKernel::Run() {
    return ret;
  }

  ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, GatherRun, this,
                       op_parameter_->thread_num_);
  ret = static_cast<const lite::InnerContext *>(this->context_)
          ->thread_pool_->ParallelLaunch(GatherRun, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Gather function error error_code[" << ret << "]";
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/instance_norm_fp32.cc
@@ -66,8 +66,8 @@ int InstanceNormCPUKernel::Run() {
  gamma_data_ = reinterpret_cast<float *>(in_tensors_.at(1)->data_c());
  beta_data_ = reinterpret_cast<float *>(in_tensors_.at(2)->data_c());
  dst_data_ = reinterpret_cast<float *>(out_tensors_.at(0)->data_c());
  auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, InstanceNormRun,
                            this, op_parameter_->thread_num_);
  auto ret = static_cast<const lite::InnerContext *>(this->context_)
               ->thread_pool_->ParallelLaunch(InstanceNormRun, this, op_parameter_->thread_num_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "InstanceNormRun error error_code[" << ret << "]";
    return ret;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/l2_norm_fp32.cc
@@ -146,8 +146,8 @@ int L2NormCPUKernel::Run() {
  output_ptr_ = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->MutableData());
  if (l2_norm_param_->axis_num_ == 0 || l2_norm_param_->axis_num_ == input_shape.size()) {
    // all axis
    auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, SquareSumRun, this,
                              context_->thread_num_);
    auto ret = static_cast<const lite::InnerContext *>(this->context_)
                 ->thread_pool_->ParallelLaunch(SquareSumRun, this, context_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "L2Norm error: error_code[" << ret << "]";
      return RET_ERROR;
@@ -157,15 +157,15 @@ int L2NormCPUKernel::Run() {
      sum += tmp_sum_[i];
    }
    sqrt_sum_ = sqrt(sum > l2_norm_param_->epsilon_ ? sum : l2_norm_param_->epsilon_);
    ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_, L2NormRun, this,
                         context_->thread_num_);
    ret = static_cast<const lite::InnerContext *>(this->context_)
            ->thread_pool_->ParallelLaunch(L2NormRun, this, context_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "L2Norm error: error_code[" << ret << "]";
      return RET_ERROR;
    }
  } else if (l2_norm_param_->axis_num_ == 1 && l2_norm_param_->axis_[0] == static_cast<int>(input_shape.size()) - 1) {
    auto ret = ParallelLaunch(static_cast<const lite::InnerContext *>(this->context_)->thread_pool_,
                              L2NormTrailingAxisRun, this, context_->thread_num_);
    auto ret = static_cast<const lite::InnerContext *>(this->context_)
                 ->thread_pool_->ParallelLaunch(L2NormTrailingAxisRun, this, context_->thread_num_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "L2Norm error: error_code[" << ret << "]";
      return RET_ERROR;