Mindspore support aicpu ops compile.

4 years ago · fa0e9ae42e
--- a/mindspore/ccsrc/CMakeLists.txt
+++ b/mindspore/ccsrc/CMakeLists.txt
@@ -13,6 +13,11 @@ set(FBS_FILES
        )
 ms_build_flatbuffers(FBS_FILES ${CMAKE_CURRENT_SOURCE_DIR}../../schema generated_fbs_files ${SERVER_FLATBUFFER_OUTPUT})

 if(ENABLE_D)
    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/backend/kernel_compiler/aicpu/aicpu_ops)
    add_subdirectory(backend/kernel_compiler/aicpu/aicpu_ops)
 endif()

 if(ENABLE_CPU)
    if(${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "aarch64")
        set(PLATFORM_ARM64 "on")
--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@@ -18,6 +18,10 @@ if(ENABLE_D)
        "rts/*.cc"
        "hccl/*.cc"
    )
    file(GLOB_RECURSE AICPU_OPS_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "aicpu/aicpu_ops/*.cc"
    )
    list(REMOVE_ITEM D_SRC_LIST ${AICPU_OPS_SRC})
    add_compile_definitions(ENABLE_D)
 endif()

--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/CMakeLists.txt
@@ -0,0 +1,64 @@
 set(NORMAL_CMAKE_C_COMPILER ${CMAKE_C_COMPILER})
 set(NORMAL_CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER})
 if(DEFINED ENV{ASCEND_CUSTOM_PATH})
    set(TOOLCHAIN_PATH $ENV{ASCEND_CUSTOM_PATH}/toolkit/toolchain)
 else()
    set(TOOLCHAIN_PATH /usr/local/Ascend/toolkit/toolchain)
 endif()
 set(CMAKE_C_COMPILER ${TOOLCHAIN_PATH}/hcc/bin/aarch64-target-linux-gnu-gcc)
 set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PATH}/hcc/bin/aarch64-target-linux-gnu-g++)

 if(EXISTS ${CMAKE_C_COMPILER} AND EXISTS ${CMAKE_CXX_COMPILER})
    set(AICPU_PROTO_SRC
        ${CMAKE_CURRENT_SOURCE_DIR}/aicpu_op_proto/aicpu_tensor.proto
    )

    ms_protobuf_generate(PROTO_SRCS PROTO_HDRS ${AICPU_PROTO_SRC})

    set(AICPU_SRC
        ${PROTO_SRCS}
        ${CMAKE_CURRENT_SOURCE_DIR}/common/kernel_base.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/common/kernel_log.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/aicpu_sharder/aicpu_async_event.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/aicpu_sharder/aicpu_context.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/aicpu_sharder/aicpu_pulse.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/aicpu_sharder/aicpu_sharder.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/random_choice_with_mask_kernels.cc
    )

    add_library(aicpu_kernels SHARED
        ${AICPU_SRC}
    )

    target_compile_options(aicpu_kernels PRIVATE
        -march=armv8-a
        -O2
        -fvisibility-inlines-hidden
        -fvisibility=hidden
        -fno-strict-aliasing
        -fno-common
    )

    target_link_libraries(aicpu_kernels PRIVATE
        -ldl
        -shared
        PUBLIC
        ${SECUREC_LIBRARY}
        -Wl,--whole-archive
        -Wl,--no-whole-archive
        -Wl,-Bsymbolic
        -rdynamic
        mindspore::protobuf
        -pthread
    )

    set(INSTALL_LIBRARY_DIR lib)
    install(TARGETS aicpu_kernels OPTIONAL
        EXPORT aicpu_kernels-targets
        LIBRARY DESTINATION ${INSTALL_LIBRARY_DIR}
    )

 endif()

 set(CMAKE_C_COMPILER ${NORMAL_CMAKE_C_COMPILER})
 set(CMAKE_CXX_COMPILER ${NORMAL_CMAKE_CXX_COMPILER})
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_op_proto/aicpu_tensor.proto
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_op_proto/aicpu_tensor.proto
@@ -0,0 +1,118 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 syntax = "proto3";
 package aicpuops;

 message AttrValue {

  message ArrayValue {
    repeated bytes s = 2;                         //"array(string)"
    repeated int64 i = 3 [ packed = true ];       //"array(int)"
    repeated float f = 4 [ packed = true ];       //"array(float)"
    repeated bool b = 5 [ packed = true ];        //"array(bool)"
    repeated int32 type = 6 [ packed = true ];    //"array(type)"
    repeated TensorShape shape = 7;               //"array(shape)"
    repeated Tensor tensor = 8;                   //"array(tensor)"
  }

  oneof value {
    ArrayValue array = 1;
    bytes s = 2;           //"string"
    int64 i = 3;           //"int"
    float f = 4;           //"float"
    bool b = 5;            //"bool"
    int32 type = 6;        //"type"
    TensorShape shape = 7; //"shape"
    Tensor tensor = 8;     //"tensor"
  }
 }

 message DynamicIdx {
  int32 idx = 1;
  int32 num = 2;
 }

 message NodeDef {
  string op = 2;
  map<string, AttrValue> attrs = 3;
  repeated Tensor inputs = 4;
  repeated Tensor outputs = 5;
  map<string, DynamicIdx> dym_inputs = 6;
  map<string, DynamicIdx> dym_outputs = 7;
 }

 message TensorShape {
  // One dimension of the tensor.
  message Dim {
    // size must >=0
    int64 size = 1;
  };

  // group dim info
  repeated Dim dim = 2;

  // If true, the number of dimensions in the shape is unknown.
  // If true, "dim.size()" must be 0.
  bool unknown_rank = 3;

  // data format "NHWC" "NCHW" "NC1HWC0" OR "NONE"
  int32 data_format = 4;
 };

 message Tensor {

  // tensor shape info
  TensorShape tensor_shape = 1;

  // tensor content data type
  int32 tensor_type = 2;

  // tensor memory device
  // data located memory device , "DDR" "HBM" OR "NONE"
  string mem_device = 3;
  string name = 4;
  uint64 data_ptr = 5;
  uint64 data_size = 6;
 }

 enum DataType {
  MS_FLOAT32 = 0;
  MS_FLOAT16 = 1;
  MS_INT8 = 2;
  MS_INT32 = 3;
  MS_UINT8 = 4;
  MS_INT16 = 6;
  MS_UINT16 = 7;
  MS_UINT32 = 8;
  MS_INT64 = 9;
  MS_UINT64 = 10;
  MS_FLOAT64 = 11;
  MS_BOOL = 12;
  MS_STRING = 13;
  MS_DUAL_SUB_INT8 = 14;
  MS_DUAL_SUB_UINT8 = 15;
  MS_COMPLEX64 = 16;
  MS_COMPLEX128 = 17;
  MS_QINT8 = 18;
  MS_QINT16 = 19;
  MS_QINT32 = 20;
  MS_QUINT8 = 21;
  MS_QUINT16 = 22;
  MS_RESOURCE = 23;
  MS_STRING_REF = 24;
  MS_DUAL = 25;
  MS_UNKNOWN = 26;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_async_event.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_async_event.cc
@@ -0,0 +1,137 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "aicpu_sharder/aicpu_async_event.h"
 #include <string>
 #include "common/kernel_log.h"
 #include "aicpu_sharder/aicpu_context.h"

 namespace aicpu {
 AsyncEventManager &AsyncEventManager::GetInstance() {
  static AsyncEventManager async_event_manager;
  return async_event_manager;
 }

 void AsyncEventManager::Register(const NotifyFunc &notify) { notify_func_ = notify; }

 void AsyncEventManager::NotifyWait(void *notify_param, const uint32_t param_len) {
  if (notify_func_ != nullptr) {
    notify_func_(notify_param, param_len);
  }
 }

 bool AsyncEventManager::GenTaskInfoFromCtx(AsyncTaskInfo *task_info) {
  if (task_info == nullptr) {
    AICPU_LOGE("AsyncEventManager GenTaskInfoFromCtx failed, task_info is nullptr.");
    return false;
  }
  (void)aicpu::GetTaskAndStreamId(&task_info->task_id, &task_info->stream_id);
  std::string wait_id_value;
  std::string ker_wait_id(aicpu::kContextKeyWaitId);
  auto status = aicpu::GetThreadLocalCtx(ker_wait_id, &wait_id_value);
  if (status != aicpu::AICPU_ERROR_NONE) {
    AICPU_LOGE("GetThreadLocalCtx failed, ret=%d, key=%s.", status, ker_wait_id.c_str());
    return false;
  }
  task_info->wait_id = atoi(wait_id_value.c_str());
  std::string wait_type_value;
  std::string key_wait_type(aicpu::kContextKeyWaitType);
  status = aicpu::GetThreadLocalCtx(key_wait_type, &wait_type_value);
  if (status != aicpu::AICPU_ERROR_NONE) {
    AICPU_LOGE("GetThreadLocalCtx failed, ret=%d, key=%s.", status, key_wait_type.c_str());
    return false;
  }
  task_info->wait_type = atoi(wait_type_value.c_str());
  std::string start_tick_value;
  std::string key_start_tick(aicpu::kContextKeyStartTick);
  status = aicpu::GetThreadLocalCtx(key_start_tick, &start_tick_value);
  if (status != aicpu::AICPU_ERROR_NONE) {
    AICPU_LOGE("GetThreadLocalCtx failed, ret=%d, key=%s.", status, key_start_tick.c_str());
    return false;
  }
  task_info->start_tick = atol(start_tick_value.c_str());
  status = aicpu::GetOpname(aicpu::GetAicpuThreadIndex(), &task_info->op_name);
  if (status != aicpu::AICPU_ERROR_NONE) {
    AICPU_LOGE("GetOpname failed, ret=%d.", status);
    return false;
  }
  return true;
 }

 bool AsyncEventManager::RegEventCb(const uint32_t event_id, const uint32_t sub_event_id,
                                   const EventProcessCallBack &cb) {
  if (cb == nullptr) {
    AICPU_LOGE("AsyncEventManager RegEventCb failed, cb is nullptr.");
    return false;
  }
  AsyncTaskInfo task_info;
  task_info.task_cb = cb;
  if (!GenTaskInfoFromCtx(&task_info)) {
    AICPU_LOGE("AsyncEventManager GenTaskInfoFromCtx failed.");
    return false;
  }
  AsyncEventInfo info;
  info.event_id = event_id;
  info.sub_event_id = sub_event_id;
  {
    std::unique_lock<std::mutex> lk(map_mutex_);
    auto iter = asyncTaskMap_.find(info);
    if (iter != asyncTaskMap_.end()) {
      AICPU_LOGE("AsyncEventManager RegEventCb failed.");
      return false;
    }
    asyncTaskMap_[info] = task_info;
  }

  AICPU_LOGI(
    "AsyncEventManager RegEventCb success, event_id[%u], subeventId[%u], taskId[%lu],"
    " streamId[%u], waitType[%u], waitId[%u], opName[%s], startTick[%lu].",
    event_id, sub_event_id, task_info.task_id, task_info.stream_id, task_info.wait_type, task_info.wait_id,
    task_info.op_name.c_str(), task_info.start_tick);
  return true;
 }

 void AsyncEventManager::ProcessEvent(const uint32_t event_id, const uint32_t sub_event_id, void *param) {
  AICPU_LOGI("AsyncEventManager proc event_id = %d, sub_event_id = %d", event_id, sub_event_id);
  AsyncEventInfo info;
  info.event_id = event_id;
  info.sub_event_id = sub_event_id;
  EventProcessCallBack taskCb = nullptr;
  {
    std::unique_lock<std::mutex> lk(map_mutex_);
    auto iter = asyncTaskMap_.find(info);
    if (iter == asyncTaskMap_.end()) {
      AICPU_LOGW("AsyncEventManager no async task to deal with.");
      return;
    }
    taskCb = iter->second.task_cb;
    asyncTaskMap_.erase(iter);
  }
  if (taskCb != nullptr) {
    taskCb(param);
  }
  AICPU_LOGI("AsyncEventManager proc end!");
  return;
 }
 }  // namespace aicpu

 void AicpuNotifyWait(void *notify_param, const uint32_t param_len) {
  aicpu::AsyncEventManager::GetInstance().NotifyWait(notify_param, param_len);
  return;
 }

 bool AicpuRegEventCb(const uint32_t event_id, const uint32_t sub_event_id, const aicpu::EventProcessCallBack &cb) {
  return aicpu::AsyncEventManager::GetInstance().RegEventCb(event_id, sub_event_id, cb);
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_async_event.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_async_event.h
@@ -0,0 +1,144 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef AICPU_OPS_AICPU_ASYNC_EVENT_H_
 #define AICPU_OPS_AICPU_ASYNC_EVENT_H_

 #include <functional>
 #include <vector>
 #include <string>
 #include <map>
 #include <mutex>
 #include "aicpu_sharder/aicpu_context.h"

 namespace aicpu {
 using NotifyFunc = std::function<void(void *param, const uint32_t param_len)>;
 using EventProcessCallBack = std::function<void(void *param)>;

 struct AsyncEventInfo {
  uint32_t event_id;
  uint32_t sub_event_id;

  bool operator==(const AsyncEventInfo &info) {
    return (event_id == info.event_id) && (sub_event_id == info.sub_event_id);
  }
 };

 inline bool operator<(const AsyncEventInfo &info1, const AsyncEventInfo &info2) {
  return (info1.event_id < info2.event_id) ||
         ((info1.event_id == info2.event_id) && (info1.sub_event_id < info2.sub_event_id));
 }

 struct AsyncTaskInfo {
  uint64_t start_tick;
  std::string op_name;
  uint8_t wait_type;
  uint32_t wait_id;
  uint64_t task_id;
  uint32_t stream_id;
  EventProcessCallBack task_cb;
 };

 struct AsyncNotifyInfo {
  uint8_t wait_type;
  uint32_t wait_id;
  uint64_t task_id;
  uint32_t stream_id;
  uint32_t ret_code;
  aicpu::aicpuContext_t ctx;
 };

 class AsyncEventManager {
 public:
  /**
   * Get the unique object of this class
   */
  static AsyncEventManager &GetInstance();

  /**
   * Register notify callback function
   * @param notify wait notify callback function
   */
  void Register(const NotifyFunc &notify);

  /**
   * Notify wait task
   * @param notify_param notify param info
   * @param param_len notify_param len
   */
  void NotifyWait(void *notify_param, const uint32_t param_len);

  /**
   * Register Event callback function, async op call
   * @param eventID EventId
   * @param sub_event_id queue id
   * @param cb Event callback function
   * @return whether register success
   */
  bool RegEventCb(const uint32_t event_id, const uint32_t sub_event_id, const EventProcessCallBack &cb);

  /**
   * Process event
   * @param event_id EventId
   * @param sub_event_id queue id
   * @param param event param
   */
  void ProcessEvent(const uint32_t event_id, const uint32_t sub_event_id, void *param = nullptr);

 private:
  AsyncEventManager() : notify_func_(nullptr) {}
  ~AsyncEventManager() = default;

  AsyncEventManager(const AsyncEventManager &) = delete;
  AsyncEventManager &operator=(const AsyncEventManager &) = delete;
  AsyncEventManager(AsyncEventManager &&) = delete;
  AsyncEventManager &operator=(AsyncEventManager &&) = delete;

  // generate task info from ctx
  bool GenTaskInfoFromCtx(AsyncTaskInfo *task_info);

  // wait notify function
  NotifyFunc notify_func_;

  std::mutex map_mutex_;

  std::map<AsyncEventInfo, AsyncTaskInfo> asyncTaskMap_;
 };
 }  // namespace aicpu

 #ifdef __cplusplus
 extern "C" {
 #endif
 /**
 * Notify wait task
 * @param notify_param notify info
 * @param param_len
 */
 __attribute__((weak)) void AicpuNotifyWait(void *notify_param, const uint32_t param_len);

 /**
 * Register Event callback function, async op call
 * @param info Registered event information
 * @param cb Event callback function
 * @return whether register success
 */
 __attribute__((weak)) bool AicpuRegEventCb(const uint32_t event_id, const uint32_t sub_event_id,
                                           const aicpu::EventProcessCallBack &cb);

 #ifdef __cplusplus
 }
 #endif
 #endif  // AICPU_OPS_AICPU_ASYNC_EVENT_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_context.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_context.cc
@@ -0,0 +1,308 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "aicpu_sharder/aicpu_context.h"

 #include <map>
 #include <vector>
 #include <memory>
 #include <mutex>
 #include <thread>
 #include <utility>

 #include "common/kernel_log.h"

 namespace {
 // current thread context
 aicpu::aicpuContext_t g_cur_ctx;
 // task monitor context
 std::unique_ptr<std::string[]> g_opsname(nullptr);
 thread_local uint32_t g_thread_index = UINT32_MAX;
 uint32_t g_aicpu_core_cnt = 0;
 thread_local std::map<std::string, std::string> g_thread_local_ctx;
 thread_local aicpu::streamAndTaskId_t g_stream_and_task_id;
 // aicpu run mode
 uint32_t g_run_mode = aicpu::AicpuRunMode::THREAD_MODE;

 // context info
 std::mutex default_mutex;
 std::vector<std::map<std::string, std::string>> g_default_thread_ctx;
 std::mutex prof_mutex;
 std::vector<std::map<std::string, std::string>> g_prof_thread_ctx;
 std::mutex debug_mutex;
 std::vector<std::map<std::string, std::string>> g_debug_thread_ctx;
 std::mutex func_map_mutex;
 std::map<uint32_t, std::map<uint32_t, std::function<void(void *)>>> g_func_map;

 std::map<std::string, std::string> &GetThreadCtx(aicpu::CtxType type, uint32_t thread_index) {
  if (type == aicpu::CTX_DEBUG) {
    std::unique_lock<std::mutex> mutex(default_mutex);
    if (thread_index >= g_debug_thread_ctx.size()) {
      g_debug_thread_ctx.resize(thread_index + 1);
    }
    return g_debug_thread_ctx[thread_index];
  } else if (type == aicpu::CTX_PROF) {
    std::unique_lock<std::mutex> mutex(prof_mutex);
    if (thread_index >= g_prof_thread_ctx.size()) {
      g_prof_thread_ctx.resize(thread_index + 1);
    }
    return g_prof_thread_ctx[thread_index];
  } else {
    std::unique_lock<std::mutex> mutex(debug_mutex);
    if (thread_index >= g_default_thread_ctx.size()) {
      g_default_thread_ctx.resize(thread_index + 1);
    }
    return g_default_thread_ctx[thread_index];
  }
 }
 }  // namespace

 namespace aicpu {
 status_t aicpuSetContext(aicpuContext_t *ctx) {
  g_cur_ctx = *ctx;
  return AICPU_ERROR_NONE;
 }

 status_t aicpuGetContext(aicpuContext_t *ctx) {
  *ctx = g_cur_ctx;
  return AICPU_ERROR_NONE;
 }

 status_t InitTaskMonitorContext(uint32_t aicpu_core_cnt) {
  if (aicpu_core_cnt == 0) {
    AICPU_LOGE("invalid aicpu core count[%u]", aicpu_core_cnt);
    return AICPU_ERROR_FAILED;
  }
  g_aicpu_core_cnt = aicpu_core_cnt;
  AICPU_LOGI("aicpu core count[%u]", aicpu_core_cnt);
  g_opsname.reset(new (std::nothrow) std::string[aicpu_core_cnt]);
  if (g_opsname == nullptr) {
    AICPU_LOGE("malloc ops name memory for task monitor failed");
    return AICPU_ERROR_FAILED;
  }
  for (uint32_t index = 0; index < aicpu_core_cnt; ++index) {
    g_opsname[index] = "null";
  }
  return AICPU_ERROR_NONE;
 }

 status_t SetAicpuThreadIndex(uint32_t thread_index) {
  g_thread_index = thread_index;
  return AICPU_ERROR_NONE;
 }

 uint32_t GetAicpuThreadIndex() { return g_thread_index; }

 status_t SetOpname(const std::string &opname) {
  if (g_opsname != nullptr && g_thread_index < g_aicpu_core_cnt) {
    AICPU_LOGI("set op name to %s for thread[%u]", opname.c_str(), g_thread_index);
    g_opsname[g_thread_index] = opname;
    return AICPU_ERROR_NONE;
  }
  // maintenance function, if failed just print event log
  AICPU_LOGEVENT(
    "set op name[%s] failed, thread index[%u] should be less than total aicpu core count[%u],"
    " and ops name array addr[%p] cannot null",
    opname.c_str(), g_thread_index, g_aicpu_core_cnt, g_opsname.get());
  return AICPU_ERROR_NONE;
 }

 status_t GetOpname(uint32_t thread_index, std::string *opname) {
  *opname = "null";
  if (g_opsname != nullptr && thread_index < g_aicpu_core_cnt) {
    *opname = g_opsname[thread_index];
    return AICPU_ERROR_NONE;
  }
  // maintenance function, if failed just print event log
  AICPU_LOGEVENT(
    "get op name failed, thread index[%u] should be less than total aicpu core count[%u],"
    " and ops name array addr[%p] cannot null",
    g_thread_index, g_aicpu_core_cnt, g_opsname.get());
  return AICPU_ERROR_NONE;
 }

 status_t SetTaskAndStreamId(uint64_t task_id, uint32_t stream_id) {
  g_stream_and_task_id.task_id = task_id;
  g_stream_and_task_id.stream_id = stream_id;
  AICPU_LOGI("Set task_id:[%lu] and stream_id:[%u] success.", task_id, stream_id);
  return AICPU_ERROR_NONE;
 }

 status_t GetTaskAndStreamId(uint64_t *task_id, uint32_t *stream_id) {
  *task_id = g_stream_and_task_id.task_id;
  *stream_id = g_stream_and_task_id.stream_id;
  AICPU_LOGI("Get task_id:[%lu] and stream_id:[%u] success.", *task_id, *stream_id);
  return AICPU_ERROR_NONE;
 }

 status_t SetAicpuRunMode(uint32_t run_mode) {
  g_run_mode = run_mode;
  AICPU_LOGI("Set run_mode:[%u] success.", run_mode);
  return AICPU_ERROR_NONE;
 }

 status_t GetAicpuRunMode(uint32_t *run_mode) {
  *run_mode = g_run_mode;
  AICPU_LOGI("Get run_mode:[%u] success.", *run_mode);
  return AICPU_ERROR_NONE;
 }

 status_t SetThreadLocalCtx(const std::string &key, const std::string &value) {
  if (key.empty()) {
    AICPU_LOGE("set thread local context failed, key is empty");
    return AICPU_ERROR_FAILED;
  }
  try {
    g_thread_local_ctx[key] = value;
  } catch (std::exception &e) {
    AICPU_LOGE("set thread local context failed, %s", e.what());
    return AICPU_ERROR_FAILED;
  }
  return AICPU_ERROR_NONE;
 }

 status_t GetThreadLocalCtx(const std::string &key, std::string *value) {
  if (key.empty()) {
    AICPU_LOGE("get thread local context failed, key is empty");
    return AICPU_ERROR_FAILED;
  }
  auto iter = g_thread_local_ctx.find(key);
  if (iter != g_thread_local_ctx.end()) {
    *value = iter->second;
    return AICPU_ERROR_NONE;
  }
  AICPU_LOGW("get thread local context failed, no such key[%s]", key.c_str());
  return AICPU_ERROR_FAILED;
 }

 status_t RemoveThreadLocalCtx(const std::string &key) {
  auto iter = g_thread_local_ctx.find(key);
  if (iter != g_thread_local_ctx.end()) {
    g_thread_local_ctx.erase(iter);
    return AICPU_ERROR_NONE;
  }
  AICPU_LOGE("remove thread local context failed, no such key[%s]", key.c_str());
  return AICPU_ERROR_FAILED;
 }

 const std::map<std::string, std::string> &GetAllThreadCtxInfo(aicpu::CtxType type, uint32_t thread_index) {
  AICPU_LOGI("Get all thread ctx info begin, thread index:%u", thread_index);
  auto &ctx = GetThreadCtx(type, thread_index);
  return ctx;
 }

 status_t RegisterEventCallback(uint32_t event_id, uint32_t subevent_id, std::function<void(void *)> func) {
  std::lock_guard<std::mutex> lock(func_map_mutex);
  std::map<uint32_t, std::function<void(void *)>> &sub_map = g_func_map[event_id];
  auto it = sub_map.insert(std::make_pair(subevent_id, func));
  if (it.second == false) {
    AICPU_LOGE(
      "register event call function failed, repulicate register callback function by event_id[%u] "
      "subevent_id[%u]",
      event_id, subevent_id);
    return AICPU_ERROR_FAILED;
  }
  return AICPU_ERROR_NONE;
 }

 status_t DoEventCallback(uint32_t event_id, uint32_t subevent_id, void *param) {
  std::lock_guard<std::mutex> lock(func_map_mutex);
  auto iter = g_func_map.find(event_id);
  if (iter == g_func_map.end()) {
    AICPU_LOGE("do event callback function failed, cannot find callback function by event_id[%u] subevent_id[%u]",
               event_id, event_id);
    return AICPU_ERROR_FAILED;
  }

  std::map<uint32_t, std::function<void(void *)>> &sub_map = iter->second;
  auto sub_iter = sub_map.find(subevent_id);
  if (sub_iter == sub_map.end()) {
    AICPU_LOGE("do event callback function failed, cannot find callback function by event_id[%u] subevent_id[%u]",
               event_id, event_id);
    return AICPU_ERROR_FAILED;
  }
  (sub_iter->second)(param);
  // erase func after call
  sub_map.erase(sub_iter);
  return AICPU_ERROR_NONE;
 }

 status_t UnRegisterCallback(uint32_t event_id, uint32_t subevent_id) {
  std::lock_guard<std::mutex> lock(func_map_mutex);
  auto iter = g_func_map.find(event_id);
  if (iter == g_func_map.end()) {
    AICPU_LOGEVENT(
      "skip unregister event callback function, cannot find callback function by event_id[%u] "
      "subevent_id[%u]",
      event_id, event_id);
    return AICPU_ERROR_NONE;
  }

  std::map<uint32_t, std::function<void(void *)>> &sub_map = iter->second;
  auto sub_iter = sub_map.find(subevent_id);
  if (sub_iter == sub_map.end()) {
    AICPU_LOGEVENT(
      "skip unregister event callback function, cannot find callback function by event_id[%u] "
      "subevent_id[%u]",
      event_id, event_id);
    return AICPU_ERROR_NONE;
  }
  sub_map.erase(sub_iter);
  return AICPU_ERROR_NONE;
 }
 }  // namespace aicpu

 aicpu::status_t SetThreadCtxInfo(aicpu::CtxType type, const std::string &key, const std::string &value) {
  if (key.empty()) {
    AICPU_LOGE("Set thread context failed, context type[%d], key is empty", type);
    return aicpu::AICPU_ERROR_FAILED;
  }

  auto &ctx = GetThreadCtx(type, g_thread_index);
  try {
    ctx[key] = value;
  } catch (std::exception &e) {
    AICPU_LOGE("Set thread context failed, context type[%d], %s", type, e.what());
    return aicpu::AICPU_ERROR_FAILED;
  }
  return aicpu::AICPU_ERROR_NONE;
 }

 aicpu::status_t GetThreadCtxInfo(aicpu::CtxType type, const std::string &key, std::string *value) {
  if (key.empty()) {
    AICPU_LOGE("Get thread context failed, context type[%d], key is empty", type);
    return aicpu::AICPU_ERROR_FAILED;
  }

  auto &ctx = GetThreadCtx(type, g_thread_index);
  auto iter = ctx.find(key);
  if (iter != ctx.end()) {
    *value = iter->second;
    return aicpu::AICPU_ERROR_NONE;
  }
  AICPU_LOGE("Get thread context failed, context type[%d], no such key[%s]", type, key.c_str());
  return aicpu::AICPU_ERROR_FAILED;
 }

 aicpu::status_t RemoveThreadCtxInfo(aicpu::CtxType type, const std::string &key) {
  auto &ctx = GetThreadCtx(type, g_thread_index);
  auto iter = ctx.find(key);
  if (iter != ctx.end()) {
    ctx.erase(iter);
    return aicpu::AICPU_ERROR_NONE;
  }
  AICPU_LOGE("Remove thread context failed, context type[%d], no such key[%s]", type, key.c_str());
  return aicpu::AICPU_ERROR_FAILED;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_context.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_context.h
@@ -0,0 +1,231 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef AICPU_OPS_AICPU_CONTEXT_H_
 #define AICPU_OPS_AICPU_CONTEXT_H_

 #include <sys/types.h>
 #include <cstdint>
 #include <string>
 #include <map>
 #include <functional>
 #include "common/kernel_util.h"

 namespace aicpu {
 typedef struct {
  uint32_t device_id;  // device id
  uint32_t tsId;       // ts id
  pid_t host_pid;      // host pid
  uint32_t vf_id;      // vf id
 } aicpuContext_t;

 enum AicpuRunMode : uint32_t {
  PROCESS_PCIE_MODE = 0,    // 1910/1980/1951 dc, with host mode
  PROCESS_SOCKET_MODE = 1,  // MDC
  THREAD_MODE = 2,          // ctrlcpu/minirc/lhisi
  INVALID_MODE,
 };

 typedef struct {
  uint32_t stream_id;
  uint64_t task_id;
 } streamAndTaskId_t;

 typedef enum {
  AICPU_ERROR_NONE = 0,    // success
  AICPU_ERROR_FAILED = 1,  // failed
 } status_t;

 enum CtxType : int32_t { CTX_DEFAULT = 0, CTX_PROF, CTX_DEBUG };

 constexpr auto kContextKeyOpName = "opname";
 constexpr auto kContextKeyPhaseOneFlag = "phaseOne";
 constexpr auto kContextKeyWaitType = "waitType";
 constexpr auto kContextKeyWaitId = "waitId";
 constexpr auto kContextKeyStartTick = "startTick";
 constexpr auto kContextKeyDrvSubmitTick = "drvSubmitTick";
 constexpr auto kContextKeyDrvSchedTick = "drvSchedTick";
 constexpr auto kContextKeyKernelType = "kernelType";

 /**
 * set aicpu context for current thread.
 * @param [in]ctx aicpu context
 * @return status whether this operation success
 */
 AICPU_VISIBILITY_API status_t aicpuSetContext(aicpuContext_t *ctx);

 /**
 * get aicpu context from current thread.
 * @param [out]ctx aicpu context
 * @return status whether this operation success
 */
 AICPU_VISIBILITY_API status_t aicpuGetContext(aicpuContext_t *ctx);

 /**
 * init context for task monitor, called in compute process start.
 * @param [in]aicpu_core_cnt aicpu core number
 * @return status whether this operation success
 */
 status_t InitTaskMonitorContext(uint32_t aicpu_core_cnt);

 /**
 * set aicpu thread index for task monitor, called in thread callback function.
 * @param [in]thread_index aicpu thread index
 * @return status whether this operation success
 */
 status_t SetAicpuThreadIndex(uint32_t thread_index);

 /**
 * get aicpu thread index.
 * @return uint32
 */
 uint32_t GetAicpuThreadIndex();

 /**
 * set op name for task monitor.
 * called in libtf_kernels.so(tf op) or libaicpu_processer.so(others) or cpu kernel framework.
 * @param [in]opname op name
 * @return status whether this operation success
 */
 status_t __attribute__((weak)) SetOpname(const std::string &opname);

 /**
 * get op name for task monitor
 * @param [in]thread_index thread index
 * @param [out]opname op name
 * @return status whether this operation success
 */
 status_t GetOpname(uint32_t thread_index, std::string *opname);

 /**
 * get task and stream id.
 * @param [in]task_id task id.
 * @param [in]stream_id stream id.
 * @return status whether this operation success
 */
 status_t __attribute__((weak)) GetTaskAndStreamId(uint64_t *task_id, uint32_t *stream_id);

 /**
 * set task and stream id.
 * @param [in]task_id task id.
 * @param [in]stream_id stream id.
 * @return status whether this operation success
 */
 status_t __attribute__((weak)) SetTaskAndStreamId(uint64_t task_id, uint32_t stream_id);

 /**
 * set thread local context of key
 * @param [in]key context key
 * @param [in]value context value
 * @return status whether this operation success
 * @note Deprecated from 20201216, Replaced by SetThreadCtxInfo
 */
 status_t __attribute__((weak)) SetThreadLocalCtx(const std::string &key, const std::string &value);

 /**
 * get thread local context of key
 * @param [in]key context key
 * @param [out]value context value
 * @return status whether this operation success
 * @note Deprecated from 20201216, Replaced by GetThreadCtxInfo
 */
 status_t GetThreadLocalCtx(const std::string &key, std::string *value);

 /**
 * remove local context of key
 * @param [in]key context key
 * @return status whether this operation success
 * @note Deprecated from 20201216, Replaced by RemoveThreadCtxInfo
 */
 status_t RemoveThreadLocalCtx(const std::string &key);

 /**
 * get all thread context info of type
 * @param [in]type: ctx type
 * @param [in]thread_index: thread index
 * @return const std::map<std::string, std::string> &: all thread context info
 */
 const std::map<std::string, std::string> &GetAllThreadCtxInfo(aicpu::CtxType type, uint32_t thread_index);

 /**
 * set run mode.
 * @param [in]run_mode: run mode.
 * @return status whether this operation success
 */
 status_t __attribute__((weak)) SetAicpuRunMode(uint32_t run_mode);

 /**
 * get run mode.
 * @param [out]run_mode: run mode.
 * @return status whether this operation success
 */
 status_t __attribute__((weak)) GetAicpuRunMode(uint32_t *run_mode);

 /**
 * Register callback function by event_id and subevent_id
 * @param event_id event id
 * @param subevent_id subevent id
 * @param func call back function
 */
 status_t __attribute__((weak))
 RegisterEventCallback(uint32_t event_id, uint32_t subevent_id, std::function<void(void *)> func);

 /**
 * Do callback function by event_id and subevent_id
 * @param event_id event id
 * @param subevent_id subevent id
 * @param param event param
 */
 status_t __attribute__((weak)) DoEventCallback(uint32_t event_id, uint32_t subevent_id, void *param);

 /**
 * Unregister callback function by event_id and subevent_id
 * @param event_id event id
 * @param subevent_id subevent id
 */
 status_t __attribute__((weak)) UnRegisterCallback(uint32_t event_id, uint32_t subevent_id);
 }  // namespace aicpu

 extern "C" {
 /**
 * set thread context info of type
 * @param [in]type: ctx type
 * @param [in]key: key of context info
 * @param [in]value: value of context info
 * @return status whether this operation success
 */
 AICPU_VISIBILITY_API aicpu::status_t SetThreadCtxInfo(aicpu::CtxType type, const std::string &key,
                                                      const std::string &value);

 /**
 * get thread context info of type
 * @param [in]type: ctx type
 * @param [in]key: key of context info
 * @param [out]value: value of context info
 * @return status whether this operation success
 */
 AICPU_VISIBILITY_API aicpu::status_t GetThreadCtxInfo(aicpu::CtxType type, const std::string &key, std::string *value);

 /**
 * remove thread context info of type
 * @param [in]type: ctx type
 * @param [in]key: key of context info
 * @return status whether this operation success
 */
 AICPU_VISIBILITY_API aicpu::status_t RemoveThreadCtxInfo(aicpu::CtxType type, const std::string &key);
 }

 #endif  // AICPU_OPS_AICPU_CONTEXT_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_pulse.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_pulse.cc
@@ -0,0 +1,58 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "aicpu_sharder/aicpu_pulse.h"
 #include <unordered_map>
 #include <mutex>
 #include <string>
 #include "common/kernel_log.h"

 namespace {
 static std::unordered_map<std::string, PulseNotifyFunc> pulse_notify_func_map;
 static std::mutex mtx;
 }  // namespace

 __attribute__((visibility("default"))) void AicpuPulseNotify() {
  std::unique_lock<std::mutex> lck(mtx);
  AICPU_LOGD("Aicpu pulse notify start, notify func num=%zu.", pulse_notify_func_map.size());
  for (auto &notify_func : pulse_notify_func_map) {
    AICPU_LOGD("Aicpu pulse notify %s start.", notify_func.first.c_str());
    notify_func.second();
    AICPU_LOGD("Aicpu pulse notify %s end.", notify_func.first.c_str());
  }
  AICPU_LOGD("Aicpu pulse notify end.");
 }

 __attribute__((visibility("default"))) int32_t RegisterPulseNotifyFunc(const char *name, PulseNotifyFunc func) {
  if (name == nullptr) {
    AICPU_LOGE("Register pulse notify func failed as param name is null");
    return -1;
  }

  if (func == nullptr) {
    AICPU_LOGE("Register pulse notify func for %s failed as param func is null", name);
    return -1;
  }

  std::unique_lock<std::mutex> lck(mtx);
  auto ret = pulse_notify_func_map.emplace(name, func);
  if (!ret.second) {
    AICPU_LOGE("Register pulse notify func for %s failed.", name);
    return -1;
  }
  AICPU_LOGI("Register pulse notify func for %s success.", name);
  return 0;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_pulse.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_pulse.h
@@ -0,0 +1,46 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef AICPU_OPS_AICPU_PULSE_H_
 #define AICPU_OPS_AICPU_PULSE_H_

 #include <cstdint>

 #ifdef __cplusplus
 extern "C" {
 #endif

 typedef void (*PulseNotifyFunc)(void);

 /**
 * aicpu pulse notify.
 * timer will call this method per second.
 */
 void AicpuPulseNotify(void);

 /**
 * Register kernel pulse notify func.
 * @param name name of kernel lib, must end with '\0' and unique.
 * @param func pulse notify function.
 * @return 0:success, other:failed.
 */
 int32_t RegisterPulseNotifyFunc(const char *name, PulseNotifyFunc func);

 #ifdef __cplusplus
 }
 #endif

 #endif  // AICPU_OPS_AICPU_PULSE_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_sharder.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_sharder.cc
@@ -0,0 +1,218 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "aicpu_sharder/aicpu_sharder.h"

 #include <semaphore.h>
 #include <unistd.h>
 #include <error.h>
 #include <atomic>
 #include <algorithm>
 #include <cerrno>
 #include <cstring>

 #include "common/kernel_log.h"

 namespace aicpu {
 #define AICPU_SHARDER_IF_TRUE_RUN(expr, run) \
  do {                                       \
    if (expr) {                              \
      run;                                   \
    }                                        \
  } while (0)

 void SharderNonBlock::Register(const RunnerBool &schedule, const ClosureBool &do_task, uint32_t cpu_core_num) {
  schedule_ = schedule;
  do_task_ = do_task;
  cpu_core_num_ = cpu_core_num;
 }

 bool SharderNonBlock::Enqueue(const Closure &closure, bool submit_topic) {
  if (schedule_ != nullptr) {
    return schedule_(closure, submit_topic);
  }
  return false;
 }

 void SharderNonBlock::Schedule(const Closure &closure) {
  if (!Enqueue(closure)) {
    closure();
  }
 }

 uint32_t SharderNonBlock::GetCPUNum() { return cpu_core_num_; }

 SharderNonBlock &SharderNonBlock::GetInstance() {
  static SharderNonBlock sharder_non_block;
  return sharder_non_block;
 }

 int64_t SharderNonBlock::CeilMultiple(int64_t x, int64_t base) {
  if (base == 0) {
    return 0;
  }
  int64_t ret = x / base;
  if ((x % base) != 0) {
    ret++;
  }

  return ret;
 }

 void SharderNonBlock::ParallelFor(int64_t total, int64_t per_unit_size, const SharderWork &work) {
  AICPU_LOGI("total: %lld, per_unit_size: %lld", total, per_unit_size);
  if ((total <= 0) || (work == nullptr)) {
    AICPU_LOGE("invalid param: total<=0 or work is nullptr");
    return;
  }

  // work itself
  if ((schedule_ == nullptr) || (cpu_core_num_ <= 1)) {
    AICPU_LOGI("work itself all");
    work(0, total);
    return;
  }

  // In order to ensure a smaller scheduling delay, the maximum number of slices is twice the number of CPU cores
  const int64_t max_shard_num = static_cast<int64_t>(cpu_core_num_) * 2;

  // calculate shard number and block size
  // i.e., if total is 118, perUintSize is 2, and cpu_core_num_ is 13
  // then shard_num is 24, block_size is 5
  int64_t block_size = std::max(int64_t{1}, std::min(total, per_unit_size));
  int64_t shard_num = CeilMultiple(total, block_size);
  shard_num = std::min(max_shard_num, shard_num);
  block_size = CeilMultiple(total, shard_num);
  shard_num = CeilMultiple(total, block_size);
  AICPU_LOGI("shard number: %lld, block size: %lld", shard_num, block_size);

  // There is no need to submit an event if shard_num is 1
  if (shard_num == 1) {
    AICPU_LOGI("executes on the current thread");
    work(0, total);
    return;
  }

  std::atomic<int64_t> count(shard_num);  // a counter
  sem_t sem;
  int32_t sem_init_ret = sem_init(&sem, 0, 0);
  if (sem_init_ret == -1) {
    AICPU_LOGE("sem_init error with message: %s", strerror(errno));
    work(0, total);
    return;
  }

  for (int64_t start = 0; start < total; start += block_size) {
    auto limit = std::min(start + block_size, total);
    Closure closure = [&sem, &work, &count, start, limit]() {
      count--;
      // In order to ensure that user's work function exception does not affect multithread services,
      // exception capture is needed. Exception type is not cared here, and error log is printed.
      try {
        work(start, limit);
      } catch (...) {
        AICPU_LOGE("exception occurred in work function with start: %lld, limit: %lld", start, limit);
      }

      int32_t sem_post_ret = sem_post(&sem);
      AICPU_SHARDER_IF_TRUE_RUN(sem_post_ret == -1, AICPU_LOGE("sem_post error with message: %s", strerror(errno)));
    };

    // if enqueue fail, work itself
    if (!Enqueue(closure, true)) {
      AICPU_LOGI("Enqueue fail, [%lld, %lld), work itself", start, limit);
      closure();
    }
  }

  if (do_task_ != nullptr) {
    bool ret = true;
    while ((count > 0) && ret) {
      AICPU_LOGI("Main thread do task begin.");
      ret = do_task_();
      AICPU_LOGI("Main thread do task end.");
    }
  }

  for (int64_t i = 0; i < shard_num; ++i) {
    int sem_wait_ret = sem_wait(&sem);
    AICPU_SHARDER_IF_TRUE_RUN(sem_wait_ret == -1, AICPU_LOGE("sem_wait error with message: %s", strerror(errno)));
  }
  int32_t sem_des_ret = sem_destroy(&sem);
  AICPU_SHARDER_IF_TRUE_RUN(sem_des_ret == -1, AICPU_LOGE("sem_destroy error with message: %s", strerror(errno)));
 }

 void SharderNonBlock::ParallelForHash(int64_t total, int64_t cpu_nums, const SharderWork &work) {
  AICPU_LOGI("total: %lld, cpu_nums: %d", total, cpu_nums);
  if (total <= 0 || work == nullptr) {
    AICPU_LOGE("invalid param: total<=0 or work is nullptr");
    return;
  }

  if ((schedule_ == nullptr) || (cpu_core_num_ <= 1)) {
    AICPU_LOGE("schedule is nullptr or cpu core num is not enough");
    return;
  }

  std::atomic<int64_t> count(cpu_nums);  // a counter

  sem_t sem;
  int32_t sem_init_ret = sem_init(&sem, 0, 0);
  if (sem_init_ret == -1) {
    AICPU_LOGE("sem_init error with message: %s", strerror(errno));
    return;
  }

  for (int64_t cur = 0; cur < cpu_nums; cur++) {
    Closure closure = [&sem, &work, &count, total, cur]() {
      work(total, cur);
      count--;
      int32_t sem_post_ret = sem_post(&sem);
      AICPU_SHARDER_IF_TRUE_RUN(sem_post_ret == -1, AICPU_LOGE("sem_post error with message: %s", strerror(errno)));
    };

    // if enqueue fail, work itself
    if (!Enqueue(closure, true)) {
      closure();
    }
  }

  if (do_task_ != nullptr) {
    bool ret = true;
    while ((count > 0) && ret) {
      ret = do_task_();
    }
  }

  for (int64_t i = 0; i < cpu_nums; i++) {
    int sem_wait_ret = sem_wait(&sem);
    AICPU_SHARDER_IF_TRUE_RUN(sem_wait_ret == -1, AICPU_LOGE("sem_wait error with message: %s", strerror(errno)));
  }
  int32_t sem_des_ret = sem_destroy(&sem);
  AICPU_SHARDER_IF_TRUE_RUN(sem_des_ret == -1, AICPU_LOGE("sem_destroy error with message: %s", strerror(errno)));
 }
 }  // namespace aicpu

 /**
 * Shards the "total" unit of work refer "perUintSize"
 */
 void ParallelFor(int64_t total, int64_t per_unit_size, const aicpu::SharderWork &work) {
  aicpu::SharderNonBlock::GetInstance().ParallelFor(total, per_unit_size, work);
 }

 /**
 * Get CPU number
 */
 uint32_t GetCPUNum() { return aicpu::SharderNonBlock::GetInstance().GetCPUNum(); }
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_sharder.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/aicpu_sharder/aicpu_sharder.h
@@ -0,0 +1,132 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef AICPU_OPS_AICPU_SHARDER_H_
 #define AICPU_OPS_AICPU_SHARDER_H_

 #include <functional>
 #include <vector>
 #include "common/kernel_util.h"

 namespace aicpu {
 using Closure = std::function<void()>;
 using ClosureBool = std::function<bool()>;
 using RunnerBool = std::function<bool(Closure, bool)>;
 using SharderWork = std::function<void(int64_t, int64_t)>;

 class SharderNonBlock {
 public:
  /**
   * Get the unique object of this class
   */
  static SharderNonBlock &GetInstance();

  /**
   * Register schedule callback function, do_task function and cpu core number
   * called by compute process
   * @param schedule Schedule callback function
   * @param do_task Callback function for itself schedule
   * @param cpu_core_num aicpu core number
   */
  void Register(const RunnerBool &schedule, const ClosureBool &do_task, uint32_t cpu_core_num);

  /**
   * Shards the "total" unit of work refer "perUintSize"
   * @param total Total unit of work
   * @param per_unit_size Minimum shard unit
   * @param work should be a callable taking (int64, int64) arguments.
                 work(start, limit) computes the work units from [start, limit),
                 i.e., [start, limit) is a shard.
   */
  void ParallelFor(int64_t total, int64_t per_unit_size, const SharderWork &work);

  /**
   * Shards the unit of work refer for hash
   * @param total, Total unit of work
   * @param cpu_nums Number of cpu cores
   * @param work should be a callable taking (int64, int64) arguments.
                 work(cur, cpu_nums) computes the work units with input hash with (cpu_nums-1) equals cur,
                 i.e. specially used by parallel unique op
   */
  void ParallelForHash(int64_t total, int64_t cpu_nums, const SharderWork &work);

  /**
   * Schedule a task use schedule function registered by compute process,
   * note that the task will actually executed asynchronously
   * @param closure Closure function with nothrow
   */
  void Schedule(const Closure &closure);

  /**
   * Get CPU number
   * @param None
   * @return CPU number
   */
  uint32_t GetCPUNum();

 private:
  SharderNonBlock() : schedule_(nullptr), do_task_(nullptr), cpu_core_num_(0) {}
  ~SharderNonBlock() = default;

  SharderNonBlock(const SharderNonBlock &) = delete;
  SharderNonBlock &operator=(const SharderNonBlock &) = delete;
  SharderNonBlock(SharderNonBlock &&) = delete;
  SharderNonBlock &operator=(SharderNonBlock &&) = delete;

  /**
   * Closure function enqueue
   * @param closure Closure function can be called
   * @param submit_topic whether submit topic, true means submit topic
   * @return whether enqueue of closure success
   */
  bool Enqueue(const Closure &closure, bool submit_topic = false);

  /**
   * Calculate how many times, which ceiled, "x" is "base".
   * i.e., x is 1, base is 2, this function will return 1
   * @param x An integral
   * @param base An integral as base when cal multiple
   * @return ceiled multiple
   */
  inline int64_t CeilMultiple(int64_t x, int64_t base);

 private:
  RunnerBool schedule_;    // enqueue runner
  ClosureBool do_task_;    // a callback, do task from task queue
  uint32_t cpu_core_num_;  // aicpu core number
 };                         // SharderNonBlock
 }  // namespace aicpu

 extern "C" {
 /**
 * Shards the "total" unit of work refer "perUintSize"
 * @param total Total unit of work
 * @param per_unit_size Minimum shard unit
 * @param work should be a callable taking (int64, int64) arguments.
                 work(start, limit) computes the work units from [start, limit),
                i.e., [start, limit) is a shard.
 */
 AICPU_VISIBILITY_API void ParallelFor(int64_t total, int64_t per_unit_size, const aicpu::SharderWork &work);

 /**
 * Get CPU number
 * @param None
 * @return CPU number
 */
 AICPU_VISIBILITY_API uint32_t GetCPUNum();
 }

 #endif  // AICPU_OPS_AICPU_SHARDER_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/distinct_uniform_int_distribution.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/distinct_uniform_int_distribution.h
@@ -0,0 +1,62 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_OPS_AICPU_DISTINCT_UNIFORM_INT_DISTRIBUTION_H_
 #define AICPU_OPS_AICPU_DISTINCT_UNIFORM_INT_DISTRIBUTION_H_

 #include <random>
 #include <unordered_set>

 namespace aicpu {
 template <typename IntType = int>
 class distinct_uniform_int_distribution {
 public:
  using result_type = IntType;

 private:
  using set_type = std::unordered_set<result_type>;
  using distr_type = std::uniform_int_distribution<result_type>;

 public:
  distinct_uniform_int_distribution(result_type inf, result_type sup)
      : inf_(inf), sup_(sup), range_(sup_ - inf_ + 1), distr_(inf_, sup_) {}
  ~distinct_uniform_int_distribution() = default;
  void reset() {
    uset_.clear();
    distr_.reset();
  }

  template <typename Generator>
  result_type exec(Generator *engine) {
    if (!(uset_.size() < range_)) {
      std::terminate();
    }
    result_type res;
    do {
      res = distr_(*engine);
    } while (uset_.count(res) > 0);
    uset_.insert(res);
    return res;
  }

 private:
  const result_type inf_;
  const result_type sup_;
  const size_t range_;
  distr_type distr_;
  set_type uset_;
 };
 }  // namespace aicpu
 #endif  // AICPU_OPS_AICPU_DISTINCT_UNIFORM_INT_DISTRIBUTION_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_base.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_base.cc
@@ -0,0 +1,255 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <map>
 #include "common/kernel_base.h"
 #include "common/kernel_errcode.h"
 #include "common/tensor.h"

 namespace aicpu {
 namespace {
 // max param len limit 10k.
 constexpr uint32_t MAX_PARAM_LEN = 10240;
 // max io address num limit 1024
 constexpr uint32_t MAX_IO_ADDR_NUMPARAM_LEN = 1024;
 }  // namespace
 static const std::map<const ::aicpuops::DataType, size_t> kKernelBaseDataTypeSize = {
  {aicpuops::MS_BOOL, sizeof(bool)},       {aicpuops::MS_INT8, sizeof(int8_t)},
  {aicpuops::MS_UINT8, sizeof(uint8_t)},   {aicpuops::MS_INT16, sizeof(int16_t)},
  {aicpuops::MS_UINT16, sizeof(uint16_t)}, {aicpuops::MS_INT32, sizeof(int32_t)},
  {aicpuops::MS_UINT32, sizeof(uint32_t)}, {aicpuops::MS_INT64, sizeof(int64_t)},
  {aicpuops::MS_UINT64, sizeof(uint64_t)}, {aicpuops::MS_FLOAT16, sizeof(float) / 2},
  {aicpuops::MS_FLOAT32, sizeof(float)},   {aicpuops::MS_FLOAT64, sizeof(double)}};

 KernelBase::KernelBase(const std::string &kernel_name)
    : kernel_name_(kernel_name),
      extend_param_len_(0),
      extend_param_base_(nullptr),
      param_head_(nullptr),
      unknow_shape_(false) {}

 uint32_t KernelBase::ParseParam(void *param) {
  if (param == nullptr) {
    AICPU_LOGE("Kernel:%s ParseParam param is null.", kernel_name_.c_str());
    return AICPU_KERNEL_STATE_PARAM_INVALID;
  }

  // parse param_len
  param_head_ = static_cast<AicpuParamHead *>(param);
  if (param_head_->length < sizeof(AicpuParamHead) || param_head_->length > MAX_PARAM_LEN) {
    AICPU_LOGE("Kernel:%s param length=%u not in [%zu, %u].", kernel_name_.c_str(), param_head_->length,
               sizeof(AicpuParamHead), MAX_PARAM_LEN);
    return AICPU_KERNEL_STATE_PARAM_INVALID;
  }

  auto param_base = static_cast<uint8_t *>(param);
  extend_param_base_ = param_base + sizeof(AicpuParamHead);
  extend_param_len_ = param_head_->length - sizeof(AicpuParamHead);

  if (param_head_->ioAddrNum > 0) {
    if (param_head_->ioAddrNum > MAX_IO_ADDR_NUMPARAM_LEN) {
      AICPU_LOGE("Kernel:%s param ioAddrNum=%u is over %u.", kernel_name_.c_str(), param_head_->ioAddrNum,
                 MAX_IO_ADDR_NUMPARAM_LEN);
      return AICPU_KERNEL_STATE_PARAM_INVALID;
    }
    uint32_t addr_len = param_head_->ioAddrNum * sizeof(uint64_t);
    if (extend_param_len_ < addr_len) {
      AICPU_LOGE("Kernel:%s extend param is not enough for io addr, ioAddrNum=%u, extendParamLen=%u.",
                 kernel_name_.c_str(), param_head_->ioAddrNum, extend_param_len_);
      return AICPU_KERNEL_STATE_PARAM_INVALID;
    }
    auto io_addr_base = reinterpret_cast<uint64_t *>(extend_param_base_);
    for (uint32_t i = 0; i < param_head_->ioAddrNum; ++i) {
      io_addrs_.push_back(static_cast<uintptr_t>(io_addr_base[i]));
    }
    extend_param_base_ = extend_param_base_ + addr_len;
    extend_param_len_ -= addr_len;
  }
  AICPU_CHK_STATUS_RET(ParseNodeDef())
  AICPU_CHK_STATUS_RET(ParseExtInfo())
  if (unknow_shape_) {
    AICPU_LOGI("Unknown shape op: %s", kernel_name_.c_str());
    AICPU_CHK_STATUS_RET(UpdateInputShape())
    AICPU_CHK_STATUS_RET(UpdateOutputShape())
  }
  return ParseKernelParam();
 }

 uint32_t KernelBase::Compute(void *param) {
  uint32_t ret = ParseParam(param);
  if (ret != AICPU_KERNEL_STATE_SUCCESS) {
    AICPU_LOGE("Kernel:%s ParseParam failed, ret=%u.", kernel_name_.c_str(), ret);
    return ret;
  }
  return DoCompute();
 }

 size_t KernelBase::GetDataTypeSize(::aicpuops::DataType data_type) {
  auto it = kKernelBaseDataTypeSize.find(data_type);
  if (it == kKernelBaseDataTypeSize.end()) {
    AICPU_LOGE("don't support input tensor types");
    return 0;
  }
  return it->second;
 }

 template <typename T>
 uint32_t KernelBase::ParseExtendParam(T *param_var, std::string param_name) {
  if (extend_param_len_ < sizeof(T)) {
    AICPU_LOGE("Kernel:%s extend param is not enough for [%s] addr, need_len=%u, extendParamLen=%u.",
               kernel_name_.c_str(), param_name.c_str(), sizeof(T), extend_param_len_);
    return AICPU_KERNEL_STATE_PARAM_INVALID;
  }
  T *param = reinterpret_cast<T *>(extend_param_base_);
  if (param != nullptr) {
    *param_var = *param;
    extend_param_base_ += sizeof(T);
    extend_param_len_ -= sizeof(T);
    return AICPU_KERNEL_STATE_SUCCESS;
  }
  AICPU_LOGE("Kernel:%s extend param for [%s] addr is invalid.", kernel_name_.c_str(), param_name.c_str());
  return AICPU_KERNEL_STATE_PARAM_INVALID;
 }

 uint32_t KernelBase::ParseNodeDef() {
  uint32_t node_def_len;
  AICPU_CHK_STATUS_RET(ParseExtendParam(&node_def_len, "node_def_len"))

  if (extend_param_len_ < node_def_len) {
    AICPU_LOGE("Kernel:%s extend param is not enough for customizeAttr addr, node_def_len=%u, extendParamLen=%u.",
               kernel_name_.c_str(), node_def_len, extend_param_len_);
    return AICPU_KERNEL_STATE_PARAM_INVALID;
  }
  std::string std_data(reinterpret_cast<char *>(extend_param_base_), node_def_len);
  if (!node_def_.ParseFromString(std_data)) {
    AICPU_LOGE("parse %s KernelBase proto failed, nodeDef=%s.", kernel_name_.c_str(), std_data.c_str());
    return AICPU_KERNEL_STATE_PARAM_INVALID;
  }
  extend_param_base_ += node_def_len;
  extend_param_len_ -= node_def_len;
  return AICPU_KERNEL_STATE_SUCCESS;
 }

 uint32_t KernelBase::ParseExtShapeType(FWKAdapter::ExtInfo *ext_info) {
  if (ext_info->infoLen != sizeof(int32_t)) {
    AICPU_LOGE("Kernel:%s parse ext shape type failed as infoLen must be %zu but %u.", kernel_name_.c_str(),
               sizeof(int32_t), ext_info->infoLen);
    return AICPU_KERNEL_STATE_PARAM_INVALID;
  }
  unknow_shape_ = true;
  return AICPU_KERNEL_STATE_SUCCESS;
 }

 uint32_t KernelBase::ParseExtInputShape(FWKAdapter::ExtInfo *ext_info) {
  // no overflow
  auto need_len = node_def_.inputs_size() * sizeof(FWKAdapter::ShapeAndType);
  if (ext_info->infoLen != need_len) {
    AICPU_LOGE(
      "Kernel:%s parse ext input shape failed as infoLen must be "
      "input_num[%d]*sizeof(ShapeAndType)[%zu], but %u.",
      kernel_name_.c_str(), node_def_.inputs_size(), sizeof(FWKAdapter::ShapeAndType), ext_info->infoLen);
    return AICPU_KERNEL_STATE_PARAM_INVALID;
  }
  input_shape_and_type_.clear();
  auto input = reinterpret_cast<FWKAdapter::ShapeAndType *>(ext_info->infoMsg);
  for (int index = 0; index < node_def_.inputs_size(); ++index) {
    input_shape_and_type_.emplace_back(&input[index]);
  }
  return AICPU_KERNEL_STATE_SUCCESS;
 }

 uint32_t KernelBase::ParseExtOutputShape(FWKAdapter::ExtInfo *ext_info) {
  // no overflow
  auto need_len = node_def_.outputs_size() * sizeof(FWKAdapter::ShapeAndType);
  if (ext_info->infoLen != need_len) {
    AICPU_LOGE(
      "Kernel:%s parse ext output shape failed as infoLen must be "
      "output_num[%d]*sizeof(ShapeAndType)[%zu], but %u.",
      kernel_name_.c_str(), node_def_.outputs_size(), sizeof(FWKAdapter::ShapeAndType), ext_info->infoLen);
    return AICPU_KERNEL_STATE_PARAM_INVALID;
  }
  output_shape_and_type_.clear();
  auto output = reinterpret_cast<FWKAdapter::ShapeAndType *>(ext_info->infoMsg);
  for (int index = 0; index < node_def_.outputs_size(); ++index) {
    output_shape_and_type_.emplace_back(&output[index]);
  }
  return AICPU_KERNEL_STATE_SUCCESS;
 }

 uint32_t KernelBase::ParseExtInfo() {
  uint32_t offset = 0;
  FWKAdapter::ExtInfo *ext_info_ptr = nullptr;
  char *ext_info_buf = reinterpret_cast<char *>(static_cast<uintptr_t>(param_head_->extInfoAddr));
  while (offset + sizeof(FWKAdapter::ExtInfo) <= param_head_->extInfoLength) {
    ext_info_ptr = reinterpret_cast<FWKAdapter::ExtInfo *>(ext_info_buf + offset);
    if (ext_info_ptr == nullptr) {
      AICPU_LOGE("Kernel:%s ext_info is nullptr, extInfoLength=%u, extInfoAddr=%p, offset=%zu.", kernel_name_.c_str(),
                 param_head_->extInfoLength, param_head_->extInfoAddr, offset);
      return AICPU_KERNEL_STATE_PARAM_INVALID;
    }
    switch (ext_info_ptr->infoType) {
      case FWKAdapter::FWK_ADPT_EXT_SHAPE_TYPE:
        AICPU_CHK_STATUS_RET(ParseExtShapeType(ext_info_ptr))
        break;
      case FWKAdapter::FWK_ADPT_EXT_INPUT_SHAPE:
        AICPU_CHK_STATUS_RET(ParseExtInputShape(ext_info_ptr))
        break;
      case FWKAdapter::FWK_ADPT_EXT_OUTPUT_SHAPE:
        AICPU_CHK_STATUS_RET(ParseExtOutputShape(ext_info_ptr))
        break;
      default:
        AICPU_LOGI("Kernel:%s ignore infoType=%d, infoLen=%u.", kernel_name_.c_str(), ext_info_ptr->infoType,
                   ext_info_ptr->infoLen);
        break;
    }
    // not overflow
    offset += FWKAdapter::kExtInfoHeadSize;
    offset += ext_info_ptr->infoLen;
  }
  return AICPU_KERNEL_STATE_SUCCESS;
 }

 uint32_t KernelBase::UpdateInputShape() {
  for (int i = 0; i < node_def_.inputs_size(); ++i) {
    aicpuops::Tensor *input_tensor = node_def_.mutable_inputs(i);
    aicpuops::TensorShape *input_tensor_shape = input_tensor->mutable_tensor_shape();
    input_tensor_shape->clear_dim();
    for (uint32_t index = 0; index < FWKAdapter::kMaxShapeDims; ++index) {
      // LLONG_MIN for dim end flag
      if (input_shape_and_type_[i]->dims[index] == LLONG_MIN) {
        break;
      }
      input_tensor_shape->add_dim()->set_size(input_shape_and_type_[i]->dims[index]);
    }
  }
  return AICPU_KERNEL_STATE_SUCCESS;
 }

 uint32_t KernelBase::UpdateOutputShape() {
  for (int i = 0; i < node_def_.outputs_size(); ++i) {
    aicpuops::Tensor *output_tensor = node_def_.mutable_outputs(i);
    aicpuops::TensorShape *output_tensor_shape = output_tensor->mutable_tensor_shape();
    output_tensor_shape->clear_dim();
    for (uint32_t index = 0; index < FWKAdapter::kMaxShapeDims; ++index) {
      // LLONG_MIN for dim end flag
      if (output_shape_and_type_[i]->dims[index] == LLONG_MIN) {
        break;
      }
      output_tensor_shape->add_dim()->set_size(output_shape_and_type_[i]->dims[index]);
    }
  }
  return AICPU_KERNEL_STATE_SUCCESS;
 }
 }  // namespace aicpu
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_base.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_base.h
@@ -0,0 +1,82 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_OPS_AICPU_COMMON_KERNEL_BASE_H_
 #define AICPU_OPS_AICPU_COMMON_KERNEL_BASE_H_

 #include <cstdint>
 #include <vector>
 #include <string>

 #include "common/kernel_util.h"
 #include "aicpu/common/aicpu_task_struct.h"
 #include "securec/include/securec.h"
 #include "common/tensor.h"
 #include "cce/fwk_adpt_struct.h"
 #include "common/kernel_log.h"
 #include "proto/aicpu_tensor.pb.h"

 namespace aicpu {
 class KernelBase {
 public:
  explicit KernelBase(const std::string &kernel_name);

  ~KernelBase() = default;

  uint32_t Compute(void *param);
  size_t GetDataTypeSize(::aicpuops::DataType data_type);

 protected:
  virtual uint32_t ParseKernelParam() = 0;
  virtual uint32_t DoCompute() = 0;

  template <typename T>
  uint32_t ParseExtendParam(T *param_var, std::string param_name);

  uint32_t ParseNodeDef();

  uint32_t ParseExtInfo();

  uint32_t ParseExtShapeType(FWKAdapter::ExtInfo *ext_info);

  uint32_t ParseExtInputShape(FWKAdapter::ExtInfo *ext_info);

  uint32_t ParseExtOutputShape(FWKAdapter::ExtInfo *ext_info);

  uint32_t UpdateInputShape();

  uint32_t UpdateOutputShape();

 private:
  KernelBase(const KernelBase &) = delete;
  KernelBase &operator=(const KernelBase &) = delete;
  KernelBase(KernelBase &&) = delete;
  KernelBase &operator=(KernelBase &&) = delete;

  uint32_t ParseParam(void *param);

 protected:
  std::string kernel_name_;
  std::vector<uintptr_t> io_addrs_;
  uint32_t extend_param_len_;
  uint8_t *extend_param_base_;
  AicpuParamHead *param_head_;
  bool unknow_shape_;
  aicpuops::NodeDef node_def_;
  std::vector<FWKAdapter::ShapeAndType *> input_shape_and_type_;
  std::vector<FWKAdapter::ShapeAndType *> output_shape_and_type_;
 };
 }  // namespace aicpu
 #endif  // AICPU_OPS_AICPU_COMMON_KERNEL_BASE_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_errcode.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_errcode.h
@@ -0,0 +1,30 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_OPS_AICPU_COMMON_KENERL_ERRCODE_H_
 #define AICPU_OPS_AICPU_COMMON_KENERL_ERRCODE_H_

 namespace aicpu {
 enum AicpuKernelErrCode {
  // 0-3 is fixed error code, runtime need interpret 0-3 error codes
  AICPU_KERNEL_STATE_SUCCESS = 0,
  AICPU_KERNEL_STATE_PARAM_INVALID = 1,
  AICPU_KERNEL_STATE_FAILED = 2,
  AICPU_KERNEL_STATE_EXECUTE_TIMEOUT = 3,
  AICPU_KERNEL_STATE_INTERNAL_ERROR = 4,
  AICPU_KERNEL_STATE_END_OF_SEQUENCE = 201,
 };
 }  // namespace aicpu
 #endif  // AICPU_OPS_AICPU_COMMON_KENERL_ERRCODE_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_log.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_log.cc
@@ -0,0 +1,29 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "common/kernel_log.h"

 namespace aicpu {
 static int log_level = AICPU_LOG_ERROR;

 int LogSetLevel(int level) {
  log_level = level;
  return log_level;
 }

 int LogGetLevel(void) { return log_level; }

 bool CheckLogLevel(int log_level_check) { return log_level >= log_level_check; }
 }  // namespace aicpu
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_log.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_log.h
@@ -0,0 +1,77 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_OPS_AICPU_COMMON_KERNEL_LOG_H_
 #define AICPU_OPS_AICPU_COMMON_KERNEL_LOG_H_

 #include <unistd.h>
 #include <sys/syscall.h>
 #include <iostream>
 #include <utility>
 #include "common/kernel_errcode.h"

 inline int GetTid(void) {
  thread_local static int tid = syscall(__NR_gettid);
  return tid;
 }
 static const int LOG_COUNT = 0;

 namespace aicpu {
 #define AICPU_LOG_DEBUG 0
 #define AICPU_LOG_INFO 1
 #define AICPU_LOG_WARN 2
 #define AICPU_LOG_ERROR 3
 #define AICPU_LOG_EVENT 0x10

 inline void PrintLog(const int level) { std::cerr << level << std::endl; }

 template <typename T, typename... Args>
 inline void PrintLog(const int level, T &&head, Args &&... tail) {
  std::cerr << std::forward<T>(head) << " ";
  PrintLog(level, std::forward<Args>(tail)...);
 }

 int LogSetLevel(int level);

 int LogGetLevel(void);

 bool CheckLogLevel(int log_level_check);

 #define AICPU_LOGD(fmt, ...) \
  AICPU_LOG(AICPU_LOG_DEBUG, "%s:%s:%d[tid:%lu]:" #fmt, __FUNCTION__, __FILE__, __LINE__, GetTid(), ##__VA_ARGS__);
 #define AICPU_LOGI(fmt, ...) \
  AICPU_LOG(AICPU_LOG_INFO, "%s:%s:%d[tid:%lu]:" #fmt, __FUNCTION__, __FILE__, __LINE__, GetTid(), ##__VA_ARGS__);
 #define AICPU_LOGW(fmt, ...) \
  AICPU_LOG(AICPU_LOG_WARN, "%s:%s:%d[tid:%lu]:" #fmt, __FUNCTION__, __FILE__, __LINE__, GetTid(), ##__VA_ARGS__);
 #define AICPU_LOGE(fmt, ...) \
  AICPU_LOG(AICPU_LOG_ERROR, "%s:%s:%d[tid:%lu]:" #fmt, __FUNCTION__, __FILE__, __LINE__, GetTid(), ##__VA_ARGS__);
 #define AICPU_LOGEVENT(fmt, ...) \
  AICPU_LOG(AICPU_LOG_EVENT, "%s:%s:%d[tid:%lu]:" #fmt, __FUNCTION__, __FILE__, __LINE__, GetTid(), ##__VA_ARGS__);
 #define AICPU_LOG(level, fmt, ...)                                              \
  do {                                                                          \
    if (aicpu::CheckLogLevel(level)) {                                          \
      aicpu::PrintLog(level, "[%s:%d]" fmt, __FILE__, __LINE__, ##__VA_ARGS__); \
    }                                                                           \
  } while (LOG_COUNT != 0)

 #define AICPU_CHK_STATUS_RET(expr...)           \
  do {                                          \
    const uint32_t status = (expr);             \
    if (status != AICPU_KERNEL_STATE_SUCCESS) { \
      return status;                            \
    }                                           \
  } while (0);
 }  // namespace aicpu
 #endif  // AICPU_OPS_AICPU_COMMON_KERNEL_LOG_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_util.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/kernel_util.h
@@ -0,0 +1,22 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_OPS_AICPU_COMMON_KERNEL_UTIL_H_
 #define AICPU_OPS_AICPU_COMMON_KERNEL_UTIL_H_

 #ifndef AICPU_VISIBILITY_API
 #define AICPU_VISIBILITY_API __attribute__((visibility("default")))
 #endif
 #endif  // AICPU_OPS_AICPU_COMMON_KERNEL_UTIL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/tensor.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/common/tensor.h
@@ -0,0 +1,41 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_OPS_COMMON_TENSOR_H_
 #define AICPU_OPS_COMMON_TENSOR_H_

 #include <atomic>
 #include <memory>
 #include <string>
 #include <vector>
 #include <map>

 namespace aicpu {
 namespace ms {
 class Tensor {
 public:
  Tensor() = default;
  ~Tensor() = default;
  const uint8_t *GetData() const;
  size_t GetSize() const;
  void SetData(uint8_t *data, size_t size);

 private:
  uint8_t *tensor_ptr_;
  size_t tensor_len_;
 };
 }  // namespace ms
 }  // namespace aicpu
 #endif  // AICPU_OPS_COMMON_TENSOR_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/random_choice_with_mask_kernels.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/random_choice_with_mask_kernels.cc
@@ -0,0 +1,280 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "./random_choice_with_mask_kernels.h"
 #include <random>
 #include <climits>
 #include <vector>
 #include <algorithm>
 #include <string>
 #include "aicpu_sharder/aicpu_sharder.h"
 #include "proto/aicpu_tensor.pb.h"
 #include "common/distinct_uniform_int_distribution.h"
 #include "common/tensor.h"

 namespace aicpu {
 static void ParseOutputCoordinate(std::vector<int64_t> dims_, int32_t output_length, int32_t input_dim_size,
                                  int32_t input_total_count, const int *tmp_output, int *output) {
  int it = 0;
  int column = input_total_count / dims_[0];
  for (int i = 0; i < output_length; i++) {
    int32_t tmp_output_number = tmp_output[i];
    int tmp_column = column;
    for (int j = 0; j < input_dim_size; j++) {
      if (j == input_dim_size - 1) {
        output[it++] = tmp_output_number;
        continue;
      }
      output[it++] = tmp_output_number / column;
      tmp_output_number = tmp_output_number % column;
      tmp_column = tmp_column / dims_[j + 1];
    }
  }
 }

 static void GetOutputLength(bool *padding_flag, int32_t *output_length, int32_t *output_non_zero_length, int32_t count,
                            int32_t non_zero_num) {
  if (count == 0) {
    *padding_flag = false;
    *output_length = non_zero_num;
    *output_non_zero_length = non_zero_num;
  } else if (count > 0 && count <= non_zero_num) {
    *padding_flag = false;
    *output_length = count;
    *output_non_zero_length = count;
  } else if (count > non_zero_num) {
    *padding_flag = true;
    *output_length = count;
    *output_non_zero_length = non_zero_num;
  } else {
    AICPU_LOGI("input count must greater or equal to 0 but instead is %d", count);
  }
 }

 static bool GetInputTotalCount(const std::vector<int64_t> &dims_, int32_t *input_total_count,
                               const int32_t &input_dim_size) {
  const int32_t max_inpu_dim = 5;
  if (input_dim_size < 1 || input_dim_size > max_inpu_dim) {
    AICPU_LOGE(
      "input dim size is %d, it must greater or equal to 1 channels "
      "and less than or equal to 5 channels!",
      input_dim_size);
    return false;
  }
  for (int32_t i = 0; i < input_dim_size; i++) {
    *input_total_count *= dims_[i];
  }
  if (*input_total_count <= 0) {
    AICPU_LOGE("input_total_count is %d, please check setting.", *input_total_count);
    return false;
  }
  return true;
 }

 static void UpdateOutput(const std::vector<int64_t> &dims_, const int32_t &non_zero_num, const int32_t &count_,
                         const int32_t &output_length, const int *mask_dim, int32_t *output_coordinate, bool *mask) {
  for (int32_t i = non_zero_num * dims_.size(); i < static_cast<int32_t>(count_ * dims_.size()); i++) {
    output_coordinate[i] = 0;
  }
  for (int32_t i = 0; i < output_length; i++) {
    mask[i] = static_cast<bool>(mask_dim[i]);
  }
  for (int32_t i = non_zero_num; i < count_; i++) {
    mask[i] = false;
  }
 }

 static bool GenerateRandomMask(const int32_t &output_length, const int32_t &non_zero_num,
                               const int32_t &output_non_zero_length, int **input_dim, int **tmp_output,
                               int **mask_dim) {
  *tmp_output = reinterpret_cast<int *>(malloc(output_length * sizeof(int)));
  if (*tmp_output == nullptr) {
    AICPU_LOGE("malloc memory failed!");
    free(*input_dim);
    return false;
  }
  std::random_device rd;
  std::mt19937 gen(rd());
  aicpu::distinct_uniform_int_distribution<> dis(0, non_zero_num - 1);
  *mask_dim = reinterpret_cast<int *>(malloc(output_length * sizeof(int)));
  if (*mask_dim == nullptr) {
    AICPU_LOGE("malloc memory failed!");
    free(*input_dim);
    free(*tmp_output);
    return false;
  }
  if (memset_s(*mask_dim, output_length, 0x00, output_length) != EOK) {
    AICPU_LOGE("memset_s to mask_dim failed!");
    free(*input_dim);
    free(*tmp_output);
    free(*mask_dim);
    return false;
  }
  if (memset_s(*tmp_output, output_length, 0x00, output_length) != EOK) {
    AICPU_LOGE("memset_s to tmp_output failed!");
    free(*input_dim);
    free(*tmp_output);
    free(*mask_dim);
    return false;
  }

  if (output_non_zero_length > output_length) {
    AICPU_LOGE("output_non_zero_length size is too long!");
    free(*input_dim);
    free(*tmp_output);
    free(*mask_dim);
    return false;
  }
  for (int32_t i = 0; i < output_non_zero_length; i++) {
    int32_t mean = dis.exec(&gen);
    *((*tmp_output) + i) = *((*input_dim) + mean);
    *((*mask_dim) + i) = 1;
  }
  return true;
 }

 uint32_t RandomChoiceWithMaskKernel::DoCompute() {
  auto *input = reinterpret_cast<bool *>(io_addrs_[0]);
  auto *output_coordinate = reinterpret_cast<int32_t *>(io_addrs_[1]);
  auto *mask = reinterpret_cast<bool *>(io_addrs_[2]);
  int32_t input_dim_size = dims_.size();
  int32_t non_zero_num = 0;
  int32_t input_total_count = 1;

  bool ret = GetInputTotalCount(dims_, &input_total_count, input_dim_size);
  if (!ret) {
    AICPU_LOGE("Get input total count failed!");
    return AICPU_KERNEL_STATE_INTERNAL_ERROR;
  }

  int *input_dim = reinterpret_cast<int *>(malloc(input_total_count * sizeof(int)));
  if (input_dim == nullptr) {
    AICPU_LOGE("Malloc memory failed!");
    return AICPU_KERNEL_STATE_INTERNAL_ERROR;
  }
  for (int32_t i = 0; i < input_total_count; i++) {
    if (input[i] != 0) {
      input_dim[non_zero_num] = i;
      non_zero_num++;
    }
  }
  bool padding_flag = false;
  int32_t output_length = 0;
  int32_t output_non_zero_length = 0;
  GetOutputLength(&padding_flag, &output_length, &output_non_zero_length, count_, non_zero_num);

  int *tmp_output = nullptr;
  int *mask_dim = nullptr;
  ret = GenerateRandomMask(output_length, non_zero_num, output_non_zero_length, &input_dim, &tmp_output, &mask_dim);
  if (!ret) {
    AICPU_LOGE("Generate random mask failed!");
    return AICPU_KERNEL_STATE_INTERNAL_ERROR;
  }

  if (padding_flag) {
    int32_t index = 0;
    for (int32_t i = output_length - 1; i > non_zero_num; i--) {
      tmp_output[non_zero_num + index] = 0;
      mask_dim[non_zero_num + index] = 0;
      index++;
    }
  }

  int32_t copy_output_length = 0;
  if (output_length * input_dim_size >= INT_MAX || output_length * input_dim_size < 0) {
    AICPU_LOGE("Output size exceed INT_MAX");
    free(input_dim);
    free(tmp_output);
    free(mask_dim);
    return AICPU_KERNEL_STATE_INTERNAL_ERROR;
  }
  copy_output_length = output_length * input_dim_size;
  int *output = reinterpret_cast<int *>(malloc(copy_output_length * sizeof(int)));
  if (output == nullptr) {
    AICPU_LOGE("malloc memory failed!");
    free(input_dim);
    free(tmp_output);
    free(mask_dim);
    return AICPU_KERNEL_STATE_INTERNAL_ERROR;
  }
  if (memset_s(output, copy_output_length, 0x00, copy_output_length) != EOK) {
    AICPU_LOGE("memset_s memory failed!");
    free(input_dim);
    free(mask_dim);
    free(tmp_output);
    free(output);
    return AICPU_KERNEL_STATE_INTERNAL_ERROR;
  }
  ParseOutputCoordinate(dims_, output_length, input_dim_size, input_total_count, tmp_output, output);

  int32_t actual_output_length = count_ * dims_.size();
  copy_output_length = std::min(actual_output_length, copy_output_length);
  int32_t copy_output_bytes = 0;
  if (INT_MAX / static_cast<int>(sizeof(int32_t)) < copy_output_length) {
    AICPU_LOGE("The output length is out of range!");
    free(input_dim);
    free(mask_dim);
    free(tmp_output);
    free(output);
    return AICPU_KERNEL_STATE_INTERNAL_ERROR;
  }
  copy_output_bytes = copy_output_length * sizeof(int32_t);
  memcpy_s(output_coordinate, copy_output_bytes, output, copy_output_bytes);
  UpdateOutput(dims_, non_zero_num, count_, output_length, mask_dim, output_coordinate, mask);
  AICPU_LOGI("no zero num is %d, output_length is %d ", non_zero_num, output_length);
  UpdateOutputShapeValue(non_zero_num, output_length);
  free(input_dim);
  free(mask_dim);
  free(tmp_output);
  free(output);
  return AICPU_KERNEL_STATE_SUCCESS;
 }

 void RandomChoiceWithMaskKernel::UpdateOutputShapeValue(int32_t non_zero_num, int32_t output_length) {
  if (unknow_shape_) {
    output_shape_and_type_[0]->dims[0] = non_zero_num;
    output_shape_and_type_[1]->dims[0] = output_length;
  }
 }
 uint32_t RandomChoiceWithMaskKernel::ParseKernelParam() {
  ::google::protobuf::Map<::std::string, ::aicpuops::AttrValue> nodedef_map = node_def_.attrs();
  aicpuops::AttrValue random_choice_count_attrs = nodedef_map["count"];
  count_ = random_choice_count_attrs.i();
  AICPU_LOGI("This op attr count is %d", count_);

  if ((count_ == 0) && (!unknow_shape_)) {
    AICPU_LOGE("This op attr count is 0, but the shapetype is %d", unknow_shape_);
    return AICPU_KERNEL_STATE_PARAM_INVALID;
  }

  size_t inputs_size = node_def_.inputs_size();
  for (size_t i = 0; i < inputs_size; i++) {
    aicpuops::Tensor input_tensor = node_def_.inputs(i);
    aicpuops::TensorShape input_shape = input_tensor.tensor_shape();
    for (int j = 0; j < input_shape.dim_size(); j++) {
      dims_.push_back(input_shape.dim(j).size());
    }
  }

  return AICPU_KERNEL_STATE_SUCCESS;
 }
 }  // namespace aicpu

 extern "C" {
 __attribute__((visibility("default"))) uint32_t RandomChoiceWithMask(void *param) {
  aicpu::RandomChoiceWithMaskKernel randomChoiceWithMaskKernel;
  return randomChoiceWithMaskKernel.Compute(param);
 }
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/random_choice_with_mask_kernels.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_ops/random_choice_with_mask_kernels.h
@@ -0,0 +1,36 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef AICPU_OPS_AICPU_RANDOM_CHOICE_WITH_MASK_KERNELS_H_
 #define AICPU_OPS_AICPU_RANDOM_CHOICE_WITH_MASK_KERNELS_H_

 #include <vector>
 #include "common/kernel_base.h"

 namespace aicpu {
 class RandomChoiceWithMaskKernel : public KernelBase {
 public:
  RandomChoiceWithMaskKernel() : KernelBase("RandomChoiceWithMask") {}
  ~RandomChoiceWithMaskKernel() = default;

 protected:
  int32_t count_ = 0;
  std::vector<int64_t> dims_;
  uint32_t DoCompute() override;
  uint32_t ParseKernelParam() override;
  void UpdateOutputShapeValue(int32_t non_zero_num, int32_t output_length);
 };
 }  // namespace aicpu
 #endif  // AICPU_OPS_AICPU_RANDOM_CHOICE_WITH_MASK_KERNELS_H_