!28310 dynamic_kernel_mod

Merge pull request !28310 from TuDouNi/dynamic_shape_stage1
4 years ago · e4438f3028
--- a/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/CMakeLists.txt
@@ -1,5 +1,6 @@
 file(GLOB_RECURSE KERNEL_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
    "kernel_build_info.cc"
    "kernel.cc"
    "kash/*.cc"
    "common_utils.cc"
    "oplib/*.cc"
@@ -12,6 +13,7 @@ endif()

 if(ENABLE_D)
    file(GLOB_RECURSE D_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "ascend_kernel_mod.cc"
        "kernel_query.cc"
        "tbe/*.cc"
        "host/*.cc"
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.cc
@@ -36,13 +36,12 @@ using HostDynamicKernel = mindspore::device::ascend::HostDynamicKernel;

 namespace mindspore {
 namespace kernel {
 AicpuOpKernelMod::AicpuOpKernelMod() : anf_node_(nullptr) {}
 AicpuOpKernelMod::AicpuOpKernelMod() {}

 AicpuOpKernelMod::~AicpuOpKernelMod() {
  args_.clear();
  inputList_.clear();
  outputList_.clear();
  anf_node_ = nullptr;
  input_list_.clear();
  output_list_.clear();
  input_size_list_.clear();
  output_size_list_.clear();
  workspace_size_list_.clear();
@@ -55,9 +54,9 @@ void AicpuOpKernelMod::SetOutputSizeList(const std::vector<size_t> &size_list) {
 const std::vector<size_t> &AicpuOpKernelMod::GetOutputSizeList() const { return output_size_list_; }
 void AicpuOpKernelMod::SetWorkspaceSizeList(const std::vector<size_t> &size_list) { workspace_size_list_ = size_list; }
 const std::vector<size_t> &AicpuOpKernelMod::GetWorkspaceSizeList() const { return workspace_size_list_; }
 void AicpuOpKernelMod::SetInputList(const std::vector<int64_t> &inputList) { inputList_ = inputList; }
 void AicpuOpKernelMod::SetOutputList(const std::vector<int64_t> &outputList) { outputList_ = outputList; }
 void AicpuOpKernelMod::SetNodeDef(const std::string &nodeDef) { (void)node_def_str_.assign(nodeDef); }
 void AicpuOpKernelMod::SetInputList(const std::vector<int64_t> &input_list) { input_list_ = input_list; }
 void AicpuOpKernelMod::SetOutputList(const std::vector<int64_t> &output_list) { output_list_ = output_list; }
 void AicpuOpKernelMod::SetNodeDef(const std::string &node_def) { (void)node_def_str_.assign(node_def); }
 void AicpuOpKernelMod::SetExtInfo(const std::string &ext_info) { ext_info_ = ext_info; }
 void AicpuOpKernelMod::SetNodeName(const std::string &node_name) { node_name_ = node_name; }
 void AicpuOpKernelMod::SetCustSo(const std::string &cust_so) {
@@ -85,11 +84,18 @@ void AicpuOpKernelMod::CreateCpuKernelInfo(const std::vector<AddressPtr> &inputs
        node_so_ = kLibAicpuKernelSoName;
      }
    }
  } else {
    if (kCpuKernelBaseOps.find(node_name_) == kCpuKernelBaseOps.end()) {
      node_name_ = kCpuRunApi;
    }
  } else if (kCpuKernelBaseOps.find(node_name_) == kCpuKernelBaseOps.end()) {
    node_name_ = kCpuRunApi;
  }

  if (node_name_ == kTopK) {
    node_name_ = kTopKV2;
  }

  if (node_name_ == kStack) {
    node_name_ = kPack;
  }

  // InputOutputAddr
  vector<void *> io_addrs;
  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(io_addrs),
@@ -120,6 +126,8 @@ void AicpuOpKernelMod::CreateCpuKernelInfo(const std::vector<AddressPtr> &inputs
    aicpu_param_head.extInfoAddr = 0;
  } else {
    MS_LOG(INFO) << "Dynamic Kernel Ext Info size:" << ext_info_.size();
    aicpu_param_head.extInfoLength = SizeToUint(ext_info_.size());
    aicpu_param_head.extInfoAddr = reinterpret_cast<uint64_t>(ext_info_addr_dev_);
  }

  args_.clear();
@@ -162,6 +170,8 @@ bool AicpuOpKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::
  }
  MS_LOG(INFO) << "Aicpu launch, node_so_:" << node_so_ << ", node name:" << node_name_
               << ", args_size:" << args_.length();
  // cppcheck-suppress unreadVariable
  auto lock = AscendKernelMod::LockRuntime();
  if (rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(node_so_.c_str()),
                                reinterpret_cast<const void *>(node_name_.c_str()), 1,
                                reinterpret_cast<const void *>(args_.data()), static_cast<uint32_t>(args_.length()),
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_mod.h
@@ -25,6 +25,8 @@ namespace kernel {
 class AicpuOpKernelMod : public AscendKernelMod {
 public:
  AicpuOpKernelMod();
  explicit AicpuOpKernelMod(const AnfNodePtr &anf_node_ptr) : AscendKernelMod(anf_node_ptr) {}

  ~AicpuOpKernelMod() override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
@@ -33,10 +35,10 @@ class AicpuOpKernelMod : public AscendKernelMod {
                                   const std::vector<AddressPtr> &outputs, uint32_t stream_id) override;
  device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override;

  void SetInputList(const std::vector<int64_t> &inputList);
  void SetOutputList(const std::vector<int64_t> &outputList);
  void SetInputList(const std::vector<int64_t> &input_list);
  void SetOutputList(const std::vector<int64_t> &output_list);
  void SetAnfNode(const AnfNodePtr &anf_node);
  void SetNodeDef(const std::string &nodeDef);
  void SetNodeDef(const std::string &node_def);
  void SetExtInfo(const std::string &ext_info);
  void SetNodeName(const std::string &node_name);
  void SetCustSo(const std::string &cust_so);
@@ -56,16 +58,18 @@ class AicpuOpKernelMod : public AscendKernelMod {
  const std::vector<size_t> &GetOutputSizeList() const override;
  const std::vector<size_t> &GetWorkspaceSizeList() const override;

 private:
  bool cust_kernel_{false};
 protected:
  std::string args_;
  std::string node_def_str_;
  std::string ext_info_;
  std::string node_name_;
  std::string node_so_;
  std::string ext_info_;
  std::vector<int64_t> inputList_;
  std::vector<int64_t> outputList_;
  AnfNodePtr anf_node_;
  bool cust_kernel_{false};
  std::string node_def_str_;
  void *ext_info_addr_dev_ = nullptr;

 private:
  std::vector<int64_t> input_list_;
  std::vector<int64_t> output_list_;

  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/dynamic_aicpu_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/dynamic_aicpu_kernel_mod.cc
@@ -0,0 +1,231 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/aicpu/dynamic_aicpu_kernel_mod.h"

 #include <memory>
 #include <vector>
 #include <string>
 #include <algorithm>
 #include "runtime/mem.h"
 #include "acl/acl_rt.h"
 #include "utils/convert_utils.h"
 #include "backend/kernel_compiler/aicpu/aicpu_util.h"
 #include "utils/ms_context.h"
 #include "runtime/device/kernel_runtime.h"
 #include "runtime/kernel.h"
 #include "utils/utils.h"
 #include "backend/session/anf_runtime_algorithm.h"

 namespace mindspore {
 namespace kernel {
 DynamicAicpuOpKernelMod::DynamicAicpuOpKernelMod(const AnfNodePtr &anf_node_ptr) : AicpuOpKernelMod(anf_node_ptr) {
  unknow_type_ = device::ascend::UnknowShapeOpType::DEPEND_IN_SHAPE;
  auto cnode = anf_node_ptr->cast<CNodePtr>();
  if (cnode != nullptr) {
    auto op_name = AnfAlgo::GetCNodeName(cnode);
    if (kComputeDepend.find(op_name) != kComputeDepend.end()) {
      unknow_type_ = device::ascend::UnknowShapeOpType::DEPEND_COMPUTE;
    }
  }
 }

 DynamicAicpuOpKernelMod::~DynamicAicpuOpKernelMod() {
  // free dev ptr
  if (ext_info_addr_dev_ == nullptr) {
    return;
  }
  auto ret = rtFree(ext_info_addr_dev_);
  if (ret != RT_ERROR_NONE) {
    MS_LOG(ERROR) << "rtFree failed";
  }
 }

 void DynamicAicpuOpKernelMod::InferOp() {
  auto node = anf_node_.lock();
  MS_EXCEPTION_IF_NULL(node);
  if (!AnfAlgo::IsDynamicShape(node)) {
    MS_LOG(EXCEPTION) << "The node is not dynamic shape.";
  }
  KernelMod::InferShape();
 }

 void DynamicAicpuOpKernelMod::InitOp() {
  auto node = anf_node_.lock();
  MS_EXCEPTION_IF_NULL(node);
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  if (!AnfAlgo::IsDynamicShape(cnode)) {
    MS_LOG(EXCEPTION) << "The node is not dynamic shape: " << cnode->fullname_with_scope();
  }

  MS_LOG(INFO) << "UpdateExtInfo of " << cnode->fullname_with_scope() << " start";
  auto input_num = AnfAlgo::GetInputTensorNum(cnode);
  auto output_num = AnfAlgo::GetOutputTensorNum(cnode);
  if (input_num == 0 && output_num == 0) {
    MS_LOG(INFO) << "Node:" << cnode->fullname_with_scope() << " no need to update output shape";
    return;
  }

  // Parse aicpu ext info
  ext_info_handler_ = std::make_shared<device::ascend::AicpuExtInfoHandler>(
    cnode->fullname_with_scope(), static_cast<uint32_t>(input_num), static_cast<uint32_t>(output_num), unknow_type_);
  MS_EXCEPTION_IF_NULL(ext_info_handler_);
  if (!ext_info_handler_->Parse(ext_info_)) {
    MS_LOG(EXCEPTION) << "Parse AiCpu ext_info_handler failed";
  }

  if (ext_info_.empty()) {
    MS_LOG(INFO) << "No need to copy to device, ext_info_ is empty. ";
    return;
  }

  for (size_t i = 0; i < input_num; ++i) {
    if (!ext_info_handler_->UpdateInputShapeAndType(i, NOT_NULL(cnode))) {
      MS_LOG(EXCEPTION) << "Update input shape failed, cnode:" << cnode->fullname_with_scope() << " input:" << i;
    }
  }

  if (unknow_type_ != device::ascend::UnknowShapeOpType::DEPEND_COMPUTE) {
    for (size_t i = 0; i < output_num; ++i) {
      if (!ext_info_handler_->UpdateOutputShapeAndType(i, NOT_NULL(cnode))) {
        MS_LOG(EXCEPTION) << "Update output shape failed, cnode:" << cnode->fullname_with_scope() << " output:" << i;
      }
    }
  }
 }

 void DynamicAicpuOpKernelMod::AllocateExtInfoDeviceAddr(const CNodePtr &cnode) {
  MS_EXCEPTION_IF_NULL(cnode);
  if (ext_info_addr_dev_ != nullptr) {
    return;
  }
  // Allocate ext info addr in device
  if (ext_info_.size() != 0) {
    auto ret = rtMalloc(&ext_info_addr_dev_, ext_info_.size(), RT_MEMORY_HBM);
    if (ret != RT_ERROR_NONE) {
      MS_LOG(EXCEPTION) << "Call rtMalloc ext_info_addr_dev_ failed. Op name: " << cnode->fullname_with_scope();
    }
  }
  ext_info_size_ = ext_info_.size();
 }

 bool DynamicAicpuOpKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                                     const std::vector<AddressPtr> &outputs, void *stream_ptr) {
  if (stream_ptr == nullptr) {
    MS_LOG(ERROR) << "stream_ptr should not be nullptr.";
    return false;
  }
  if (stream_ == nullptr) {
    stream_ = stream_ptr;
  }
  auto node = anf_node_.lock();
  MS_EXCEPTION_IF_NULL(node);
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  MS_LOG(INFO) << "Start launch of node: " << cnode->fullname_with_scope();

  // is dynamic shape
  if (!AnfAlgo::IsDynamicShape(cnode)) {
    MS_LOG(EXCEPTION) << "The cnode is not dynamic shape:" << cnode->fullname_with_scope();
  }

  // copy extinfo to device
  AllocateExtInfoDeviceAddr(cnode);
  MS_EXCEPTION_IF_NULL(ext_info_handler_);
  auto ret = aclrtMemcpy(ext_info_addr_dev_, ext_info_size_, ext_info_handler_->GetExtInfo(),
                         ext_info_handler_->GetExtInfoLen(), ACL_MEMCPY_HOST_TO_DEVICE);
  if (ret != RT_ERROR_NONE) {
    MS_LOG(ERROR) << "UpdateExtInfo aclrtMemcpy failed. Node info: " << cnode->fullname_with_scope();
    return false;
  }

  AicpuOpKernelMod::CreateCpuKernelInfo(inputs, outputs);
  MS_LOG(INFO) << "Aicpu launch, node_so_:" << node_so_ << ", node name:" << node_name_
               << ", args_size:" << args_.length();
  // cppcheck-suppress unreadVariable
  auto lock = AscendKernelMod::LockRuntime();
  ret = rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(node_so_.c_str()),
                                  reinterpret_cast<const void *>(node_name_.c_str()), 1,
                                  reinterpret_cast<const void *>(args_.data()), static_cast<uint32_t>(args_.length()),
                                  nullptr, stream_, RT_KERNEL_DEFAULT);
  if (ret != RT_ERROR_NONE) {
    MS_LOG(ERROR) << "Aicpu op launch failed!";
    return false;
  }

  if (unknow_type_ == device::ascend::UnknowShapeOpType::DEPEND_COMPUTE) {
    ret = aclrtMemcpyAsync(ext_info_handler_->GetExtInfo(), ext_info_handler_->GetExtInfoLen(), ext_info_addr_dev_,
                           ext_info_size_, ACL_MEMCPY_DEVICE_TO_HOST, stream_);
    if (ret != RT_ERROR_NONE) {
      MS_LOG(ERROR) << "aclrtMemcpyAsync output shape failed. Op name: " << cnode->fullname_with_scope();
      return false;
    }
  }

  return true;
 }

 void DynamicAicpuOpKernelMod::UpdateOp() {
  auto node = anf_node_.lock();
  MS_EXCEPTION_IF_NULL(node);
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  MS_LOG(INFO) << "Aicpu " << cnode->fullname_with_scope() << " PostExecute";
  // is dynamic shape
  if (!AnfAlgo::IsDynamicShape(cnode)) {
    MS_LOG(EXCEPTION) << "The cnode is not dynamic shape:" << cnode->fullname_with_scope();
  }

  if (unknow_type_ != device::ascend::UnknowShapeOpType::DEPEND_COMPUTE) {
    MS_LOG(INFO) << "Node " << node->fullname_with_scope() << " update op skip.";
    return;
  }
  // cppcheck-suppress unreadVariable
  auto lock = AscendKernelMod::LockRuntime();
  auto ret = rtStreamSynchronize(stream_);
  if (ret != RT_ERROR_NONE) {
    MS_LOG(EXCEPTION) << "Call runtime rtStreamSynchronize failed. Op name: " << cnode->fullname_with_scope();
  }

  MS_LOG(INFO) << "Update aicpu kernel output shape from ext_info. Op name: " << cnode->fullname_with_scope();
  UpdateOutputShapeFromExtInfo(cnode);
 }

 bool DynamicAicpuOpKernelMod::UpdateOutputShapeFromExtInfo(const CNodePtr &cnode) {
  MS_EXCEPTION_IF_NULL(cnode);
  MS_LOG(INFO) << "UpdateOutputShapeFromExtInfo start. Op name " << cnode->fullname_with_scope();
  MS_EXCEPTION_IF_NULL(ext_info_handler_);

  std::vector<TypeId> type_ids;
  std::vector<std::vector<size_t>> shapes;
  auto output_num = AnfAlgo::GetOutputTensorNum(cnode);
  for (size_t i = 0; i < output_num; ++i) {
    MS_LOG(INFO) << "Get output:" << output_num << " Shape";
    std::vector<int64_t> shape;
    TypeId type_id;
    (void)ext_info_handler_->GetOutputShapeAndType(SizeToUint(i), NOT_NULL(&shape), NOT_NULL(&type_id));
    type_ids.emplace_back(type_id);
    std::vector<size_t> size_t_shape;
    std::transform(shape.begin(), shape.end(), std::back_inserter(size_t_shape), LongToSize);
    shapes.emplace_back(size_t_shape);
  }

  AnfAlgo::SetOutputInferTypeAndShape(type_ids, shapes, cnode.get());
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/dynamic_aicpu_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/dynamic_aicpu_kernel_mod.h
@@ -0,0 +1,54 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_DYNAMIC_AICPU_KERNEL_MOD_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_DYNAMIC_AICPU_KERNEL_MOD_H_
 #include <vector>
 #include <memory>
 #include <string>
 #include "backend/kernel_compiler/aicpu/aicpu_kernel_mod.h"
 #include "backend/kernel_compiler/aicpu/aicpu_util.h"
 #include "runtime/device/ascend/executor/aicpu_ext_info_handle.h"
 namespace mindspore {
 namespace kernel {
 class DynamicAicpuOpKernelMod : public AicpuOpKernelMod {
 public:
  DynamicAicpuOpKernelMod() : unknow_type_(device::ascend::UnknowShapeOpType::DEPEND_IN_SHAPE) {}
  explicit DynamicAicpuOpKernelMod(const AnfNodePtr &anf_node_ptr);

  ~DynamicAicpuOpKernelMod() override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;

  void InferOp() override;
  void InitOp() override;
  void UpdateOp() override;

 private:
  void AllocateExtInfoDeviceAddr(const CNodePtr &cnode);
  bool UpdateOutputShapeFromExtInfo(const CNodePtr &cnode);

  std::shared_ptr<device::ascend::AicpuExtInfoHandler> ext_info_handler_ = nullptr;
  size_t ext_info_size_ = 0;
  device::ascend::UnknowShapeOpType unknow_type_;
 };

 using DynamicAicpuOpKernelModPtr = std::shared_ptr<DynamicAicpuOpKernelMod>;
 using DynamicAicputOpKernelModPtrList = std::vector<DynamicAicpuOpKernelModPtr>;
 }  // namespace kernel
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_AICPU_DYNAMIC_AICPU_KERNEL_MOD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.cc
@@ -0,0 +1,35 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/ascend_kernel_mod.h"
 #include "runtime/rt.h"
 namespace mindspore {
 namespace kernel {
 void AscendKernelMod::UpdateOp() {
  MS_EXCEPTION_IF_NULL(stream_);
  // cppcheck-suppress unreadVariable
  auto lock = LockRuntime();
  if (RT_ERROR_NONE != rtStreamSynchronize(stream_)) {
    MS_LOG(EXCEPTION) << "Call runtime rtStreamSynchronize failed.";
  }
 }

 std::lock_guard<std::mutex> AscendKernelMod::LockRuntime() {
  static std::mutex mutex;
  return std::lock_guard<std::mutex>(mutex);
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.h
@@ -31,6 +31,8 @@ namespace mindspore {
 namespace kernel {
 class AscendKernelMod : public KernelMod {
 public:
  AscendKernelMod() {}
  explicit AscendKernelMod(const AnfNodePtr &anf_node_ptr) : KernelMod(anf_node_ptr) {}
  virtual std::vector<TaskInfoPtr> GenTask(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
                                           const std::vector<AddressPtr> &, uint32_t) = 0;
  uint32_t block_dim() { return block_dim_; }
@@ -44,6 +46,7 @@ class AscendKernelMod : public KernelMod {
    return false;
 #endif
  }
  void UpdateOp() override;

  void InitDynamicKernel(const CNodePtr &cnode_ptr, void *stream) {
    if (dynamic_kernel_ == nullptr) {
@@ -54,6 +57,8 @@ class AscendKernelMod : public KernelMod {
  }
  device::DynamicKernelPtr DynamicKernel() const { return dynamic_kernel_; }

  static std::lock_guard<std::mutex> LockRuntime();

 protected:
  uint32_t block_dim_{1};
  uint32_t stream_id_{0};
--- a/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.cc
@@ -66,7 +66,13 @@ HcclKernelFactory &HcclKernelFactory::Get() {

 HcclKernel::HcclKernel()
    : hccl_count_(0), op_type_(::HcclReduceOp::HCCL_REDUCE_SUM), root_id_(0), src_rank_(0), dest_rank_(0) {}

 HcclKernel::HcclKernel(const AnfNodePtr &anf_node)
    : AscendKernelMod(),
      hccl_count_(0),
      op_type_(::HcclReduceOp::HCCL_REDUCE_SUM),
      root_id_(0),
      src_rank_(0),
      dest_rank_(0) {}
 HcclKernel::~HcclKernel() {
  hccl_kernel_input_shape_list_.clear();
  hccl_kernel_output_shape_list_.clear();
@@ -294,5 +300,99 @@ device::DynamicKernelPtr HcclKernel::GenDynamicKernel(const CNodePtr &cnode_ptr,
    hccl_type, input_data_addr, output_data_addr, hccl_count_, data_type, op_type_, root_id_, stream_ptr, cnode_ptr);
  return executor;
 }

 void HcclKernel::InferOp() {
  if (AnfAlgo::IsDynamicShape(anf_node_.lock())) {
    KernelMod::InferShape();
  }
 }

 bool HcclKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
                        const std::vector<AddressPtr> &outputs, void *stream_ptr) {
  auto node = anf_node_.lock();
  MS_EXCEPTION_IF_NULL(node);
  if (!node->isa<CNode>()) {
    MS_LOG(EXCEPTION) << "anfnode is not a cnode";
  }
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);

  if (inputs.empty() && outputs.empty()) {
    MS_LOG(ERROR) << "Hccl kernel input or output is empty";
    return false;
  }
  if (hccl_data_type_list_.empty()) {
    MS_LOG(ERROR) << "Hccl data type list is empty";
    return false;
  }

  MS_EXCEPTION_IF_NULL(stream_ptr);

  MS_LOG(INFO) << "Start Execute: " << cnode->DebugString();
  std::string hccl_type = MsOpNameToHcomOpType(AnfAlgo::GetCNodeName(anf_node_.lock()));
  HcclDataType data_type = hccl_data_type_list_[0];

  ::HcomOperation op_info;
  op_info.hcclType = hccl_type;
  op_info.inputPtr = inputs[0]->addr;
  op_info.outputPtr = outputs[0]->addr;
  op_info.dataType = static_cast<HcclDataType>(data_type);
  op_info.opType = static_cast<HcclReduceOp>(op_type_);
  op_info.root = IntToUint(root_id_);
  op_info.count = hccl_count_;

  auto callback = [this](HcclResult status) {
    if (status != HCCL_SUCCESS) {
      MS_LOG(ERROR) << "HcomExcutorInitialize failed, ret:" << status;
    }
    std::lock_guard<std::mutex> lock(this->hccl_mutex_);
    this->cond_.notify_all();
    MS_LOG(INFO) << "hccl callback success.";
  };

  auto hccl_ret = hccl::HcclAdapter::GetInstance().HcclExecEnqueueOp(op_info, callback);
  if (hccl_ret != HCCL_SUCCESS) {
    MS_LOG(EXCEPTION) << "Call EnqueueHcomOperation failed, node info: " << cnode->DebugString();
    return false;
  }

  std::unique_lock<std::mutex> ulock(hccl_mutex_);
  cond_.wait(ulock);
  MS_LOG(INFO) << "Execute " << cnode->DebugString() << " success";
  return true;
 }

 void HcclKernel::InitOp() {
  auto node = anf_node_.lock();
  MS_EXCEPTION_IF_NULL(node);
  if (!node->isa<CNode>()) {
    MS_LOG(EXCEPTION) << "anfnode is not a cnode";
  }
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);

  if (!AnfAlgo::IsDynamicShape(cnode)) {
    MS_LOG(DEBUG) << "The node is not dynamic shape: " << cnode->fullname_with_scope();
    return;
  }

  MS_LOG(INFO) << "Start to InitOp. Node info: " << cnode->DebugString();

  std::vector<std::vector<size_t>> hccl_kernel_input_shape_list;
  if (!HcomUtil::GetKernelInputShape(cnode, &hccl_kernel_input_shape_list)) {
    MS_LOG(EXCEPTION) << "GetKernelInputShape fail! Node info: " << cnode->DebugString();
  }

  std::vector<HcclDataType> hccl_data_type_list;
  if (!HcomUtil::GetHcomDataType(cnode, &hccl_data_type_list)) {
    MS_LOG(EXCEPTION) << "GetHcomDataType fail! Node info: " << cnode->DebugString();
  }

  // Update Hccl count
  if (!HcomUtil::GetHcomCount(cnode, hccl_data_type_list, hccl_kernel_input_shape_list, &hccl_count_)) {
    MS_LOG(EXCEPTION) << "GetHcomCount fail! Node info: " << cnode->DebugString();
  }
  MS_LOG(INFO) << "Update Hccl count:" << hccl_count_;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_kernel.h
@@ -34,6 +34,7 @@ namespace kernel {
 class HcclKernel : public AscendKernelMod {
 public:
  HcclKernel();
  explicit HcclKernel(const AnfNodePtr &anf_node);
  ~HcclKernel() override;
  virtual bool Init(const AnfNodePtr &anf_node);
  const std::vector<size_t> &GetInputSizeList() const override;
@@ -43,6 +44,12 @@ class HcclKernel : public AscendKernelMod {
                                   const std::vector<AddressPtr> &outputs, uint32_t stream_id) override;
  device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;

  void InferOp() override;
  void InitOp() override;

 protected:
  std::vector<std::vector<size_t>> hccl_kernel_input_shape_list_;
  std::vector<std::vector<size_t>> hccl_kernel_output_shape_list_;
@@ -56,9 +63,10 @@ class HcclKernel : public AscendKernelMod {
  mutable std::vector<size_t> input_size_list_;
  mutable std::vector<size_t> output_size_list_;
  mutable std::vector<size_t> workspace_size_list_;
  AnfNodeWeakPtr anf_node_;
  std::string op_name_;
  std::string group_;
  std::mutex hccl_mutex_;
  std::condition_variable cond_;
 };

 using HcclKernelCreater = std::function<std::shared_ptr<HcclKernel>()>;
--- a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_broadcast_gradient_args_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_broadcast_gradient_args_kernel.cc
@@ -16,6 +16,7 @@

 #include "backend/kernel_compiler/host/dynamic_broadcast_gradient_args_kernel.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "runtime/device/ascend/ascend_kernel_runtime.h"
 #include "utils/trace_base.h"

 namespace mindspore {
@@ -195,6 +196,15 @@ void DynamicBroadcastGradientArgsKernel::Execute() {
  input_shapes[1] = GetInputShape(cnode, 1);
  auto grad_reduce_idx = CalculateOutput(input_shapes);

  auto runtime_instance = device::KernelRuntimeManager::Instance().GetCurrentKernelRuntime();
  MS_EXCEPTION_IF_NULL(runtime_instance);
  // cppcheck-suppress unreadVariable
  auto lock = AscendKernelMod::LockRuntime();
  auto ret = runtime_instance->SyncStream();
  if (!ret) {
    MS_LOG(EXCEPTION) << "Sync stream error!";
  }

  auto r0_size = SetOutputValue(cnode, grad_reduce_idx, 0, input_shapes[0].size());
  auto r1_size = SetOutputValue(cnode, grad_reduce_idx, 1, input_shapes[1].size());

@@ -209,5 +219,26 @@ device::DynamicKernelPtr DynamicBroadcastGradientArgsKernelMod::GenDynamicKernel
                                                                                 void *stream_ptr) {
  return std::make_shared<DynamicBroadcastGradientArgsKernel>(stream_ptr, cnode_ptr);
 }

 bool DynamicBroadcastGradientArgsKernelMod::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
                                                   const std::vector<AddressPtr> &, void *stream_ptr) {
  auto node = anf_node_.lock();
  MS_EXCEPTION_IF_NULL(node);
  if (!node->isa<CNode>()) {
    MS_LOG(EXCEPTION) << "anfnode is not a cnode";
  }
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  stream_ = stream_ptr;
  auto broadcast_grad_kernel = std::make_shared<DynamicBroadcastGradientArgsKernel>(stream_ptr, cnode);
  try {
    broadcast_grad_kernel->Execute();
  } catch (const std::exception &e) {
    MS_LOG(ERROR) << "DynamicBroadcastGradientArgsKernel Launch failed. node: " << cnode->fullname_with_scope()
                  << ", Error message is " << e.what();
    return false;
  }
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_broadcast_gradient_args_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_broadcast_gradient_args_kernel.h
@@ -36,6 +36,8 @@ class DynamicBroadcastGradientArgsKernelMod : public HostKernelMod {
  DynamicBroadcastGradientArgsKernelMod() = default;
  ~DynamicBroadcastGradientArgsKernelMod() override = default;
  device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
 };
 MS_HOST_REG_KERNEL(DynamicBroadcastGradientArgs, DynamicBroadcastGradientArgsKernelMod);
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_reshape_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_reshape_kernel.cc
@@ -114,5 +114,26 @@ void DynamicReshapeKernel::Execute() {
 device::DynamicKernelPtr DynamicReshapeKernelMod::GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) {
  return std::make_shared<DynamicReshapeKernel>(stream_ptr, cnode_ptr);
 }

 bool DynamicReshapeKernelMod::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
                                     const std::vector<AddressPtr> &, void *stream_ptr) {
  auto node = anf_node_.lock();
  MS_EXCEPTION_IF_NULL(node);
  if (!node->isa<CNode>()) {
    MS_LOG(EXCEPTION) << "anfnode is not a cnode";
  }
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  stream_ = stream_ptr;
  auto reshape_kernel = std::make_shared<DynamicReshapeKernel>(stream_ptr, cnode);
  try {
    reshape_kernel->Execute();
  } catch (const std::exception &e) {
    MS_LOG(ERROR) << "DynamicReshapeKernel Launch failed. node: " << cnode->fullname_with_scope()
                  << ", Error message is " << e.what();
    return false;
  }
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_reshape_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_reshape_kernel.h
@@ -35,6 +35,9 @@ class DynamicReshapeKernelMod : public HostKernelMod {
  DynamicReshapeKernelMod() = default;
  ~DynamicReshapeKernelMod() override = default;
  device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
  void UpdateOp() override { AscendKernelMod::UpdateOp(); }
 };
 MS_HOST_REG_KERNEL(DynamicReshape, DynamicReshapeKernelMod);
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_shape_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_shape_kernel.cc
@@ -57,6 +57,8 @@ void DynamicShapeKernel::Execute() {
  } else {
    auto runtime_instance = device::KernelRuntimeManager::Instance().GetCurrentKernelRuntime();
    MS_EXCEPTION_IF_NULL(runtime_instance);
    // cppcheck-suppress unreadVariable
    auto lock = AscendKernelMod::LockRuntime();
    auto ret = runtime_instance->SyncStream();
    if (!ret) {
      MS_LOG(EXCEPTION) << "Sync stream error!";
@@ -106,5 +108,23 @@ void DynamicShapeKernel::Execute(const std::vector<AddressPtr> &inputs, const st
 device::DynamicKernelPtr DynamicShapeKernelMod::GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) {
  return std::make_shared<DynamicShapeKernel>(stream_ptr, cnode_ptr);
 }

 bool DynamicShapeKernelMod::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
                                   const std::vector<AddressPtr> &, void *stream_ptr) {
  auto node = anf_node_.lock();
  MS_EXCEPTION_IF_NULL(node);
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  stream_ = stream_ptr;
  auto shape_kernel = std::make_shared<DynamicShapeKernel>(stream_ptr, cnode);
  try {
    shape_kernel->Execute();
  } catch (const std::exception &e) {
    MS_LOG(ERROR) << "DynamicShapeKernelMod Launch failed. node: " << cnode->fullname_with_scope()
                  << ", Error message is " << e.what();
    return false;
  }
  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_shape_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_shape_kernel.h
@@ -38,18 +38,7 @@ class DynamicShapeKernelMod : public HostKernelMod {
  ~DynamicShapeKernelMod() override = default;
  device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override;
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    if (kernel_ == nullptr) {
      kernel_ =
        std::dynamic_pointer_cast<DynamicShapeKernel>(GenDynamicKernel(anf_node_->cast<CNodePtr>(), stream_ptr));
      kernel_->Initialize();
    }
    kernel_->Execute(inputs, outputs);
    return true;
  }

 private:
  std::shared_ptr<DynamicShapeKernel> kernel_;
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
 };
 MS_HOST_REG_KERNEL(DynamicShape, DynamicShapeKernelMod);
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/kernel_compiler/host/host_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/host_kernel_mod.cc
@@ -77,6 +77,16 @@ bool HostKernelMod::Launch(const std::vector<AddressPtr> &, const std::vector<Ad
                           const std::vector<AddressPtr> &, void *) {
  return true;
 }

 void HostKernelMod::InferOp() {
  auto node = anf_node_.lock();
  MS_EXCEPTION_IF_NULL(node);
  if (!AnfAlgo::IsDynamicShape(node)) {
    MS_LOG(EXCEPTION) << "The node is not dynamic shape.";
  }
  KernelMod::InferShape();
 }

 std::vector<TaskInfoPtr> HostKernelMod::GenTask(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
                                                const std::vector<AddressPtr> &, uint32_t) {
  return {};
--- a/mindspore/ccsrc/backend/kernel_compiler/host/host_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/host_kernel_mod.h
@@ -36,9 +36,10 @@ class HostKernelMod : public AscendKernelMod {
                                   const std::vector<AddressPtr> &, uint32_t) override;
  device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override = 0;
  bool Init(const AnfNodePtr &anf_node);
  void InferOp() override;
  void UpdateOp() override {}

 protected:
  AnfNodePtr anf_node_;
  std::string op_name_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
--- a/mindspore/ccsrc/backend/kernel_compiler/kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel.cc
@@ -0,0 +1,184 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/kernel.h"

 #include <algorithm>
 #include <stack>
 #include <utility>
 #include "utils/ms_context.h"
 #include "utils/anf_utils.h"
 #include "utils/ms_device_shape_transfer.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "backend/optimizer/common/helper.h"

 namespace mindspore {
 namespace kernel {
 constexpr int64_t kInvalidShape = -2;

 void KernelMod::InferShape() {
  auto node = anf_node_.lock();
  MS_EXCEPTION_IF_NULL(node);
  if (!node->isa<CNode>()) {
    MS_LOG(EXCEPTION) << "anfnode is not a cnode";
  }
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  MS_LOG(INFO) << "InferShape start, node:" << cnode->fullname_with_scope();
  GetDepndLists(cnode);
  auto ret = InferShapeForDefiniteOutputNode(cnode);
  if (ret) {
    return;
  }
  depend_tensor_map_.clear();
  auto inputs = cnode->inputs();
  if (inputs.empty()) {
    MS_LOG(EXCEPTION) << "Invalid inputs";
  }
  auto context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context);
  AbstractBasePtrList args_spec_list;
  auto primitive = GetValueNode<PrimitivePtr>(inputs[0]);
  auto input_size = AnfAlgo::GetInputTensorNum(cnode);
  std::vector<AnfNodePtr> input_nodes;
  for (size_t i = 0; i < input_size; i++) {
    auto input_node_with_index = AnfAlgo::GetPrevNodeOutput(cnode, i);
    auto real_input = input_node_with_index.first;
    MS_EXCEPTION_IF_NULL(real_input);
    auto cnode_input = cnode->input(i + 1);
    MS_EXCEPTION_IF_NULL(cnode_input);
    InferShapeForNopNode(&real_input);
    if (depend_list_.find(i) != depend_list_.end()) {
      auto pre_node_with_index = AnfAlgo::GetPrevNodeOutput(cnode, i);
      bool skip_nop_node = !context->get_param<bool>(MS_CTX_ENABLE_MINDRT);
      auto output_addr = AnfAlgo::GetPrevNodeMutableOutputAddr(cnode, i, skip_nop_node);
      std::vector<int64_t> shapes =
        trans::GetRuntimePaddingShape(pre_node_with_index.first, pre_node_with_index.second);
      auto host_type = AnfAlgo::GetOutputInferDataType(pre_node_with_index.first, pre_node_with_index.second);
      auto out_tensor = std::make_shared<tensor::Tensor>(host_type, shapes);
      MS_EXCEPTION_IF_NULL(out_tensor);
      // The second parameter must be false, otherwise the device address cannot be released and allocated, and the
      // address size will be wrong in the dynamic shape scenario.
      out_tensor->set_device_address(output_addr, false);
      auto ret2 = depend_tensor_map_.try_emplace(i, out_tensor);
      if (!ret2.second) {
        MS_LOG(EXCEPTION) << "Insert map failed";
      }
      out_tensor->data_sync();
      auto lock = AnfUtils::GetAbstractLock(real_input.get());
      MS_EXCEPTION_IF_NULL(real_input->abstract());
      auto real_abs = real_input->abstract()->Clone();
      if (real_abs->isa<abstract::AbstractTensor>()) {
        real_abs->set_value(out_tensor);
      } else if (real_abs->isa<abstract::AbstractTuple>()) {
        auto tuple_get_item_index = AnfAlgo::GetTupleGetItemOutIndex(cnode_input->cast<CNodePtr>());
        auto abstract_tuple = real_abs->cast<abstract::AbstractTuplePtr>();
        MS_EXCEPTION_IF_NULL(abstract_tuple);
        auto tuple_elements = abstract_tuple->elements()[tuple_get_item_index];
        tuple_elements->set_value(out_tensor);
      }
      real_input->set_abstract(real_abs);
    }
    bool is_cnode_input = AnfAlgo::AddArgList(&args_spec_list, cnode_input, real_input, i);
    if (is_cnode_input) {
      input_nodes.push_back(cnode_input);
    } else {
      input_nodes.push_back(real_input);
    }
  }
  std::vector<AbstractScope> locks;
  std::transform(input_nodes.begin(), input_nodes.end(), std::back_inserter(locks),
                 [](const AnfNodePtr &input) { return AnfUtils::GetAbstractLock(input.get()); });
  auto eval_result = opt::CppInferShape(primitive, args_spec_list);
  locks.clear();
  // cppcheck-suppress unreadVariable
  auto lock = AnfUtils::GetAbstractLock(cnode.get());
  cnode->set_abstract(eval_result);
 }

 bool KernelMod::InferShapeForDefiniteOutputNode(const CNodePtr &cnode) {
  MS_EXCEPTION_IF_NULL(cnode);
  if (!AnfAlgo::CheckPrimitiveType(cnode, prim::kPrimShape)) {
    return false;
  }
  auto input_size = AnfAlgo::GetInputTensorNum(cnode);
  if (input_size != 1) {
    MS_LOG(EXCEPTION) << "Node only has one input: " << cnode->fullname_with_scope();
  }
  auto cur_shape = dynamic_cast<mindspore::abstract::Shape *>(cnode->Shape().get())->shape();
  if (std::any_of(cur_shape.begin(), cur_shape.end(), [](int64_t x) { return x == kInvalidShape; })) {
    return false;
  }
  std::vector<int64_t> output_shape = {static_cast<int64_t>(cur_shape.size())};
  mindspore::abstract::BaseShapePtr shape = std::make_shared<mindspore::abstract::Shape>(output_shape);

  auto lock = AnfUtils::GetAbstractLock(cnode.get());
  auto abstract = cnode->abstract()->Clone();
  MS_EXCEPTION_IF_NULL(abstract);
  abstract->set_shape(shape);
  cnode->set_abstract(abstract);
  return true;
 }

 void KernelMod::InferShapeForNopNode(AnfNodePtr *input_node) {
  MS_EXCEPTION_IF_NULL(*input_node);
  if (!opt::IsNopNode(*input_node) || !AnfAlgo::IsDynamicShape(*input_node)) {
    MS_LOG(INFO) << "Input node is not a nop node, no need infer.";
    return;
  }
  MS_LOG(INFO) << "Infer shape for nop node.";
  std::stack<AnfNodePtr> nop_road;
  nop_road.push(*input_node);

  /*lint -e716*/
  while (true) {
    auto input_node_with_idx = AnfAlgo::GetPrevNodeOutput(*input_node, 0);
    auto in_node = input_node_with_idx.first;
    MS_EXCEPTION_IF_NULL(in_node);
    if (opt::IsNopNode(in_node)) {
      nop_road.push(in_node);
      *input_node = in_node;
    } else {
      break;
    }
  }

  /*lint +e716*/
  while (!nop_road.empty()) {
    auto nop_node = nop_road.top();
    MS_EXCEPTION_IF_NULL(nop_node);
    AnfAlgo::InferShape(nop_node->cast<CNodePtr>());
    nop_road.pop();
  }
 }

 void KernelMod::GetDepndLists(const CNodePtr &cnode) {
  MS_EXCEPTION_IF_NULL(cnode);
  if (depend_list_.size() != 0) {
    return;
  }
  auto ret = abstract::GetDependsFormMap(cnode);
  if (ret.empty()) {
    MS_LOG(DEBUG) << "No dynamic_shape_depends found";
    return;
  }
  MS_LOG(INFO) << "Have depends";
  (void)std::transform(ret.begin(), ret.end(), std::inserter(depend_list_, depend_list_.begin()),
                       [](const int64_t &value) { return static_cast<int>(value); });
  MS_LOG(INFO) << "Init End";
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/kernel.h
@@ -18,6 +18,8 @@
 #include <vector>
 #include <string>
 #include <memory>
 #include <map>
 #include <set>
 #include "nlohmann/json.hpp"
 #include "ir/anf.h"
 #include "ir/dtype.h"
@@ -180,6 +182,8 @@ struct KernelLaunchInfo {

 class KernelMod {
 public:
  KernelMod() {}
  explicit KernelMod(const AnfNodePtr &anf_node_ptr) : anf_node_(anf_node_ptr) {}
  virtual const std::vector<size_t> &GetInputSizeList() const = 0;
  virtual const std::vector<size_t> &GetOutputSizeList() const = 0;
  virtual const std::vector<size_t> &GetWorkspaceSizeList() const = 0;
@@ -193,6 +197,10 @@ class KernelMod {
  virtual std::vector<size_t> GenParameters() { return {}; }
  virtual void ReleaseResource() {}

  virtual void InferOp() {}
  virtual void InitOp() {}
  virtual void UpdateOp() {}

  virtual ~KernelMod() = default;
  void set_unique_name(const std::string &unique_name) { unique_name_ = unique_name; }
  void set_fullname(const std::string &fullname) { fullname_ = fullname; }
@@ -205,18 +213,29 @@ class KernelMod {
  const std::vector<AddressPtr> &GetOutputsAddr() { return outputs_addr_; }
  void SetStream(void *stream) { stream_ = stream; }
  void *GetStream() const { return stream_; }
  void SetAtomicCleanNodes(const std::vector<CNodePtr> &atomic_clean_node) { atomic_clean_nodes_ = atomic_clean_node; }

 protected:
  void InferShape();

  std::string kernel_name_;
  std::string unique_name_;
  std::string fullname_;
  bool is_monad_{false};
  void *stream_{nullptr};
  AnfNodeWeakPtr anf_node_;
  std::map<uint32_t, tensor::TensorPtr> depend_tensor_map_;
  std::vector<CNodePtr> atomic_clean_nodes_;

 private:
  void InferShapeForNopNode(AnfNodePtr *input_node);
  void GetDepndLists(const CNodePtr &cnode);
  bool InferShapeForDefiniteOutputNode(const CNodePtr &cnode);

  std::vector<AddressPtr> inputs_addr_;
  std::vector<AddressPtr> workspaces_addr_;
  std::vector<AddressPtr> outputs_addr_;
  std::set<uint32_t> depend_list_;
 };
 using KernelModPtr = std::shared_ptr<KernelMod>;
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/dynamic_tbe_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/dynamic_tbe_kernel_mod.cc
@@ -0,0 +1,298 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/tbe/dynamic_tbe_kernel_mod.h"

 #include <algorithm>
 #include <stack>
 #include "acl/acl_rt.h"
 #include "utils/ms_context.h"
 #include "backend/session/anf_runtime_algorithm.h"
 #include "runtime/device/kernel_runtime.h"
 #include "backend/optimizer/common/helper.h"
 #include "framework/common/debug/log.h"
 #include "utils/log_adapter.h"
 #include "utils/convert_utils_base.h"
 #include "runtime/device/kernel_runtime_manager.h"
 #include "runtime/kernel.h"
 #include "runtime/mem.h"
 #include "pipeline/jit/static_analysis/static_analysis.h"
 #include "runtime/device/ascend/executor/tiling/op_tiling_adapter.h"
 #include "utils/ms_device_shape_transfer.h"
 #include "utils/utils.h"
 #include "register/op_tiling.h"
 #include "nlohmann/json.hpp"

 namespace mindspore {
 namespace kernel {
 using TbeTaskInfoPtr = std::shared_ptr<mindspore::ge::model_runner::TbeTaskInfo>;
 using tbe::KernelManager;
 using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>;

 DynamicTbeKernelMod::DynamicTbeKernelMod(KernelPackPtr kernel_pack, const AnfNodePtr &anf_node_ptr)
    : TbeKernelMod(std::move(kernel_pack), anf_node_ptr) {
  MS_EXCEPTION_IF_NULL(anf_node_ptr);
  auto cnode = anf_node_ptr->cast<CNodePtr>();
  if (cnode != nullptr) {
    op_compile_info_ = ParseCompileJson(cnode);
  }
 }

 DynamicTbeKernelMod::~DynamicTbeKernelMod() {
  if (tiling_data_ptr_ != nullptr) {
    (void)rtFree(tiling_data_ptr_);
  }
 }

 void DynamicTbeKernelMod::InferOp() {
  if (AnfAlgo::IsDynamicShape(anf_node_.lock())) {
    auto node = anf_node_.lock();
    MS_EXCEPTION_IF_NULL(node);
    auto cnode = node->cast<CNodePtr>();
    MS_EXCEPTION_IF_NULL(cnode);
    need_skip_execute_ = NeedSkipExecute(cnode);
    if (need_skip_execute_) {
      std::vector<TypeId> dtypes{AnfAlgo::GetOutputInferDataType(cnode, 0)};
      AnfAlgo::SetOutputInferTypeAndShape(dtypes, {AnfAlgo::GetInputDeviceShape(cnode, 0)}, cnode.get());
    } else {
      KernelMod::InferShape();
    }
  }
 }

 void DynamicTbeKernelMod::InitOp() {
  auto node = anf_node_.lock();
  MS_EXCEPTION_IF_NULL(node);
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);

  if (!AnfAlgo::IsDynamicShape(cnode)) {
    MS_LOG(EXCEPTION) << "The node is not dynamic shape: " << cnode->fullname_with_scope();
  }

  if (!atomic_clean_nodes_.empty()) {
    for (const auto &atomic_clean_node : atomic_clean_nodes_) {
      AnfAlgo::GetKernelMod(atomic_clean_node)->InitOp();
    }
  }

  if (need_skip_execute_) {
    return;
  }

  // gen FuncStub
  if (handle_ == nullptr) {
    auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &block_dim_, true, &handle_, &origin_key_);
    if (func_stub != 1) {
      MS_LOG(EXCEPTION) << "GenFuncStub failed.";
    }
  }

  // start compute tiling
  MS_LOG(INFO) << "Start compute tiling of: " << cnode->fullname_with_scope();
  optiling::utils::OpRunInfo op_run_info_v2(-1, true, 0);
  device::tiling::OpTilingCalculateAdapter converter;
  ::ge::ComputeGraphPtr ge_graph = std::make_shared<::ge::ComputeGraph>("default");
  auto ge_node = converter.AnfNodeToGeNodeAdapter(cnode, &ge_graph, depend_tensor_map_, op_compile_info_);
  (void)optiling::OpParaCalculateV2(ge_node, op_run_info_v2);

  block_dim_ = op_run_info_v2.GetBlockDim();
  std::vector<int64_t> workspace_size_list;
  op_run_info_v2.GetAllWorkspaces(workspace_size_list);
  tiling_data_ = op_run_info_v2.GetAllTilingData().str();
  tiling_key_ = op_run_info_v2.GetTilingKey();

  workspace_size_list_.clear();
  workspace_size_list_.resize(workspace_size_list.size());
  std::transform(workspace_size_list.begin(), workspace_size_list.end(), workspace_size_list_.begin(),
                 [](int64_t size) { return static_cast<size_t>(size); });
 }

 std::string DynamicTbeKernelMod::ParseCompileJson(const CNodePtr &cnode) {
  MS_EXCEPTION_IF_NULL(cnode);

  bool get_flag = true;
  std::string op_compile_info = "";
  TbeUtils::GetCompileInfo(cnode, &op_compile_info, &get_flag);
  if (!get_flag) {
    MS_LOG(EXCEPTION) << "Get compile_info failed. The compile result of [" << cnode->fullname_with_scope()
                      << "] maybe not in the json file(kernel_meta/) or the file had been deleted.";
  }
  MS_LOG(INFO) << "Node: " << cnode->fullname_with_scope() << " get compile_info: " << op_compile_info;
  return op_compile_info;
 }

 void DynamicTbeKernelMod::InitTilingDataPtr() {
  if (tiling_data_ptr_ != nullptr) {
    return;
  }
  auto kernel_json_info = kernel_pack_->kernel_json_info();
  auto op_para_size = kernel_json_info.op_para_size;
  if (op_para_size > 0) {
    auto ret = rtMalloc(&tiling_data_ptr_, op_para_size, RT_MEMORY_HBM);
    if (ret != RT_ERROR_NONE) {
      MS_LOG(EXCEPTION) << "rtMalloc tiling data failed";
    }
  }
 }

 bool DynamicTbeKernelMod::CopyTilingToDevice(void *stream_ptr) {
  InitTilingDataPtr();
  MS_EXCEPTION_IF_NULL(kernel_pack_);
  auto kernel_json_info = kernel_pack_->kernel_json_info();

  auto op_para_size = kernel_json_info.op_para_size;
  if (tiling_data_.size() > op_para_size) {
    MS_LOG(EXCEPTION) << "Compute tiling size:" << tiling_data_.size()
                      << " larger than tbe build op_para_size:" << op_para_size;
  }

  if (tiling_data_.empty() || tiling_data_ptr_ == nullptr) {
    MS_LOG(INFO) << "Tiling size is 0, skip aclrtMemcpyAsync";
    return true;
  }
  // cppcheck-suppress unreadVariable
  auto lock = AscendKernelMod::LockRuntime();
  auto ret = aclrtMemcpyAsync(tiling_data_ptr_, op_para_size, tiling_data_.c_str(), tiling_data_.size(),
                              ACL_MEMCPY_HOST_TO_DEVICE, stream_ptr);
  if (ret != RT_ERROR_NONE) {
    MS_LOG(EXCEPTION) << "Tiling aclrtMemcpyAsync failed, ret:" << ret;
  }
  return true;
 }

 bool DynamicTbeKernelMod::NeedSkipExecute(const CNodePtr &cnode) {
  // Skip run ReduceSum when axis is a Empty Tensor
  MS_EXCEPTION_IF_NULL(cnode);
  auto op_name = AnfAlgo::GetCNodeName(cnode);
  if (op_name != kReduceSumOpName) {
    return false;
  }

  const size_t axes_index = 1;
  if (cnode->inputs().size() <= axes_index + 1) {
    return false;
  }
  auto input_axes = cnode->input(axes_index + 1);
  // cppcheck-suppress unreadVariable
  auto lock = AnfUtils::GetAbstractLock(input_axes.get());
  auto axes_abs = input_axes->abstract()->Clone();
  MS_EXCEPTION_IF_NULL(axes_abs);
  auto axes_shape = AnfAlgo::GetInputDeviceShape(cnode, axes_index);
  if (axes_abs->isa<abstract::AbstractTensor>()) {
    if (std::any_of(axes_shape.begin(), axes_shape.end(), [](ssize_t shape) { return shape == 0; })) {
      return true;
    }
  }
  return false;
 }

 bool DynamicTbeKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
                                 const std::vector<AddressPtr> &outputs, void *stream_ptr) {
  if (stream_ptr == nullptr) {
    MS_LOG(ERROR) << "stream_ptr should not be nullptr.";
    return false;
  }

  if (kernel_pack_ == nullptr) {
    MS_LOG(ERROR) << "kernel pack should not be nullptr.";
    return false;
  }
  if (stream_ == nullptr) {
    stream_ = stream_ptr;
  }

  auto node = anf_node_.lock();
  MS_EXCEPTION_IF_NULL(node);
  if (!node->isa<CNode>()) {
    MS_LOG(EXCEPTION) << "anfnode is not a cnode";
  }
  auto cnode = node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);

  // is dynamic shape
  if (!AnfAlgo::IsDynamicShape(cnode)) {
    MS_LOG(EXCEPTION) << "The cnode is not dynamic shape:" << cnode->fullname_with_scope();
  }

  if (!atomic_clean_nodes_.empty()) {
    for (auto atomic_clean_node : atomic_clean_nodes_) {
      KernelLaunchInfo kernel_launch_info;
      auto kernel_mod = AnfAlgo::GetKernelMod(atomic_clean_node);
      MS_EXCEPTION_IF_NULL(kernel_mod);
      device::KernelRuntime::GenLaunchArgs(*kernel_mod, atomic_clean_node, &kernel_launch_info);
      auto atomic_inputs = kernel_launch_info.inputs_;
      std::vector<AddressPtr> atomic_outputs;
      std::vector<AddressPtr> atomic_workspace;
      kernel_mod->Launch(atomic_inputs, atomic_workspace, atomic_outputs, stream_ptr);
    }
  }

  // need skip, for reducesum empty input axis
  if (need_skip_execute_) {
    // Skip reduce if axis is a empty Tensor (shape = 0)
    MS_LOG(INFO) << "The node " << cnode->fullname_with_scope() << "Need Skip.";
    // cppcheck-suppress unreadVariable
    auto lock = AscendKernelMod::LockRuntime();
    rtError_t status = aclrtMemcpyAsync(outputs[0]->addr, inputs[0]->size, inputs[0]->addr, inputs[0]->size,
                                        ACL_MEMCPY_DEVICE_TO_DEVICE, stream_ptr);
    if (status != RT_ERROR_NONE) {
      MS_LOG(EXCEPTION) << "aclrtMemcpyAsync failed for " << cnode->fullname_with_scope();
    }

    MS_LOG(INFO) << "Execute node:" << cnode->fullname_with_scope() << " success.";
    return true;
  }

  // copy tiling to device
  if (!CopyTilingToDevice(stream_ptr)) {
    MS_LOG(EXCEPTION) << "Copy tiling to device failed. op name: " << cnode->fullname_with_scope();
  }

  // pack all addresses into a vector.
  std::vector<void *> runtimeargs;
  (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(runtimeargs),
                       [](const AddressPtr &input) -> void * { return input->addr; });
  (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs),
                       [](const AddressPtr &output) -> void * { return output->addr; });
  if (!workspace.empty()) {
    (void)std::transform(std::begin(workspace), std::end(workspace), std::back_inserter(runtimeargs),
                         [](const AddressPtr &addr) -> void * { return addr->addr; });
  }

  if (!tiling_data_.empty() && tiling_data_ptr_ != nullptr) {
    runtimeargs.push_back(tiling_data_ptr_);
  }

  rtL2Ctrl_t *l2ctrl = nullptr;
  auto args_size = static_cast<uint32_t>(UlongToUint(sizeof(void *)) * runtimeargs.size());
  auto node_info = cnode->fullname_with_scope();
  const auto dev_func =
    origin_key_.find("kernel0") != origin_key_.npos ? origin_key_ : origin_key_ + "_" + std::to_string(tiling_key_);
  const auto kernel_info = node_info + "/" + std::to_string(tiling_key_);
  // cppcheck-suppress unreadVariable
  auto lock = AscendKernelMod::LockRuntime();
  auto ret = rtKernelLaunchWithHandle(handle_, dev_func.c_str(), block_dim_, runtimeargs.data(), args_size, l2ctrl,
                                      stream_ptr, kernel_info.c_str());
  if (ret != RT_ERROR_NONE) {
    MS_LOG(ERROR) << "Call runtime rtKernelLaunchWithHandle error. Node info: " << node_info;
    return false;
  }

  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/dynamic_tbe_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/dynamic_tbe_kernel_mod.h
@@ -0,0 +1,65 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_TBE_DYNAMIC_TBE_KERNEL_MOD_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_TBE_DYNAMIC_TBE_KERNEL_MOD_H_

 #include <memory>
 #include <string>
 #include <vector>
 #include <utility>
 #include <map>
 #include "backend/kernel_compiler/tbe/tbe_kernel_mod.h"
 #include "backend/kernel_compiler/tbe/tbe_utils.h"
 #include "runtime/device/device_address.h"
 #include "ir/tensor.h"

 namespace mindspore {
 namespace kernel {
 class DynamicTbeKernelMod : public TbeKernelMod {
 public:
  explicit DynamicTbeKernelMod(KernelPackPtr kernel_pack) : TbeKernelMod(kernel_pack) {}  // maybe delete later
  DynamicTbeKernelMod(KernelPackPtr kernel_pack, const AnfNodePtr &anf_node_ptr);
  ~DynamicTbeKernelMod() override;

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override;

  void InferOp() override;
  void InitOp() override;

 private:
  void InferShapeRecursive();
  void InferShapeForNopNode(AnfNodePtr *input_node);
  std::string ParseCompileJson(const CNodePtr &cnode);
  void InitTilingDataPtr();
  bool CopyTilingToDevice(void *stream_ptr);
  bool NeedSkipExecute(const CNodePtr &cnode);

  uint32_t block_dim_ = 1;
  std::string tiling_data_;
  void *tiling_data_ptr_ = nullptr;
  uint32_t tiling_key_{0};
  void *handle_ = nullptr;
  std::string origin_key_{""};
  std::string op_compile_info_{};
  bool need_skip_execute_ = false;
 };

 using DynamicTbeKernelModPtr = std::shared_ptr<DynamicTbeKernelMod>;
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_TBE_TBE_KERNEL_MOD_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_mod.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_mod.cc
@@ -15,6 +15,8 @@
 */

 #include "backend/kernel_compiler/tbe/tbe_kernel_mod.h"

 #include <algorithm>
 #include "runtime/rt.h"
 #include "utils/ms_context.h"
 #include "runtime/device/ascend/ge_runtime/task_info.h"
@@ -41,6 +43,20 @@ bool TbeKernelMod::Launch(const std::vector<mindspore::kernel::AddressPtr> &inpu
  if (stream_ == nullptr) {
    stream_ = stream_ptr;
  }
  // launch atomic_cleans first
  if (!atomic_clean_nodes_.empty()) {
    for (const auto &atomic_clean_node : atomic_clean_nodes_) {
      KernelLaunchInfo kernel_launch_info;
      auto kernel_mod = AnfAlgo::GetKernelMod(atomic_clean_node);
      MS_EXCEPTION_IF_NULL(kernel_mod);
      device::KernelRuntime::GenLaunchArgs(*kernel_mod, atomic_clean_node, &kernel_launch_info);
      auto atomic_inputs = kernel_launch_info.inputs_;
      std::vector<AddressPtr> atomic_outputs;
      std::vector<AddressPtr> atomic_workspace;
      kernel_mod->Launch(atomic_inputs, atomic_workspace, atomic_outputs, stream_ptr);
    }
  }

  uint32_t blockdim = 1;  // default blockdim equal to 1.
  auto func_stub = KernelManager::GenFuncStub(*kernel_pack_, false, &blockdim);
  if (func_stub == 0) {
@@ -61,6 +77,7 @@ bool TbeKernelMod::Launch(const std::vector<mindspore::kernel::AddressPtr> &inpu
  rtL2Ctrl_t *l2ctrl = nullptr;
  const void *stubFunc = reinterpret_cast<void *>(func_stub);
  auto argsSize = static_cast<uint32_t>(UlongToUint(sizeof(void *)) * runtimeargs.size());
  auto lock = AscendKernelMod::LockRuntime();
  auto ret = rtKernelLaunch(stubFunc, blockdim, runtimeargs.data(), argsSize, l2ctrl, stream_);
  if (ret != RT_ERROR_NONE) {
    MS_LOG(ERROR) << "Call runtime rtKernelLaunch error.";
--- a/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_mod.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_mod.h
@@ -29,6 +29,8 @@ namespace kernel {
 class TbeKernelMod : public AscendKernelMod {
 public:
  explicit TbeKernelMod(KernelPackPtr kernel_pack) : kernel_pack_(std::move(kernel_pack)) {}
  TbeKernelMod(KernelPackPtr kernel_pack, const AnfNodePtr &anf_node_ptr)
      : AscendKernelMod(anf_node_ptr), kernel_pack_(std::move(kernel_pack)) {}
  ~TbeKernelMod() override = default;

  void SetInputSizeList(const std::vector<size_t> &size_list) { input_size_list_ = size_list; }
@@ -45,7 +47,7 @@ class TbeKernelMod : public AscendKernelMod {
  device::DynamicKernelPtr GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) override;
  std::vector<size_t> GenParameters() override;

 private:
 protected:
  KernelPackPtr kernel_pack_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
@@ -729,9 +729,8 @@ KernelWithIndex AnfRuntimeAlgorithm::GetPrevNodeOutput(const AnfNodePtr &anf_nod
  auto kernel_info = anf_node->kernel_info();
  if (kernel_info) {
    auto runtime_cache = kernel_info->runtime_cache();
    MS_EXCEPTION_IF_NULL(runtime_cache);
    if (runtime_cache->is_valid()) {
      auto output = runtime_cache->get_prev_node_output(input_idx);
    if (runtime_cache.runtime_cache().is_valid()) {
      auto output = runtime_cache.runtime_cache().get_prev_node_output(input_idx);
      if (output.first != nullptr) {
        return output;
      }
@@ -747,9 +746,8 @@ KernelWithIndex AnfRuntimeAlgorithm::GetPrevNodeOutput(const AnfNodePtr &anf_nod
  }
  if (kernel_info) {
    auto runtime_cache = kernel_info->runtime_cache();
    MS_EXCEPTION_IF_NULL(runtime_cache);
    if (runtime_cache->is_valid()) {
      runtime_cache->set_prev_node_output(input_idx, res);
    if (runtime_cache.runtime_cache().is_valid()) {
      runtime_cache.runtime_cache().set_prev_node_output(input_idx, res);
    }
  }
  return res;
@@ -2065,7 +2063,7 @@ std::vector<int64_t> AnfRuntimeAlgorithm::GetOutputMinShape(const AnfNodePtr &an
  }
 }

 bool IsNodeInputDynamicShape(const CNodePtr &anf_node_ptr) {
 bool AnfRuntimeAlgorithm::IsNodeInputDynamicShape(const CNodePtr &anf_node_ptr) {
  MS_EXCEPTION_IF_NULL(anf_node_ptr);
  auto input_num = AnfAlgo::GetInputTensorNum(anf_node_ptr);
  for (size_t i = 0; i < input_num; ++i) {
@@ -2274,6 +2272,7 @@ void AnfRuntimeAlgorithm::InferShape(const CNodePtr &node, std::map<uint32_t, te
  AbstractBasePtrList args_spec_list;
  auto primitive = GetValueNode<PrimitivePtr>(inputs[0]);
  auto input_size = AnfAlgo::GetInputTensorNum(node);
  std::vector<AnfNodePtr> input_nodes;
  for (size_t i = 0; i < input_size; ++i) {
    auto input_with_index = AnfAlgo::GetPrevNodeOutput(node, i);
    auto real_input = input_with_index.first;
@@ -2289,9 +2288,12 @@ void AnfRuntimeAlgorithm::InferShape(const CNodePtr &node, std::map<uint32_t, te
          // sync data from device to host
          tensor_ptr->data_sync();
        }
        auto real_abs = real_input->abstract();
        // cppcheck-suppress unreadVariable
        auto lock = AnfUtils::GetAbstractLock(real_input.get());
        MS_EXCEPTION_IF_NULL(real_input->abstract());
        auto real_abs = real_input->abstract()->Clone();
        if (real_abs->isa<abstract::AbstractTensor>()) {
          real_input->abstract()->set_value(tensor_ptr);
          real_abs->set_value(tensor_ptr);
        } else if (real_abs->isa<abstract::AbstractTuple>()) {
          auto tuple_get_item_index = AnfAlgo::GetTupleGetItemOutIndex(cnode_input->cast<CNodePtr>());
          auto abstract_tuple = real_abs->cast<abstract::AbstractTuplePtr>();
@@ -2299,15 +2301,27 @@ void AnfRuntimeAlgorithm::InferShape(const CNodePtr &node, std::map<uint32_t, te
          auto tuple_elements = abstract_tuple->elements()[tuple_get_item_index];
          tuple_elements->set_value(tensor_ptr);
        }
        real_input->set_abstract(real_abs);
      }
    }
    AddArgList(&args_spec_list, cnode_input, real_input, i);
    bool is_cnode_input = AddArgList(&args_spec_list, cnode_input, real_input, i);
    if (is_cnode_input) {
      input_nodes.push_back(cnode_input);
    } else {
      input_nodes.push_back(real_input);
    }
  }
  std::vector<AbstractScope> locks;
  std::transform(input_nodes.begin(), input_nodes.end(), std::back_inserter(locks),
                 [](const AnfNodePtr &input) { return AnfUtils::GetAbstractLock(input.get()); });
  auto eval_result = opt::CppInferShape(primitive, args_spec_list);
  locks.clear();
  // cppcheck-suppress unreadVariable
  auto lock = AnfUtils::GetAbstractLock(node.get());
  node->set_abstract(eval_result);
 }

 void AnfRuntimeAlgorithm::AddArgList(AbstractBasePtrList *args_spec_list, const AnfNodePtr &cnode_input,
 bool AnfRuntimeAlgorithm::AddArgList(AbstractBasePtrList *args_spec_list, const AnfNodePtr &cnode_input,
                                     const AnfNodePtr &real_input, size_t index) {
  if (AnfAlgo::CheckPrimitiveType(cnode_input, prim::kPrimTupleGetItem)) {
    auto base_shape = real_input->Shape();
@@ -2315,15 +2329,24 @@ void AnfRuntimeAlgorithm::AddArgList(AbstractBasePtrList *args_spec_list, const
      MS_LOG(EXCEPTION) << "Node input is a tuple_get_item but real input node shape is not a TupleShape. trace: "
                        << trace::DumpSourceLines(real_input);
    }
    // cppcheck-suppress unreadVariable
    auto lock = AnfUtils::GetAbstractLock(real_input.get());
    auto abs = real_input->abstract()->cast<abstract::AbstractTuplePtr>();
    MS_EXCEPTION_IF_NULL(abs);
    auto tuple_get_item_indexk = AnfAlgo::GetTupleGetItemOutIndex(cnode_input->cast<CNodePtr>());
    auto abs_i = abs->elements()[tuple_get_item_indexk];
    (void)args_spec_list->emplace_back(abs_i);
    return false;
  } else if (cnode_input->isa<CNode>() && AnfAlgo::GetCNodeName(cnode_input) == prim::kPrimReshape->name()) {
    // cppcheck-suppress unreadVariable
    auto lock = AnfUtils::GetAbstractLock(cnode_input.get());
    (void)args_spec_list->emplace_back(cnode_input->abstract());
    return true;
  } else {
    // cppcheck-suppress unreadVariable
    auto lock = AnfUtils::GetAbstractLock(real_input.get());
    (void)args_spec_list->emplace_back(real_input->abstract());
    return false;
  }
 }

--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
@@ -288,6 +288,7 @@ class AnfRuntimeAlgorithm {
  static TypeId GetCNodeOutputPrecision(const AnfNodePtr &node);
  // get fix output precision from prev node, input_idx is the input index of current node related to prev node.
  static TypeId GetPrevNodeOutputPrecision(const AnfNodePtr &node, size_t input_idx);
  static bool IsNodeInputDynamicShape(const CNodePtr &anf_node_ptr);
  static bool IsDynamicShape(const AnfNodePtr &node);
  static bool HasDynamicShapeFlag(const PrimitivePtr &prim);
  static bool IsCondControlKernel(const CNodePtr &node);
@@ -302,7 +303,8 @@ class AnfRuntimeAlgorithm {
  static bool IsNodeDynamicShape(const AnfNodePtr &node);
  static bool IsHostKernel(const CNodePtr &node);
  static void InferShape(const CNodePtr &node, std::map<uint32_t, tensor::TensorPtr> *depend_tensors = nullptr);
  static void AddArgList(AbstractBasePtrList *args_spec_list, const AnfNodePtr &cnode_input,
  // return true if use cnode_input's abstract, false if use real_input's abstract
  static bool AddArgList(AbstractBasePtrList *args_spec_list, const AnfNodePtr &cnode_input,
                         const AnfNodePtr &real_input, size_t index);
  static std::vector<size_t> GetInputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index);
  static std::vector<size_t> GetOutputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index);
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -123,8 +123,7 @@ void AscendEnableDynamicRuntimeCache(const KernelGraph *graph) {
    }
    MS_EXCEPTION_IF_NULL(kernel_info);
    auto runtime_cache = kernel_info->runtime_cache();
    MS_EXCEPTION_IF_NULL(runtime_cache);
    runtime_cache->set_valid();
    runtime_cache.runtime_cache().set_valid();
  }
 }
 }  // namespace
--- a/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_adapter.h
+++ b/mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_adapter.h
@@ -37,21 +37,21 @@ class OpTilingCalculateAdapter {
  OpTilingCalculateAdapter() = default;
  ~OpTilingCalculateAdapter() = default;

  ge::Operator AnfNodeToGeNodeAdapter(const CNodePtr &node, ge::ComputeGraphPtr *ge_graph,
                                      const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map,
                                      const std::string &op_compile_info);
  ::ge::Operator AnfNodeToGeNodeAdapter(const CNodePtr &node, ::ge::ComputeGraphPtr *ge_graph,
                                        const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map,
                                        const std::string &op_compile_info);

 private:
  void ConvertInputShapeAndType(const CNodePtr &node, ge::OpDescPtr *op_desc);
  void ConvertOutputShapeAndType(const CNodePtr &node, ge::OpDescPtr *op_desc);
  void ConvertCompileInfo(const CNodePtr &node, ge::OpDescPtr *op_desc);
  void ConvertAttrs(const CNodePtr &node, ge::OpDescPtr *op_desc);
  std::vector<std::tuple<std::size_t, ge::NodePtr>> ConvertDepends(
    const CNodePtr &node, const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map, ge::OpDescPtr *op_desc,
    ge::ComputeGraphPtr *ge_graph);
  ge::NodePtr NewConstantOp(const CNodePtr &node, const std::string &name, const tensor::TensorPtr &tensor_data,
                            ge::ComputeGraphPtr *ge_graph, size_t index);
  void AddEdge(const ge::NodePtr &ge_node, const std::vector<std::tuple<std::size_t, ge::NodePtr>> &constant_ops);
  void ConvertInputShapeAndType(const CNodePtr &node, ::ge::OpDescPtr *op_desc);
  void ConvertOutputShapeAndType(const CNodePtr &node, ::ge::OpDescPtr *op_desc);
  void ConvertCompileInfo(const CNodePtr &node, ::ge::OpDescPtr *op_desc);
  void ConvertAttrs(const CNodePtr &node, ::ge::OpDescPtr *op_desc);
  std::vector<std::tuple<std::size_t, ::ge::NodePtr>> ConvertDepends(
    const CNodePtr &node, const std::map<uint32_t, tensor::TensorPtr> &depend_tensor_map, ::ge::OpDescPtr *op_desc,
    ::ge::ComputeGraphPtr *ge_graph);
  ::ge::NodePtr NewConstantOp(const CNodePtr &node, const std::string &name, const tensor::TensorPtr &tensor_data,
                              ::ge::ComputeGraphPtr *ge_graph, size_t index);
  void AddEdge(const ::ge::NodePtr &ge_node, const std::vector<std::tuple<std::size_t, ::ge::NodePtr>> &constant_ops);
  std::string GetRealOpType(const std::string &op_type);
  std::string GetInputName(const CNodePtr &node, size_t index);
  std::string GetOutputName(const CNodePtr &node, size_t index);
--- a/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc
+++ b/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc
@@ -103,7 +103,7 @@ void DynamicKernel::InferShape() {
        tuple_elements->set_value(out_tensor);
      }
    }
    AnfAlgo::AddArgList(&args_spec_list, cnode_input, real_input, i);
    (void)AnfAlgo::AddArgList(&args_spec_list, cnode_input, real_input, i);
  }
  auto eval_result = opt::CppInferShape(primitive, args_spec_list);
  cnode->set_abstract(eval_result);
--- a/mindspore/ccsrc/runtime/hardware/device_context.h
+++ b/mindspore/ccsrc/runtime/hardware/device_context.h
@@ -164,8 +164,7 @@ class DeviceContext {
      }
      MS_EXCEPTION_IF_NULL(kernel_info);
      auto runtime_cache = kernel_info->runtime_cache();
      MS_EXCEPTION_IF_NULL(runtime_cache);
      runtime_cache->set_valid();
      runtime_cache.runtime_cache().set_valid();
    }
  }

--- a/mindspore/core/ir/anf.cc
+++ b/mindspore/core/ir/anf.cc
@@ -28,8 +28,21 @@
 #include "ir/func_graph.h"
 #include "ir/primitive.h"
 #include "utils/ms_context.h"
 #include "utils/anf_utils.h"

 namespace mindspore {
 const AbstractBasePtr &AnfNode::abstract() const {
  // cppcheck-suppress unreadVariable
  auto lock = AnfUtils::GetAbstractLock(this);
  return abstract_;
 }

 void AnfNode::set_abstract(const AbstractBasePtr &abs) {
  // cppcheck-suppress unreadVariable
  auto lock = AnfUtils::GetAbstractLock(this);
  abstract_ = abs;
 }

 // namespace to support intermediate representation definition
 CNode::CNode(const std::vector<AnfNodePtr> &inputs, const FuncGraphPtr &func_graph)
    : AnfNode(func_graph),
@@ -574,9 +587,8 @@ std::string GetCNodeTarget(const AnfNodePtr &node) {
  auto kernel_info = node->kernel_info();
  if (kernel_info != nullptr) {
    auto runtime_cache = kernel_info->runtime_cache();
    MS_EXCEPTION_IF_NULL(runtime_cache);
    if (runtime_cache->is_valid()) {
      auto tmp_target = runtime_cache->device_target();
    if (runtime_cache.runtime_cache().is_valid()) {
      auto tmp_target = runtime_cache.runtime_cache().device_target();
      if (!tmp_target.empty()) {
        return tmp_target;
      }
@@ -595,9 +607,8 @@ std::string GetCNodeTarget(const AnfNodePtr &node) {

  if (kernel_info != nullptr) {
    auto runtime_cache = kernel_info->runtime_cache();
    MS_EXCEPTION_IF_NULL(runtime_cache);
    if (runtime_cache->is_valid()) {
      runtime_cache->set_device_target(target);
    if (runtime_cache.runtime_cache().is_valid()) {
      runtime_cache.runtime_cache().set_device_target(target);
    }
  }
  return target;
--- a/mindspore/core/ir/anf.h
+++ b/mindspore/core/ir/anf.h
@@ -178,12 +178,12 @@ class MS_CORE_API AnfNode : public Base {
  /// \brief Obtain the inferred abstract value of this AnfNode.
  ///
  /// \return The inferred abstract value.
  const AbstractBasePtr &abstract() const { return abstract_; }
  const AbstractBasePtr &abstract() const;

  /// \brief Set the abstract value of this AnfNode.
  ///
  /// \param[in] abs New abstract value.
  void set_abstract(const AbstractBasePtr &abs) { abstract_ = abs; }
  void set_abstract(const AbstractBasePtr &abs);

  /// \brief Obtain the intermediate abstract value of this AnfNode.
  ///
--- a/mindspore/core/ir/anf_extends.cc
+++ b/mindspore/core/ir/anf_extends.cc
@@ -24,12 +24,21 @@
 #include "ir/visitor.h"
 #include "ir/func_graph.h"
 #include "base/core_ops.h"
 #include "utils/anf_utils.h"

 namespace mindspore {
 // namespace to support intermediate representation definition
 // Methods of AnfNode
 TypePtr AnfNode::Type() const { return (abstract_ == nullptr) ? nullptr : abstract_->BuildType(); }
 BaseShapePtr AnfNode::Shape() const { return (abstract_ == nullptr) ? nullptr : abstract_->BuildShape(); }
 TypePtr AnfNode::Type() const {
  // cppcheck-suppress unreadVariable
  auto lock = AnfUtils::GetAbstractLock(this);
  return (abstract_ == nullptr) ? nullptr : abstract_->BuildType();
 }
 BaseShapePtr AnfNode::Shape() const {
  // cppcheck-suppress unreadVariable
  auto lock = AnfUtils::GetAbstractLock(this);
  return (abstract_ == nullptr) ? nullptr : abstract_->BuildShape();
 }

 std::string AnfNode::ToString() const {
  return mindspore::label_manage::Label(const_cast<AnfNode *>(this)->shared_from_base<AnfNode>()->debug_info());
--- a/mindspore/core/ir/kernel_info_dev.h
+++ b/mindspore/core/ir/kernel_info_dev.h
@@ -68,13 +68,26 @@ class RuntimeCache {
 // Interface for device kernel program information.
 class KernelInfoDevice {
 public:
  class RuntimeCacheScope {
   public:
    RuntimeCacheScope(RuntimeCache &base, std::mutex &mu) : runtime_cache_(base), mu_(mu) { mu_.lock(); }
    RuntimeCacheScope(const RuntimeCacheScope &other) = delete;
    RuntimeCacheScope operator=(const RuntimeCacheScope &other) = delete;
    ~RuntimeCacheScope() { mu_.unlock(); }
    RuntimeCache &runtime_cache() { return runtime_cache_; }

   private:
    RuntimeCache &runtime_cache_;
    std::mutex &mu_;
  };
  // If kernel program was built and build info is set.
  virtual bool has_build_info() const = 0;

  RuntimeCache *runtime_cache() { return &runtime_cache_; }
  RuntimeCacheScope runtime_cache() { return RuntimeCacheScope(runtime_cache_, mu_); }

 private:
  RuntimeCache runtime_cache_;
  std::mutex mu_;
 };
 using KernelInfoDevicePtr = std::shared_ptr<KernelInfoDevice>;
 }  // namespace mindspore
--- a/mindspore/core/utils/anf_utils.cc
+++ b/mindspore/core/utils/anf_utils.cc
@@ -15,6 +15,7 @@
 */

 #include "utils/anf_utils.h"
 #include <map>
 #include <string>
 #include "base/core_ops.h"
 #include "utils/trace_base.h"
@@ -23,8 +24,52 @@
 namespace mindspore {
 namespace {
 const PrimitiveSet follow_first_input_prims = {prim::kPrimDepend, prim::kPrimLoad};

 class AbstractMutexManager {
 public:
  static AbstractMutexManager &GetInstance() {
    static AbstractMutexManager instance;
    return instance;
  }

  AbstractScope GetAbstractLock(const AnfNode *node) {
    std::lock_guard<std::recursive_mutex> lock(mu_);
    return AbstractScope(&mu_for_nodes_[node]);
  }

 private:
  std::map<const AnfNode *, std::recursive_mutex> mu_for_nodes_;
  std::recursive_mutex mu_;
 };
 }  // namespace

 AbstractScope::AbstractScope(std::recursive_mutex *mu) {
  MS_EXCEPTION_IF_NULL(mu);
  mu_ = mu;
  mu_->lock();
 }

 AbstractScope::AbstractScope(AbstractScope &&other) {
  mu_ = other.mu_;
  other.mu_ = nullptr;
 }

 AbstractScope &AbstractScope::operator=(AbstractScope &&other) {
  mu_ = other.mu_;
  other.mu_ = nullptr;
  return *this;
 }

 AbstractScope::~AbstractScope() {
  if (mu_ != nullptr) {
    mu_->unlock();
  }
 }

 AbstractScope AnfUtils::GetAbstractLock(const AnfNode *node) {
  return AbstractMutexManager::GetInstance().GetAbstractLock(node);
 }

 bool AnfUtils::IsDimUnknown(const abstract::ShapePtr &shape) {
  MS_EXCEPTION_IF_NULL(shape);
  return std::any_of(shape->shape().begin(), shape->shape().end(), [](int64_t s) { return s < -1; });
@@ -112,20 +157,18 @@ bool AnfUtils::IsRealKernel(const AnfNodePtr &node) {
  auto kernel_info = cnode->kernel_info();
  if (kernel_info) {
    auto runtime_cache = kernel_info->runtime_cache();
    MS_EXCEPTION_IF_NULL(runtime_cache);
    if (runtime_cache->is_real_kernel() != CacheBool::UNCACHED) {
      return (runtime_cache->is_real_kernel() == CacheBool::TRUE);
    if (runtime_cache.runtime_cache().is_real_kernel() != CacheBool::UNCACHED) {
      return (runtime_cache.runtime_cache().is_real_kernel() == CacheBool::TRUE);
    }
  }
  bool res = !IsOneOfPrimitive(cnode->input(kAnfPrimitiveIndex), virtual_prims);

  if (kernel_info) {
    auto runtime_cache = kernel_info->runtime_cache();
    MS_EXCEPTION_IF_NULL(runtime_cache);
    if (res) {
      runtime_cache->set_real_kernel(CacheBool::TRUE);
      runtime_cache.runtime_cache().set_real_kernel(CacheBool::TRUE);
    } else {
      runtime_cache->set_real_kernel(CacheBool::FALSE);
      runtime_cache.runtime_cache().set_real_kernel(CacheBool::FALSE);
    }
  }

@@ -175,10 +218,15 @@ size_t AnfUtils::GetInputTensorNum(const AnfNodePtr &node) {
    MS_LOG(EXCEPTION) << "Only cnode has real input, but this anf is " << node->DebugString()
                      << trace::DumpSourceLines(node);
  }
  ssize_t input_tensor_num = cnode->input_tensor_num();
  if (input_tensor_num >= 0) {
    return static_cast<size_t>(input_tensor_num);
  {
    // cppcheck-suppress unreadVariable
    auto lock = AnfUtils::GetAbstractLock(node.get());
    ssize_t input_tensor_num = cnode->input_tensor_num();
    if (input_tensor_num >= 0) {
      return static_cast<size_t>(input_tensor_num);
    }
  }

  size_t input_num = cnode->inputs().size();
  if (input_num == 0) {
    MS_LOG(EXCEPTION) << "Cnode inputs size can't be zero" << trace::DumpSourceLines(node);
@@ -191,6 +239,8 @@ size_t AnfUtils::GetInputTensorNum(const AnfNodePtr &node) {
    auto &inputs = cnode->inputs();
    // Search monad inputs, backward.
    for (auto iter = inputs.rbegin(); iter != inputs.rend(); ++iter) {
      // cppcheck-suppress unreadVariable
      auto lock = AnfUtils::GetAbstractLock(node.get());
      if (!HasAbstractMonad(*iter)) {
        // Stop count if we encounter a non-monad input.
        break;
@@ -198,6 +248,8 @@ size_t AnfUtils::GetInputTensorNum(const AnfNodePtr &node) {
      --input_num;
    }
  }
  // cppcheck-suppress unreadVariable
  auto lock = AnfUtils::GetAbstractLock(node.get());
  cnode->set_input_tensor_num(static_cast<ssize_t>(input_num));
  return input_num;
 }
@@ -207,8 +259,8 @@ size_t AnfUtils::GetOutputTensorNum(const AnfNodePtr &node) {
  auto kernel_info = node->kernel_info();
  if (kernel_info) {
    auto runtime_cache = kernel_info->runtime_cache();
    if (runtime_cache->is_valid()) {
      ssize_t output_tensor_num = runtime_cache->output_tensor_num();
    if (runtime_cache.runtime_cache().is_valid()) {
      ssize_t output_tensor_num = runtime_cache.runtime_cache().output_tensor_num();
      if (output_tensor_num >= 0) {
        return static_cast<size_t>(output_tensor_num);
      }
@@ -231,8 +283,8 @@ size_t AnfUtils::GetOutputTensorNum(const AnfNodePtr &node) {

  if (kernel_info) {
    auto runtime_cache = kernel_info->runtime_cache();
    if (runtime_cache->is_valid()) {
      runtime_cache->set_output_tensor_num(static_cast<ssize_t>(res));
    if (runtime_cache.runtime_cache().is_valid()) {
      runtime_cache.runtime_cache().set_output_tensor_num(static_cast<ssize_t>(res));
    }
  }
  return res;
--- a/mindspore/core/utils/anf_utils.h
+++ b/mindspore/core/utils/anf_utils.h
@@ -25,6 +25,19 @@
 #include "ir/primitive.h"

 namespace mindspore {
 class AbstractScope {
 public:
  explicit AbstractScope(std::recursive_mutex *mu);
  AbstractScope(const AbstractScope &other) = delete;
  AbstractScope operator=(const AbstractScope &other) = delete;
  AbstractScope(AbstractScope &&other);
  AbstractScope &operator=(AbstractScope &&other);
  ~AbstractScope();

 private:
  std::recursive_mutex *mu_;
 };

 class AnfUtils {
 public:
  static bool IsDimUnknown(const abstract::ShapePtr &shape);
@@ -52,6 +65,7 @@ class AnfUtils {
  static void SetDumpFlag(const AnfNodePtr &node);
  // Get dump flag from CNode's primitive.
  static bool GetDumpFlag(const AnfNodePtr &node);
  static AbstractScope GetAbstractLock(const AnfNode *node);
 };
 }  // namespace mindspore
 #endif  // MINDSPORE_CORE_UTILS_ANF_UTILS_H_
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -182,6 +182,13 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
        "../../../mindspore/ccsrc/profiler/device/ascend/*.cc"
        "../../../mindspore/ccsrc/profiler/device/profiling.cc"
        "../../../mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/adam_fp32.c"
        "../../../mindspore/ccsrc/backend/kernel_compiler/kernel.cc"
        "../../../mindspore/ccsrc/backend/kernel_compiler/ascend_kernel_mod.cc"
        "../../../mindspore/ccsrc/backend/optimizer/common/helper.cc"
        "../../../mindspore/ccsrc/runtime/device/ascend/executor/tiling/op_tiling_adapter.cc"
        "../../../mindspore/ccsrc/runtime/device/ascend/executor/aicpu_ext_info_handle.cc"
        "../../../mindspore/ccsrc/runtime/device/ascend/ge_types_convert.cc"
        "../../../mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_util.cc"
        )

 if(ENABLE_SECURITY)
@@ -230,6 +237,24 @@ add_dependencies(_ut_ut_obj engine-cache-server graph)
 add_executable(ut_tests $<TARGET_OBJECTS:_ut_ut_obj>
        $<TARGET_OBJECTS:_ut_mindspore_obj>)

 include_directories("${CMAKE_BINARY_DIR}/backend/kernel_compiler/aicpu")
 file(GLOB_RECURSE PROTO_IN RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
    "../../../mindspore/ccsrc/backend/kernel_compiler/aicpu/proto/*.proto")
 ms_protobuf_generate(PROTOSRCS PROTOHDRS ${PROTO_IN})

 file(GLOB_RECURSE PROTO_DUMP RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
    "../../../mindspore/ccsrc/runtime/device/ascend/dump/proto/*.proto")
 ms_protobuf_generate(DUMP_PROTOSRCS PROTOHDRS ${PROTO_DUMP})

 list(APPEND MINDSPORE_PROTO_LIST ${PROTOSRCS})
 list(APPEND MINDSPORE_PROTO_LIST ${PREDICT_PROTOSRCS})
 list(APPEND MINDSPORE_PROTO_LIST ${DUMP_PROTOSRCS})

 if(MINDSPORE_PROTO_LIST)
    add_library(proto_input_ut STATIC ${MINDSPORE_PROTO_LIST})
    set_target_properties(proto_input_ut PROPERTIES COMPILE_FLAGS "-Wno-unused-variable")
 endif()

 if(ENABLE_GE)
    if(ENABLE_TRAIN)
        target_link_libraries(ut_tests PRIVATE graph ge_runner)
--- a/tests/ut/cpp/stub/ge/ge_op_tiling_info.cc
+++ b/tests/ut/cpp/stub/ge/ge_op_tiling_info.cc
@@ -0,0 +1,75 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "register/op_tiling_info.h"
 #include "register/op_tiling.h"

 namespace optiling {
 using std::make_shared;
 extern "C" ge::graphStatus OpParaCalculateV2(const ge::Operator &op, OpRunInfoV2 &run_info) {
  return ge::GRAPH_SUCCESS;
 }

 namespace utils {
 OpRunInfo::OpRunInfo() {}

 OpRunInfo::OpRunInfo(const uint32_t &block_dim, const bool &clear_atomic, const uint64_t &tiling_key) {}

 OpRunInfo::OpRunInfo(const OpRunInfo &runinfo) {}

 OpRunInfo::OpRunInfo(OpRunInfo &&runinfo) {}

 OpRunInfo &OpRunInfo::operator=(const OpRunInfo &runinfo) { return *this; }

 OpRunInfo &OpRunInfo::operator=(OpRunInfo &&runinfo) { return *this; }

 void OpRunInfo::SetBlockDim(const uint32_t &block_dim) { return; }

 uint32_t OpRunInfo::GetBlockDim() const { return 0; }

 void OpRunInfo::AddWorkspace(const int64_t &workspace) { return; }

 size_t OpRunInfo::GetWorkspaceNum() const { return 0; }

 ge::graphStatus OpRunInfo::GetWorkspace(const size_t &idx, int64_t &workspace) const { return ge::GRAPH_SUCCESS; }

 void OpRunInfo::GetAllWorkspaces(std::vector<int64_t> &workspaces) const { return; }

 void OpRunInfo::SetWorkspaces(const std::vector<int64_t> &workspaces) { return; }

 void OpRunInfo::InternelSetTiling(const ByteBuffer &value) { return; }

 void OpRunInfo::AddTilingData(const char *_value, size_t _size) { return; }

 ByteBuffer &OpRunInfo::GetAllTilingData() {
  std::shared_ptr<ByteBuffer> tiling_data = std::make_shared<ByteBuffer>();
  return *tiling_data;
 }

 const ByteBuffer &OpRunInfo::GetAllTilingData() const {
  std::shared_ptr<ByteBuffer> tiling_data = std::make_shared<ByteBuffer>();
  return *tiling_data;
 }

 void OpRunInfo::SetClearAtomic(bool clear_atomic_input) { return; }

 bool OpRunInfo::GetClearAtomic() const { return true; }

 void OpRunInfo::SetTilingKey(const uint64_t &new_tiling_key) { return; }

 uint64_t OpRunInfo::GetTilingKey() const { return 0; }
 }  // namespace utils
 }  // namespace optiling
--- a/tests/ut/cpp/stub/runtime/runtime_stub.cc
+++ b/tests/ut/cpp/stub/runtime/runtime_stub.cc
@@ -211,3 +211,9 @@ RTS_API rtError_t rtMemGetInfoEx(rtMemInfoType_t memInfoType, size_t *free, size
 RTS_API rtError_t rtProfRegisterCtrlCallback(uint32_t moduleId, rtProfCtrlHandle callback) { return RT_ERROR_NONE; }

 RTS_API rtError_t rtGetRtCapability(rtFeatureType_t, int32_t, int64_t *) { return RT_ERROR_NONE; }

 RTS_API rtError_t rtKernelLaunchWithHandle(void *handle, const void *devFunc, uint32_t blockDim, void *args,
                                           uint32_t argsSize, rtSmDesc_t *smDesc, rtStream_t stream,
                                           const void *kernelInfo) {
  return RT_ERROR_NONE;
 }