host cpu support dynamic shape

5 years ago · 9f5ab8f76f
--- a/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/aicpu/aicpu_kernel_build.cc
@@ -289,14 +289,14 @@ bool CreateNodeDefBytes(const std::shared_ptr<AnfNode> &anf_node,
  return true;
 }

 uint64_t SetExtInfoShapeType(char *ext_info_buf, uint64_t ext_info_offset) {
 uint64_t SetExtInfoShapeType(char *ext_info_buf, uint64_t ext_info_offset, UnknowShapeOpType type) {
  // deal1: unknown shape type
  auto *info = reinterpret_cast<ExtInfo *>(ext_info_buf + ext_info_offset);
  info->infoType = FWK_ADPT_EXT_SHAPE_TYPE;
  info->infoLen = sizeof(int32_t);
  ext_info_offset += kExtInfoHeadSize;
  auto *shape_type = reinterpret_cast<int32_t *>(ext_info_buf + ext_info_offset);
  *shape_type = UnknowShapeOpType::DEPEND_COMPUTE;
  *shape_type = type;
  ext_info_offset += info->infoLen;
  return ext_info_offset;
 }
@@ -401,7 +401,11 @@ bool CreateExtInfo(const std::shared_ptr<AnfNode> &anf_node, const std::shared_p
  ext_info.resize(ext_info_len, 0);
  char *ext_info_buf = ext_info.data();

  ext_info_offset = SetExtInfoShapeType(ext_info_buf, ext_info_offset);
  UnknowShapeOpType shape_type = UnknowShapeOpType::DEPEND_IN_SHAPE;
  if (AnfAlgo::GetCNodeName(anf_node) == "Unique") {
    shape_type = UnknowShapeOpType::DEPEND_COMPUTE;
  }
  ext_info_offset = SetExtInfoShapeType(ext_info_buf, ext_info_offset, shape_type);
  ext_info_offset = SetExtInfoInputShapeType(ext_info_buf, ext_info_offset, anf_node, input_num);
  ext_info_offset = SetExtInfoOutputShapeType(ext_info_buf, ext_info_offset, anf_node, output_num);

--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
@@ -18,6 +18,7 @@
 #include <algorithm>
 #include <map>
 #include <set>
 #include <stack>
 #include "ir/anf.h"
 #include "ir/func_graph.h"
 #include "base/core_ops.h"
@@ -30,6 +31,7 @@
 #include "backend/kernel_compiler/kernel_build_info.h"
 #include "common/trans.h"
 #include "abstract/param_validator.h"
 #include "abstract/primitive_infer_map.h"
 #include "pipeline/jit/static_analysis/static_analysis.h"
 #include "utils/trace_base.h"

@@ -820,6 +822,8 @@ DeviceAddressPtr AnfRuntimeAlgorithm::GetMutableWorkspaceAddr(const AnfNodePtr &
 void AnfRuntimeAlgorithm::SetOutputInferTypeAndShape(const std::vector<TypeId> &types,
                                                     const std::vector<std::vector<size_t>> &shapes, AnfNode *node) {
  MS_EXCEPTION_IF_NULL(node);
  auto node_ptr = node->cast<AnfNodePtr>();
  MS_EXCEPTION_IF_NULL(node_ptr);
  if (types.size() != shapes.size()) {
    MS_LOG(EXCEPTION) << "Types size " << types.size() << "should be same with shapes size " << shapes.size()
                      << " trace: " << trace::DumpSourceLines(node);
@@ -829,16 +833,23 @@ void AnfRuntimeAlgorithm::SetOutputInferTypeAndShape(const std::vector<TypeId> &
  } else if (shapes.size() == 1) {
    // single output handle
    ShapeVector shape_int;
    auto max_shape = GetOutputMaxShape(node_ptr, 0);
    auto min_shape = GetOutputMinShape(node_ptr, 0);
    std::transform(shapes[0].begin(), shapes[0].end(), std::back_inserter(shape_int), SizeToLong);
    auto abstract = std::make_shared<AbstractTensor>(TypeIdToType(types[0]), shape_int);
    auto abstract = std::make_shared<AbstractTensor>(
      TypeIdToType(types[0]), std::make_shared<abstract::Shape>(shape_int, min_shape, max_shape));
    node->set_abstract(abstract);
  } else {
    // multiple output handle
    std::vector<AbstractBasePtr> abstract_list;
    for (size_t i = 0; i < types.size(); ++i) {
      ShapeVector shape_int;
      auto max_shape = GetOutputMaxShape(node_ptr, i);
      auto min_shape = GetOutputMinShape(node_ptr, i);
      std::transform(shapes[i].begin(), shapes[i].end(), std::back_inserter(shape_int), SizeToLong);
      abstract_list.emplace_back(std::make_shared<AbstractTensor>(TypeIdToType(types[i]), shape_int));
      auto abstract = std::make_shared<AbstractTensor>(
        TypeIdToType(types[i]), std::make_shared<abstract::Shape>(shape_int, min_shape, max_shape));
      abstract_list.emplace_back(abstract);
    }
    auto abstract_tuple = std::make_shared<AbstractTuple>(abstract_list);
    node->set_abstract(abstract_tuple);
@@ -1409,7 +1420,7 @@ std::vector<int64_t> AnfRuntimeAlgorithm::GetOutputMinShape(const AnfNodePtr &an
  }
 }

 bool AnfRuntimeAlgorithm::IsNodeDynamicShape(const AnfNodePtr &node) {
 bool IsNodeOutputDynamicShape(const CNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  auto base_shape = node->Shape();
  if (base_shape == nullptr) {
@@ -1436,6 +1447,66 @@ bool AnfRuntimeAlgorithm::IsNodeDynamicShape(const AnfNodePtr &node) {
  return false;
 }

 bool IsNodeInputDynamicShape(const CNodePtr &anf_node_ptr) {
  MS_EXCEPTION_IF_NULL(anf_node_ptr);
  auto input_num = AnfAlgo::GetInputTensorNum(anf_node_ptr);
  for (size_t i = 0; i < input_num; ++i) {
    auto input_with_index = AnfAlgo::GetPrevNodeOutput(anf_node_ptr, i);
    auto input = input_with_index.first;
    auto index = input_with_index.second;
    MS_EXCEPTION_IF_NULL(input);

    auto base_shape = input->Shape();
    if (base_shape == nullptr) {
      MS_LOG(INFO) << "Invalid shape ptr, node:" << input->fullname_with_scope();
      continue;
    }
    if (base_shape->isa<abstract::Shape>()) {
      if (IsShapeDynamic(base_shape->cast<abstract::ShapePtr>())) {
        return true;
      }
    } else if (base_shape->isa<abstract::TupleShape>()) {
      auto tuple_shape = base_shape->cast<abstract::TupleShapePtr>();
      MS_EXCEPTION_IF_NULL(tuple_shape);

      if (index >= tuple_shape->size()) {
        MS_LOG(INFO) << "Node:" << anf_node_ptr->fullname_with_scope() << "Invalid index:" << index
                     << " and tuple_shape size:" << tuple_shape->size();
        continue;
      }

      auto b_shp = (*tuple_shape)[index];
      if (!b_shp->isa<abstract::Shape>()) {
        continue;
      }
      if (IsShapeDynamic(b_shp->cast<abstract::ShapePtr>())) {
        return true;
      }
    }
  }
  return false;
 }

 bool AnfRuntimeAlgorithm::IsNodeDynamicShape(const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  if (!node->isa<CNode>()) {
    MS_LOG(WARNING) << "Node is not a cnode";
    return false;
  }
  auto cnode = node->cast<CNodePtr>();
  auto in_dynamic = IsNodeInputDynamicShape(cnode);
  auto out_dynamic = IsNodeOutputDynamicShape(cnode);
  if (in_dynamic && !AnfAlgo::HasNodeAttr(kAttrInputIsDynamicShape, cnode)) {
    AnfAlgo::SetNodeAttr(kAttrInputIsDynamicShape, MakeValue(true), cnode);
    MS_LOG(INFO) << "Set Input Dynamic Shape Attr to Node:" << cnode->fullname_with_scope();
  }
  if (out_dynamic && !AnfAlgo::HasNodeAttr(kAttrOutputIsDynamicShape, cnode)) {
    AnfAlgo::SetNodeAttr(kAttrOutputIsDynamicShape, MakeValue(true), cnode);
    MS_LOG(INFO) << "Set Output Dynamic Shape Attr to Node:" << cnode->fullname_with_scope();
  }
  return in_dynamic || out_dynamic;
 }

 std::vector<size_t> AnfRuntimeAlgorithm::GetInputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index) {
  auto device_shape = GetInputDeviceShape(anf_node, index);
  // Initialize GPUKernel with max shape to fit 'InitDynamicOutputKernelRef()' for memory reuse.
@@ -1500,5 +1571,50 @@ void AnfRuntimeAlgorithm::GetAllFatherRealNode(const AnfNodePtr &anf_node, std::
    GetAllFatherRealNode(cnode->input(kDependAttachNodeIndex), result, visited);
  }
 }

 void AnfRuntimeAlgorithm::InferShape(const CNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  MS_LOG(INFO) << "InferShape start, node:" << node->DebugString();
  auto inputs = node->inputs();
  if (inputs.empty()) {
    MS_LOG(EXCEPTION) << "Invalid inputs";
  }
  AbstractBasePtrList args_spec_list;
  auto primitive = GetValueNode<PrimitivePtr>(inputs[0]);
  auto input_size = AnfAlgo::GetInputTensorNum(node);
  for (size_t i = 0; i < input_size; ++i) {
    auto input_with_index = AnfAlgo::GetPrevNodeOutput(node, i);
    auto real_input = input_with_index.first;
    MS_EXCEPTION_IF_NULL(real_input);
    auto cnode_input = node->input(i + 1);
    MS_EXCEPTION_IF_NULL(cnode_input);
    if (AnfAlgo::CheckPrimitiveType(cnode_input, prim::kPrimTupleGetItem)) {
      auto base_shape = real_input->Shape();
      if (!base_shape->isa<abstract::TupleShape>()) {
        MS_LOG(EXCEPTION) << "Node:" << node->DebugString()
                          << " input is a tuple_get_item but real input node shape is not a TupleShape";
      }
      auto tuple_ptr = base_shape->cast<abstract::TupleShapePtr>();
      MS_EXCEPTION_IF_NULL(tuple_ptr);
      auto tuple_get_item_index = AnfAlgo::GetTupleGetItemOutIndex(cnode_input->cast<CNodePtr>());
      auto real_shape = tuple_ptr->shape().at(tuple_get_item_index);
      auto abstract_tensor = cnode_input->abstract()->cast<abstract::AbstractTensorPtr>();
      MS_EXCEPTION_IF_NULL(abstract_tensor);
      args_spec_list.emplace_back(std::make_shared<abstract::AbstractTensor>(abstract_tensor->element(), real_shape));
    } else if (cnode_input->isa<CNode>() && AnfAlgo::GetCNodeName(cnode_input) == prim::kPrimReshape->name()) {
      args_spec_list.emplace_back(cnode_input->abstract());
    } else {
      args_spec_list.emplace_back(real_input->abstract());
    }
  }
  auto &prim_eval_implement_map = abstract::GetPrimitiveToEvalImplMap();
  auto ret = prim_eval_implement_map.find(primitive);
  if (ret == prim_eval_implement_map.end()) {
    MS_LOG(EXCEPTION) << "Get infer shape function failed, primitive name:" << primitive->name()
                      << " primitive type:" << primitive->type_name();
  }
  auto eval_result = ret->second.impl_(nullptr, primitive, args_spec_list);
  node->set_abstract(eval_result);
 }
 }  // namespace session
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
@@ -230,6 +230,7 @@ class AnfRuntimeAlgorithm {
  static std::vector<int64_t> GetOutputMaxShape(const AnfNodePtr &anf_node, size_t index);
  static std::vector<int64_t> GetOutputMinShape(const AnfNodePtr &anf_node, size_t index);
  static bool IsNodeDynamicShape(const AnfNodePtr &node);
  static void InferShape(const CNodePtr &node);
  static std::vector<size_t> GetInputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index);
  static std::vector<size_t> GetOutputRealDeviceShapeIfExist(const AnfNodePtr &anf_node, size_t index);
  // Find control_depend real input nodes.
--- a/mindspore/ccsrc/backend/session/cpu_session.cc
+++ b/mindspore/ccsrc/backend/session/cpu_session.cc
@@ -65,6 +65,8 @@ GraphId CPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
  auto graph_id = graph_sum_;
  auto graph = ConstructKernelGraph(lst, outputs);
  MS_EXCEPTION_IF_NULL(graph);
  UpdateGraphDynamicShapeAttr(NOT_NULL(graph));
  graph->UpdateGraphDynamicAttr();
  MS_LOG(INFO) << "Set kernel info";
  SetKernelInfo(graph.get());
 #if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
@@ -87,7 +89,7 @@ void CPUSession::CreateOutputTensors(const GraphId &graph_id, const std::vector<
                                     std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) {
  auto kernel_graph = GetGraph(graph_id);
  MS_EXCEPTION_IF_NULL(kernel_graph);
  runtime_.CreateOutputTensors(kernel_graph.get(), input_tensors, outputs);
  runtime_.CreateOutputTensors(kernel_graph.get(), input_tensors, outputs, tensor_to_node);
 }

 void CPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs,
--- a/mindspore/ccsrc/backend/session/session_basic.cc
+++ b/mindspore/ccsrc/backend/session/session_basic.cc
@@ -47,6 +47,41 @@ static std::shared_ptr<std::map<ValuePtr, ParameterPtr>> python_paras;
 void ClearPythonParasMap() { python_paras = nullptr; }
 namespace {
 const int kSummaryGetItem = 2;
 bool IsUsedByRealKernel(const FuncGraphManagerPtr &manager, const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(manager);
  MS_EXCEPTION_IF_NULL(node);
  auto node_users = manager->node_users()[node];
  for (auto item : node_users) {
    if (AnfAlgo::IsRealKernel(item.first)) {
      return true;
    }
  }
  return false;
 }

 bool IsUsedByDynamicKernel(const FuncGraphManagerPtr &manager, const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(manager);
  MS_EXCEPTION_IF_NULL(node);
  auto node_users = manager->node_users()[node];
  for (auto item : node_users) {
    if (item.first->isa<CNode>() && AnfAlgo::IsNodeDynamicShape(item.first->cast<CNodePtr>())) {
      return true;
    }
  }
  return false;
 }

 bool CheckIfNeedCreateOutputTensor(const AnfNodePtr &node) {
  MS_EXCEPTION_IF_NULL(node);
  if (node->isa<Parameter>()) {
    auto node_ptr = node->cast<ParameterPtr>();
    MS_EXCEPTION_IF_NULL(node_ptr);
    if (!node_ptr->is_used_by_real_kernel()) {
      return true;
    }
  }
  return false;
 }

 ValuePtr GetParamDefaultValue(const AnfNodePtr &node) {
  if (node == nullptr) {
@@ -114,6 +149,8 @@ BaseRef CreateNodeOutputTensor(const session::KernelWithIndex &node_output_pair,
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(graph);
  MS_EXCEPTION_IF_NULL(tensor_to_node);
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  MS_LOG(INFO) << "Create tensor for output[" << node->DebugString() << "] index[" << node_output_pair.second << "]";
  // if node is a value node, no need sync addr from device to host
  if (node->isa<ValueNode>()) {
@@ -121,7 +158,8 @@ BaseRef CreateNodeOutputTensor(const session::KernelWithIndex &node_output_pair,
    MS_EXCEPTION_IF_NULL(value_node);
    return value_node->value();
  }
  if (!AnfAlgo::OutputAddrExist(node, output_index)) {
  if (!AnfAlgo::OutputAddrExist(node, output_index) ||
      (CheckIfNeedCreateOutputTensor(node) && ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode)) {
    if (node->isa<Parameter>()) {
      for (size_t input_idx = 0; input_idx < graph->inputs().size(); input_idx++) {
        if (input_idx >= input_tensors.size()) {
@@ -875,9 +913,21 @@ KernelGraphPtr SessionBasic::ConstructKernelGraph(const AnfNodePtrList &lst, con

  // Update Graph Dynamic Shape Attr
  UpdateGraphDynamicShapeAttr(NOT_NULL(graph));

  opt::BackendCommonOptimization(graph);
  graph->SetInputNodes();
  auto input_nodes = graph->input_nodes();
  for (auto input_node : input_nodes) {
    if (input_node->isa<Parameter>()) {
      auto node_ptr = input_node->cast<ParameterPtr>();
      MS_EXCEPTION_IF_NULL(node_ptr);
      if (!IsUsedByRealKernel(manager, input_node)) {
        node_ptr->set_used_by_real_kernel();
      }
      if (IsUsedByDynamicKernel(manager, input_node)) {
        node_ptr->set_used_by_dynamic_kernel();
      }
    }
  }
  graph->SetOptimizerFlag();
  return graph;
 }
@@ -950,7 +1000,22 @@ std::shared_ptr<KernelGraph> SessionBasic::ConstructKernelGraph(const FuncGraphP
      MS_LOG_EXCEPTION << "construct func graph " << func_graph->ToString() << "fail!";
    }
  }

  AddParameterToGraphInputs(func_graph->parameters(), graph.get());
  FuncGraphManagerPtr manager = MakeManager({graph});
  auto input_nodes = graph->inputs();
  for (auto input_node : input_nodes) {
    if (input_node->isa<Parameter>()) {
      auto node_ptr = input_node->cast<ParameterPtr>();
      MS_EXCEPTION_IF_NULL(node_ptr);
      if (!IsUsedByRealKernel(manager, input_node)) {
        node_ptr->set_used_by_real_kernel();
      }
      if (IsUsedByDynamicKernel(manager, input_node)) {
        node_ptr->set_used_by_dynamic_kernel();
      }
    }
  }
  graph->SetExecOrderByDefault();
  if (ExistSummaryNode(graph.get())) {
    graph->set_summary_node_exist(true);
@@ -1021,14 +1086,23 @@ void SessionBasic::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_grap
    MS_EXCEPTION_IF_NULL(tensor);
    auto input_node = input_nodes[i];
    MS_EXCEPTION_IF_NULL(input_node);
    auto size = LongToSize(tensor->data().nbytes());
    if (input_node->isa<Parameter>() && input_node->cast<ParameterPtr>()->is_used_by_dynamic_kernel()) {
      auto tensor_shape = tensor->shape();
      std::vector<size_t> shape_tmp;
      (void)std::transform(tensor_shape.begin(), tensor_shape.end(), std::back_inserter(shape_tmp), IntToSize);
      AnfAlgo::SetOutputInferTypeAndShape({AnfAlgo::GetOutputInferDataType(input_node, 0)}, {shape_tmp},
                                          input_node.get());
      size = trans::ShapeSize(shape_tmp) * trans::TypeIdSize(tensor->data_type());
    }
    if (input_node->isa<Parameter>() && AnfAlgo::OutputAddrExist(input_node, 0) && TensorNeedSync(input_node, tensor)) {
      auto device_address = AnfAlgo::GetMutableOutputAddr(input_node, 0);
      MS_EXCEPTION_IF_NULL(device_address);
      if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(input_node, 0),
                                            LongToSize(tensor->data().nbytes()), tensor->data_type(),
                                            tensor->data_c())) {
      if (size != 0 && !device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(input_node, 0), size,
                                                         tensor->data_type(), tensor->data_c())) {
        MS_LOG(EXCEPTION) << "SyncHostToDevice failed.";
      }

      if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode ||
          AnfAlgo::IsParameterWeight(input_node->cast<ParameterPtr>())) {
        tensor->set_device_address(device_address);
@@ -1543,55 +1617,6 @@ void SessionBasic::RunGraphAsync(const GraphId &graph_id, const std::vector<tens
  executor_->RunGraphAsync(shared_from_this(), graph_id, inputs, outputs);
 }

 bool IsDynamicShape(const NotNull<abstract::ShapePtr> &shape) {
  return std::any_of(shape->shape().begin(), shape->shape().end(), [](int64_t s) { return s < 0; });
 }

 bool IsNodeOutputDynamicShape(const CNodePtr &anf_node_ptr) {
  MS_EXCEPTION_IF_NULL(anf_node_ptr);
  return AnfAlgo::IsNodeDynamicShape(anf_node_ptr);
 }

 bool IsNodeInputDynamicShape(const CNodePtr &anf_node_ptr) {
  MS_EXCEPTION_IF_NULL(anf_node_ptr);
  auto input_num = AnfAlgo::GetInputTensorNum(anf_node_ptr);
  for (size_t i = 0; i < input_num; ++i) {
    auto input_with_index = AnfAlgo::GetPrevNodeOutput(anf_node_ptr, i);
    auto input = input_with_index.first;
    auto index = input_with_index.second;
    MS_EXCEPTION_IF_NULL(input);

    auto base_shape = input->Shape();
    if (base_shape == nullptr) {
      MS_LOG(INFO) << "Invalid shape ptr, node:" << input->fullname_with_scope();
      continue;
    }
    if (base_shape->isa<abstract::Shape>()) {
      if (IsDynamicShape(NOT_NULL(base_shape->cast<abstract::ShapePtr>()))) {
        return true;
      }
    } else if (base_shape->isa<abstract::TupleShape>()) {
      auto tuple_shape = base_shape->cast<abstract::TupleShapePtr>();
      MS_EXCEPTION_IF_NULL(tuple_shape);

      if (index >= tuple_shape->size()) {
        MS_LOG(INFO) << "Node:" << anf_node_ptr->fullname_with_scope() << "Invalid index:" << index
                     << " and tuple_shape size:" << tuple_shape->size();
        continue;
      }

      auto b_shp = (*tuple_shape)[index];
      if (!b_shp->isa<abstract::Shape>()) {
        continue;
      }
      if (IsDynamicShape(NOT_NULL(b_shp->cast<abstract::ShapePtr>()))) {
        return true;
      }
    }
  }
  return false;
 }

 void SessionBasic::UpdateAllGraphDynamicShapeAttr(const std::vector<KernelGraphPtr> &all_graphs) {
  bool is_dynamic = false;
  for (const auto &graph : all_graphs) {
@@ -1605,20 +1630,10 @@ void SessionBasic::UpdateAllGraphDynamicShapeAttr(const std::vector<KernelGraphP

 void SessionBasic::UpdateGraphDynamicShapeAttr(const NotNull<KernelGraphPtr> &root_graph) {
  for (const auto &cnode : root_graph->execution_order()) {
    auto output_dynamic = IsNodeOutputDynamicShape(NOT_NULL(cnode));
    auto input_dynamic = IsNodeInputDynamicShape(NOT_NULL(cnode));
    if (output_dynamic || input_dynamic) {
    if (AnfAlgo::IsNodeDynamicShape(cnode)) {
      AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), cnode);
      MS_LOG(INFO) << "Set Dynamic Shape Attr to Node:" << cnode->fullname_with_scope();
    }
    if (output_dynamic) {
      AnfAlgo::SetNodeAttr(kAttrOutputIsDynamicShape, MakeValue(true), cnode);
      MS_LOG(INFO) << "Set Output Dynamic Shape Attr to Node:" << cnode->fullname_with_scope();
    }
    if (input_dynamic) {
      AnfAlgo::SetNodeAttr(kAttrInputIsDynamicShape, MakeValue(true), cnode);
      MS_LOG(INFO) << "Set Input Dynamic Shape Attr to Node:" << cnode->fullname_with_scope();
    }
  }
  root_graph->UpdateGraphDynamicAttr();
 }
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@@ -532,7 +532,7 @@ bool AscendDeviceAddress::SyncHostToDevice(const ShapeVector &shape, size_t size
  }
  if (format_ == kOpFormat_NCHW || format_ == kOpFormat_DEFAULT || format_ == kOpFormat_NDHWC) {
    if (type_id_ == type) {
      SyncMemory(ptr_, host_ptr, size_, RT_MEMCPY_HOST_TO_DEVICE);
      SyncMemory(ptr_, host_ptr, size, RT_MEMCPY_HOST_TO_DEVICE);
      sync_ok = true;
    } else if (type_id_ == kNumberTypeFloat32 && type == kNumberTypeFloat64) {
      sync_ok = Float64ToFloatAndSyncHostToDevice(ptr_, size_, host_ptr, size);
--- a/mindspore/ccsrc/runtime/device/ascend/executor/ai_cpu_dynamic_kernel.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/executor/ai_cpu_dynamic_kernel.cc
@@ -66,11 +66,15 @@ void AiCpuDynamicKernel::Initialize() {
  input_num_ = AnfAlgo::GetInputTensorNum(cnode_ptr_);
  output_num_ = AnfAlgo::GetOutputTensorNum(cnode_ptr_);

  UnknowShapeOpType shape_type = UnknowShapeOpType::DEPEND_IN_SHAPE;
  if (AnfAlgo::GetCNodeName(cnode_ptr_) == "Unique") {
    shape_type = UnknowShapeOpType::DEPEND_COMPUTE;
  }
  // Parse aicpu ext info
  if (is_dynamic_shape_) {
    MS_EXCEPTION_IF_NULL(cnode_ptr_);
    ext_info_handler_ =
      std::make_shared<AicpuExtInfoHandler>(cnode_ptr_->fullname_with_scope(), input_num_, output_num_, DEPEND_COMPUTE);
      std::make_shared<AicpuExtInfoHandler>(cnode_ptr_->fullname_with_scope(), input_num_, output_num_, shape_type);
    ext_info_handler_->Parse(ext_info_data_);
  }

--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.cc
@@ -19,6 +19,7 @@
 #include <memory>
 #include <numeric>
 #include <utility>
 #include <algorithm>
 #include <functional>
 #include "backend/kernel_compiler/kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"
@@ -129,9 +130,11 @@ DeviceAddressPtr CPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t
  return std::make_shared<CPUDeviceAddress>(device_ptr, device_size, format, type_id);
 }

 tensor::TensorPtr CPUKernelRuntime::CreatTensorForOutput(session::KernelGraph *kernel_graph, const CNodePtr &node,
                                                         size_t index) {
 tensor::TensorPtr CPUKernelRuntime::CreatTensorForOutput(
  session::KernelGraph *kernel_graph, const CNodePtr &node, size_t index,
  std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) {
  MS_EXCEPTION_IF_NULL(node);
  MS_EXCEPTION_IF_NULL(tensor_to_node);
  size_t output_size = AnfAlgo::GetOutputTensorNum(node);
  if (index >= output_size) {
    MS_LOG(EXCEPTION) << "Invalid input index " << index;
@@ -166,13 +169,16 @@ tensor::TensorPtr CPUKernelRuntime::CreatTensorForOutput(session::KernelGraph *k
    }
    (void)bound_addresses_.insert(address);
  }
  session::KernelWithIndex node_index(node, index);
  tensor->SetNeedWait(true);
  tensor->SetIsGraphOutput();
  (*tensor_to_node)[tensor] = node_index;
  return tensor;
 }

 BaseRef CPUKernelRuntime::CreatTensorForOutput(session::KernelGraph *kernel_graph,
                                               const session::KernelWithIndex &kernel_with_index) {
                                               const session::KernelWithIndex &kernel_with_index,
                                               std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) {
  auto &input_node = kernel_with_index.first;
  auto index = kernel_with_index.second;
  MS_EXCEPTION_IF_NULL(input_node);
@@ -183,12 +189,12 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(session::KernelGraph *kernel_grap
      VectorRef ret;
      for (size_t i = 1; i < node->inputs().size(); i++) {
        auto item_with_index = AnfAlgo::VisitKernelWithReturnType(node->input(i), 0);
        auto out = CreatTensorForOutput(kernel_graph, item_with_index);
        auto out = CreatTensorForOutput(kernel_graph, item_with_index, tensor_to_node);
        ret.push_back(out);
      }
      return ret;
    }
    return CreatTensorForOutput(kernel_graph, node, index);
    return CreatTensorForOutput(kernel_graph, node, index, tensor_to_node);
  } else if (input_node->isa<Parameter>()) {
    auto iter = input_param_tensor_map_.find(input_node);
    if (iter != input_param_tensor_map_.end()) {
@@ -203,9 +209,11 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(session::KernelGraph *kernel_grap
 }

 void CPUKernelRuntime::CreateOutputTensors(session::KernelGraph *kernel_graph,
                                           const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
                                           const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs,
                                           std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node) {
  MS_EXCEPTION_IF_NULL(kernel_graph);
  MS_EXCEPTION_IF_NULL(outputs);
  MS_EXCEPTION_IF_NULL(tensor_to_node);
  auto &input_nodes = kernel_graph->inputs();
  if (input_nodes.size() != inputs.size()) {
    MS_LOG(EXCEPTION) << "Input size not equal to input node size!";
@@ -222,7 +230,7 @@ void CPUKernelRuntime::CreateOutputTensors(session::KernelGraph *kernel_graph,
  auto output_nodes = kernel_graph->outputs();
  for (const auto &item : output_nodes) {
    auto item_with_index = AnfAlgo::VisitKernelWithReturnType(item, 0, true);
    auto out = CreatTensorForOutput(kernel_graph, item_with_index);
    auto out = CreatTensorForOutput(kernel_graph, item_with_index, tensor_to_node);
    outputs->push_back(std::move(out));
  }
 }
@@ -258,6 +266,12 @@ void CPUKernelRuntime::BindInputTensorAddressPtr(const session::KernelGraph &ker
          MS_LOG(EXCEPTION) << "Parameter node sync host to device failed!";
        }
      }
      if (item->cast<ParameterPtr>()->is_used_by_dynamic_kernel()) {
        auto tensor_shape = tensor->shape();
        std::vector<size_t> shape_tmp;
        (void)std::transform(tensor_shape.begin(), tensor_shape.end(), std::back_inserter(shape_tmp), IntToSize);
        AnfAlgo::SetOutputInferTypeAndShape({AnfAlgo::GetOutputInferDataType(item, 0)}, {shape_tmp}, item.get());
      }
      address->ref_count_ = INIT_NODE_REF;
      tensor->set_device_address(address);
    }
@@ -325,6 +339,9 @@ bool CPUKernelRuntime::Run(session::KernelGraph *kernel_graph, bool is_task_sink
 #ifdef ENABLE_PROFILE
    double start_time = GetTime();
 #endif
    if (AnfAlgo::IsDynamicShape(kernel)) {
      AnfAlgo::InferShape(kernel);
    }
    std::vector<kernel::AddressPtr> kernel_inputs;
    std::vector<kernel::AddressPtr> kernel_workspaces;
    std::vector<kernel::AddressPtr> kernel_outputs;
--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
@@ -39,7 +39,7 @@ class CPUKernelRuntime : public KernelRuntime {
  bool Run(session::KernelGraph *graph, bool is_task_sink) override;
  void AssignKernelAddress(session::KernelGraph *kernel_graph);
  void CreateOutputTensors(session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
                           VectorRef *outputs);
                           VectorRef *outputs, std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node);
  void BindInputOutput(session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
                       VectorRef *outputs);
  void IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs);
@@ -53,8 +53,10 @@ class CPUKernelRuntime : public KernelRuntime {
                                       TypeId type_id) override;

 private:
  tensor::TensorPtr CreatTensorForOutput(session::KernelGraph *kernel_graph, const CNodePtr &node, size_t index);
  BaseRef CreatTensorForOutput(session::KernelGraph *kernel_graph, const session::KernelWithIndex &kernel_with_index);
  tensor::TensorPtr CreatTensorForOutput(session::KernelGraph *kernel_graph, const CNodePtr &node, size_t index,
                                         std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node);
  BaseRef CreatTensorForOutput(session::KernelGraph *kernel_graph, const session::KernelWithIndex &kernel_with_index,
                               std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node);
  void BindInputTensorAddressPtr(const session::KernelGraph &graph, const std::vector<tensor::TensorPtr> &inputs);
  void BindOutputTensorAddressPtr(const VectorRef *outputs);
  void AssignValueNodeAddress(session::KernelGraph *kernel_graph);
--- a/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc
+++ b/mindspore/ccsrc/runtime/device/executor/dynamic_kernel.cc
@@ -51,17 +51,6 @@ void DynamicKernel::Initialize() {

 int DynamicKernel::GetKernelType() { return AnfAlgo::GetKernelType(cnode_ptr_); }

 bool IsTupleGetItem(const AnfNodePtr &anf_node) {
  MS_EXCEPTION_IF_NULL(anf_node);
  if (!anf_node->isa<CNode>()) {
    return false;
  }
  auto cnode = anf_node->cast<CNodePtr>();
  MS_EXCEPTION_IF_NULL(cnode);
  auto input0 = cnode->input(0);
  return IsPrimitive(input0, prim::kPrimTupleGetItem);
 }

 void DynamicKernel::RebuildDependTensor() {
  depend_tensor_map_.clear();
  for (auto depend : depend_list_) {
@@ -112,7 +101,7 @@ void DynamicKernel::InferShape() {

    auto cnode_input = cnode_ptr_->input(i + 1);
    MS_EXCEPTION_IF_NULL(cnode_input);
    if (IsTupleGetItem(cnode_input)) {
    if (AnfAlgo::CheckPrimitiveType(cnode_input, prim::kPrimTupleGetItem)) {
      auto base_shape = real_input->Shape();
      if (!base_shape->isa<abstract::TupleShape>()) {
        MS_LOG(EXCEPTION) << "Node:" << cnode_ptr_->fullname_with_scope()
--- a/mindspore/core/abstract/abstract_value.h
+++ b/mindspore/core/abstract/abstract_value.h
@@ -259,6 +259,13 @@ class AbstractUndetermined : public AbstractBase {
    }
    set_shape(std::make_shared<Shape>(shape));
  }
  explicit AbstractUndetermined(const TypePtr &element_type, const BaseShapePtr &shape = std::make_shared<Shape>())
      : AbstractBase(kAnyValue), element_(std::make_shared<AbstractScalar>(kAnyValue, element_type)) {
    if (element_type == nullptr) {
      MS_LOG(EXCEPTION) << "element_type is nullptr";
    }
    set_shape(shape);
  }
  ~AbstractUndetermined() override = default;
  MS_DECLARE_PARENT(AbstractUndetermined, AbstractBase)
  TypePtr BuildType() const override { return std::make_shared<UndeterminedType>(); }
@@ -277,6 +284,8 @@ class AbstractTensor : public AbstractUndetermined {
      : AbstractUndetermined(element, shape) {}
  AbstractTensor(const TypePtr &element_type, const ShapeVector &shape) : AbstractUndetermined(element_type, shape) {}
  explicit AbstractTensor(const tensor::TensorPtr &tensor) : AbstractUndetermined(tensor->Dtype(), tensor->shape()) {}
  explicit AbstractTensor(const TypePtr &element_type, const BaseShapePtr &shape = std::make_shared<Shape>())
      : AbstractUndetermined(element_type, shape) {}
  ~AbstractTensor() override = default;
  MS_DECLARE_PARENT(AbstractTensor, AbstractUndetermined)

--- a/mindspore/core/abstract/utils.cc
+++ b/mindspore/core/abstract/utils.cc
@@ -26,6 +26,12 @@

 namespace mindspore {
 namespace abstract {
 const std::map<TypeId, size_t> type_map = {{kNumberTypeBool, 1},    {kNumberTypeInt, 4},     {kNumberTypeInt8, 1},
                                           {kNumberTypeInt16, 2},   {kNumberTypeInt32, 4},   {kNumberTypeInt64, 8},
                                           {kNumberTypeUInt, 4},    {kNumberTypeUInt8, 1},   {kNumberTypeUInt16, 2},
                                           {kNumberTypeUInt32, 4},  {kNumberTypeUInt64, 8},  {kNumberTypeFloat, 4},
                                           {kNumberTypeFloat16, 2}, {kNumberTypeFloat32, 4}, {kNumberTypeFloat64, 8}};

 ValuePtr ValueJoin(const ValuePtr &value1, const ValuePtr &value2) {
  MS_EXCEPTION_IF_NULL(value1);
  MS_EXCEPTION_IF_NULL(value2);
@@ -291,5 +297,18 @@ ShapePtr GetBroadcastShape(const std::string &op, const AbstractTensorPtr &tenso
  auto y_shape = tensor_y_shape->shape();
  return std::make_shared<Shape>(RealBroadcast(op, x_shape, y_shape));
 }

 size_t TypeIdSize(const TypeId data_type) {
  const size_t unsupported_type_error = 0;
  auto iter = type_map.find(data_type);
  if (iter != type_map.end()) {
    return iter->second;
  }
  return unsupported_type_error;
 }

 size_t ShapeSize(const std::vector<size_t> &shape) {
  return std::accumulate(shape.begin(), shape.end(), IntToSize(1), std::multiplies<size_t>());
 }
 }  // namespace abstract
 }  // namespace mindspore
--- a/mindspore/core/abstract/utils.h
+++ b/mindspore/core/abstract/utils.h
@@ -51,6 +51,9 @@ int64_t GetPositiveAxis(int64_t axis_value, size_t increment);

 ShapeVector BroadcastShape(ShapeVector shpx, ShapeVector shpy);

 size_t TypeIdSize(const TypeId data_type);
 size_t ShapeSize(const std::vector<size_t> &shape);

 // Get broadcasted shape for binary element-wise operation
 ShapePtr GetBroadcastShape(const std::string &op, const AbstractTensorPtr &tensor_x, const AbstractTensorPtr &tensor_y);
 }  // namespace abstract
--- a/mindspore/core/ir/anf.h
+++ b/mindspore/core/ir/anf.h
@@ -322,9 +322,17 @@ class Parameter : public ANode {
    return shared_from_this() == other.shared_from_this();
  }

  void set_used_by_real_kernel() { is_real_kernel_used_ = false; }
  bool is_used_by_real_kernel() { return is_real_kernel_used_; }

  void set_used_by_dynamic_kernel() { is_used_by_dynamic_kernel_ = true; }
  bool is_used_by_dynamic_kernel() { return is_used_by_dynamic_kernel_; }

 private:
  std::string name_;
  bool has_default_;
  bool is_real_kernel_used_ = true;
  bool is_used_by_dynamic_kernel_ = false;
  ValuePtr default_param_;
  // The count of graphs using the parameter.
  int used_graph_count_;
--- a/mindspore/core/ir/tensor.cc
+++ b/mindspore/core/ir/tensor.cc
@@ -29,6 +29,7 @@
 #include <type_traits>
 #include <typeinfo>

 #include "abstract/utils.h"
 #include "abstract/abstract_value.h"

 namespace mindspore {
@@ -581,8 +582,11 @@ void Tensor::data_sync(bool need_wait) const {
  if (device_sync_ == nullptr) {
    return;
  }
  std::vector<size_t> shape_tmp;
  (void)std::transform(shape().begin(), shape().end(), std::back_inserter(shape_tmp), IntToSize);
  auto size = abstract::ShapeSize(shape_tmp) * abstract::TypeIdSize(data_type());
  auto address = device_sync_;
  if (!address->SyncDeviceToHost(shape(), static_cast<size_t>(data().nbytes()), data_type(), data_c())) {
  if (size != 0 && !address->SyncDeviceToHost(shape(), size, data_type(), data_c())) {
    MS_LOG(EXCEPTION) << "SyncDeviceToHost failed.";
  }
  sync_status_ = kNeedSyncHostToDevice;
--- a/tests/st/dynamic_shape/test_ascend_cpu.py
+++ b/tests/st/dynamic_shape/test_ascend_cpu.py
@@ -0,0 +1,70 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 import numpy as np
 import pytest
 import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
 import mindspore.common.dtype as mstype
 from mindspore.ops import operations as P

 context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")


 class Net(nn.Cell):
    def __init__(self):
        super(Net, self).__init__()
        self.unique = P.Unique().add_prim_attr("primitive_target", "CPU")

    def construct(self, x):
        x, y = self.unique(x)
        return (x, y)


 class UniqueSquare(nn.Cell):
    def __init__(self):
        super(UniqueSquare, self).__init__()
        self.unique = P.Unique().add_prim_attr("primitive_target", "CPU")
        self.square = P.Square()

    def construct(self, x):
        x, _ = self.unique(x)
        return self.square(x)


@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
 def test_unique_ascend():
    x = Tensor(np.array([1, 1, 2, 2, 3, 3]), mstype.int32)
    unique = Net()
    output = unique(x)
    expect1 = np.array([1, 2, 3])
    expect2 = np.array([0, 0, 1, 1, 2, 2])
    assert (output[0].asnumpy() == expect1).all()
    assert (output[1].asnumpy() == expect2).all()


@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
 def test_unique_square():
    x = Tensor(np.array([1, 1, 2, 2, 3, 3]), mstype.int32)
    net = UniqueSquare()
    output = net(x)
    expect1 = np.array([1, 4, 9])
    assert (output.asnumpy() == expect1).all()
--- a/tests/st/dynamic_shape/test_unique_cpu.py
+++ b/tests/st/dynamic_shape/test_unique_cpu.py
@@ -0,0 +1,69 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 import numpy as np
 import pytest
 import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
 import mindspore.common.dtype as mstype
 from mindspore.ops import operations as P

 context.set_context(mode=context.GRAPH_MODE, device_target="CPU")


 class Net(nn.Cell):
    def __init__(self):
        super(Net, self).__init__()
        self.unique = P.Unique()

    def construct(self, x):
        return self.unique(x)


 class UniqueSquare(nn.Cell):
    def __init__(self):
        super(UniqueSquare, self).__init__()
        self.unique = P.Unique()
        self.square = P.Square()

    def construct(self, x):
        x, _ = self.unique(x)
        return self.square(x)


@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
 def test_unique_cpu():
    x = Tensor(np.array([1, 1, 2, 2, 3, 3]), mstype.int32)
    unique = Net()
    output = unique(x)
    expect1 = np.array([1, 2, 3])
    expect2 = np.array([0, 0, 1, 1, 2, 2])
    assert (output[0].asnumpy() == expect1).all()
    assert (output[1].asnumpy() == expect2).all()


@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.env_onecard
 def test_unique_square():
    x = Tensor(np.array([1, 1, 2, 2, 3, 3]), mstype.int32)
    net = UniqueSquare()
    output = net(x)
    expect1 = np.array([1, 4, 9])
    assert (output.asnumpy() == expect1).all()