!30589 [DynamicShape][GPU]add dynamic shape support some ops for new network

Merge pull request !30589 from hanhuifeng/dcn_dyn_ops_2_new
4 years ago · 9a5268f2be
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/array_reduce_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/array_reduce_gpu_kernel.cc
@@ -56,5 +56,22 @@ MS_REG_GPU_KERNEL_ONE(ReduceProd, KernelAttr().AddInputAttr(kNumberTypeFloat32).
                      ArrayReduceGpuKernelMod, float)
 MS_REG_GPU_KERNEL_ONE(ReduceProd, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
                      ArrayReduceGpuKernelMod, double)

 // dynamic
 MS_REG_GPU_KERNEL_TWO(
  ReduceSum,
  KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64),
  ArrayReduceGpuKernelMod, double, int64_t)
 MS_REG_GPU_KERNEL_TWO(
  ReduceSum,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
  ArrayReduceGpuKernelMod, float, int64_t)
 MS_REG_GPU_KERNEL_TWO(
  ReduceSum,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16),
  ArrayReduceGpuKernelMod, half, int64_t)
 MS_REG_GPU_KERNEL_TWO(
  ReduceSum, KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeBool),
  ArrayReduceGpuKernelMod, bool, int64_t)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/array_reduce_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/array_reduce_gpu_kernel.h
@@ -32,7 +32,7 @@ const std::map<std::string, cudnnReduceTensorOp_t> kReduceTypeMap = {
  {"ReduceAny", CUDNN_REDUCE_TENSOR_MAX},  {"ReduceAll", CUDNN_REDUCE_TENSOR_MUL},
  {"ReduceProd", CUDNN_REDUCE_TENSOR_MUL},
 };
 template <typename T>
 template <typename T, typename S = int64_t>
 class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
 public:
  ArrayReduceGpuKernelMod() { ResetResource(); }
@@ -43,6 +43,9 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
    if (is_null_input_) {
      return true;
    }
    if (is_dynamic_axis_ && !get_dynamic_axis_value_) {
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', fail to get value of the axis when axis is dynamic!";
    }
    T *input_addr = GetDeviceAddress<T>(inputs, 0);
    T *output_addr = GetDeviceAddress<T>(outputs, 0);
    T *workspace_addr = GetPossiblyNullDeviceAddress<T>(workspace, 0);
@@ -87,8 +90,13 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
    }
    data_type_ = GetCudnnDataType(type_name);
    size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1, but got " << input_num;
    constexpr size_t kDynamicAxisInputNum = 2;
    if (input_num != 1 && input_num != kDynamicAxisInputNum) {
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the number of inputs should be 1 or " << kDynamicAxisInputNum
                        << ", but got " << input_num;
    }
    if (input_num == kDynamicAxisInputNum) {
      is_dynamic_axis_ = true;
    }
    size_t output_num = common::AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
@@ -96,29 +104,28 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
    }
    int input_dim_length = SizeToInt(AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0).size());

    auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
    MS_EXCEPTION_IF_NULL(prim);
    if (prim->GetAttr("axis")->isa<ValueTuple>() || prim->GetAttr("axis")->isa<ValueList>()) {
      std::vector<int> attr_axis;
      std::vector<int64_t> attr_axis_me = GetAttr<std::vector<int64_t>>(kernel_node, "axis");
      (void)std::transform(attr_axis_me.begin(), attr_axis_me.end(), std::back_inserter(attr_axis),
                           [](const int64_t &value) { return static_cast<int>(value); });
      if (attr_axis.empty()) {
        axis_.push_back(-1);
      } else {
        for (auto axis : attr_axis) {
          axis < 0 ? axis_.push_back(axis + input_dim_length) : axis_.push_back(axis);
        }
        std::sort(axis_.begin(), axis_.end());
        auto multiple_pos = std::unique(axis_.begin(), axis_.end());
        axis_.erase(multiple_pos, axis_.end());
    std::vector<int64_t> attr_axis;
    if (is_dynamic_axis_) {
      get_dynamic_axis_value_ = GetDynamicAttrIntValue(kernel_node, kAxisIndex_, &attr_axis);
      if (!get_dynamic_axis_value_) {
        InitSizeLists();
        return true;
      }
    } else if (prim->GetAttr("axis")->isa<Int64Imm>()) {
      int axis = static_cast<int>(GetAttr<int64_t>(kernel_node, "axis"));
      axis < 0 ? axis_.push_back(axis + input_dim_length) : axis_.push_back(axis);
      dynamic_axis_size_ = attr_axis.size();
    } else {
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', attribute 'axis' type is invalid.";
      attr_axis = GetAxisValue(kernel_node);
    }
    if (attr_axis.empty()) {
      axis_.push_back(-1);
    } else {
      for (auto axis : attr_axis) {
        axis < 0 ? axis_.push_back(axis + input_dim_length) : axis_.push_back(axis);
      }
      std::sort(axis_.begin(), axis_.end());
      auto multiple_pos = std::unique(axis_.begin(), axis_.end());
      axis_.erase(multiple_pos, axis_.end());
    }

    keep_dims_ = GetAttr<bool>(kernel_node, "keep_dims");

    auto inputA_shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
@@ -156,6 +163,9 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
    input_size_list_.clear();
    output_size_list_.clear();
    workspace_size_list_.clear();
    dynamic_axis_size_ = 0;
    is_dynamic_axis_ = false;
    get_dynamic_axis_value_ = false;
  }

  void DestroyResource() noexcept override {
@@ -182,6 +192,10 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
                                "cudnnGetTensorSizeInBytes failed.");
    input_size_list_.push_back(input_size_);

    if (is_dynamic_axis_) {
      input_size_list_.push_back(dynamic_axis_size_ * sizeof(S));
    }

    CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(outputC_descriptor_, &output_size_),
                                "cudnnGetTensorSizeInBytes failed.");
    output_size_list_.push_back(output_size_);
@@ -279,6 +293,21 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
    return;
  }

  std::vector<int64_t> GetAxisValue(const CNodePtr &kernel_node) {
    auto prim = common::AnfAlgo::GetCNodePrimitive(kernel_node);
    MS_EXCEPTION_IF_NULL(prim);
    std::vector<int64_t> attr_axis_me;
    if (prim->GetAttr("axis")->isa<ValueTuple>() || prim->GetAttr("axis")->isa<ValueList>()) {
      attr_axis_me = GetAttr<std::vector<int64_t>>(kernel_node, "axis");
    } else if (prim->GetAttr("axis")->isa<Int64Imm>()) {
      auto axis = GetAttr<int64_t>(kernel_node, "axis");
      attr_axis_me.push_back(axis);
    } else {
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', attribute 'axis' type is invalid.";
    }
    return attr_axis_me;
  }

  cudnnHandle_t cudnn_handle_;
  cudnnReduceTensorOp_t reduce_tensor_op_;
  cudnnDataType_t data_type_;
@@ -295,7 +324,10 @@ class ArrayReduceGpuKernelMod : public NativeGpuKernelMod {
  size_t input_size_;
  size_t output_size_;
  size_t workspace_size_;
  std::string kernel_name_;
  size_t dynamic_axis_size_;
  bool is_dynamic_axis_;
  bool get_dynamic_axis_value_;
  static constexpr size_t kAxisIndex_{1};
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.cc
@@ -33,18 +33,18 @@ namespace kernel {

 REG_SLICE_GPU_DTYPES(REG_SLICE_GPU)

 #define REG_DYNAMIC_SLICE_GPU_ATTR(T0_MS_DTYPE, T0_DTYPE, T1_MS_DTYPE) \
  MS_REG_GPU_KERNEL_ONE(Slice,                                         \
                        KernelAttr()                                   \
                          .AddInputAttr(T0_MS_DTYPE)                   \
                          .AddInputAttr(T1_MS_DTYPE)                   \
                          .AddInputAttr(T1_MS_DTYPE)                   \
                          .AddOutputAttr(T0_MS_DTYPE),                 \
                        SliceFwdGpuKernelMod, T0_DTYPE)
 #define REG_DYNAMIC_SLICE_GPU_ATTR(T0_MS_DTYPE, T0_DTYPE, T1_MS_DTYPE, T1_DTYPE) \
  MS_REG_GPU_KERNEL_TWO(Slice,                                                   \
                        KernelAttr()                                             \
                          .AddInputAttr(T0_MS_DTYPE)                             \
                          .AddInputAttr(T1_MS_DTYPE)                             \
                          .AddInputAttr(T1_MS_DTYPE)                             \
                          .AddOutputAttr(T0_MS_DTYPE),                           \
                        SliceFwdGpuKernelMod, T0_DTYPE, T1_DTYPE)

 #define REG_DYNAMIC_SLICE_GPU(MS_DTYPE, DTYPE)                  \
  REG_DYNAMIC_SLICE_GPU_ATTR(MS_DTYPE, DTYPE, kNumberTypeInt32) \
  REG_DYNAMIC_SLICE_GPU_ATTR(MS_DTYPE, DTYPE, kNumberTypeInt64)
 #define REG_DYNAMIC_SLICE_GPU(MS_DTYPE, DTYPE)                           \
  REG_DYNAMIC_SLICE_GPU_ATTR(MS_DTYPE, DTYPE, kNumberTypeInt32, int32_t) \
  REG_DYNAMIC_SLICE_GPU_ATTR(MS_DTYPE, DTYPE, kNumberTypeInt64, int64_t)

 REG_SLICE_GPU_DTYPES(REG_DYNAMIC_SLICE_GPU)
 }  // namespace kernel
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/slice_gpu_kernel.h
@@ -40,7 +40,7 @@ constexpr auto kIdx4 = 4;
 constexpr auto kIdx5 = 5;
 constexpr auto kIdx6 = 6;

 template <typename T>
 template <typename T, typename S = int64_t>
 class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
 public:
  SliceFwdGpuKernelMod() { kernel_name_ = "Slice"; }
@@ -106,7 +106,10 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
    kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    (void)CheckParam(kernel_node);

    if (is_dynamic_attr_ && !get_dynamic_attr_value_) {
      InitSizeLists();
      return true;
    }
    auto input_shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
    auto out_shape = AnfAlgo::GetOutputDeviceShapeAdaptively(kernel_node, 0);
    is_null_input_ =
@@ -118,27 +121,15 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
    (void)std::transform(input_shape.begin(), input_shape.end(), std::back_inserter(input_shape_),
                         [](const int64_t &e) { return static_cast<int32_t>(e); });

    size_t input_size = sizeof(T);
    input_size_ = sizeof(T);
    for (size_t x : input_shape) {
      input_size *= x;
    }
    input_size_list_.push_back(input_size);
    if (is_dynamic_attr_) {
      std::vector<size_t> dynamic_attr_indexs = {kBeginIndex_, kSizeIndex_};
      for (size_t index : dynamic_attr_indexs) {
        input_size = sizeof(T);
        for (size_t x : AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, index)) {
          input_size *= x;
        }
        input_size_list_.push_back(input_size);
      }
      input_size_ *= x;
    }

    size_t output_size = sizeof(T);
    output_size_ = sizeof(T);
    for (size_t x : out_shape) {
      output_size *= x;
      output_size_ *= x;
    }
    output_size_list_.push_back(output_size);

    // transpose begin and size for NHWC data
    auto data_format = AnfAlgo::GetInputFormat(kernel_node, 0);
@@ -166,6 +157,8 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
    begin_.clear();
    size_.clear();
    input_shape_.clear();
    input_size_ = 0;
    output_size_ = 0;
    is_null_input_ = false;
    kernel_name_ = "Slice";
    is_dynamic_attr_ = false;
@@ -173,7 +166,29 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
  }

 protected:
  void InitSizeLists() override {}
  void InitSizeLists() override {
    input_size_list_.push_back(input_size_);
    output_size_list_.push_back(output_size_);
    if (is_dynamic_attr_) {
      InitDynamicAttrSizeLists(kernel_node_.lock());
    }
  }

  inline void InitDynamicAttrSizeLists(const CNodePtr &kernel_node) {
    if (get_dynamic_attr_value_) {
      std::vector<size_t> dynamic_attr_indexs = {kBeginIndex_, kSizeIndex_};
      for (size_t index : dynamic_attr_indexs) {
        size_t input_size = sizeof(S);
        for (size_t x : AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, index)) {
          input_size *= x;
        }
        input_size_list_.push_back(input_size);
      }
    } else {
      input_size_list_.push_back(0);
      input_size_list_.push_back(0);
    }
  }

 private:
  void CheckParam(const CNodePtr &kernel_node) {
@@ -205,13 +220,13 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
      size = GetAttr<std::vector<int64_t>>(kernel_node, "size");
      begin = GetAttr<std::vector<int64_t>>(kernel_node, "begin");
    } else {
      // The value of dynamic attr can only be obtained after the InferShape() of dynamic kernel is executed
      if (depend_tensor_map_.empty()) {
      if (GetDynamicAttrIntValue(kernel_node, kBeginIndex_, &begin) &&
          GetDynamicAttrIntValue(kernel_node, kSizeIndex_, &size)) {
        get_dynamic_attr_value_ = true;
      }
      if (!get_dynamic_attr_value_) {
        return;
      }
      begin = GetDynamicAttrIntValue(kernel_node, kBeginIndex_);
      size = GetDynamicAttrIntValue(kernel_node, kSizeIndex_);
      get_dynamic_attr_value_ = true;
    }

    if (size.size() != input_shape.size() || begin.size() != input_shape.size()) {
@@ -220,9 +235,8 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
                        << "of size: " << size.size() << ", the dimension of begin: " << begin.size()
                        << ", the dimension of input_x: " << input_shape.size();
    }
    const int64_t NEG_ONE = -1;
    for (size_t i = 0; i < input_shape.size(); i++) {
      if (size[i] == NEG_ONE) {
      if (size[i] == -1) {
        size[i] = input_shape[i] - begin[i];
      }
      if (input_shape[i] <= 0 || size[i] <= 0) {
@@ -245,6 +259,8 @@ class SliceFwdGpuKernelMod : public NativeGpuKernelMod {
  std::vector<int32_t> size_;
  std::vector<int32_t> input_shape_;

  size_t input_size_{0};
  size_t output_size_{0};
  bool is_null_input_{false};
  bool is_dynamic_attr_{false};
  bool get_dynamic_attr_value_{false};
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/gpu_kernel.h
@@ -346,9 +346,12 @@ class NativeGpuKernelMod : public GpuKernelMod {
    return std::equal(s1.begin(), s1.end(), s2_trans.begin(), s2_trans.end());
  }

  inline std::vector<int64_t> GetDynamicAttrIntValue(const CNodePtr &kernel_node, const size_t input_index) {
  inline bool GetDynamicAttrIntValue(const CNodePtr &kernel_node, const size_t input_index,
                                     std::vector<int64_t> *attr_value) {
    // The value of dynamic attr can only be obtained after the InferShape() is executed
    if (depend_tensor_map_.empty()) {
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the depend_tensor_map is empty!";
      MS_LOG(DEBUG) << "For '" << kernel_name_ << "', the depend_tensor_map is currently empty";
      return false;
    }
    auto depend_iter = depend_tensor_map_.find(input_index);
    if (depend_iter == depend_tensor_map_.end()) {
@@ -366,7 +369,8 @@ class NativeGpuKernelMod : public GpuKernelMod {
      MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "',  the format of the " << input_index
                        << "th input currently should be the default format and does not support " << data_format;
    }
    return GetTensorIntValue(input_tensor, input_index);
    *attr_value = GetTensorIntValue(input_tensor, input_index);
    return true;
  }
 };
 }  // namespace kernel
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/batch_norm_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/batch_norm_grad_gpu_kernel.h
@@ -33,30 +33,7 @@ constexpr size_t NO_CUDNN_BATCHNORM_OPS_BN_INPUT_NUM = 8;
 template <typename T>
 class BatchNormGradGpuKernelMod : public NativeGpuKernelMod {
 public:
  BatchNormGradGpuKernelMod()
      : batch_(0),
        channel_(0),
        height_(0),
        width_(0),
        x_size_(0),
        para_size_(0),
        workspace_size_(0),
        reserve_size_(0),
        mode_(CUDNN_BATCHNORM_SPATIAL),
        bn_ops_(CUDNN_BATCHNORM_OPS_BN),
        epsilon_(10e-5),
        is_train_(false),
        is_null_input_(false),
        x_desc_(nullptr),
        y_desc_(nullptr),
        dy_desc_(nullptr),
        dx_desc_(nullptr),
        dz_desc_(nullptr),
        scale_bias_diff_desc_(nullptr),
        activation_desc_(nullptr),
        handle_(nullptr),
        cudnn_data_type_(CUDNN_DATA_FLOAT),
        beta_data_diff_(0) {}
  BatchNormGradGpuKernelMod() { ResetResource(); }
  ~BatchNormGradGpuKernelMod() override { DestroyResource(); }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
@@ -171,6 +148,33 @@ class BatchNormGradGpuKernelMod : public NativeGpuKernelMod {
    return true;
  }

  void ResetResource() noexcept override {
    ResetSizeLists();
    batch_ = 0;
    channel_ = 0;
    height_ = 0;
    width_ = 0;
    x_size_ = 0;
    para_size_ = 0;
    workspace_size_ = 0;
    reserve_size_ = 0;
    mode_ = CUDNN_BATCHNORM_SPATIAL;
    bn_ops_ = CUDNN_BATCHNORM_OPS_BN;
    epsilon_ = 10e-5;
    is_train_ = false;
    is_null_input_ = false;
    x_desc_ = nullptr;
    y_desc_ = nullptr;
    dy_desc_ = nullptr;
    dx_desc_ = nullptr;
    dz_desc_ = nullptr;
    scale_bias_diff_desc_ = nullptr;
    activation_desc_ = nullptr;
    handle_ = nullptr;
    cudnn_data_type_ = CUDNN_DATA_FLOAT;
    beta_data_diff_ = 0;
  }

 protected:
  void InitResource() override {
    handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/flatten_gpu_kernel.cc
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/flatten_gpu_kernel.cc
@@ -18,6 +18,7 @@

 namespace mindspore {
 namespace kernel {
 namespace {
 // float64
 MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
                      FlattenFwdGpuKernelMod, double)
@@ -25,6 +26,7 @@ MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeFloat64).Add
                      FlattenFwdGpuKernelMod, double)
 MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
                      FlattenFwdGpuKernelMod, double)

 // float32
 MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                      FlattenFwdGpuKernelMod, float)
@@ -32,6 +34,7 @@ MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeFloat32).Add
                      FlattenFwdGpuKernelMod, float)
 MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                      FlattenFwdGpuKernelMod, float)

 // float16
 MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
                      FlattenFwdGpuKernelMod, half)
@@ -39,6 +42,7 @@ MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeFloat16).
                      FlattenFwdGpuKernelMod, half)
 MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
                      FlattenFwdGpuKernelMod, half)

 // int64
 MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
                      FlattenFwdGpuKernelMod, int64_t)
@@ -46,13 +50,15 @@ MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOu
                      FlattenFwdGpuKernelMod, int64_t)
 MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
                      FlattenFwdGpuKernelMod, int64_t)

 // int32
 MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                      FlattenFwdGpuKernelMod, int)
                      FlattenFwdGpuKernelMod, int32_t)
 MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                      FlattenFwdGpuKernelMod, int)
                      FlattenFwdGpuKernelMod, int32_t)
 MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                      FlattenFwdGpuKernelMod, int)
                      FlattenFwdGpuKernelMod, int32_t)

 // int16
 MS_REG_GPU_KERNEL_ONE(Flatten, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
                      FlattenFwdGpuKernelMod, int16_t)
@@ -107,5 +113,71 @@ MS_REG_GPU_KERNEL_ONE(Reshape, KernelAttr().AddInputAttr(kNumberTypeBool).AddOut
                      FlattenFwdGpuKernelMod, bool)
 MS_REG_GPU_KERNEL_ONE(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
                      FlattenFwdGpuKernelMod, bool)
 }  // namespace

 // dynamic kernel:
 namespace {
 // float64
 MS_REG_GPU_KERNEL_TWO(
  Reshape,
  KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat64),
  FlattenFwdGpuKernelMod, double, int64_t)

 // float32
 MS_REG_GPU_KERNEL_TWO(
  Reshape,
  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat32),
  FlattenFwdGpuKernelMod, float, int64_t)

 // float16
 MS_REG_GPU_KERNEL_TWO(
  Reshape,
  KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeFloat16),
  FlattenFwdGpuKernelMod, half, int64_t)

 // int64
 MS_REG_GPU_KERNEL_TWO(
  Reshape, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
  FlattenFwdGpuKernelMod, int64_t, int64_t)

 // int32
 MS_REG_GPU_KERNEL_TWO(
  Reshape, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt32),
  FlattenFwdGpuKernelMod, int32_t, int64_t)

 // int16
 MS_REG_GPU_KERNEL_TWO(
  Reshape, KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt16),
  FlattenFwdGpuKernelMod, int16_t, int64_t)
 // int8
 MS_REG_GPU_KERNEL_TWO(
  Reshape, KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt8),
  FlattenFwdGpuKernelMod, char, int64_t)

 // uint64
 MS_REG_GPU_KERNEL_TWO(
  Reshape, KernelAttr().AddInputAttr(kNumberTypeUInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeUInt64),
  FlattenFwdGpuKernelMod, uint64_t, int64_t)

 // uint32
 MS_REG_GPU_KERNEL_TWO(
  Reshape, KernelAttr().AddInputAttr(kNumberTypeUInt32).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeUInt32),
  FlattenFwdGpuKernelMod, uint, int64_t)

 // uint16
 MS_REG_GPU_KERNEL_TWO(
  Reshape, KernelAttr().AddInputAttr(kNumberTypeUInt16).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeUInt16),
  FlattenFwdGpuKernelMod, uint16_t, int64_t)

 // uint8
 MS_REG_GPU_KERNEL_TWO(
  Reshape, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeUInt8),
  FlattenFwdGpuKernelMod, uchar, int64_t)

 // bool
 MS_REG_GPU_KERNEL_TWO(
  Reshape, KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeBool),
  FlattenFwdGpuKernelMod, bool, int64_t)
 }  // namespace
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/nn/flatten_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/nn/flatten_gpu_kernel.h
@@ -25,7 +25,7 @@

 namespace mindspore {
 namespace kernel {
 template <typename T>
 template <typename T, typename S = int64_t>
 class FlattenFwdGpuKernelMod : public NativeGpuKernelMod {
 public:
  FlattenFwdGpuKernelMod() : input_size_(0), is_null_input_(false) {}
@@ -48,6 +48,7 @@ class FlattenFwdGpuKernelMod : public NativeGpuKernelMod {
  }
  bool Init(const CNodePtr &kernel_node) override {
    kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
    kernel_node_ = kernel_node;
    auto shape = AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, 0);
    kernel_node_ = kernel_node;
    is_null_input_ = CHECK_SHAPE_NULL(shape, kernel_name_, "input");
@@ -75,6 +76,21 @@ class FlattenFwdGpuKernelMod : public NativeGpuKernelMod {
  void InitSizeLists() override {
    input_size_list_.push_back(input_size_);
    output_size_list_.push_back(input_size_);
    InitDynamicAttrSizeLists(kernel_node_.lock());
  }

  inline void InitDynamicAttrSizeLists(const CNodePtr &kernel_node) {
    size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num == 1) {
      return;
    }
    for (size_t index = 1; index < input_num; ++index) {
      size_t input_size = sizeof(S);
      for (size_t x : AnfAlgo::GetInputDeviceShapeAdaptively(kernel_node, index)) {
        input_size *= x;
      }
      input_size_list_.push_back(input_size);
    }
  }

 private:
--- a/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcast_grad_args_gpu_kernel.h
+++ b/mindspore/ccsrc/plugin/device/gpu/kernel/other/dynamic_broadcast_grad_args_gpu_kernel.h
@@ -49,6 +49,8 @@ class DynamicBroadcastGradientArgsGpuKernelMod : public NativeGpuKernelMod {
    CHECK_CUDA_RET_WITH_EXCEPT(
      kernel_node_, cudaMemcpyAsync(&x1_value[0], s1_addr, input_size_list_[1], cudaMemcpyDeviceToHost, cuda_stream),
      "DynamicBroadcastGradientArgs copy s1 value failed");
    CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(cuda_stream),
                               "DynamicBroadcastGradientArgs cudaStreamSynchronized failed");
    auto grad_reduce_idx = CalOut({x0_value, x1_value});
    r0_size_ = SetOuputValue(r0_addr, grad_reduce_idx[0], x0_value.size(), cuda_stream);
    r1_size_ = SetOuputValue(r1_addr, grad_reduce_idx[1], x1_value.size(), cuda_stream);
--- a/mindspore/core/abstract/primitive_infer_map.cc
+++ b/mindspore/core/abstract/primitive_infer_map.cc
@@ -84,7 +84,8 @@ std::set<int64_t> GetDependsFormMap(const CNodePtr &cnode) {
                                                        {kReshape, ShapeSet{1}},
                                                        {kSlice, ShapeSet{1, 2}},
                                                        {kSliceGrad, ShapeSet{2, 3}},
                                                        {kDynamicBroadcastTo, ShapeSet{1}}};
                                                        {kDynamicBroadcastTo, ShapeSet{1}},
                                                        {kReduceSum, ShapeSet{1}}};

  MS_EXCEPTION_IF_NULL(cnode);
  if (cnode->inputs().empty()) {
@@ -98,7 +99,7 @@ std::set<int64_t> GetDependsFormMap(const CNodePtr &cnode) {
  MS_EXCEPTION_IF_NULL(ms_context);
  auto device = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  // Special dynamic shape depends for Ascend.
  if (device == kAscendDevice && (prim_name == kReduceSum || prim_name == kTranspose)) {
  if (device == kAscendDevice && prim_name == kTranspose) {
    return {1};
  }

--- a/mindspore/core/ops/grad/batch_norm_grad.cc
+++ b/mindspore/core/ops/grad/batch_norm_grad.cc
@@ -45,27 +45,34 @@ bool BatchNormGrad::get_is_training() const {
  return GetValue<bool>(value_ptr);
 }

 AbstractBasePtr BatchNormGradInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                   const std::vector<AbstractBasePtr> &input_args) {
 constexpr auto kInputNum = 6;

 abstract::TupleShapePtr BatchNormGradInferShape(const PrimitivePtr &primitive,
                                                const std::vector<AbstractBasePtr> &input_args) {
  MS_EXCEPTION_IF_NULL(primitive);
  for (auto item : input_args) {
    MS_EXCEPTION_IF_NULL(item);
  }
  const int64_t input_num = 5;
  (void)CheckAndConvertUtils::CheckInteger("BatchNormGrad infer", SizeToLong(input_args.size()), kGreaterEqual,
                                           input_num, primitive->name());
  auto y_backprop_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[0]->BuildShape())[kShape];
  auto x_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[1]->BuildShape())[kShape];
  CheckAndConvertUtils::Check("BatchNorm y_backprop_shape", y_backprop_shape, kEqual, x_shape);
  auto prim_name = primitive->name();
  (void)CheckAndConvertUtils::CheckInputArgs(input_args, kGreaterEqual, kInputNum, prim_name);
  auto x_shape_ptr = input_args[kInputIndex1]->BuildShape();
  auto scale_shape_ptr = input_args[kInputIndex2]->BuildShape();
  return std::make_shared<abstract::TupleShape>(
    std::vector<abstract::BaseShapePtr>{x_shape_ptr, scale_shape_ptr, scale_shape_ptr});
 }

  auto dx = input_args[kInputIndex1]->Broaden();
  auto dscale = input_args[kInputIndex2]->Broaden();
  auto reserve_1 = input_args[kInputIndex3]->Broaden();
  auto reserve_2 = input_args[kInputIndex4]->Broaden();
 TuplePtr BatchNormGradInferType(const PrimitivePtr &primitive, const std::vector<AbstractBasePtr> &input_args) {
  MS_EXCEPTION_IF_NULL(primitive);
  auto prim_name = primitive->name();
  (void)CheckAndConvertUtils::CheckInputArgs(input_args, kGreaterEqual, kInputNum, prim_name);
  auto x_type_ptr = input_args[kInputIndex1]->BuildType();
  auto scale_type_ptr = input_args[kInputIndex2]->BuildType();
  return std::make_shared<Tuple>(std::vector<TypePtr>{x_type_ptr, scale_type_ptr, scale_type_ptr});
 }

  AbstractBasePtrList rets = {dx, dscale, dscale, reserve_1, reserve_2};
  return std::make_shared<abstract::AbstractTuple>(rets);
 AbstractBasePtr BatchNormGradInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive,
                                   const std::vector<AbstractBasePtr> &input_args) {
  MS_EXCEPTION_IF_NULL(primitive);
  return abstract::MakeAbstract(BatchNormGradInferShape(primitive, input_args),
                                BatchNormGradInferType(primitive, input_args));
 }
 REGISTER_PRIMITIVE_C(kNameBatchNormGrad, BatchNormGrad);
 REGISTER_PRIMITIVE_EVAL_IMPL(BatchNormGrad, prim::kPrimBatchNormGrad, BatchNormGradInfer, nullptr, true);
 }  // namespace ops
 }  // namespace mindspore
--- a/mindspore/python/mindspore/ops/operations/_grad_ops.py
+++ b/mindspore/python/mindspore/ops/operations/_grad_ops.py
@@ -118,7 +118,7 @@ class SqrtGrad(PrimitiveWithInfer):
        return x_dtype


 class BatchNormGrad(PrimitiveWithInfer):
 class BatchNormGrad(Primitive):
    """Performs grad of BatchNorm operation."""

    @prim_attr_register
@@ -127,13 +127,6 @@ class BatchNormGrad(PrimitiveWithInfer):
        self.epsilon = validator.check_float_range(epsilon, 0, 1, Rel.INC_RIGHT, 'epsilon', self.name)
        self.data_format = validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.name)

    def infer_shape(self, y_backprop_shape, x_shape, scale_shape, save_mean_shape, save_variance_shape, reserve):
        validator.check("BatchNorm y_backprop_shape", y_backprop_shape, "BatchNorm x_shape", x_shape)
        return (x_shape, scale_shape, scale_shape)

    def infer_dtype(self, y_backprop_type, x_type, scale_type, save_mean_shape, save_variance_shape, reserve):
        return (x_type, scale_type, scale_type)


 class SyncBatchNormGrad(PrimitiveWithInfer):
    """Performs grad of SyncBatchNorm operation."""
--- a/tests/st/ops/gpu/test_dynamic_broadcast_gradient_args.py
+++ b/tests/st/ops/gpu/test_dynamic_broadcast_gradient_args.py
@@ -0,0 +1,52 @@
 # Copyright 2022 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 import numpy as np
 import pytest

 import mindspore.nn as nn
 import mindspore.context as context

 from mindspore.ops.operations import _inner_ops

 context.set_context(mode=context.GRAPH_MODE, device_target="GPU")


 class Net(nn.Cell):
    def __init__(self):
        super(Net, self).__init__()
        self.args = _inner_ops.DynamicBroadcastGradientArgs()

    def construct(self, s0, s1):
        return self.args(s0, s1)


@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_net():
    """
    Feature: DynamicBroadcastGradientArgs op.
    Description: test cases for DynamicBroadcastGradientArgs op.
    Expectation: the result match expected array.
    """
    shape0 = (4, 2, 1)
    shape1 = (2, 7)
    net = Net()
    r0, r1 = net(shape0, shape1)
    r0_expected = [2]
    r1_expected = [0]

    assert np.array_equal(r0_expected, r0.asnumpy())
    assert np.array_equal(r1_expected, r1.asnumpy())
--- a/tests/st/ops/gpu/test_dynamic_ops.py
+++ b/tests/st/ops/gpu/test_dynamic_ops.py
@@ -102,31 +102,159 @@ class ConcatNet(nn.Cell):
        return self.op((x1, x2))


 def dynamic_concat_run(is_grad):
    axis = 1
    dtype = np.float32
    data_list = []
    for i in [2, 64]:
        data = []
        data.append(np.random.rand(i, 16).astype(dtype))
        data.append(np.random.rand(i, 32).astype(dtype))
        if is_grad:
            data.append(np.random.rand(i, 48).astype(dtype))
        data_list.append(tuple(data))
    column_names = get_columns(len(data_list[0]))
    dataset = ds.GeneratorDataset(data_list, column_names, shuffle=False)
    dynamic_columns = {column_names[0]: [
        None, 16], column_names[1]: [None, 32]}
    if is_grad:
        dynamic_columns[column_names[-1]] = [None, 48]
    dataset.set_dynamic_columns(columns=dynamic_columns)
    net = ConcatNet(axis)
    if is_grad:
        net = GradNetWrtX(net)
    output = dynamic_shape_sink_process(net, dataset)
    output_cmp = fixed_shape_process(net, dataset)
    assert compare(output, output_cmp)


@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_dynamic_concat():
 def test_dynamic_concat_forward():
    """
    Feature: Test Concat and its backward.
    Feature: Test Concat.
    Description:  The shape of inputs is dynamic.
    Expectation: Assert that results are consistent with fixed shape.
    """
    axis = 1
    dynamic_concat_run(False)


@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_dynamic_concat_backward():
    """
    Feature: Test backward of Concat.
    Description:  The shape of inputs is dynamic.
    Expectation: Assert that results are consistent with fixed shape.
    """
    dynamic_concat_run(True)


 class BatchNormNet(nn.Cell):
    def __init__(self, c):
        super(BatchNormNet, self).__init__()
        self.bn = nn.BatchNorm1d(c)

    def construct(self, input_data):
        x = self.bn(input_data)
        return x


@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_dynamic_bachnorm():
    """
    Feature: Test BatchNorm and its backward.
    Description:  The shape of inputs is dynamic.
    Expectation: Assert that results are consistent with fixed shape.
    """
    c = 256
    dtype = np.float32
    data_list = []
    for i in [2, 64]:
        data = []
        data.append(np.random.rand(i, 16).astype(dtype))
        data.append(np.random.rand(i, 32).astype(dtype))
        data.append(np.random.rand(i, 48).astype(dtype))
        data.append(np.random.rand(i, c).astype(dtype))
        data.append(np.random.rand(i, c).astype(dtype))
        data_list.append(tuple(data))
    column_names = get_columns(len(data_list[0]))
    dataset = ds.GeneratorDataset(data_list, column_names, shuffle=False)
    dataset.set_dynamic_columns(
        columns={column_names[0]: [None, 16], column_names[1]: [None, 32], column_names[2]: [None, 48]})

    net = GradNetWrtX(ConcatNet(axis))

    dynamic_columns = {column_names[0]: [None, c], column_names[1]: [None, c]}
    dataset.set_dynamic_columns(columns=dynamic_columns)
    net = GradNetWrtX(BatchNormNet(c))
    gradients = dynamic_shape_sink_process(net, dataset)
    gradients_cmp = fixed_shape_process(net, dataset)
    assert compare(gradients, gradients_cmp)


 class ReshapeNet(nn.Cell):
    def construct(self, x, y):
        shape_of_y = ops.DynamicShape()(y)
        return ops.Reshape()(x, shape_of_y)


@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_dynamic_reshape():
    """
    Feature: Test Reshape.
    Description:  The shape of inputs is dynamic.
    Expectation: Assert that results are consistent with fixed shape.
    """
    dtype = np.float32
    data_list = []
    for i in [2, 96]:
        data = []
        data.append(np.random.rand(i, 64, 1).astype(dtype))
        data.append(np.random.rand(i, 64).astype(dtype))
        data_list.append(tuple(data))
    column_names = get_columns(len(data_list[0]))
    dataset = ds.GeneratorDataset(data_list, column_names, shuffle=False)
    dynamic_columns = {column_names[0]: [
        None, 64, 1], column_names[1]: [None, 64]}
    dataset.set_dynamic_columns(columns=dynamic_columns)
    net = ReshapeNet()
    output = dynamic_shape_sink_process(net, dataset)
    output_cmp = fixed_shape_process(net, dataset)
    assert compare(output, output_cmp)


 class ReduceSumNet(nn.Cell):
    def __init__(self):
        super(ReduceSumNet, self).__init__()
        self.reduce = ops.ReduceSum()

    def construct(self, x, y):
        return self.reduce(x, y)


@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_dynamic_reduce_sum():
    """
    Feature: Test ReduceSum.
    Description:  The shape of inputs is dynamic.
    Expectation: Assert that results are consistent with result of the numpy compute
    """
    dtype = np.float32
    data_list = []
    for i in [2, 96]:
        data = []
        data.append(np.random.rand(i, 256).astype(dtype))
        data.append(np.array([1], dtype=np.int64))
        data_list.append(tuple(data))
    column_names = get_columns(len(data_list[0]))
    dataset = ds.GeneratorDataset(data_list, column_names, shuffle=False)
    dynamic_columns = {column_names[0]: [None, 256]}
    dataset.set_dynamic_columns(columns=dynamic_columns)
    net = ReduceSumNet()
    output = dynamic_shape_sink_process(net, dataset)
    # Currently, the parameter axis of ReduceSum operator is dynamic(tensor) is
    # not supported under the fixed shape, so numpy is used for comparison
    inputs = data_list[0]
    output_cmp = np.sum(inputs[0], inputs[1][0])
    assert np.allclose(output.asnumpy(), output_cmp, rtol=1.0e-4, atol=1.0e-4)