add new op instancenorm2d

5 years ago · 26f6daa850
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/instance_norm_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/instance_norm_impl.cu
@@ -0,0 +1,90 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/cuda_impl/instance_norm_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/util.cuh"

 __global__ void CopyMemKernel(const size_t thread_num, const size_t N, const size_t C,
                              float *gamma_addr, float *beta_addr,
                              float *runing_mean_addr, float *runnig_variance_addr,
                              float *ws_gamma, float *ws_beta, float *ws_mean, float *ws_var) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < thread_num; pos += gridDim.x * blockDim.x) {
    size_t cur_addr = pos / (N * C);
    size_t cur_local_index = pos % (N * C);
    size_t local_index = 0;
    switch (cur_addr) {
      case 0:
        if (!(gamma_addr && ws_gamma)) break;
        local_index = cur_local_index % C;
        ws_gamma[cur_local_index] = gamma_addr[local_index];
        break;
      case 1:
        if (!(beta_addr && ws_beta)) break;
        local_index = cur_local_index % C;
        ws_beta[cur_local_index] = beta_addr[local_index];
        break;
      case 2:
        if (!(runing_mean_addr && ws_mean)) break;
        local_index = cur_local_index % C;
        ws_mean[cur_local_index] = runing_mean_addr[local_index];
        break;
      default:
        if (!(runnig_variance_addr && ws_var)) break;
        local_index = cur_local_index % C;
        ws_var[cur_local_index] = runnig_variance_addr[local_index];
    }
  }
  return;
 }

 void CopyMemDevice2Device(const size_t N, const size_t C, float *gamma_addr, float *beta_addr,
                          float *runing_mean_addr, float *runnig_variance_addr,
                          float *ws_gamma, float *ws_beta, float *ws_mean, float *ws_var,
                          cudaStream_t cuda_stream) {
  size_t thread_num = N * C * 4;
  CopyMemKernel<<<GET_BLOCKS(thread_num), GET_THREADS, 0, cuda_stream>>>(
          thread_num, N, C, gamma_addr, beta_addr, runing_mean_addr, runnig_variance_addr,
          ws_gamma, ws_beta, ws_mean, ws_var);
 }

 __global__ void ComputeMeanKernel(const size_t thread_num, const size_t N, const size_t C,
                              float *save_mean_addr, float *save_var_addr) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < thread_num; pos += gridDim.x * blockDim.x) {
    size_t cur_addr = pos / C;
    size_t cur_local_index = pos % C;
    float tmp = 0;
    if (cur_addr) {
      for (size_t i = 0; i < N; i++) {
        tmp += save_var_addr[i * C + cur_local_index];
      }
      save_var_addr[cur_local_index] = tmp / N;
    } else {
      for (size_t i = 0; i < N; i++) {
        tmp += save_mean_addr[i * C + cur_local_index];
      }
      save_mean_addr[cur_local_index] = tmp / N;
    }
  }
  return;
 }

 void ComputeMean(const size_t N, const size_t C,
                 float *save_mean_addr, float *save_var_addr,
                 cudaStream_t cuda_stream) {
  size_t thread_num = C * 2;
  ComputeMeanKernel<<<GET_BLOCKS(thread_num), GET_THREADS, 0, cuda_stream>>>(
          thread_num, N, C, save_mean_addr, save_var_addr);
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/instance_norm_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/instance_norm_impl.cuh
@@ -0,0 +1,27 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_INSTANCE_NORM_IMPL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_INSTANCE_NORM_IMPL_H_

 #include "runtime/device/gpu/cuda_common.h"
 void CopyMemDevice2Device(const size_t N, const size_t C,
                          float *gamma_addr, float *beta_addr, float *runing_mean_addr, float *runnig_variance_addr,
                          float *ws_gamma, float *ws_beta, float *ws_mean, float *ws_var,
                          cudaStream_t cuda_stream);
 void ComputeMean(const size_t N, const size_t C, float *save_mean_addr, float *save_var_addr,
                 cudaStream_t cuda_stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_INSTANCE_NORM_IMPL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/instance_norm_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/instance_norm_gpu_kernel.cc
@@ -0,0 +1,44 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/nn/instance_norm_gpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_ONE(InstanceNorm,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      InstanceNormGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(InstanceNorm,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat16)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddInputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat16)
                        .AddOutputAttr(kNumberTypeFloat32)
                        .AddOutputAttr(kNumberTypeFloat32),
                      InstanceNormGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/instance_norm_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/instance_norm_gpu_kernel.h
@@ -0,0 +1,240 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_INSTANCE_NORM_GPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_INSTANCE_NORM_GPU_KERNEL_H_

 #include <string>
 #include <vector>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/gpu/kernel_constants.h"
 #include "utils/utils.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/instance_norm_impl.cuh"

 namespace mindspore {
 namespace kernel {
 template <typename T>
 class InstanceNormGpuKernel : public GpuKernel {
 public:
  InstanceNormGpuKernel()
      : input_x_size_(0),
        input_z_size_(0),
        para_size_(0),
        output_size_(0),
        workspace_size_(0),
        mode_(CUDNN_BATCHNORM_SPATIAL),
        bn_ops_(CUDNN_BATCHNORM_OPS_BN),
        is_training_(true),
        epsilon_(10e-5),
        exp_avg_factor_(0.1),
        is_null_input_(false),
        x_desc_(nullptr),
        y_desc_(nullptr),
        z_desc_(nullptr),
        scale_bias_mean_var_desc_(nullptr),
        handle_(nullptr),
        cudnn_data_type_(CUDNN_DATA_FLOAT) {}
  ~InstanceNormGpuKernel() override { DestroyResource(); }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    VARIABLE_NOT_USED(workspace);
    VARIABLE_NOT_USED(stream_ptr);
    if (is_null_input_) {
      return true;
    }
    auto x_addr = GetDeviceAddress<T>(inputs, 0);
    auto gamma_addr = GetDeviceAddress<float>(inputs, 1);
    auto beta_addr = GetDeviceAddress<float>(inputs, 2);
    auto runing_mean_addr = GetDeviceAddress<float>(inputs, 3);
    auto runnig_variance_addr = GetDeviceAddress<float>(inputs, 4);
    T *z = nullptr;

    auto y_addr = GetDeviceAddress<T>(outputs, 0);
    auto save_mean_addr = GetDeviceAddress<float>(outputs, 1);
    auto save_variance_addr = GetDeviceAddress<float>(outputs, 2);

    float *ws_gamma = GetDeviceAddress<float>(workspace, 0);
    float *ws_beta = GetDeviceAddress<float>(workspace, 1);
    float *ws_mean = GetDeviceAddress<float>(workspace, 2);
    float *ws_var = GetDeviceAddress<float>(workspace, 3);
    T *workspace_addr = nullptr;
    if (workspace_size_ != 0) {
      workspace_addr = GetDeviceAddress<T>(workspace, 4);
    }

    size_t N = input_shape_[0];
    size_t C = input_shape_[1];
    CopyMemDevice2Device(N, C, gamma_addr, beta_addr, runing_mean_addr, runnig_variance_addr, ws_gamma, ws_beta,
                         ws_mean, ws_var, reinterpret_cast<cudaStream_t>(stream_ptr));

    const float alpha = 1;
    const float beta = 0;
    float *reserve_addr = nullptr;
    if (is_training_) {
      CHECK_CUDNN_RET_WITH_EXCEPT(
        kernel_node_,
        cudnnBatchNormalizationForwardTrainingEx(
          handle_, mode_, bn_ops_, &alpha, &beta, x_desc_, x_addr, z_desc_, z, y_desc_, y_addr,
          scale_bias_mean_var_desc_, ws_gamma, ws_beta, exp_avg_factor_, ws_mean, ws_var, epsilon_, save_mean_addr,
          save_variance_addr, nullptr, workspace_addr, workspace_size_, reserve_addr, 0),
        "Kernel launch failed");
    } else {
      CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
                                  cudnnBatchNormalizationForwardInference(
                                    handle_, mode_, &alpha, &beta, x_desc_, x_addr, y_desc_, y_addr,
                                    scale_bias_mean_var_desc_, ws_gamma, ws_beta, ws_mean, ws_var, epsilon_),
                                  "Kernel launch failed");
    }
    return true;
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    MS_EXCEPTION_IF_NULL(kernel_node);
    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    bn_ops_ = CUDNN_BATCHNORM_OPS_BN;

    InitResource();
    is_training_ = GetAttr<bool>(kernel_node, "is_training");
    mode_ = is_training_ ? CUDNN_BATCHNORM_SPATIAL_PERSISTENT : CUDNN_BATCHNORM_SPATIAL;
    epsilon_ = GetAttr<float>(kernel_node, "epsilon");
    exp_avg_factor_ = GetAttr<float>(kernel_node, "momentum");

    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 5) {
      MS_LOG(EXCEPTION) << "input tensor size is " << input_num << ", " << kernel_name << "  should be 5";
    }
    input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
    if (input_shape_.size() != 4) {
      MS_LOG(EXCEPTION) << "tensor shape is " << input_shape_.size() << ", InstanceNormGpuKernel should be 4";
    }
    is_null_input_ = CHECK_NULL_INPUT(input_shape_);
    if (is_null_input_) {
      MS_LOG(WARNING) << "InstanceNormGpuKernel input is null";
      InitSizeLists();
      return true;
    }
    SetTensorDescriptor();
    InitSizeLists();
    return true;
  }

  void DestroyResource() noexcept override {
    CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
    CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(y_desc_), "Destroy y desc failed");
    CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(scale_bias_mean_var_desc_),
                               "Destroy para desc failed");
  }

 protected:
  void InitResource() override {
    handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
    CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed");
    CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&y_desc_), "Create y desc failed");
    CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&scale_bias_mean_var_desc_),
                                "Create para desc failed");
  }

  void InitSizeLists() override {
    if (!is_null_input_) {
      CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_desc_, &input_x_size_),
                                  "Get input x size failed");
      CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(scale_bias_mean_var_desc_, &para_size_),
                                  "Get para size failed");
      CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(y_desc_, &output_size_),
                                  "Get output size failed");

      CHECK_CUDNN_RET_WITH_EXCEPT(
        kernel_node_,
        cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(handle_, mode_, bn_ops_, x_desc_, z_desc_, y_desc_,
                                                                 scale_bias_mean_var_desc_, nullptr, &workspace_size_),
        "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize failed");
    }

    input_size_list_.push_back(input_x_size_);    // input x
    input_size_list_.push_back(input_shape_[1]);  // gamma
    input_size_list_.push_back(input_shape_[1]);  // beta
    input_size_list_.push_back(input_shape_[1]);  // mean
    input_size_list_.push_back(input_shape_[1]);  // variance

    output_size_list_.push_back(output_size_);  // output
    output_size_list_.push_back(para_size_);    // save mean
    output_size_list_.push_back(para_size_);    // save variance

    workspace_size_list_.push_back(para_size_);  // ws gamma
    workspace_size_list_.push_back(para_size_);  // ws beta
    workspace_size_list_.push_back(para_size_);  // ws mean
    workspace_size_list_.push_back(para_size_);  // ws variance
    workspace_size_list_.push_back(workspace_size_);
  }

 private:
  void SetTensorDescriptor() {
    cudnnTensorFormat_t cudnn_format;
    int batch, channel, height, width;
    batch = 1;
    channel = SizeToInt(input_shape_[0]) * SizeToInt(input_shape_[1]);
    height = SizeToInt(input_shape_[2]);
    width = SizeToInt(input_shape_[3]);
    cudnn_format = CUDNN_TENSOR_NCHW;

    CHECK_CUDNN_RET_WITH_EXCEPT(
      kernel_node_, cudnnSetTensor4dDescriptor(x_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
      "Set x desc failed");

    CHECK_CUDNN_RET_WITH_EXCEPT(
      kernel_node_, cudnnSetTensor4dDescriptor(y_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
      "Set y desc failed");

    CHECK_CUDNN_RET_WITH_EXCEPT(
      kernel_node_,
      cudnnSetTensor4dDescriptor(scale_bias_mean_var_desc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, channel, 1, 1),
      "Set para desc failed");
  }

  size_t input_x_size_;
  size_t input_z_size_;
  size_t para_size_;
  size_t output_size_;
  size_t workspace_size_;
  cudnnBatchNormMode_t mode_;
  cudnnBatchNormOps_t bn_ops_;
  bool is_training_;
  double epsilon_;
  double exp_avg_factor_;
  bool is_null_input_;
  cudnnTensorDescriptor_t x_desc_;
  cudnnTensorDescriptor_t y_desc_;
  cudnnTensorDescriptor_t z_desc_;
  cudnnTensorDescriptor_t scale_bias_mean_var_desc_;

  cudnnHandle_t handle_;
  cudnnDataType_t cudnn_data_type_;
  std::vector<size_t> input_shape_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_INSTANCE_NORM_GPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/instance_norm_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/instance_norm_grad_gpu_kernel.cc
@@ -0,0 +1,44 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "backend/kernel_compiler/gpu/nn/instance_norm_grad_gpu_kernel.h"

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_ONE(InstanceNormGrad,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat32)    // dy
                        .AddInputAttr(kNumberTypeFloat32)    // x
                        .AddInputAttr(kNumberTypeFloat32)    // scale
                        .AddInputAttr(kNumberTypeFloat32)    // save_mean
                        .AddInputAttr(kNumberTypeFloat32)    // save_variance
                        .AddOutputAttr(kNumberTypeFloat32)   // dx
                        .AddOutputAttr(kNumberTypeFloat32)   // dscale
                        .AddOutputAttr(kNumberTypeFloat32),  // dbias
                      InstanceNormGradGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(InstanceNormGrad,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeFloat16)    // dy
                        .AddInputAttr(kNumberTypeFloat16)    // x
                        .AddInputAttr(kNumberTypeFloat32)    // scale
                        .AddInputAttr(kNumberTypeFloat32)    // save_mean
                        .AddInputAttr(kNumberTypeFloat32)    // save_variance
                        .AddOutputAttr(kNumberTypeFloat16)   // dx
                        .AddOutputAttr(kNumberTypeFloat32)   // dscale
                        .AddOutputAttr(kNumberTypeFloat32),  // dbias
                      InstanceNormGradGpuKernel, half)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/instance_norm_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/instance_norm_grad_gpu_kernel.h
@@ -0,0 +1,238 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_INSTANCE_NORM_GRAD_GPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_INSTANCE_NORM_GRAD_GPU_KERNEL_H_

 #include <string>
 #include <vector>
 #include "utils/utils.h"

 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/gpu/kernel_constants.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/instance_norm_impl.cuh"

 namespace mindspore {
 namespace kernel {
 template <typename T>
 class InstanceNormGradGpuKernel : public GpuKernel {
 public:
  InstanceNormGradGpuKernel()
      : x_size_(0),
        para_size_(0),
        workspace_size_(0),
        mode_(CUDNN_BATCHNORM_SPATIAL),
        bn_ops_(CUDNN_BATCHNORM_OPS_BN),
        epsilon_(10e-5),
        is_training_(true),
        is_null_input_(false),
        x_desc_(nullptr),
        y_desc_(nullptr),
        dy_desc_(nullptr),
        dx_desc_(nullptr),
        dz_desc_(nullptr),
        scale_bias_diff_desc_(nullptr),
        activation_desc_(nullptr),
        handle_(nullptr),
        cudnn_data_type_(CUDNN_DATA_FLOAT),
        beta_data_diff_(0) {}
  ~InstanceNormGradGpuKernel() override { DestroyResource(); }

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    VARIABLE_NOT_USED(workspace);
    VARIABLE_NOT_USED(stream_ptr);
    if (is_null_input_) {
      return true;
    }
    auto dy = GetDeviceAddress<T>(inputs, 0);
    auto x = GetDeviceAddress<T>(inputs, 1);
    auto gamma = GetDeviceAddress<float>(inputs, 2);
    auto save_mean = GetDeviceAddress<float>(inputs, 3);
    auto save_variance = GetDeviceAddress<float>(inputs, 4);
    void *beta = nullptr;
    T *y = nullptr;

    auto dx = GetDeviceAddress<T>(outputs, 0);
    auto dgamma = GetDeviceAddress<float>(outputs, 1);
    auto dbeta = GetDeviceAddress<float>(outputs, 2);
    T *dz = nullptr;

    float *ws_gamma = GetDeviceAddress<float>(workspace, 0);
    void *workspace_addr = nullptr;
    if (workspace_size_ != 0) {
      workspace_addr = GetDeviceAddress<T>(workspace, 3);
    }

    size_t N = input_shape_[0];
    size_t C = input_shape_[1];
    CopyMemDevice2Device(N, C, gamma, nullptr, nullptr, nullptr, ws_gamma, nullptr, nullptr, nullptr,
                         reinterpret_cast<cudaStream_t>(stream_ptr));
    CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream_ptr)),
                               "cudaStreamSynchronized failed");

    const float alpha_data_diff = 1;
    const float alpha_param_diff = 1;
    const float beta_param_diff = 0;
    float *reserve_addr = nullptr;
    if (is_training_) {
      CHECK_CUDNN_RET_WITH_EXCEPT(
        kernel_node_,
        cudnnBatchNormalizationBackwardEx(
          handle_, mode_, bn_ops_, &alpha_data_diff, &beta_data_diff_, &alpha_param_diff, &beta_param_diff, x_desc_, x,
          y_desc_, y, dy_desc_, dy, dz_desc_, dz, dx_desc_, dx, scale_bias_diff_desc_, ws_gamma, beta, dgamma, dbeta,
          epsilon_, save_mean, save_variance, activation_desc_, workspace_addr, workspace_size_, reserve_addr, 0),
        "Kernel launch failed");
      ComputeMean(N, C, dgamma, dbeta, reinterpret_cast<cudaStream_t>(stream_ptr));
    } else {
      MS_LOG(EXCEPTION) << "The backward of InstanceNorm operator in evaluation mode is not implemented yet.";
    }
    return true;
  }

  bool Init(const CNodePtr &kernel_node) override {
    kernel_node_ = kernel_node;
    MS_EXCEPTION_IF_NULL(kernel_node);
    std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
    bn_ops_ = CUDNN_BATCHNORM_OPS_BN;

    InitResource();
    is_training_ = GetAttr<bool>(kernel_node, "is_training");
    mode_ = is_training_ ? CUDNN_BATCHNORM_SPATIAL_PERSISTENT : CUDNN_BATCHNORM_SPATIAL;
    epsilon_ = GetAttr<float>(kernel_node, "epsilon");

    cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0)));
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 5) {
      MS_LOG(EXCEPTION) << "input tensor size is " << input_num << ", " << kernel_name << " should be 5";
    }

    input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
    if (input_shape_.size() != 4) {
      MS_LOG(EXCEPTION) << "tensor shape is " << input_shape_.size() << ", InstanceNormGradGpuKernel should be 4";
    }
    is_null_input_ = CHECK_NULL_INPUT(input_shape_);
    if (is_null_input_) {
      MS_LOG(WARNING) << "InstanceNormGradGpuKernel input is null";
      InitSizeLists();
      return true;
    }
    beta_data_diff_ = GetAttrWithDefault(kernel_node, "inplace_algo", std::string("cover")) == "cover" ? 0 : 1;
    SetTensorDescriptor();
    InitSizeLists();
    return true;
  }

 protected:
  void InitResource() override {
    handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle();
    CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&x_desc_), "Create x desc failed");
    CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dy_desc_), "Create dy desc failed");
    CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&dx_desc_), "Create dx desc failed");
    CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnCreateTensorDescriptor(&scale_bias_diff_desc_),
                                "Create para desc failed");
  }

  void InitSizeLists() override {
    if (!is_null_input_) {
      CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(x_desc_, &x_size_), "Get x size failed");
      CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(scale_bias_diff_desc_, &para_size_),
                                  "Get para size failed");
      CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
                                  cudnnGetBatchNormalizationBackwardExWorkspaceSize(
                                    handle_, mode_, bn_ops_, x_desc_, y_desc_, dy_desc_, dz_desc_, dx_desc_,
                                    scale_bias_diff_desc_, activation_desc_, &workspace_size_),
                                  "cudnnGetBatchNormalizationBackwardExWorkspaceSize failed");
    }

    input_size_list_.push_back(x_size_);
    input_size_list_.push_back(x_size_);
    input_size_list_.push_back(input_shape_[1]);
    input_size_list_.push_back(para_size_);
    input_size_list_.push_back(para_size_);

    output_size_list_.push_back(x_size_);
    output_size_list_.push_back(para_size_);
    output_size_list_.push_back(para_size_);

    workspace_size_list_.push_back(para_size_);  // ws gamma
    workspace_size_list_.push_back(workspace_size_);
  }
  void DestroyResource() noexcept override {
    CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(x_desc_), "Destroy x desc failed");
    CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dy_desc_), "Destroy dy desc failed");
    CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(dx_desc_), "Destroy dx desc failed");
    CHECK_CUDNN_RET_WITH_ERROR(kernel_node_, cudnnDestroyTensorDescriptor(scale_bias_diff_desc_),
                               "Destroy para desc failed");
  }

 private:
  void SetTensorDescriptor() {
    int batch, channel, height, width;
    batch = 1;
    channel = SizeToInt(input_shape_[0]) * SizeToInt(input_shape_[1]);
    height = SizeToInt(input_shape_[2]);
    width = SizeToInt(input_shape_[3]);
    cudnnTensorFormat_t cudnn_format = CUDNN_TENSOR_NCHW;

    CHECK_CUDNN_RET_WITH_EXCEPT(
      kernel_node_, cudnnSetTensor4dDescriptor(x_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
      "Set x desc failed");
    CHECK_CUDNN_RET_WITH_EXCEPT(
      kernel_node_, cudnnSetTensor4dDescriptor(dy_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
      "Set dy desc failed");
    CHECK_CUDNN_RET_WITH_EXCEPT(
      kernel_node_, cudnnSetTensor4dDescriptor(dx_desc_, cudnn_format, cudnn_data_type_, batch, channel, height, width),
      "Set dx desc failed");
    CHECK_CUDNN_RET_WITH_EXCEPT(
      kernel_node_,
      cudnnSetTensor4dDescriptor(scale_bias_diff_desc_, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, channel, 1, 1),
      "Set para desc failed");
  }

  size_t x_size_;
  size_t para_size_;
  size_t workspace_size_;
  cudnnBatchNormMode_t mode_;
  cudnnBatchNormOps_t bn_ops_;
  double epsilon_;
  bool is_training_;
  bool is_null_input_;

  cudnnTensorDescriptor_t x_desc_;
  cudnnTensorDescriptor_t y_desc_;
  cudnnTensorDescriptor_t dy_desc_;
  cudnnTensorDescriptor_t dx_desc_;
  cudnnTensorDescriptor_t dz_desc_;
  cudnnTensorDescriptor_t scale_bias_diff_desc_;
  cudnnActivationDescriptor_t activation_desc_;

  cudnnHandle_t handle_;
  cudnnDataType_t cudnn_data_type_;
  float beta_data_diff_;
  std::vector<size_t> input_shape_;
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_INSTANCE_NORM_GRAD_GPU_KERNEL_H_
--- a/mindspore/ccsrc/utils/utils.h
+++ b/mindspore/ccsrc/utils/utils.h
@@ -47,6 +47,7 @@ constexpr auto kBNGrad1OpName = "BNGrad1";
 constexpr auto kBNGrad2OpName = "BNGrad2";
 constexpr auto kBNGrad3OpName = "BNGrad3";
 constexpr auto kFusedBatchNormEx = "FusedBatchNormEx";
 constexpr auto kInstanceNorm = "InstanceNorm";
 constexpr auto kFusedBatchNormExWithActivation = "FusedBatchNormExWithActivation";
 constexpr auto kFusedBatchNormExWithAddAndActivation = "FusedBatchNormExWithAddAndActivation";
 constexpr auto kFusedBatchNormGradEx = "FusedBatchNormGradEx";
--- a/mindspore/nn/layer/normalization.py
+++ b/mindspore/nn/layer/normalization.py
@@ -1,4 +1,4 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 # Copyright 2020-2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ from mindspore.communication import management
 from mindspore.ops import _selected_ops
 from ..cell import Cell

 __all__ = ['BatchNorm1d', 'BatchNorm2d', 'LayerNorm', 'GroupNorm', 'GlobalBatchNorm']
 __all__ = ['BatchNorm1d', 'BatchNorm2d', 'LayerNorm', 'GroupNorm', 'GlobalBatchNorm', 'InstanceNorm2d']


 class _BatchNorm(Cell):
@@ -705,6 +705,119 @@ class LayerNorm(Cell):
            self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta)


 class InstanceNorm2d(Cell):
    r"""
    Instance normalization layer over a 4D input.

    This layer applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with
    additional channel dimension) as described in the paper `Instance Normalization: The Missing Ingredient for
    Fast Stylization <https://arxiv.org/abs/1607.08022>`_. It rescales and recenters the feature using a mini-batch
    of data and the learned parameters which can be described in the following formula.

    .. math::
        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    Note:
        Note that the formula for updating the running_mean and running_var is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times x_t + \text{momentum} \times \hat{x}`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the new observed value.

    Args:
        num_features (int): `C` from an expected input of size (N, C, H, W).
        eps (float): A value added to the denominator for numerical stability. Default: 1e-5.
        momentum (float): A floating hyperparameter of the momentum for the
            running_mean and running_var computation. Default: 0.1.
        affine (bool): A bool value. When set to True, gamma and beta can be learned. Default: True.
        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'ones'.
        beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'zeros'.
        moving_mean_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the moving mean.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'zeros'.
        moving_var_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the moving variance.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'ones'.
        use_batch_statistics (bool): If true, use the mean value and variance value of current batch data. If false,
            use the mean value and variance value of specified value. Default: True.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, C, H, W)`. Data type: float16 or float32.

    Outputs:
        Tensor, the normalized, scaled, offset tensor, of shape :math:`(N, C, H, W)`. Same type and
        shape as the `input_x`.

    Supported Platforms:
        ``GPU``

    Raise:
        ValueError: If num_features is less than 1 or momentum not in (0, 1).

    Examples:
        >>> net = nn.InstanceNorm2d(3)
        >>> np.random.seed(0)
        >>> input = Tensor(np.random.randint(0, 255, [2, 3, 2, 2]), mindspore.float32)
        >>> output = net(input)
        >>> print(output.shape)
        (2, 3, 2, 2)
    """

    @cell_attr_register
    def __init__(self,
                 num_features,
                 eps=1e-5,
                 momentum=0.1,
                 affine=True,
                 gamma_init='ones',
                 beta_init='zeros',
                 moving_mean_init='zeros',
                 moving_var_init='ones',
                 use_batch_statistics=True,
                 input_dims='2d'):
        super(InstanceNorm2d, self).__init__()
        if num_features < 1:
            raise ValueError("num_features must be at least 1")

        if momentum < 0 or momentum > 1:
            raise ValueError("momentum should be a number in range [0, 1], but got {}".format(momentum))
        self.use_batch_statistics = use_batch_statistics
        self.num_features = num_features
        self.eps = eps
        self.input_dims = input_dims
        self.moving_mean = Parameter(initializer(
            moving_mean_init, num_features), name="mean", requires_grad=False)
        self.moving_variance = Parameter(initializer(
            moving_var_init, num_features), name="variance", requires_grad=False)
        self.gamma = Parameter(initializer(
            gamma_init, num_features), name="gamma", requires_grad=affine)
        self.beta = Parameter(initializer(
            beta_init, num_features), name="beta", requires_grad=affine)

        self.shape = P.Shape()
        self.momentum = momentum
        self.instance_bn = P.InstanceNorm(is_training=self.use_batch_statistics,
                                          epsilon=self.eps,
                                          momentum=self.momentum)

    def _check_data_dim(self, x):
        raise NotImplementedError

    def construct(self, x):
        _shape_check_bn(self.shape(x), self.input_dims)
        return self.instance_bn(x,
                                self.gamma,
                                self.beta,
                                self.moving_mean,
                                self.moving_variance)[0]

    def extend_repr(self):
        return 'num_features={}, eps={}, momentum={}, gamma={}, beta={}, moving_mean={}, moving_variance={}'.format(
            self.num_features, self.eps, self.momentum, self.gamma, self.beta, self.moving_mean, self.moving_variance)


 class GroupNorm(Cell):
    r"""
    Group Normalization over a mini-batch of inputs.
--- a/mindspore/ops/_grad/grad_nn_ops.py
+++ b/mindspore/ops/_grad/grad_nn_ops.py
@@ -688,6 +688,24 @@ def get_bprop_fused_batch_norm_ex(self):
    return bprop


@bprop_getters.register(P.InstanceNorm)
 def get_bprop_instance_norm(self):
    """Grad definition for `InstanceNorm` operation."""
    is_training = self.is_training
    input_grad = G.InstanceNormGrad(is_training, self.epsilon, self.momentum)

    def bprop(x, gamma, beta, mean, variance, out, dout):
        saved_mean = out[1]
        saved_variance = out[2]
        out = input_grad(dout[0], x, gamma, saved_mean, saved_variance)
        dx = out[0]
        dgamma = out[1]
        dbeta = out[2]
        return dx, dgamma, dbeta, zeros_like(mean), zeros_like(variance)

    return bprop


@bprop_getters.register(P.BatchNorm)
 def get_bprop_batch_norm(self):
    """Grad definition for `BatchNorm` operation."""
--- a/mindspore/ops/operations/init.py
+++ b/mindspore/ops/operations/init.py
@@ -63,8 +63,8 @@ from .random_ops import (RandomChoiceWithMask, StandardNormal, Gamma, Poisson, U
 from .nn_ops import (LSTM, SGD, Adam, FusedSparseAdam, FusedSparseLazyAdam, AdamNoUpdateParam, ApplyMomentum, BatchNorm,
                     BiasAdd, Conv2D,
                     DepthwiseConv2dNative,
                     DropoutDoMask, Dropout,
                     DropoutGenMask, Flatten, FusedBatchNorm, FusedBatchNormEx, BNTrainingReduce, BNTrainingUpdate,
                     DropoutDoMask, Dropout, DropoutGenMask, Flatten,
                     FusedBatchNorm, FusedBatchNormEx, InstanceNorm, BNTrainingReduce, BNTrainingUpdate,
                     Gelu, FastGelu, Elu,
                     GetNext, L2Normalize, LayerNorm, L2Loss, CTCLoss, CTCGreedyDecoder,
                     LogSoftmax,
@@ -130,6 +130,7 @@ __all__ = [
    'MaxPoolWithArgmax',
    'FusedBatchNorm',
    'FusedBatchNormEx',
    'InstanceNorm',
    'BNTrainingReduce',
    'BNTrainingUpdate',
    'BatchNorm',
--- a/mindspore/ops/operations/_grad_ops.py
+++ b/mindspore/ops/operations/_grad_ops.py
@@ -714,6 +714,21 @@ class FusedBatchNormGradEx(PrimitiveWithInfer):
        return (x_type, scale_type, scale_type)


 class InstanceNormGrad(PrimitiveWithInfer):
    """Gradients of InstanceNorm operation."""

    @prim_attr_register
    def __init__(self, is_training=True, epsilon=0.0, momentum=0.1):
        self.init_prim_io_names(inputs=['dy', 'x', 'gamma', 'save_mean', 'save_variance'],
                                outputs=['dx', 'bn_gamma', 'bn_beta'])

    def infer_shape(self, y_backprop_shape, x_shape, gamma_shape, save_mean_shape, save_variance_shape):
        return (x_shape, gamma_shape, gamma_shape)

    def infer_dtype(self, y_backprop_type, x_type, gamma_type, save_mean_type, save_variance_type):
        return (x_type, gamma_type, gamma_type)


 class UniqueGrad(Primitive):
    """Gradients of Unique operation."""

--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -859,6 +859,119 @@ class FusedBatchNormEx(PrimitiveWithInfer):
        return (input_x, scale, scale, scale, scale, scale)


 class InstanceNorm(PrimitiveWithInfer):
    r"""
    Instance normalization over a 4D input.

    This operator applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with
    additional channel dimension) as described in the paper `Instance Normalization: The Missing Ingredient for
    Fast Stylization <https://arxiv.org/abs/1607.08022>`_. It rescales and recenters the feature using a mini-batch
    of data and the learned parameters which can be described in the following formula.

    .. math::
        y = \frac{x - mean}{\sqrt{variance + \epsilon}} * \gamma + \beta

    where :math:`\gamma` is scale, :math:`\beta` is bias, :math:`\epsilon` is epsilon.

    Args:
        is_training (bool): Is training or inference. Default: True.
        epsilon (float): A small value added for numerical stability. Default: 1e-5.
        momentum (float): The hyper parameter to compute moving average for running_mean and running_var
            (e.g. :math:`new\_running\_mean = momentum * running\_mean + (1 - momentum) * current\_mean`).
            Momentum value must be [0, 1]. Default: 0.1.
        data_format (str): The optional value for data format, is 'NCHW'. Default: "NCHW".

    Inputs:
        - **input_x** (Tensor) - The input of InstanceNorm, Tensor of shape :math:`(N, C)`,
          data type: float16 or float32.
        - **gamma** (Parameter) - scale, Tensor of shape :math:`(C,)`,
          data type: float32.
        - **beta** (Parameter) - bias, Tensor of shape :math:`(C,)`,
          data type: float32.
        - **mean** (Parameter) - mean value, Tensor of shape :math:`(C,)`, data type: float32.
        - **variance** (Parameter) - variance value, Tensor of shape :math:`(C,)`, data type: float32.

    Outputs:
        Tuple of 3 Tensors, the normalized input, the updated parameters.

        - **output_x** (Tensor) - The output of InstanceNorm, same type and shape as the `input_x`.
        - **updated_moving_mean** (Tensor) - Updated mean value, Tensor of shape :math:`(NC,)`, data type: float32.
        - **updated_moving_variance** (Tensor) - Updated variance value, Tensor of shape :math:`(NC,)`,
          data type: float32.

    Supported Platforms:
        ``GPU``

    Raise:
        TypeError: If any validator check fails.

    Examples:
        >>> import mindspore
        >>> import mindspore.nn as nn
        >>> import numpy as np
        >>> from mindspore import Parameter
        >>> from mindspore import Tensor
        >>> from mindspore.ops import operations as ops
        >>> class InstanceNormNet(nn.Cell):
        >>>     def __init__(self):
        >>>         super(InstanceNormNet, self).__init__()
        >>>         self.instance_norm = ops.InstanceNorm()
        >>>         self.gamma = Parameter(Tensor(np.ones([64]), mindspore.float32), name="gamma")
        >>>         self.beta = Parameter(Tensor(np.ones([64]), mindspore.float32), name="beta")
        >>>         self.mean = Parameter(Tensor(np.ones([64]), mindspore.float32), name="mean")
        >>>         self.variance = Parameter(Tensor(np.ones([64]), mindspore.float32), name="variance")
        >>>
        >>>     def construct(self, input_x):
        >>>         out = self.instance_norm(input_x, self.gamma, self.beta, self.mean, self.variance)
        >>>         return out
        >>>
        >>> input_x = Tensor(np.ones([128, 64, 32, 64]), mindspore.float32)
        >>> net = InstanceNormNet()
        >>> output = net(input_x)
        >>> result = output[0].shape
        >>> print(result)
        (128, 64, 32, 64)
    """
    __mindspore_signature__ = (
        sig.make_sig('input_x', dtype=sig.sig_dtype.T2),
        sig.make_sig('gamma', sig.sig_rw.RW_WRITE, dtype=sig.sig_dtype.T),
        sig.make_sig('beta', sig.sig_rw.RW_WRITE, dtype=sig.sig_dtype.T),
        sig.make_sig('mean', sig.sig_rw.RW_WRITE, dtype=sig.sig_dtype.T),
        sig.make_sig('variance', sig.sig_rw.RW_WRITE, dtype=sig.sig_dtype.T),
    )

    @prim_attr_register
    def __init__(self, is_training=True, epsilon=1e-5, momentum=0.1):
        self.init_prim_io_names(inputs=['x', 'gamma', 'beta', 'mean', 'variance'],
                                outputs=['y', 'save_mean', 'save_variance'])
        self.is_training = validator.check_bool(is_training, self.name)
        self.epsilon = validator.check_float_range(epsilon, 0, 1, Rel.INC_RIGHT, 'epsilon', self.name)
        self.momentum = validator.check_float_range(momentum, 0, 1, Rel.INC_BOTH, 'momentum', self.name)
        self._update_parameter = True

    def infer_shape(self, input_x, gamma, beta, mean, variance):
        input_shape_norm = input_x
        validator.check_equal_int(len(gamma), 1, "gamma rank", self.name)
        validator.check("gamma shape", gamma, "beta shape", beta, Rel.EQ, self.name)
        validator.check("gamma shape[0]", gamma[0], "input channel", input_shape_norm[1], Rel.EQ, self.name)
        validator.check_equal_int(len(mean), 1, "mean rank", self.name)

        validator.check("mean shape", mean, "variance shape", variance, Rel.EQ, self.name)
        validator.check("mean shape", mean, "gamma shape", gamma, Rel.EQ, self.name)
        save_mean_shape = gamma
        save_mean_shape[0] = save_mean_shape[0] * input_shape_norm[0]
        return (input_x, save_mean_shape, save_mean_shape)

    def infer_dtype(self, input_x, gamma, beta, mean, variance):
        validator.check_tensor_dtype_valid("input_x", input_x, [mstype.float16, mstype.float32], self.name)
        args = {"gamma": gamma, "beta": beta}
        validator.check_tensors_dtypes_same_and_valid(args, [mstype.float32], self.name)
        args_moving = {"mean": mean, "variance": variance}
        valid_dtypes = [mstype.tensor_type(mstype.float32)]
        validator.check_types_same_and_valid(args_moving, valid_dtypes, self.name)
        return (input_x, gamma, gamma)


 class BNTrainingReduce(PrimitiveWithInfer):
    """
    For the BatchNorm operation this operator update the moving averages for training and is used in conjunction with
--- a/tests/st/ops/gpu/test_instancenorm2d.py
+++ b/tests/st/ops/gpu/test_instancenorm2d.py
@@ -0,0 +1,62 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 import numpy as np
 import pytest

 import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.common.api import ms_function
 from mindspore.ops import functional as F
 from mindspore.ops.composite import GradOperation
 context.set_context(mode=context.GRAPH_MODE, device_target="GPU")

 class Grad(nn.Cell):
    def __init__(self, network):
        super(Grad, self).__init__()
        self.grad = GradOperation(get_all=True, sens_param=True)
        self.network = network

    @ms_function
    def construct(self, input_x, grad):
        return self.grad(self.network)(input_x, grad)

 class Net(nn.Cell):
    def __init__(self, n):
        super(Net, self).__init__()
        self.ops = nn.BatchNorm2d(n, use_batch_statistics=True, gamma_init=0.5, beta_init=0.5)

    def construct(self, x):
        shape = F.shape(x)
        return F.reshape(self.ops(F.reshape(x, (1, -1, shape[2], shape[3]))), shape)

@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_InstanceNorm2d_fp32():
    x_np = np.random.randn(3, 3, 2, 2).astype(np.float32)
    bn_instance_comp = Net(3 * 3)
    bn_instance_op = nn.InstanceNorm2d(3, use_batch_statistics=True, gamma_init=0.5, beta_init=0.5)
    comp_out = bn_instance_comp(Tensor(x_np))
    op_out = bn_instance_op(Tensor(x_np))
    assert np.allclose(comp_out.asnumpy(), op_out.asnumpy())

    sens = np.random.randn(3, 3, 2, 2).astype(np.float32)
    bn_comp_backward_net = Grad(bn_instance_comp)
    bn_op_backward_net = Grad(bn_instance_op)
    output1 = bn_comp_backward_net(Tensor(x_np), Tensor(sens))
    output2 = bn_op_backward_net(Tensor(x_np), Tensor(sens))
    assert np.allclose(output1[0].asnumpy(), output2[0].asnumpy())