!11446 fix argmax op for cpu and gpu

From: @xcnick Reviewed-by: Signed-off-by:
4 years ago · 9ed9d950e2
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.cc
@@ -18,48 +18,82 @@

 namespace mindspore {
 namespace kernel {
 void ArgmaxCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  if (shape.size() != 2) {
    MS_LOG(EXCEPTION) << "argmax kernel dims invalid " << shape.size();
 namespace {
 size_t get_element_num(const std::vector<size_t> &shape) {
  size_t size = 1;
  for (size_t i = 0; i < shape.size(); i++) {
    size *= shape[i];
  }
  batch_size_ = shape[0];
  class_num_ = shape[1];
  return size;
 }

  int64_t axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS);
  if (axis != -1 && axis != 1) {
    MS_LOG(EXCEPTION) << "argmax kernel not support axis " << axis;
 template <typename T>
 bool check_validation(const std::vector<size_t> &shape, const size_t num_before_axis, const size_t num_after_axis,
                      const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.size() != 1 || outputs.size() != 1) {
    MS_LOG(EXCEPTION) << "Wrong number of inputs or outputs!";
    return false;
  }
  size_t data_size = sizeof(T);
  size_t input_size = get_element_num(shape) * data_size;
  size_t output_num = num_before_axis * num_after_axis;
  size_t output_size = output_num * sizeof(int);
  if (inputs[0]->size != input_size || outputs[0]->size != output_size) {
    MS_LOG(EXCEPTION) << "invalid input or output data size!";
    return false;
  }
  return true;
 }
 }  // namespace

 bool ArgmaxCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                             const std::vector<kernel::AddressPtr> & /*workspaces*/,
                             const std::vector<kernel::AddressPtr> &outputs) {
  if (inputs.empty() || outputs.empty()) {
    MS_LOG(EXCEPTION) << "input or output empty!";
 template <typename T>
 void ArgmaxCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  size_t shape_len = shape_.size();
  int64_t axis = AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS);
  axis += shape_len;
  if (axis < 0) {
    MS_LOG(EXCEPTION) << "Invalid axis:" << axis << ", should in range [-1, " << shape_len - 1 << "]";
  }
  axis = axis % static_cast<int64_t>(shape_len);
  num_before_axis_ = 1;
  num_after_axis_ = 1;
  for (size_t i = 0; i < shape_len; i++) {
    if (static_cast<int64_t>(i) < axis) {
      num_before_axis_ *= shape_[i];
    } else if (static_cast<int64_t>(i) > axis) {
      num_after_axis_ *= shape_[i];
    }
  }
  dim_axis_ = shape_[axis];
 }

  size_t batch_float_size = batch_size_ * sizeof(float);
  size_t batch_class_float_size = class_num_ * batch_float_size;
  if (inputs[0]->size != batch_class_float_size || outputs[0]->size != batch_float_size) {
    MS_LOG(EXCEPTION) << "invalid input or output data size!";
 template <typename T>
 bool ArgmaxCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                const std::vector<kernel::AddressPtr> & /*workspaces*/,
                                const std::vector<kernel::AddressPtr> &outputs) {
  if (!check_validation<T>(shape_, num_before_axis_, num_after_axis_, inputs, outputs)) {
    return false;
  }
  auto input = reinterpret_cast<float *>(inputs[0]->addr);
  auto output = reinterpret_cast<int *>(outputs[0]->addr);
  size_t row_start = 0;
  for (size_t i = 0; i < batch_size_; ++i) {
    size_t max_index = 0;
    float max_value = input[row_start];
    for (size_t j = 1; j < class_num_; ++j) {
      size_t index = row_start + j;
      if (input[index] > max_value) {
        max_value = input[index];
        max_index = j;

  auto input = reinterpret_cast<T *>(inputs[0]->addr);
  auto output = reinterpret_cast<int32_t *>(outputs[0]->addr);

  for (size_t i = 0; i < num_before_axis_; i++) {
    size_t src_index_i = i * dim_axis_ * num_after_axis_;
    for (size_t j = 0; j < num_after_axis_; j++) {
      std::vector<float> array_axis;
      size_t src_index_j = src_index_i + j;
      for (size_t k = 0; k < dim_axis_; k++) {
        size_t src_index_k = k * num_after_axis_ + src_index_j;
        array_axis.push_back(static_cast<float>(input[src_index_k]));
      }
      auto max_ops = std::max_element(array_axis.begin(), array_axis.end());
      auto max_index = static_cast<int32_t>(std::distance(array_axis.begin(), max_ops));
      auto dst_index = i * num_after_axis_ + j;
      output[dst_index] = max_index;
    }
    output[i] = SizeToInt(max_index);
    row_start += class_num_;
  }
  return true;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/argmax_cpu_kernel.h
@@ -22,6 +22,7 @@

 namespace mindspore {
 namespace kernel {
 template <typename T>
 class ArgmaxCPUKernel : public CPUKernel {
 public:
  ArgmaxCPUKernel() = default;
@@ -33,12 +34,16 @@ class ArgmaxCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
  size_t class_num_{0};
  size_t batch_size_{0};
  std::vector<size_t> shape_;
  size_t num_before_axis_;
  size_t num_after_axis_;
  size_t dim_axis_;
 };

 MS_REG_CPU_KERNEL(Argmax, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeInt32),
                  ArgmaxCPUKernel);
 MS_REG_CPU_KERNEL_T(Argmax, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeInt32),
                    ArgmaxCPUKernel, float);
 MS_REG_CPU_KERNEL_T(Argmax, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeInt32),
                    ArgmaxCPUKernel, float16);
 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/argmax_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/argmax_gpu_kernel.cc
@@ -18,9 +18,9 @@

 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_ONE(Argmax, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeInt32),
                      ArgmaxGpuKernel, float)
 MS_REG_GPU_KERNEL_ONE(Argmax, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeInt32),
                      ArgmaxGpuKernel, half)
 MS_REG_GPU_KERNEL_TWO(Argmax, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeInt32),
                      ArgmaxGpuKernel, float, int)
 MS_REG_GPU_KERNEL_TWO(Argmax, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeInt32),
                      ArgmaxGpuKernel, half, int)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/argmax_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/argmax_gpu_kernel.h
@@ -23,11 +23,10 @@
 #include "backend/kernel_compiler/gpu/cuda_impl/argmax_impl.cuh"
 namespace mindspore {
 namespace kernel {
 #define ARGMAX_MAX_DIMENSION 2
 template <typename T>
 template <typename T, typename S>
 class ArgmaxGpuKernel : public GpuKernel {
 public:
  ArgmaxGpuKernel() : input_size_(0), output_size_(0), workspace_size_(0), batch_size_(0), channel_size_(0), axis_(0) {}
  ArgmaxGpuKernel() : input_size_(0), output_size_(0), workspace_size_(0), bound_(0), outer_size_(0), inner_size_(0) {}
  ~ArgmaxGpuKernel() override = default;

  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
@@ -37,47 +36,38 @@ class ArgmaxGpuKernel : public GpuKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    T *input = GetDeviceAddress<T>(inputs, 0);
    int *output = GetDeviceAddress<int>(outputs, 0);
    CalArgmax(input, SizeToInt(batch_size_), SizeToInt(channel_size_), axis_, output,
              reinterpret_cast<cudaStream_t>(stream_ptr));
    S *output = GetDeviceAddress<S>(outputs, 0);
    CalArgmax(input, bound_, outer_size_, inner_size_, output, reinterpret_cast<cudaStream_t>(stream_ptr));
    return true;
  }

  bool Init(const CNodePtr &kernel_node) override {
    size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
    if (input_num != 1) {
      MS_LOG(ERROR) << "Input number is " << input_num << ", but argmax needs 1 input.";
      return false;
    auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
    int64_t dims = shape.size();
    int64_t axis = GetAttr<int64_t>(kernel_node, "axis");
    if (axis < 0) {
      axis += dims;
    }
    size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
    if (output_num != 1) {
      MS_LOG(ERROR) << "Output number is " << output_num << ", but argmax needs 1 output.";
      return false;
    input_size_ = sizeof(T);
    for (auto x : shape) {
      input_size_ *= x;
    }
    auto output_type = GetValue<TypePtr>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("output_type"));
    if (output_type->type_id() != TypeId::kNumberTypeInt32) {
      MS_LOG(EXCEPTION) << "Argmax only supports int32 output type.";
    output_size_ = sizeof(S);
    for (auto x : output_shape) {
      output_size_ *= x;
    }
    auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    if (input_shape.size() > ARGMAX_MAX_DIMENSION) {
      MS_LOG(EXCEPTION) << "Input is " << input_shape.size() << "-D, but Argmax supports max " << ARGMAX_MAX_DIMENSION
                        << "-D inputs.";
    bound_ = static_cast<S>(shape[axis]);
    if (shape[axis] != static_cast<size_t>(bound_)) {
      MS_LOG(EXCEPTION) << "Bound's shape is larger than index type and overflows when casting.";
    }

    axis_ = GetAttr<int64_t>(kernel_node, "axis");
    if (axis_ < 0) {
      axis_ += static_cast<int64_t>(input_shape.size());
    outer_size_ = 1;
    for (int64_t i = axis - 1; i >= 0; i--) {
      outer_size_ *= shape[i];
    }
    if (input_shape.size() == 1) {
      batch_size_ = 0;
      channel_size_ = input_shape[0];
      input_size_ = sizeof(T) * channel_size_;
      output_size_ = sizeof(int);
    } else {
      batch_size_ = input_shape[0];
      channel_size_ = input_shape[1];
      input_size_ = sizeof(T) * batch_size_ * channel_size_;
      output_size_ = (axis_ == 1) ? sizeof(int) * batch_size_ : sizeof(int) * channel_size_;
    inner_size_ = 1;
    for (int64_t i = axis + 1; i < dims; i++) {
      inner_size_ *= shape[i];
    }
    InitSizeLists();
    return true;
@@ -96,9 +86,9 @@ class ArgmaxGpuKernel : public GpuKernel {
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  size_t batch_size_;
  size_t channel_size_;
  int64_t axis_;
  S bound_;
  size_t outer_size_;
  size_t inner_size_;
 };
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/argmax_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/argmax_impl.cu
@@ -17,72 +17,36 @@
 #include "argmax_impl.cuh"
 #include "runtime/device/gpu/cuda_common.h"
 #include "include/cuda_fp16.h"
 template <typename T>
 __global__ void Argmax1D(const T *input, const int channel_size, int *output) {
  int max_index = 0;
  T max = input[0];
  for (int pos = 1; pos < channel_size; pos++) {
    if (max < input[pos]) {
      max = input[pos];
      max_index = pos;
 template <typename T, typename S>
 __global__ void Argmax(const T *input, const S bound, const size_t outer_size,
                       const size_t inner_size, S *output) {
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < outer_size * inner_size;
       pos += gridDim.x * blockDim.x) {
    size_t x = pos / inner_size % outer_size;
    size_t y = pos % inner_size;
    S idx = 0;
    size_t input_offset = x * bound * inner_size + 0 * inner_size + y;
    T max_data = input[input_offset];
    for (S i = 1; i < bound; i++) {
      input_offset = x * bound * inner_size + i * inner_size + y;
      auto input_data = input[input_offset];
      idx = input_data > max_data ? i : idx;
      max_data = input_data > max_data ? input_data : max_data;
    }
    output[pos] = idx;
  }
  output[0] = max_index;
  return;
 }
 template <typename T>
 __global__ void ArgmaxDefault2D(const T *input, const int batch_size, const int channel_size, int *output) {
  int pos;
  int max_index;
  T max;
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size; i += blockDim.x * gridDim.x) {
    max = input[i * channel_size];
    max_index = 0;
    for (int j = 1; j < channel_size; j++) {
      pos = i * channel_size + j;
      if (max < input[pos]) {
        max = input[pos];
        max_index = j;
      }
    }

    output[i] = max_index;
  }
  return;
 }
 template <typename T>
 __global__ void ArgmaxAxis2D(const T *input, const int batch_size, const int channel_size, int *output) {
  int pos;
  int max_index;
  T max;
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < channel_size; i += blockDim.x * gridDim.x) {
    max = input[i];
    max_index = 0;
    for (int j = 1; j < batch_size; j++) {
      pos = j * channel_size + i;
      if (max < input[pos]) {
        max = input[pos];
        max_index = j;
      }
    }
    output[i] = max_index;
  }
  return;
 }
 template <typename T>
 void CalArgmax(const T *input, const int batch_size, const int channel_size, const int64_t axis, int *output,
               cudaStream_t cuda_stream) {
  if (batch_size == 0) {
    Argmax1D<<<1, 1, 0, cuda_stream>>>(input, channel_size, output);
  } else if (axis == 1) {
    ArgmaxDefault2D<<<GET_BLOCKS(batch_size), GET_THREADS, 0, cuda_stream>>>(input, batch_size, channel_size, output);
  } else {
    ArgmaxAxis2D<<<GET_BLOCKS(channel_size), GET_THREADS, 0, cuda_stream>>>(input, batch_size, channel_size, output);
  }
 template <typename T, typename S>
 void CalArgmax(const T *input, const S bound, const size_t outer_size, const size_t inner_size,
               S *output, cudaStream_t cuda_stream) {
  Argmax<<<GET_BLOCKS(outer_size), GET_THREADS, 0, cuda_stream>>>(input, bound, outer_size, inner_size,
                                                                  output);
  return;
 }

 template void CalArgmax<float>(const float *input, const int batch_size, const int channel_size, const int64_t axis,
                               int *output, cudaStream_t cuda_stream);
 template void CalArgmax<half>(const half *input, const int batch_size, const int channel_size, const int64_t axis,
                              int *output, cudaStream_t cuda_stream);
 template void CalArgmax<float, int>(const float *input, const int bound, const size_t outer_size,
                                    const size_t inner_size, int *output, cudaStream_t cuda_stream);
 template void CalArgmax<half, int>(const half *input, const int bound, const size_t outer_size,
                                   const size_t inner_size, int *output, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/argmax_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/argmax_impl.cuh
@@ -16,8 +16,8 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_ARGMAX_IMPL_CUH_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_ARGMAX_IMPL_CUH_
 template <typename T>
 void CalArgmax(const T *input, const int batch_size, const int channel_size, const int64_t axis, int *output,
 template <typename T, typename S>
 void CalArgmax(const T *input, const S bound, const size_t outer_size, const size_t inner_size, S *output,
               cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_ARGMAX_IMPL_CUH_
--- a/tests/st/ops/cpu/test_argmax_op.py
+++ b/tests/st/ops/cpu/test_argmax_op.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 # ============================================================================

 import random
 from functools import reduce
 import numpy as np
 import pytest

@@ -20,33 +22,59 @@ import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.common import dtype as mstype
 from mindspore.common.initializer import initializer
 from mindspore.common.parameter import Parameter
 from mindspore.ops import operations as P
 import mindspore.ops as ops

 context.set_context(mode=context.GRAPH_MODE, device_target="CPU")


 class NetArgmax(nn.Cell):
    def __init__(self):
    def __init__(self, axis=0):
        super(NetArgmax, self).__init__()
        self.argmax = P.Argmax(output_type=mstype.int32)
        x = Tensor(np.array([[1., 20., 5.],
                             [67., 8., 9.],
                             [130., 24., 15.]]).astype(np.float32))
        self.x = Parameter(initializer(x, x.shape), name='x')
        self.argmax = ops.Argmax(axis=axis, output_type=mstype.int32)

    def construct(self):
        return self.argmax(self.x)
    def construct(self, x):
        return self.argmax(x)


@pytest.mark.level0
@pytest.mark.platform_x86_cpu
@pytest.mark.env_onecard
 def test_argmax():
    Argmax = NetArgmax()
    output = Argmax()
    print("================================")
 def test_argmax_1d():
    x = Tensor(np.array([1., 20., 5.]).astype(np.float32))
    Argmax = NetArgmax(axis=0)
    output = Argmax(x)
    expect = np.array([1]).astype(np.float32)
    assert (output.asnumpy() == expect).all()


@pytest.mark.level0
@pytest.mark.platform_x86_cpu
@pytest.mark.env_onecard
 def test_argmax_2d():
    x = Tensor(np.array([[1., 20., 5.],
                         [67., 8., 9.],
                         [130., 24., 15.]]).astype(np.float32))
    Argmax_axis_0 = NetArgmax(axis=0)
    output = Argmax_axis_0(x)
    expect = np.array([2, 2, 2]).astype(np.float32)
    assert (output.asnumpy() == expect).all()
    Argmax_axis_1 = NetArgmax(axis=1)
    output = Argmax_axis_1(x)
    expect = np.array([1, 0, 0]).astype(np.float32)
    print(output)
    assert (output.asnumpy() == expect).all()


@pytest.mark.level0
@pytest.mark.platform_x86_cpu
@pytest.mark.env_onecard
 def test_argmax_high_dims():
    for dim in range(3, 10):
        shape = np.random.randint(1, 10, size=dim)
        x = np.random.randn(reduce(lambda x, y: x * y, shape)).astype(np.float32)
        x = x.reshape(shape)

        rnd_axis = random.randint(-dim + 1, dim - 1)
        Argmax = NetArgmax(axis=rnd_axis)
        ms_output = Argmax(Tensor(x))
        np_output = np.argmax(x, axis=rnd_axis)
        assert (ms_output.asnumpy() == np_output).all()
--- a/tests/st/ops/gpu/test_argmax_op.py
+++ b/tests/st/ops/gpu/test_argmax_op.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 # ============================================================================

 import random
 from functools import reduce
 import numpy as np
 import pytest

@@ -20,43 +22,67 @@ import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.common import dtype as mstype
 from mindspore.ops import operations as P
 import mindspore.ops as ops


 class NetArgmax(nn.Cell):
    def __init__(self):
    def __init__(self, axis=0):
        super(NetArgmax, self).__init__()
        axis1 = 0
        axis2 = -1
        self.argmax1 = P.Argmax(axis1, output_type=mstype.int32)
        self.argmax2 = P.Argmax(axis2, output_type=mstype.int32)
        self.argmax3 = P.Argmax(output_type=mstype.int32)
        self.argmax = ops.Argmax(axis, output_type=mstype.int32)

    def construct(self, x):
        return (self.argmax1(x), self.argmax2(x), self.argmax3(x))
        return self.argmax(x)


@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_argmax():
 def test_argmax_1d():
    for mode in [context.PYNATIVE_MODE, context.GRAPH_MODE]:
        context.set_context(mode=mode, device_target="GPU")

        x = Tensor(np.array([1., 20., 5.]).astype(np.float32))
        Argmax = NetArgmax(axis=0)
        output = Argmax(x)
        expect = np.array([1]).astype(np.float32)
        assert (output.asnumpy() == expect).all()


@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_argmax_2d():
    for mode in [context.PYNATIVE_MODE, context.GRAPH_MODE]:
        context.set_context(mode=mode, device_target="GPU")

    x = Tensor(np.array([[1., 20., 5.],
                         [67., 8., 9.],
                         [130., 24., 15.],
                         [0.3, -0.4, -15.]]).astype(np.float32))
    expect1 = np.array([2, 2, 2]).astype(np.int32)
    expect2 = np.array([1, 0, 0, 0]).astype(np.int32)

    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
    argmax = NetArgmax()
    output = argmax(x)
    assert (output[0].asnumpy() == expect1).all()
    assert (output[1].asnumpy() == expect2).all()
    assert (output[2].asnumpy() == expect2).all()

    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
    argmax1 = NetArgmax()
    output1 = argmax1(x)
    assert (output1[0].asnumpy() == expect1).all()
    assert (output1[1].asnumpy() == expect2).all()
    assert (output1[2].asnumpy() == expect2).all()
    Argmax_axis_0 = NetArgmax(axis=0)
    output = Argmax_axis_0(x)
    expect = np.array([2, 2, 2]).astype(np.int32)
    assert (output.asnumpy() == expect).all()

    Argmax_axis_1 = NetArgmax(axis=1)
    output = Argmax_axis_1(x)
    expect = np.array([1, 0, 0, 0]).astype(np.int32)
    assert (output.asnumpy() == expect).all()


@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
 def test_argmax_high_dims():
    for mode in [context.PYNATIVE_MODE, context.GRAPH_MODE]:
        context.set_context(mode=mode, device_target="GPU")
        for dim in range(3, 10):
            shape = np.random.randint(1, 10, size=dim)
            x = np.random.randn(reduce(lambda x, y: x * y, shape)).astype(np.float32)
            x = x.reshape(shape)

            rnd_axis = random.randint(-dim + 1, dim - 1)
            Argmax = NetArgmax(axis=rnd_axis)
            ms_output = Argmax(Tensor(x))
            np_output = np.argmax(x, axis=rnd_axis)
            assert (ms_output.asnumpy() == np_output).all()