!6292 add internal/bias_add,arithmetic,reduce ops

Merge pull request !6292 from wangzhe/master
5 years ago · 7db29893fc
--- a/mindspore/lite/internal/CMakeLists.txt
+++ b/mindspore/lite/internal/CMakeLists.txt
@@ -10,6 +10,8 @@ file(GLOB KERNEL_SRC
        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/fp32/arithmetic_self.c
        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/fp32/arithmetic.c
        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/fp32/matmul.c
        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/fp32/reduce.c
        ${CMAKE_CURRENT_SOURCE_DIR}/../nnacl/fp32/arithmetic.c
        ${CMAKE_CURRENT_SOURCE_DIR}/src/kernel/fp32/*.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/src/kernel/common/*.cc
        )
--- a/mindspore/lite/internal/src/kernel/fp32/arithmetic.cc
+++ b/mindspore/lite/internal/src/kernel/fp32/arithmetic.cc
@@ -0,0 +1,238 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "internal/src/kernel/fp32/arithmetic.h"
 #include "internal/src/lite_log.h"
 #include "internal/include/errorcode.h"
 #include "internal/include/model.h"
 #include "internal/include/ms_tensor.h"
 #include "internal/include/lite_utils.h"
 #include "src/runtime/allocator.h"
 #include "nnacl/arithmetic_common.h"
 #include "nnacl/fp32/arithmetic.h"
 #include "schema/ops_generated.h"

 typedef int (*ArithmeticRun)(float *input0, float *input1, float *output, int element_size);
 typedef int (*ArithmeticOptRun)(float *input0, float *input1, float *output, int element_size,
                                ArithmeticParameter *param);

 int BroadcastRun(float *input0, float *input1, float *output, int dim, int out_count, int break_pos,
                 ArithmeticRun arithmetic_run, ArithmeticParameter *params) {
  if (dim > break_pos) {
    return arithmetic_run(input0, input1, output, out_count);
  }
  for (int i = 0; i < params->out_shape_[dim]; ++i) {
    int pos0_ = params->in_shape0_[dim] == 1 ? 0 : i;
    int pos1_ = params->in_shape1_[dim] == 1 ? 0 : i;
    int error_code =
      BroadcastRun(input0 + pos0_ * params->in_strides0_[dim], input1 + pos1_ * params->in_strides1_[dim],
                   output + i * params->out_strides_[dim], dim + 1, out_count, break_pos, arithmetic_run, params);
    if (error_code != RET_OK) {
      return error_code;
    }
  }
  return RET_OK;
 }

 int CalBroadCasting(const TensorPtrVector &in_tensors, int *outside, int *break_pos, ArithmeticParameter *params) {
  params->broadcasting_ = false;
  for (int i = 0; i < params->ndim_; i++) {
    if (params->in_shape0_[i] != params->in_shape1_[i]) {
      if (params->in_shape0_[i] == 1) {
        params->out_shape_[i] = params->in_shape1_[i];
      } else if (params->in_shape1_[i] == 1) {
        params->out_shape_[i] = params->in_shape0_[i];
      } else {
        LITE_ERROR_LOG("shapes of input tensors can not be broadCasted");
        return RET_INPUT_TENSOR_ERROR;
      }
      params->broadcasting_ = true;
    } else {
      params->out_shape_[i] = params->in_shape0_[i];
    }
  }
  if (params->broadcasting_) {
    *outside = 1;
    for (auto i = params->ndim_ - 1; i >= 0; --i) {
      if (params->in_shape0_[i] != params->in_shape1_[i]) {
        *break_pos = i;
        break;
      }
      (*outside) *= params->out_shape_[i];
    }
    ComputeStrides(params->in_shape0_, params->in_strides0_, params->ndim_);
    ComputeStrides(params->in_shape1_, params->in_strides1_, params->ndim_);
    ComputeStrides(params->out_shape_, params->out_strides_, params->ndim_);
  }
  return RET_OK;
 }

 int RunArithmetic(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, ArithmeticRun arithmetic_run,
                  ArithmeticOptRun arithmetic_opt_run, int outside, int break_pos, ArithmeticParameter *params) {
  int error_code = RET_OK;
  int count = out_tensors[0]->ElementsNum();
  float *input0_data = reinterpret_cast<float *>(in_tensors[0]->data_);
  float *input1_data1 = reinterpret_cast<float *>(in_tensors[1]->data_);
  float *output_data = reinterpret_cast<float *>(out_tensors[0]->data_);
  if (params->broadcasting_) {
    error_code = BroadcastRun(input0_data, input1_data1, output_data, 0, outside, break_pos, arithmetic_run, params);
  } else if (arithmetic_opt_run != NULL) {
    error_code = arithmetic_opt_run(input0_data, input1_data1, output_data, count, params);
  } else {
    error_code = arithmetic_run(input0_data, input1_data1, output_data, count);
  }
  if (error_code != RET_OK) {
    return error_code;
  }
  return RET_OK;
 }

 int DoArithmeticInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, OpParameter *param) {
  if (in_tensors.size() != 2 || in_tensors[0]->data_ == NULL || in_tensors[1]->data_ == NULL) {
    LITE_ERROR_LOG("input tensors num not correct or input data is NULL!")
    return RET_INPUT_TENSOR_ERROR;
  }
  if (out_tensors.size() != 1) {
    LITE_ERROR_LOG("output tensors num not correct!")
    return RET_ERROR;
  }
  ShapeVector in_shape0 = in_tensors[0]->shape_;
  ShapeVector in_shape1 = in_tensors[1]->shape_;
  int ndim0 = in_shape0.size();
  int ndim1 = in_shape1.size();
  ArithmeticParameter *arithmeticParameter = (ArithmeticParameter *)param;
  if (ndim0 < ndim1) {
    arithmeticParameter->ndim_ = ndim1;
    int fill_dim_num = ndim1 - ndim0;
    int j = 0;
    for (size_t i = 0; i < ndim1; i++) {
      if (i < fill_dim_num) {
        arithmeticParameter->in_shape0_[i] = 1;
      } else {
        arithmeticParameter->in_shape0_[i] = in_shape0[j++];
      }
      arithmeticParameter->in_shape1_[i] = in_shape1[i];
    }
  } else if (ndim0 > ndim1) {
    arithmeticParameter->ndim_ = ndim0;
    int fill_dim_num = ndim0 - ndim1;
    int j = 0;
    for (size_t i = 0; i < ndim0; i++) {
      if (i < fill_dim_num) {
        arithmeticParameter->in_shape1_[i] = 1;
      } else {
        arithmeticParameter->in_shape1_[i] = in_shape1[j++];
      }
      arithmeticParameter->in_shape0_[i] = in_shape0[i];
    }
  } else {
    arithmeticParameter->ndim_ = ndim0;
    for (size_t i = 0; i < ndim0; i++) {
      arithmeticParameter->in_shape0_[i] = in_shape0[i];
      arithmeticParameter->in_shape1_[i] = in_shape1[i];
    }
  }
  ShapeVector out_shape;
  for (int i = 0; i < arithmeticParameter->ndim_; i++) {
    if (arithmeticParameter->in_shape0_[i] != arithmeticParameter->in_shape1_[i]) {
      if (arithmeticParameter->in_shape0_[i] == 1) {
        out_shape.push_back(arithmeticParameter->in_shape1_[i]);
      } else if (arithmeticParameter->in_shape1_[i] == 1) {
        out_shape.push_back(arithmeticParameter->in_shape0_[i]);
      } else {
        LITE_ERROR_LOG("shapes of input tensors can not be broadcasted!")
        return RET_INPUT_TENSOR_ERROR;
      }
    } else {
      out_shape.push_back(arithmeticParameter->in_shape0_[i]);
    }
  }
  out_tensors[0]->shape_ = out_shape;
  out_tensors[0]->data_type_ = in_tensors[0]->data_type_;
  out_tensors[0]->format_ = in_tensors[0]->format_;
  return RET_OK;
 }

 int ChooseKernel(const int kernel_type, ArithmeticRun *arithmetic_run, ArithmeticParameter *params) {
  if (kernel_type == KernelType::Mul) {
    if (params->activation_type_ == mindspore::schema::ActivationType_RELU) {
      *arithmetic_run = ElementMulRelu;
    } else if (params->activation_type_ == mindspore::schema::ActivationType_RELU6) {
      *arithmetic_run = ElementMulRelu6;
    } else {
      *arithmetic_run = ElementMul;
    }
  } else {
    LITE_ERROR_LOG("unsupported operator type");
    return RET_ERROR;
  }
  return RET_OK;
 }

 int ChooseOptKernel(const int kernel_type, ArithmeticOptRun *arithmetic_opt_run, ArithmeticParameter *params) {
  if (kernel_type == KernelType::Mul) {
    if (params->activation_type_ == mindspore::schema::ActivationType_RELU) {
      *arithmetic_opt_run = ElementOptMulRelu;
    } else if (params->activation_type_ == mindspore::schema::ActivationType_RELU6) {
      *arithmetic_opt_run = ElementOptMulRelu6;
    } else {
      *arithmetic_opt_run = ElementOptMul;
    }
  } else {
    LITE_INFO_LOG("kernel not have opt version");
  }
  return RET_OK;
 }

 int DoArithmetic(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
                 mindspore::lite::Allocator *allocator) {
  if (in_tensors.size() != 2 || in_tensors[0]->data_ == NULL || in_tensors[1]->data_ == NULL) {
    LITE_ERROR_LOG("input tensors num not correct or input data is NULL!")
    return RET_INPUT_TENSOR_ERROR;
  }
  if (out_tensors.size() != 1 || out_tensors[0]->data_ == NULL) {
    LITE_ERROR_LOG("output tensors num not correct or output data is NULL!")
    return RET_ERROR;
  }
  if (allocator == NULL) {
    LITE_ERROR_LOG("allocator is NULL!")
    return RET_ERROR;
  }
  ArithmeticParameter *params = reinterpret_cast<ArithmeticParameter *>(node->primitive_);

  ArithmeticRun arithmetic_run = NULL;
  int kernel_type = params->op_parameter_.type_;
  int status = ChooseKernel(kernel_type, &arithmetic_run, params);
  if (status != RET_OK) {
    return status;
  }
  int outside = 0;
  int break_pos = 0;
  // when one of input only has one element
  params->in_elements_num0_ = in_tensors[0]->ElementsNum();
  params->in_elements_num1_ = in_tensors[1]->ElementsNum();
  params->out_elements_num_ = out_tensors[0]->ElementsNum();
  ArithmeticOptRun arithmetic_opt_run = NULL;
  if (params->in_elements_num0_ == 1 || params->in_elements_num1_ == 1) {
    params->broadcasting_ = false;
    ChooseOptKernel(kernel_type, &arithmetic_opt_run, params);
  } else {
    int ret = CalBroadCasting(in_tensors, &outside, &break_pos, params);
    if (ret != RET_OK) {
      return ret;
    }
  }
  return RunArithmetic(in_tensors, out_tensors, arithmetic_run, arithmetic_opt_run, outside, break_pos, params);
 }
--- a/mindspore/lite/internal/src/kernel/fp32/arithmetic.h
+++ b/mindspore/lite/internal/src/kernel/fp32/arithmetic.h
@@ -0,0 +1,29 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef INTERNAL_SRC_RUNTIME_KERNEL_MUL_H_
 #define INTERNAL_SRC_RUNTIME_KERNEL_MUL_H_

 #include "internal/include/model.h"
 #include "internal/include/lite_utils.h"
 #include "src/runtime/allocator.h"
 #include "nnacl/arithmetic_common.h"

 int DoArithmeticInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, OpParameter *param);

 int DoArithmetic(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
                 mindspore::lite::Allocator *allocator);

 #endif  // INTERNAL_SRC_RUNTIME_KERNEL_MUL_H_
--- a/mindspore/lite/internal/src/kernel/fp32/bias_add.cc
+++ b/mindspore/lite/internal/src/kernel/fp32/bias_add.cc
@@ -0,0 +1,82 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "internal/src/kernel/fp32/bias_add.h"
 #include "internal/include/model.h"
 #include "internal/include/ms_tensor.h"
 #include "internal/include/lite_utils.h"
 #include "src/runtime/allocator.h"
 #include "internal/src/lite_log.h"
 #include "internal/include/errorcode.h"
 #include "nnacl/arithmetic_common.h"
 #include "nnacl/fp32/arithmetic.h"

 int DoBiasAddInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, OpParameter *param) {
  if (in_tensors.size() != 2 || in_tensors[0]->data_ == NULL || in_tensors[1]->data_ == NULL) {
    LITE_ERROR_LOG("input tensors num not correct or input data is NULL!")
    return RET_INPUT_TENSOR_ERROR;
  }
  if (out_tensors.size() != 1) {
    LITE_ERROR_LOG("output tensors num not correct!")
    return RET_ERROR;
  }
  out_tensors[0]->shape_ = in_tensors[0]->shape_;
  out_tensors[0]->data_type_ = in_tensors[0]->data_type_;
  out_tensors[0]->format_ = in_tensors[0]->format_;
  return RET_OK;
 }

 int DoBiasAdd(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
              mindspore::lite::Allocator *allocator) {
  if (in_tensors.size() != 2 || in_tensors[0]->data_ == NULL || in_tensors[1]->data_ == NULL) {
    LITE_ERROR_LOG("input tensors num not correct or input data is NULL!")
    return RET_INPUT_TENSOR_ERROR;
  }
  if (out_tensors.size() != 1 || out_tensors[0]->data_ == NULL) {
    LITE_ERROR_LOG("output tensors num not correct or output data is NULL!")
    return RET_ERROR;
  }
  if (allocator == NULL) {
    LITE_ERROR_LOG("allocator is NULL!")
    return RET_ERROR;
  }
  ArithmeticParameter *params = reinterpret_cast<ArithmeticParameter *>(node->primitive_);

  ShapeVector dims = in_tensors[0]->shape_;
  params->ndim_ = dims.size();
  for (size_t i = 0; i < params->ndim_; i++) {
    params->in_shape0_[i] = dims[i];
    params->in_shape1_[i] = 1;
    params->out_shape_[i] = dims[i];
  }
  params->in_shape1_[params->ndim_ - 1] = dims[params->ndim_ - 1];

  float *in = reinterpret_cast<float *>(in_tensors[0]->data_);
  float *bias = reinterpret_cast<float *>(in_tensors[1]->data_);
  float *out = reinterpret_cast<float *>(out_tensors[0]->data_);
  size_t data_size = in_tensors[0]->ElementsNum();
  float *tile_in = reinterpret_cast<float *>(allocator->Malloc(data_size * sizeof(float)));
  float *tile_bias = reinterpret_cast<float *>(allocator->Malloc(data_size * sizeof(float)));
  if (tile_in == NULL || tile_bias == NULL) {
    LITE_ERROR_LOG("Memory allocation failed!")
    allocator->Free(tile_in);
    allocator->Free(tile_bias);
    return RET_ERROR;
  }
  BroadcastAdd(in, bias, tile_in, tile_bias, out, data_size, params);
  allocator->Free(tile_in);
  allocator->Free(tile_bias);
  return RET_OK;
 }
--- a/mindspore/lite/internal/src/kernel/fp32/bias_add.h
+++ b/mindspore/lite/internal/src/kernel/fp32/bias_add.h
@@ -0,0 +1,28 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef INTERNAL_SRC_RUNTIME_KERNEL_BIAS_H_
 #define INTERNAL_SRC_RUNTIME_KERNEL_BIAS_H_

 #include "internal/include/model.h"
 #include "internal/include/lite_utils.h"
 #include "src/runtime/allocator.h"

 int DoBiasAddInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, OpParameter *param);

 int DoBiasAdd(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
              mindspore::lite::Allocator *allocator);

 #endif  // INTERNAL_SRC_RUNTIME_KERNEL_BIAS_H_
--- a/mindspore/lite/internal/src/kernel/fp32/reduce.cc
+++ b/mindspore/lite/internal/src/kernel/fp32/reduce.cc
@@ -0,0 +1,233 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "internal/src/kernel/fp32/reduce.h"
 #include <vector>
 #include "internal/include/model.h"
 #include "internal/include/lite_utils.h"
 #include "src/runtime/allocator.h"
 #include "internal/src/lite_log.h"
 #include "internal/include/errorcode.h"
 #include "nnacl/reduce_parameter.h"
 #include "nnacl/fp32/reduce.h"
 #include "schema/ops_generated.h"

 typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
                       float *dst_data, const int tid, const int thread_num);

 int MallocTmpBuffer(std::vector<float *> *data_buffers, const ShapeVector &shape, const int *axes, const int num_axes,
                    mindspore::lite::Allocator *allocator) {
  for (int i = 0; i < data_buffers->size(); ++i) {
    if (data_buffers->at(i) != NULL) {
      free(data_buffers->at(i));
      data_buffers->at(i) = NULL;
    }
  }
  data_buffers->clear();

  ShapeVector input_shape = shape;
  const int rank = input_shape.size();
  for (auto i = 0; i < num_axes - 1; i++) {
    int axis = axes[i];
    size_t size = 1;
    for (int j = 0; j < rank; j++) {
      if (axis != j) {
        size *= input_shape[j];
      }
    }
    float *buffer = reinterpret_cast<float *>(allocator->Malloc(size * sizeof(float)));
    if (buffer == NULL) {
      LITE_ERROR_LOG("Memory allocation failed!")
      return RET_ERROR;
    }
    data_buffers->emplace_back(buffer);
    input_shape[axis] = 1;
  }
  return RET_OK;
 }

 int FreeTmpBuffer(std::vector<float *> *data_buffers, mindspore::lite::Allocator *allocator) {
  for (int i = 0; i < data_buffers->size(); ++i) {
    allocator->Free(data_buffers->at(i));
  }
  data_buffers->clear();
  return RET_OK;
 }

 int RunReduce(Reducer reducer, std::vector<float *> data_buffers, float *in_data, float *out_data, Int32Vector axes,
              ShapeVector shape) {
  int rank = shape.size();
  float *dst_data = NULL;
  float *src_data = in_data;
  ShapeVector tmp_shape = shape;
  for (size_t i = 0; i < axes.size(); ++i) {
    if (i != axes.size() - 1) {
      dst_data = data_buffers[i];
    } else {
      dst_data = out_data;
    }
    int axis = axes[i];
    int outer_size = 1;
    for (int j = 0; j < axis; j++) {
      outer_size *= tmp_shape[j];
    }
    int inner_size = 1;
    for (int k = axis + 1; k < rank; k++) {
      inner_size *= tmp_shape[k];
    }
    int axis_size = tmp_shape[axis];
    int error_code = reducer(outer_size, inner_size, axis_size, src_data, dst_data, 0, 1);
    if (error_code != RET_OK) {
      LITE_ERROR_LOG("Reduce run error!")
      return RET_ERROR;
    }
    tmp_shape[axis] = 1;
    src_data = dst_data;
  }
  return RET_OK;
 }

 int DoReduceInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, OpParameter *param) {
  if (in_tensors.size() != 1 || in_tensors[0]->data_ == NULL) {
    LITE_ERROR_LOG("input tensors num not correct or input data is NULL!")
    return RET_INPUT_TENSOR_ERROR;
  }
  if (out_tensors.size() != 1) {
    LITE_ERROR_LOG("output tensors num not correct!")
    return RET_ERROR;
  }

  ReduceParameter *reduceParameter = reinterpret_cast<ReduceParameter *>(param);
  bool keep_dims = reduceParameter->keep_dims_;
  int num_axes = reduceParameter->num_axes_;
  ShapeVector in_shape = in_tensors[0]->shape_;
  int rank = in_shape.size();
  Int32Vector out_shape;
  Int32Vector axes;
  int actual_axes_num = num_axes;
  for (int i = 0; i < num_axes; ++i) {
    if (reduceParameter->axes_[i] < -rank || reduceParameter->axes_[i] >= rank) {
      LITE_ERROR_LOG("reduce_sum got invalid axis!")
      return RET_ERROR;
    }
    if (reduceParameter->axes_[i] < 0) {
      axes.push_back(reduceParameter->axes_[i] + rank);
    } else {
      axes.push_back(reduceParameter->axes_[i]);
    }
  }
  if (reduceParameter->reduce_to_end_) {
    if (num_axes != 1) {
      LITE_ERROR_LOG("Reduce when reduce_to_end, num of axis should be 1!")
      return RET_ERROR;
    }
    int begin_axis = axes[0];
    num_axes = rank - begin_axis;
    for (auto i = begin_axis + 1; i < rank; ++i) {
      axes[actual_axes_num++] = i;
    }
  }

  if (num_axes == 0) {
    axes.resize(rank);
    for (size_t i = 0; i < rank; i++) {
      axes[i] = i;
      if (keep_dims) {
        out_shape.push_back(1);
      }
    }
    reduceParameter->num_axes_ = axes.size();
    for (int i = 0; i < axes.size(); ++i) {
      reduceParameter->axes_[i] = axes[i];
    }
    out_tensors[0]->shape_ = out_shape;
    out_tensors[0]->data_type_ = in_tensors[0]->data_type_;
    out_tensors[0]->format_ = in_tensors[0]->format_;
    return RET_OK;
  }
  // reduce on selected axes
  for (size_t i = 0; i < rank; i++) {
    bool reduce_axis = false;
    for (size_t idx = 0; idx < num_axes; ++idx) {
      if (axes[idx] == i) {
        reduce_axis = true;
        break;
      }
    }
    if (reduce_axis) {
      if (keep_dims) {
        out_shape.push_back(1);
      }
    } else {
      out_shape.push_back(in_shape[i]);
    }
  }
  reduceParameter->num_axes_ = axes.size();
  for (int i = 0; i < axes.size(); ++i) {
    reduceParameter->axes_[i] = axes[i];
  }
  out_tensors[0]->shape_ = out_shape;
  out_tensors[0]->data_type_ = in_tensors[0]->data_type_;
  out_tensors[0]->format_ = in_tensors[0]->format_;
  return RET_OK;
 }

 int DoReduce(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
             mindspore::lite::Allocator *allocator) {
  if (in_tensors.size() != 1 || in_tensors[0]->data_ == NULL) {
    LITE_ERROR_LOG("input tensors num not correct or input data is NULL!")
    return RET_INPUT_TENSOR_ERROR;
  }
  if (out_tensors.size() != 1 || out_tensors[0]->data_ == NULL) {
    LITE_ERROR_LOG("output tensors num not correct or output data is NULL!")
    return RET_ERROR;
  }
  if (allocator == NULL) {
    LITE_ERROR_LOG("allocator is NULL!")
    return RET_ERROR;
  }

  ReduceParameter *params = reinterpret_cast<ReduceParameter *>(node->primitive_);
  Reducer reducer = NULL;
  if (params->mode_ == mindspore::schema::ReduceMode::ReduceMode_ReduceSum) {
    reducer = ReduceSum;
  } else if (params->mode_ == mindspore::schema::ReduceMode::ReduceMode_ReduceMean) {
    reducer = ReduceMean;
  }

  std::vector<float *> data_buffers;
  int status = MallocTmpBuffer(&data_buffers, in_tensors[0]->shape_, params->axes_, params->num_axes_, allocator);
  if (status != RET_OK) {
    FreeTmpBuffer(&data_buffers, allocator);
    return status;
  }

  Int32Vector axes;
  for (int i = 0; i < params->num_axes_; ++i) {
    axes.push_back(params->axes_[i]);
  }
  status = RunReduce(reducer, data_buffers, reinterpret_cast<float *>(in_tensors[0]->data_),
                     reinterpret_cast<float *>(out_tensors[0]->data_), axes, in_tensors[0]->shape_);
  if (status != RET_OK) {
    return status;
  }

  status = FreeTmpBuffer(&data_buffers, allocator);
  if (status != RET_OK) {
    return status;
  }
  return RET_OK;
 }
--- a/mindspore/lite/internal/src/kernel/fp32/reduce.h
+++ b/mindspore/lite/internal/src/kernel/fp32/reduce.h
@@ -0,0 +1,29 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef INTERNAL_SRC_KERNEL_FP32_REDUCE_COMMON_H_
 #define INTERNAL_SRC_KERNEL_FP32_REDUCE_COMMON_H_

 #include "internal/include/model.h"
 #include "internal/include/ms_tensor.h"
 #include "internal/include/lite_utils.h"
 #include "src/runtime/allocator.h"

 int DoReduceInferShape(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, OpParameter *param);

 int DoReduce(const TensorPtrVector &in_tensors, const TensorPtrVector &out_tensors, Node *node,
             mindspore::lite::Allocator *allocator);

 #endif  // INTERNAL_SRC_KERNEL_FP32_REDUCE_COMMON_H_
--- a/mindspore/lite/test/ut/internal/infer_test.cc
+++ b/mindspore/lite/test/ut/internal/infer_test.cc
@@ -33,6 +33,7 @@ class InferTest : public mindspore::CommonTest {
 TEST_F(InferTest, TestSession) {
  Model model;
  Node node;
  node.name_ = String("node");
  model.nodes_.push_back(&node);

  node.node_type_ = NodeType::NodeType_CNode;
@@ -64,7 +65,7 @@ TEST_F(InferTest, TestSession) {
  TensorPtrVector outvec = session.GetOutputs();
  ASSERT_EQ(outvec.size(), 1);
  for (int i = 0; i < kOutSize; ++i) {
    std::cout << *(reinterpret_cast<float *>(outvec.at(0)->data_)+ i) << " ";
    std::cout << *(reinterpret_cast<float *>(outvec.at(0)->data_) + i) << " ";
  }
  std::cout << "\n";
  CompareOutputData(reinterpret_cast<float *>(outvec.at(0)->data_), expect_out, kOutSize, 0.000001);