!3943 add arm cpu fp32 op: lstm

Merge pull request !3943 from yangruoqi713/lite
5 years ago · 4b2e26f16b
--- a/mindspore/lite/src/ops/lstm.cc
+++ b/mindspore/lite/src/ops/lstm.cc
@@ -0,0 +1,69 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/ops/ops.h"
 #include "include/errorcode.h"
 #include "utils/log_adapter.h"
 #include "src/ir/tensor.h"

 namespace mindspore::lite {
 const int kLstmInputNum = 6;
 const int kLstmOutputNum = 3;
 int Lstm::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
  MS_ASSERT(this->primitive != nullptr);
  if (inputs_.size() != kLstmInputNum || outputs_.size() != kLstmOutputNum) {
    MS_LOG(ERROR) << "OpLstm inputs or outputs size error.";
    return RET_INPUT_TENSOR_ERROR;
  }
  auto input = inputs_.front();
  MS_ASSERT(input != nullptr);
  auto weight_i = inputs_.front();
  MS_ASSERT(input0 != nullptr);
  auto output = outputs_.front();
  MS_ASSERT(output != nullptr);

  std::vector<int> in_shape = input->shape();
  std::vector<int> w_shape = weight_i->shape();  // layer, hidden_size * 4, input_size
  if (in_shape.size() != 3 || w_shape.size() != 3) {
    MS_LOG(ERROR) << "OpLstm input dims should be 3.";
    return RET_ERROR;
  }

  auto lstm_prim = this->primitive->value_as_Lstm();
  int hidden_size = w_shape[1] / 4;

  // set output
  std::vector<int> out_shape(in_shape);
  out_shape[2] = hidden_size;
  if (lstm_prim->bidirection()) {
    out_shape.insert(out_shape.begin() + 1, 2);
  }
  output->set_shape(out_shape);

  // set hidden state, cell state
  std::vector<int> state_shape(in_shape);
  state_shape[0] = lstm_prim->bidirection() ? 2 : 1;
  state_shape[2] = hidden_size;
  outputs_[1]->set_shape(state_shape);
  outputs_[2]->set_shape(state_shape);

  for (int i = 0; i < kLstmOutputNum; i++) {
    outputs_[i]->set_data_type(input->data_type());
    outputs_[i]->SetFormat(input->GetFormat());
  }
  return RET_OK;
 }
 }  // namespace mindspore::lite
--- a/mindspore/lite/src/ops/ops.h
+++ b/mindspore/lite/src/ops/ops.h
@@ -770,6 +770,13 @@ class QuantDTypeCast : public Primitive {
  const schema::QuantDTypeCast *GetAttribute() const { return this->primitive->value_as_QuantDTypeCast(); }
  int InferShape(std::vector<tensor::Tensor *> inputs, std::vector<tensor::Tensor *> outputs) override;
 };

 class Lstm : public Primitive {
 public:
  explicit Lstm(schema::Primitive *primitive) : Primitive(primitive) {}
  const schema::Lstm *GetAttribute() const { return this->primitive->value_as_Lstm(); }
  int InferShape(std::vector<tensor::Tensor *> inputs, std::vector<tensor::Tensor *> outputs) override;
 };
 }  // namespace lite
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_SRC_OPS_OPS_H_
--- a/mindspore/lite/src/populate_parameter.cc
+++ b/mindspore/lite/src/populate_parameter.cc
@@ -67,6 +67,7 @@
 #include "src/runtime/kernel/arm/opclib/fp32/space_to_depth.h"
 #include "src/runtime/kernel/arm/opclib/fp32/space_to_batch.h"
 #include "src/runtime/kernel/arm/opclib/int8/quant_dtype_cast.h"
 #include "src/runtime/kernel/arm/opclib/fp32/lstm.h"

 namespace mindspore::kernel {
 OpParameter *PopulateFillParameter(const lite::Primitive *primitive) {
@@ -1169,6 +1170,23 @@ OpParameter *PopulatePriorBoxParameter(const lite::Primitive *primitive) {
  return reinterpret_cast<OpParameter *>(prior_box_param);
 }

 OpParameter *PopulateLstmParameter(const lite::Primitive *primitive) {
  LstmParameter *lstm_param = new (std::nothrow) LstmParameter();
  if (lstm_param == nullptr) {
    MS_LOG(ERROR) << "new LstmParameter fail!";
    return nullptr;
  }
  lstm_param->op_parameter_.type_ = primitive->Type();
  auto param = primitive->Value()->value_as_Lstm();
  if (param == nullptr) {
    delete (lstm_param);
    MS_LOG(ERROR) << "get Lstm param nullptr.";
    return nullptr;
  }
  lstm_param->bidirectional_ = param->bidirection();
  return reinterpret_cast<OpParameter *>(lstm_param);
 }

 PopulateParameterRegistry::PopulateParameterRegistry() {
  populate_parameter_funcs_[schema::PrimitiveType_SoftMax] = PopulateSoftmaxParameter;
  populate_parameter_funcs_[schema::PrimitiveType_Activation] = PopulateActivationParameter;
@@ -1244,6 +1262,7 @@ PopulateParameterRegistry::PopulateParameterRegistry() {
  populate_parameter_funcs_[schema::PrimitiveType_Split] = PopulateSplitParameter;
  populate_parameter_funcs_[schema::PrimitiveType_PriorBox] = PopulatePriorBoxParameter;
  populate_parameter_funcs_[schema::PrimitiveType_QuantDTypeCast] = PopulateQuantDTypeCastParameter;
  populate_parameter_funcs_[schema::PrimitiveType_Lstm] = PopulateLstmParameter;
 }

 PopulateParameterRegistry *PopulateParameterRegistry::GetInstance() {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/lstm.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/lstm.cc
@@ -0,0 +1,185 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/fp32/lstm.h"
 #include <vector>
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Lstm;

 namespace mindspore::kernel {
 int LstmCPUKernel::InitParam() {
  auto input = inputs_.front();
  MS_ASSERT(input != nullptr);
  std::vector<int> in_shape = input->shape();
  lstm_parm_->seq_len_ = in_shape[0];
  lstm_parm_->batch_ = in_shape[1];
  lstm_parm_->input_size_ = in_shape[2];

  auto weight_i = inputs_[1];
  MS_ASSERT(weight_i != nullptr);
  std::vector<int> w_shape = weight_i->shape();
  lstm_parm_->hidden_size_ = w_shape[1] / 4;

  lstm_parm_->input_step_ = lstm_parm_->batch_ * lstm_parm_->input_size_;
  lstm_parm_->output_step_ = lstm_parm_->bidirectional_ ? 2 * lstm_parm_->batch_ * lstm_parm_->hidden_size_
                                                        : lstm_parm_->batch_ * lstm_parm_->hidden_size_;
  return RET_OK;
 }

 int LstmCPUKernel::InitBuffer() {
  gate_buffer_ = reinterpret_cast<float *>(malloc(4 * lstm_parm_->batch_ * lstm_parm_->hidden_size_ * sizeof(float)));
  if (gate_buffer_ == nullptr) {
    MS_LOG(ERROR) << "LstmCPUKernel malloc gate_buffer error.";
    return RET_ERROR;
  }
  return RET_OK;
 }

 int LstmCPUKernel::InitWeightBias() {
  // copy weight_i and weight_h
  auto weight_i = inputs_.at(1);
  MS_ASSERT(weight_i != nullptr);
  weight_i_ptr_ = reinterpret_cast<float *>(malloc(weight_i->ElementsNum() * sizeof(float)));
  if (weight_i_ptr_ == nullptr) {
    MS_LOG(ERROR) << "LstmCPUKernel malloc weight_i_ptr_ error.";
    return RET_ERROR;
  }
  memcpy(weight_i_ptr_, weight_i->Data(), weight_i->ElementsNum() * sizeof(float));

  auto weight_h = inputs_.at(2);
  MS_ASSERT(weight_h != nullptr);
  weight_h_ptr_ = reinterpret_cast<float *>(malloc(weight_h->ElementsNum() * sizeof(float)));
  if (weight_h_ptr_ == nullptr) {
    MS_LOG(ERROR) << "LstmCPUKernel malloc weight_h_ error.";
    return RET_ERROR;
  }
  memcpy(weight_h_ptr_, weight_h->Data(), weight_h->ElementsNum() * sizeof(float));

  // init bias
  int bias_num = lstm_parm_->bidirectional_ ? 2 * 4 * lstm_parm_->hidden_size_ : 4 * lstm_parm_->hidden_size_;
  bias_ptr_ = reinterpret_cast<float *>(malloc(bias_num * sizeof(float)));
  if (bias_ptr_ == nullptr) {
    MS_LOG(ERROR) << "LstmCPUKernel malloc bias_ptr_ error.";
    return RET_ERROR;
  }

  auto bias_data = reinterpret_cast<float *>(inputs_.at(3)->Data());
  int state_bias_offset = 4 * lstm_parm_->hidden_size_;
  for (int i = 0; i < state_bias_offset; i++) {
    bias_ptr_[i] = bias_data[i] + bias_data[i + state_bias_offset];
  }
  if (lstm_parm_->bidirectional_) {
    bias_data += 4 * lstm_parm_->hidden_size_ * 2;
    auto backward_bias = bias_ptr_ + 4 * lstm_parm_->hidden_size_;
    for (int i = 0; i < state_bias_offset; i++) {
      backward_bias[i] = bias_data[i] + bias_data[i + state_bias_offset];
    }
  }
  return RET_OK;
 }

 int LstmCPUKernel::Init() {
  auto ret = InitParam();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "LstmCPUKernel InitParam error.";
    return RET_ERROR;
  }

  ret = InitWeightBias();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "LstmCPUKernel InitWeightBias error.";
    return RET_ERROR;
  }

  ret = InitBuffer();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "LstmCPUKernel InitBuffer error.";
    return RET_ERROR;
  }
  return RET_OK;
 }

 int LstmCPUKernel::ReSize() {
  free(gate_buffer_);

  auto ret = InitParam();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "LstmCPUKernel InitParam error.";
    return RET_ERROR;
  }

  ret = InitBuffer();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "LstmCPUKernel InitBuffer error.";
    return RET_ERROR;
  }
  return RET_OK;
 }

 int LstmCPUKernel::Run() {
  auto input = inputs_.at(kInputIndex);
  MS_ASSERT(input != nullptr);
  auto hidden_state = inputs_.at(4);
  MS_ASSERT(hidden_state != nullptr);
  auto cell_state = inputs_.at(5);
  MS_ASSERT(cell_state != nullptr);
  auto output = outputs_.at(0);
  MS_ASSERT(output != nullptr);

  auto input_ptr = reinterpret_cast<float *>(input->Data());
  auto output_ptr = reinterpret_cast<float *>(output->Data());

  auto output_hidden_state = outputs_[1];
  memcpy(output_hidden_state->Data(), hidden_state->Data(), hidden_state->ElementsNum() * sizeof(float));
  auto output_cell_state = outputs_[2];
  memcpy(output_cell_state->Data(), cell_state->Data(), cell_state->ElementsNum() * sizeof(float));

  Lstm(output_ptr, input_ptr, weight_i_ptr_, weight_h_ptr_, bias_ptr_,
       reinterpret_cast<float *>(output_hidden_state->Data()), reinterpret_cast<float *>(output_cell_state->Data()),
       gate_buffer_, lstm_parm_);
  return RET_OK;
 }

 kernel::LiteKernel *CpuLstmKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                         const std::vector<lite::tensor::Tensor *> &outputs, OpParameter *opParameter,
                                         const lite::Context *ctx, const kernel::KernelKey &desc) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Lstm);

  auto *kernel = new (std::nothrow) LstmCPUKernel(opParameter, inputs, outputs, ctx);
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "kernel is nullptr.";
    return nullptr;
  }
  auto ret = kernel->Init();
  if (ret != RET_OK) {
    delete kernel;
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Lstm, CpuLstmKernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/lstm.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/lstm.h
@@ -0,0 +1,57 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_LSTM_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_LSTM_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/runtime/kernel/arm/opclib/fp32/lstm.h"

 namespace mindspore::kernel {
 class LstmCPUKernel : public LiteKernel {
 public:
  LstmCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx)
      : LiteKernel(parameter, inputs, outputs) {
    lstm_parm_ = reinterpret_cast<LstmParameter *>(opParameter);
  }

  ~LstmCPUKernel() override {
    free(gate_buffer_);
    free(weight_i_ptr_);
    free(weight_h_ptr_);
    free(bias_ptr_);
  }

  int Init() override;
  int ReSize() override;
  int Run() override;

  int InitParam();
  int InitBuffer();
  int InitWeightBias();

 private:
  float *gate_buffer_;
  float *weight_i_ptr_;
  float *weight_h_ptr_;
  float *bias_ptr_;
  LstmParameter *lstm_parm_;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_LSTM_H_
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/lstm.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/lstm.cc
@@ -0,0 +1,158 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/opclib/fp32/lstm.h"
 #include <string.h>
 #include "src/runtime/kernel/arm/opclib/fp32/activation.h"
 #include "src/runtime/kernel/arm/opclib/fp32/arithmetic.h"

 void InitGate(float *gate_buffer, const float *bias, LstmParameter *lstm_parm) {
  int gate_offest = 0;
  for (int l = 0; l < 4; l++) {
    int batch_offest = gate_offest;
    int bias_offest = l * lstm_parm->hidden_size_;
    for (int b = 0; b < lstm_parm->batch_; b++) {
      memcpy(gate_buffer + batch_offest, bias + bias_offest, lstm_parm->hidden_size_ * sizeof(float));
      batch_offest += lstm_parm->hidden_size_;
    }
    gate_offest += lstm_parm->batch_ * lstm_parm->hidden_size_;
  }
 }

 // input: [row, inner_size]; weight: [col, inner_size]; output: [row, col]
 void MatMulAcc(float *output, const float *input, const float *weight, int rows, int cols, int inner_size) {
  for (int r = 0; r < rows; r++) {
    for (int c = 0; c < cols; c++) {
      float res = 0;
      for (int i = 0; i < inner_size; i++) {
        res += input[r * inner_size + i] * weight[c * inner_size + i];
      }
      output[r * cols + c] += res;
    }
  }
 }

 void ElementMulAcc(float *input0, float *input1, float *output, int element_size) {
  for (int index = 0; index < element_size; index++) {
    output[index] += input0[index] * input1[index];
  }
 }

 void UpdataState(float *cell_state, float *forget_gate, float *input_gate, float *cell_gate, int batch,
                 int hidden_size) {
  ElementMul(forget_gate, cell_state, cell_state, batch * hidden_size);
  ElementMulAcc(input_gate, cell_gate, cell_state, batch * hidden_size);
 }

 void UpdataOutput(float *cell_state, float *output_gate, float *hidden_state, int batch, int hidden_size) {
  Tanh(cell_state, batch * hidden_size, hidden_state);
  ElementMul(hidden_state, output_gate, hidden_state, batch * hidden_size);
 }

 void LstmStepUnit(float *output, const float *input, const float *input_input_weight, const float *input_forget_weight,
                  const float *input_cell_weight, const float *input_output_weight, const float *state_input_weight,
                  const float *state_forget_weight, const float *state_cell_weight, const float *state_output_weight,
                  const float *bias, float *hidden_state, float *cell_state, float *gate_buffer,
                  LstmParameter *lstm_parm) {
  InitGate(gate_buffer, bias, lstm_parm);

  float *input_gate = gate_buffer;
  float *forget_gate = gate_buffer + lstm_parm->batch_ * lstm_parm->hidden_size_ * 2;
  float *cell_gate = gate_buffer + lstm_parm->batch_ * lstm_parm->hidden_size_ * 3;
  float *output_gate = gate_buffer + lstm_parm->batch_ * lstm_parm->hidden_size_ * 1;

  // input * weight
  MatMulAcc(input_gate, input, input_input_weight, lstm_parm->batch_, lstm_parm->hidden_size_, lstm_parm->input_size_);
  MatMulAcc(forget_gate, input, input_forget_weight, lstm_parm->batch_, lstm_parm->hidden_size_,
            lstm_parm->input_size_);
  MatMulAcc(cell_gate, input, input_cell_weight, lstm_parm->batch_, lstm_parm->hidden_size_, lstm_parm->input_size_);
  MatMulAcc(output_gate, input, input_output_weight, lstm_parm->batch_, lstm_parm->hidden_size_,
            lstm_parm->input_size_);

  // state * weight
  MatMulAcc(input_gate, hidden_state, state_input_weight, lstm_parm->batch_, lstm_parm->hidden_size_,
            lstm_parm->hidden_size_);
  MatMulAcc(forget_gate, hidden_state, state_forget_weight, lstm_parm->batch_, lstm_parm->hidden_size_,
            lstm_parm->hidden_size_);
  MatMulAcc(cell_gate, hidden_state, state_cell_weight, lstm_parm->batch_, lstm_parm->hidden_size_,
            lstm_parm->hidden_size_);
  MatMulAcc(output_gate, hidden_state, state_output_weight, lstm_parm->batch_, lstm_parm->hidden_size_,
            lstm_parm->hidden_size_);

  // update input_gate
  Sigmoid(input_gate, lstm_parm->batch_ * lstm_parm->hidden_size_, input_gate);

  // update forget_gate
  Sigmoid(forget_gate, lstm_parm->batch_ * lstm_parm->hidden_size_, forget_gate);

  // update cell_gate
  Tanh(cell_gate, lstm_parm->batch_ * lstm_parm->hidden_size_, cell_gate);
  // update cell state
  UpdataState(cell_state, forget_gate, input_gate, cell_gate, lstm_parm->batch_, lstm_parm->hidden_size_);

  // update output_gate
  Sigmoid(output_gate, lstm_parm->batch_ * lstm_parm->hidden_size_, output_gate);
  // update output
  UpdataOutput(cell_state, output_gate, hidden_state, lstm_parm->batch_, lstm_parm->hidden_size_);
  memcpy(output, hidden_state, lstm_parm->batch_ * lstm_parm->hidden_size_ * sizeof(float));
 }

 void Lstm(float *output, const float *input, const float *weight_i, const float *weight_h, const float *bias,
          float *hidden_state, float *cell_state, float *gate_buffer, LstmParameter *lstm_parm) {
  // forward
  const float *input_input_weight = weight_i;
  const float *input_forget_weight = weight_i + lstm_parm->input_size_ * lstm_parm->hidden_size_ * 2;
  const float *input_cell_weight = weight_i + lstm_parm->input_size_ * lstm_parm->hidden_size_ * 3;
  const float *input_output_weight = weight_i + lstm_parm->input_size_ * lstm_parm->hidden_size_ * 1;

  const float *state_input_weight = weight_h;
  const float *state_forget_weight = weight_h + lstm_parm->hidden_size_ * lstm_parm->hidden_size_ * 2;
  const float *state_cell_weight = weight_h + lstm_parm->hidden_size_ * lstm_parm->hidden_size_ * 3;
  const float *state_output_weight = weight_h + lstm_parm->hidden_size_ * lstm_parm->hidden_size_ * 1;

  for (int t = 0; t < lstm_parm->seq_len_; t++) {
    const float *input_ptr = input + t * lstm_parm->input_step_;
    float *output_ptr = output + t * lstm_parm->output_step_;
    LstmStepUnit(output_ptr, input_ptr, input_input_weight, input_forget_weight, input_cell_weight, input_output_weight,
                 state_input_weight, state_forget_weight, state_cell_weight, state_output_weight, bias, hidden_state,
                 cell_state, gate_buffer, lstm_parm);
  }

  // backward
  if (lstm_parm->bidirectional_) {
    input_input_weight = weight_i + lstm_parm->input_size_ * lstm_parm->hidden_size_ * 4;
    input_forget_weight = weight_i + lstm_parm->input_size_ * lstm_parm->hidden_size_ * 6;
    input_cell_weight = weight_i + lstm_parm->input_size_ * lstm_parm->hidden_size_ * 7;
    input_output_weight = weight_i + lstm_parm->input_size_ * lstm_parm->hidden_size_ * 5;

    state_input_weight = weight_h + lstm_parm->hidden_size_ * lstm_parm->hidden_size_ * 4;
    state_forget_weight = weight_h + lstm_parm->hidden_size_ * lstm_parm->hidden_size_ * 6;
    state_cell_weight = weight_h + lstm_parm->hidden_size_ * lstm_parm->hidden_size_ * 7;
    state_output_weight = weight_h + lstm_parm->hidden_size_ * lstm_parm->hidden_size_ * 5;

    float *backward_output = output + lstm_parm->batch_ * lstm_parm->hidden_size_;
    const float *backward_bias = bias + 4 * lstm_parm->hidden_size_;
    float *backward_cell_state = cell_state + lstm_parm->batch_ * lstm_parm->hidden_size_;
    float *backward_hidden_state = hidden_state + lstm_parm->batch_ * lstm_parm->hidden_size_;
    for (int t = lstm_parm->seq_len_ - 1; t >= 0; t--) {
      const float *input_ptr = input + t * lstm_parm->input_step_;
      float *output_ptr = backward_output + t * lstm_parm->output_step_;
      LstmStepUnit(output_ptr, input_ptr, input_input_weight, input_forget_weight, input_cell_weight,
                   input_output_weight, state_input_weight, state_forget_weight, state_cell_weight, state_output_weight,
                   backward_bias, backward_hidden_state, backward_cell_state, gate_buffer, lstm_parm);
    }
  }
 }
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/lstm.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/lstm.h
@@ -0,0 +1,36 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_LSTM_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_LSTM_H_

 #include "src/runtime/kernel/arm/opclib/op_base.h"

 struct LstmParameter {
  OpParameter op_parameter_;
  int input_size_;
  int hidden_size_;  // output_size
  int seq_len_;
  int batch_;
  int input_step_;
  int output_step_;
  bool bidirectional_;
 };

 void Lstm(float *output, const float *input, const float *weight_i, const float *weight_h, const float *bias,
          float *hidden_state, float *cell_state, float *gate_buffer, LstmParameter *lstm_parm);

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_LSTM_H_
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/lstm_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/lstm_fp32_tests.cc
@@ -0,0 +1,330 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <iostream>
 #include <memory>
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/kernel/arm/opclib/fp32/lstm.h"
 #include "mindspore/lite/src/kernel_registry.h"
 #include "mindspore/lite/src/ops/ops.h"

 namespace mindspore {
 class LstmFp32 : public mindspore::Common {
 public:
  LstmFp32() {}
 };

 void InitLstmParam(LstmParameter *lstm_param) {
  lstm_param->seq_len_ = 4;
  lstm_param->batch_ = 1;
  lstm_param->input_size_ = 2;
  lstm_param->hidden_size_ = 3;
  lstm_param->bidirectional_ = false;
 }

 void InitLstmForwardCreator(std::vector<lite::tensor::Tensor *> *inputs, std::vector<lite::tensor::Tensor *> *outputs,
                            const LstmParameter *lstm_param) {
  // prepare input
  std::vector<float> input_data = {1.3889, -0.3006, -0.1787, 2.1504, -0.3181, 0.4945, -0.4758, -0.8187};
  auto *input = new lite::tensor::Tensor;
  input->set_data_type(kNumberTypeFloat32);
  input->set_shape({lstm_param->seq_len_, lstm_param->batch_, lstm_param->input_size_});
  input->MallocData();
  memcpy(input->Data(), input_data.data(), input_data.size() * sizeof(float));

  // prepare weight_i
  std::vector<float> weight_i_data = {0.21368974,  -0.3778776,  0.05025542, 0.09011161,  0.18355745,  0.5491228,
                                      -0.14186832, -0.4655916,  0.49541366, -0.44039622, 0.5625571,   0.23325664,
                                      0.3449825,   -0.42750397, 0.01911497, -0.4125802,  -0.56690466, 0.50593233,
                                      -0.29129684, -0.27841482, 0.01964372, -0.42543447, 0.41720617,  -0.30054367};
  auto *weight_i = new lite::tensor::Tensor;
  weight_i->set_data_type(kNumberTypeFloat32);
  weight_i->SetFormat(schema::Format_NHWC);
  weight_i->set_shape({1, lstm_param->hidden_size_ * 4, lstm_param->input_size_});
  weight_i->MallocData();
  memcpy(weight_i->Data(), weight_i_data.data(), weight_i_data.size() * sizeof(float));

  // prepare weight_r
  std::vector<float> weight_h_data = {
    -0.03424168, 0.00643545,  0.36867607, -0.08598137, 0.19804275,  -0.11319417, -0.0244593,  -0.16440144, -0.07268238,
    0.09828371,  0.33358777,  0.53381383, -0.39431244, -0.06005383, -0.3520246,  0.42687547,  0.5772828,   0.5380008,
    -0.16130409, -0.24737108, 0.42409766, -0.50648475, 0.48223662,  -0.5221103,  -0.49216837, -0.29084128, 0.3408438,
    0.34080023,  0.49467337,  0.23473483, 0.01759732,  0.04691631,  0.45574808,  -0.29481018, 0.29442167,  -0.36718};
  auto *weight_h = new lite::tensor::Tensor;
  weight_h->set_data_type(kNumberTypeFloat32);
  weight_h->SetFormat(schema::Format_NHWC);
  weight_h->set_shape({1, lstm_param->hidden_size_ * 4, lstm_param->hidden_size_});
  weight_h->MallocData();
  memcpy(weight_h->Data(), weight_h_data.data(), weight_h_data.size() * sizeof(float));

  // prepare bias
  std::vector<float> bias_data = {-0.00207639, 0.16391152,  -0.00069344, -0.32945693, -0.367423,   0.28301108,
                                  -0.17930457, 0.5278388,   0.12598747,  -0.53130764, 0.1479364,   0.16695255,
                                  -0.00708795, -0.46417096, -0.23966661, -0.17496741, -0.19166365, -0.50466555,
                                  -0.23593256, -0.3911457,  0.51128435,  0.5128727,   0.253451,    -0.51891875};
  auto *bias = new lite::tensor::Tensor;
  bias->set_data_type(kNumberTypeFloat32);
  bias->SetFormat(schema::Format_NHWC);
  bias->set_shape({1, lstm_param->hidden_size_ * 4 * 2});
  bias->MallocData();
  memcpy(bias->Data(), bias_data.data(), bias_data.size() * sizeof(float));

  // prepare state
  std::vector<float> state_data = {0, 0, 0};
  auto *state = new lite::tensor::Tensor;
  state->set_data_type(kNumberTypeFloat32);
  state->SetFormat(schema::Format_NHWC);
  state->set_shape({1, lstm_param->batch_, lstm_param->hidden_size_});
  state->MallocData();
  memcpy(state->Data(), state_data.data(), state_data.size() * sizeof(float));

  inputs->push_back(input);
  inputs->push_back(weight_i);
  inputs->push_back(weight_h);
  inputs->push_back(bias);
  inputs->push_back(state);
  inputs->push_back(state);

  // malloc output buffer, for arm cpu, format: N C4 H W 4
  auto *output = new lite::tensor::Tensor;
  output->set_data_type(kNumberTypeFloat32);
  output->set_shape({lstm_param->seq_len_, lstm_param->batch_, lstm_param->hidden_size_});
  output->SetFormat(schema::Format_NHWC);
  output->MallocData();
  memset(output->Data(), 0, output->ElementsNum() * sizeof(float));

  auto *cell_state = new lite::tensor::Tensor;
  cell_state->set_data_type(kNumberTypeFloat32);
  cell_state->set_shape({1, lstm_param->batch_, lstm_param->hidden_size_});
  cell_state->SetFormat(schema::Format_NHWC);
  cell_state->MallocData();
  memset(cell_state->Data(), 0, cell_state->ElementsNum() * sizeof(float));

  auto *hidden_state = new lite::tensor::Tensor;
  hidden_state->set_data_type(kNumberTypeFloat32);
  hidden_state->set_shape({1, lstm_param->batch_, lstm_param->hidden_size_});
  hidden_state->SetFormat(schema::Format_NHWC);
  hidden_state->MallocData();
  memset(hidden_state->Data(), 0, hidden_state->ElementsNum() * sizeof(float));

  outputs->push_back(output);
  outputs->push_back(cell_state);
  outputs->push_back(hidden_state);
 }

 void CompareOutput(lite::tensor::Tensor *output, std::vector<float> data) {
  for (int i = 0; i < output->ElementsNum(); i++) {
    std::cout << reinterpret_cast<float *>(output->Data())[i] << ", ";
  }
  std::cout << std::endl;

  Common::CompareOutputData(reinterpret_cast<float *>(output->Data()), data.data(), output->ElementsNum(), 0.0001);
 }

 TEST_F(LstmFp32, LstmForwardFp32Accuracy) {
  // prepare stage
  auto lstm_param = new LstmParameter();
  InitLstmParam(lstm_param);

  // init ctx
  auto ctx = new lite::Context();
  ctx->thread_num_ = 1;

  // init tensor
  std::vector<lite::tensor::Tensor *> inputs;
  std::vector<lite::tensor::Tensor *> outputs;
  InitLstmForwardCreator(&inputs, &outputs, lstm_param);

  // register op
  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, mindspore::schema::PrimitiveType_Lstm};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  ASSERT_NE(creator, nullptr);
  kernel::LiteKernel *kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(lstm_param), ctx, desc);
  ASSERT_NE(kernel, nullptr);
  // op run
  kernel->Run();

  std::cout << "==================output data=================" << std::endl;
  std::vector<float> output0_data = {-0.0702, 0.1225,  0.0876,  -0.0357, -0.0227, -0.2294,
                                     -0.0345, -0.0108, -0.2002, 0.0451,  0.0853,  -0.1205};
  CompareOutput(outputs[0], output0_data);

  std::vector<float> output1_data = {0.0451, 0.0853, -0.1205};
  CompareOutput(outputs[1], output1_data);

  std::vector<float> output2_data = {0.0989, 0.2094, -0.4132};
  CompareOutput(outputs[2], output2_data);

  delete lstm_param;
  for (int i = 0; i < inputs.size() - 1; i++) {
    delete inputs[i];
  }
  for (int i = 0; i < outputs.size(); i++) {
    delete outputs[i];
  }
  delete kernel;
  MS_LOG(INFO) << "LstmFp32 forward accuracy passed";
 }

 void InitLstmBackwardCreator(std::vector<lite::tensor::Tensor *> *inputs, std::vector<lite::tensor::Tensor *> *outputs,
                             const LstmParameter *lstm_param) {
  // prepare input
  std::vector<float> input_data = {1.4305, 0.5342, -0.9221, 0.0527, 2.3770, -0.3697, -0.2833, -2.1285};
  auto *input = new lite::tensor::Tensor;
  input->set_data_type(kNumberTypeFloat32);
  input->set_shape({lstm_param->seq_len_, lstm_param->batch_, lstm_param->input_size_});
  input->MallocData();
  memcpy(input->Data(), input_data.data(), input_data.size() * sizeof(float));

  // prepare weight_i
  std::vector<float> weight_i_data = {
    -0.19253477, -0.007966279, -0.06039094, 0.27697134,  -0.5071223,  0.18996351,  0.20472168,  -0.1007814,
    0.04282999,  0.20836472,   -0.4654655,  0.050321221, -0.3431457,  0.22256428,  0.29294532,  0.45042896,
    0.20468240,  0.13078391,   -0.20987969, -0.3173505,  -0.3813517,  0.10205835,  0.21858131,  -0.0386473,
    0.5512280,   -0.2763766,   -0.3593936,  -0.5181975,  0.3469863,   -0.38533931, 0.010202527, -0.46598294,
    -0.5740513,  0.06127524,   -0.03960543, 0.2478809,   -0.17296993, 0.19159525,  -0.4976995,  0.05985528,
    0.3653409,   0.386924,     0.3170289,   -0.08830952, -0.31105759, 0.3110240,   0.15174299,  0.287579894};
  auto *weight_i = new lite::tensor::Tensor;
  weight_i->set_data_type(kNumberTypeFloat32);
  weight_i->SetFormat(schema::Format_NHWC);
  weight_i->set_shape({2, lstm_param->hidden_size_ * 4, lstm_param->input_size_});
  weight_i->MallocData();
  memcpy(weight_i->Data(), weight_i_data.data(), weight_i_data.size() * sizeof(float));

  // prepare weight_r
  std::vector<float> weight_h_data = {
    0.106934666,  -0.50430017,  0.33296257,   -0.288117021, -0.38019785,  -0.147071093, 0.422707557,  0.41497004,
    -0.5329730,   -0.430150926, -0.032713949, 0.35401260,   0.179495036,  -0.14158579,  0.380428612,  -0.175597071,
    0.54088723,   -0.403292059, -0.287720531, -0.51250511,  -0.15405902,  -0.440592586, 0.16726928,   -0.0163397789,
    0.51673841,   0.5094323,    -0.137105107, -0.181070089, -0.47221425,  -0.38046866,  -0.206725060, 0.248537719,
    -0.23961094,  -0.117781728, 0.426800847,  0.0266208052, -0.197408229, 0.54831492,   -0.280048757, -0.125062286,
    -0.29929456,  0.42354834,   -0.401066303, 0.356340110,  0.54629492,   -0.15852552,  0.131406366,  -0.101815432,
    0.0121276974, -0.53553336,  0.121099889,  0.060554087,  0.46259057,   -0.49666053,  0.090806663,  0.20542401,
    -0.38674920,  -0.23874849,  -0.5222138,   0.57537007,   0.113343358,  -0.35233467,  -0.25532332,  0.159506142,
    0.35996592,   -0.201961308, -0.16323345,  0.119177639,  -0.12677872,  -0.175229549, -0.160024613, -0.21058899};
  auto *weight_h = new lite::tensor::Tensor;
  weight_h->set_data_type(kNumberTypeFloat32);
  weight_h->SetFormat(schema::Format_NHWC);
  weight_h->set_shape({2, lstm_param->hidden_size_ * 4, lstm_param->hidden_size_});
  weight_h->MallocData();
  memcpy(weight_h->Data(), weight_h_data.data(), weight_h_data.size() * sizeof(float));

  // prepare bias
  std::vector<float> bias_data = {
    0.57061123,   -0.25357073,  -0.146834075, 0.412972748,  -0.27809411,  -0.0542128682, -0.45384609,  -0.53261917,
    0.222133636,  -0.18093895,  -0.045559883, 0.09109061,   0.080319643,  0.455167174,   0.36235427,   -0.00164419412,
    -0.135566502, 0.41905909,   -0.450117409, 0.50565385,   -0.077815443, -0.47051778,   -0.141349375, -0.338519752,
    0.48683023,   0.282384872,  0.13399660,   -0.382526844, -0.23370727,  -0.184681564,  0.45679104,   -0.339453905,
    0.452010273,  0.0552094578, 0.328843057,  0.127738714,  -0.127084732, -0.334061294,  -0.46742400,  -0.401568055,
    0.23712641,   -0.052937567, 0.272351622,  0.42767739,   0.303884744,  -0.46025499,   -0.43985402,  0.256422877};
  auto *bias = new lite::tensor::Tensor;
  bias->set_data_type(kNumberTypeFloat32);
  bias->SetFormat(schema::Format_NHWC);
  bias->set_shape({2, lstm_param->hidden_size_ * 4 * 2});
  bias->MallocData();
  memcpy(bias->Data(), bias_data.data(), bias_data.size() * sizeof(float));

  // prepare state
  std::vector<float> state_data = {0, 0, 0, 0, 0, 0};
  auto *state = new lite::tensor::Tensor;
  state->set_data_type(kNumberTypeFloat32);
  state->SetFormat(schema::Format_NHWC);
  state->set_shape({2, lstm_param->batch_, lstm_param->hidden_size_});
  state->MallocData();
  memcpy(state->Data(), state_data.data(), state_data.size() * sizeof(float));

  inputs->push_back(input);
  inputs->push_back(weight_i);
  inputs->push_back(weight_h);
  inputs->push_back(bias);
  inputs->push_back(state);
  inputs->push_back(state);

  // malloc output buffer, for arm cpu, format: N C4 H W 4
  auto *output = new lite::tensor::Tensor;
  output->set_data_type(kNumberTypeFloat32);
  output->set_shape({lstm_param->seq_len_, 2, lstm_param->batch_, lstm_param->hidden_size_});
  output->SetFormat(schema::Format_NHWC);
  output->MallocData();
  memset(output->Data(), 0, output->ElementsNum() * sizeof(float));

  auto *cell_state = new lite::tensor::Tensor;
  cell_state->set_data_type(kNumberTypeFloat32);
  cell_state->set_shape({2, lstm_param->batch_, lstm_param->hidden_size_});
  cell_state->SetFormat(schema::Format_NHWC);
  cell_state->MallocData();
  memset(cell_state->Data(), 0, cell_state->ElementsNum() * sizeof(float));

  auto *hidden_state = new lite::tensor::Tensor;
  hidden_state->set_data_type(kNumberTypeFloat32);
  hidden_state->set_shape({2, lstm_param->batch_, lstm_param->hidden_size_});
  hidden_state->SetFormat(schema::Format_NHWC);
  hidden_state->MallocData();
  memset(hidden_state->Data(), 0, hidden_state->ElementsNum() * sizeof(float));

  outputs->push_back(output);
  outputs->push_back(cell_state);
  outputs->push_back(hidden_state);
 }

 TEST_F(LstmFp32, LstmBackwardFp32Accuracy) {
  // prepare stage
  auto lstm_param = new LstmParameter();
  InitLstmParam(lstm_param);
  lstm_param->bidirectional_ = true;

  // init ctx
  auto ctx = new lite::Context();
  ctx->thread_num_ = 1;

  // init tensor
  std::vector<lite::tensor::Tensor *> inputs;
  std::vector<lite::tensor::Tensor *> outputs;
  InitLstmBackwardCreator(&inputs, &outputs, lstm_param);

  // register op
  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, mindspore::schema::PrimitiveType_Lstm};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  ASSERT_NE(creator, nullptr);
  kernel::LiteKernel *kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(lstm_param), ctx, desc);
  ASSERT_NE(kernel, nullptr);
  // op run
  kernel->Run();

  std::cout << "==================output data=================" << std::endl;
  std::vector<float> output0_data = {-0.2922, -0.1416, 0.0077,  -0.0422, -0.0585, 0.2061,  -0.2385, -0.0146,
                                     -0.1796, -0.0554, -0.0973, 0.1013,  -0.3062, -0.1516, -0.0310, 0.0459,
                                     -0.0784, 0.0949,  0.0249,  -0.0653, -0.0869, -0.1113, -0.2155, -0.0500};
  CompareOutput(outputs[0], output0_data);

  std::vector<float> output1_data = {0.0249, -0.0653, -0.0869, -0.0422, -0.0585, 0.2061};
  CompareOutput(outputs[1], output1_data);

  std::vector<float> output2_data = {0.0373, -0.2322, -0.1477, -0.1621, -0.1808, 0.5146};
  CompareOutput(outputs[2], output2_data);

  delete lstm_param;
  for (int i = 0; i < inputs.size() - 1; i++) {
    delete inputs[i];
  }
  for (int i = 0; i < outputs.size(); i++) {
    delete outputs[i];
  }
  delete kernel;
  MS_LOG(INFO) << "LstmFp32 backward accuracy passed";
 }

 }  // namespace mindspore