diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.c b/mindspore/lite/nnacl/fp16/matmul_fp16.c
index 02cc6e48eb..982fc39232 100644
--- a/mindspore/lite/nnacl/fp16/matmul_fp16.c
+++ b/mindspore/lite/nnacl/fp16/matmul_fp16.c
@@ -443,17 +443,20 @@ void RowMajor2Col16MajorFp16Opt(const float16_t *src_ptr, float16_t *dst_ptr, si
 }
 
 void RowMajor2Col16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) {
-  for (int r = 0; r < row; r++) {
-    for (int c = 0; c < col; c++) {
-      int r_div16 = r / 16;
-      int r_mod16 = r % 16;
-      if (is_fp32_src) {
-        dst[r_div16 * 16 * col + c * 16 + r_mod16] = (float16_t)(((const float *)src)[r * col + c]);
-      } else {
-        dst[r_div16 * 16 * col + c * 16 + r_mod16] = ((const float16_t *)src)[r * col + c];
+  if (is_fp32_src) {
+    const float *fp32_src = (const float *)src;
+    for (int r = 0; r < row; r++) {
+      for (int c = 0; c < col; c++) {
+        int r_div16 = r / 16;
+        int r_mod16 = r % 16;
+        dst[r_div16 * 16 * col + c * 16 + r_mod16] = (float16_t)(fp32_src[r * col + c]);
       }
     }
+  } else {
+    const float16_t *fp16_src = (const float16_t *)src;
+    RowMajor2Col16MajorFp16Opt(fp16_src, dst, row, col);
   }
+  return;
 }
 
 void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) {
@@ -484,6 +487,18 @@ void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, b
   }
 }
 
+void RowMajor2ColMajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) {
+  for (int r = 0; r < row; ++r) {
+    for (int c = 0; c < col; ++c) {
+      if (is_fp32_src) {
+        dst[c * row + r] = (float16_t)(((const float *)src)[r * col + c]);
+      } else {
+        dst[c * row + r] = ((const float16_t *)src)[r * col + c];
+      }
+    }
+  }
+}
+
 void RowMajor2Col8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src) {
   for (int r = 0; r < row; r++) {
     for (int c = 0; c < col; c++) {
diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.h b/mindspore/lite/nnacl/fp16/matmul_fp16.h
index 086a1b973d..e2e7a80cc2 100644
--- a/mindspore/lite/nnacl/fp16/matmul_fp16.h
+++ b/mindspore/lite/nnacl/fp16/matmul_fp16.h
@@ -59,6 +59,8 @@ void RowMajor2Row8MajorFp16(const void *src, float16_t *dst, int row, int col, b
 
 void RowMajor2Col8MajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src);
 
+void RowMajor2ColMajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/lite/nnacl/fp32/matmul_fp32.c b/mindspore/lite/nnacl/fp32/matmul_fp32.c
index cc0d492b3b..f4cfaccdc3 100644
--- a/mindspore/lite/nnacl/fp32/matmul_fp32.c
+++ b/mindspore/lite/nnacl/fp32/matmul_fp32.c
@@ -27,11 +27,17 @@ void RowMajor2ColMajor(const float *src_ptr, float *dst_ptr, int row, int col) {
 void RowMajor2Row4Major(const float *src_ptr, float *dst_ptr, int row, int col) {
   for (int r = 0; r < row; r++) {
     const float *src = src_ptr + r * col;
-    for (int c = 0; c < col; c++) {
+    int c = 0;
+    for (; c < col; c++) {
       int cd4 = c / C4NUM;
       int cm4 = c % C4NUM;
       dst_ptr[cd4 * C4NUM * row + r * C4NUM + cm4] = src[c];
     }
+    for (; c < UP_ROUND(col, C4NUM); c++) {
+      int cd4 = c / C4NUM;
+      int cm4 = c % C4NUM;
+      dst_ptr[cd4 * C4NUM * row + r * C4NUM + cm4] = 0;
+    }
   }
   return;
 }
@@ -39,11 +45,17 @@ void RowMajor2Row4Major(const float *src_ptr, float *dst_ptr, int row, int col)
 void RowMajor2Row6Major(const float *src_ptr, float *dst_ptr, int row, int col) {
   for (int r = 0; r < row; r++) {
     const float *src = src_ptr + r * col;
-    for (int c = 0; c < col; c++) {
+    int c = 0;
+    for (; c < col; c++) {
       int cd6 = c / C6NUM;
       int cm6 = c % C6NUM;
       dst_ptr[cd6 * C6NUM * row + r * C6NUM + cm6] = src[c];
     }
+    for (; c < UP_ROUND(col, C6NUM); c++) {
+      int cd6 = c / C6NUM;
+      int cm6 = c % C6NUM;
+      dst_ptr[cd6 * C6NUM * row + r * C6NUM + cm6] = 0;
+    }
   }
   return;
 }
@@ -51,11 +63,17 @@ void RowMajor2Row6Major(const float *src_ptr, float *dst_ptr, int row, int col)
 void RowMajor2Row8Major(const float *src_ptr, float *dst_ptr, int row, int col) {
   for (int r = 0; r < row; r++) {
     const float *src = src_ptr + r * col;
-    for (int c = 0; c < col; c++) {
+    int c = 0;
+    for (; c < col; c++) {
       int cd8 = c / C8NUM;
       int cm8 = c % C8NUM;
       dst_ptr[cd8 * C8NUM * row + r * C8NUM + cm8] = src[c];
     }
+    for (; c < UP_ROUND(col, C8NUM); c++) {
+      int cd8 = c / C8NUM;
+      int cm8 = c % C8NUM;
+      dst_ptr[cd8 * C8NUM * row + r * C8NUM + cm8] = 0;
+    }
   }
   return;
 }
@@ -63,11 +81,17 @@ void RowMajor2Row8Major(const float *src_ptr, float *dst_ptr, int row, int col)
 void RowMajor2Row12Major(const float *src_ptr, float *dst_ptr, int row, int col) {
   for (int r = 0; r < row; r++) {
     const float *src = src_ptr + r * col;
-    for (int c = 0; c < col; c++) {
+    int c = 0;
+    for (; c < col; c++) {
       int cd12 = c / C12NUM;
       int cm12 = c % C12NUM;
       dst_ptr[cd12 * C12NUM * row + r * C12NUM + cm12] = src[c];
     }
+    for (; c < UP_ROUND(col, C12NUM); c++) {
+      int cd12 = c / C12NUM;
+      int cm12 = c % C12NUM;
+      dst_ptr[cd12 * C12NUM * row + r * C12NUM + cm12] = 0;
+    }
   }
   return;
 }
@@ -75,11 +99,17 @@ void RowMajor2Row12Major(const float *src_ptr, float *dst_ptr, int row, int col)
 void RowMajor2Row16Major(const float *src_ptr, float *dst_ptr, int row, int col) {
   for (int r = 0; r < row; r++) {
     const float *src = src_ptr + r * col;
-    for (int c = 0; c < col; c++) {
+    int c = 0;
+    for (; c < col; c++) {
       int cd16 = c / C16NUM;
       int cm16 = c % C16NUM;
       dst_ptr[cd16 * C16NUM * row + r * C16NUM + cm16] = src[c];
     }
+    for (; c < UP_ROUND(col, C16NUM); c++) {
+      int cd16 = c / C16NUM;
+      int cm16 = c % C16NUM;
+      dst_ptr[cd16 * C16NUM * row + r * C16NUM + cm16] = 0;
+    }
   }
   return;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
index 7df1608a53..b1185b1c86 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
@@ -19,236 +19,61 @@
 #include "src/kernel_registry.h"
 
 using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_INPUT_TENSOR_ERROR;
-using mindspore::lite::RET_MEMORY_FAILED;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_FullConnection;
 
 namespace mindspore::kernel {
-FullconnectionFP16CPUKernel::~FullconnectionFP16CPUKernel() { FreeTmpBuffer(); }
+void FullconnectionFP16CPUKernel::InitAShape() {
+  auto a_shape = in_tensors_.at(0)->shape();
+  params_->row_ = a_shape[0];
+  params_->deep_ = a_shape[1];
+}
 
-void FullconnectionFP16CPUKernel::FreeTmpBuffer() {
-  if (a_pack_ptr_ != nullptr) {
-    context_->allocator->Free(a_pack_ptr_);
-    a_pack_ptr_ = nullptr;
-  }
-  if (b_pack_ptr_ != nullptr) {
-    context_->allocator->Free(b_pack_ptr_);
-    b_pack_ptr_ = nullptr;
-  }
-  if (bias_ptr_ != nullptr) {
-    context_->allocator->Free(bias_ptr_);
-    bias_ptr_ = nullptr;
-  }
-  if (output_fp16_ != nullptr) {
-    context_->allocator->Free(output_fp16_);
-    output_fp16_ = nullptr;
-  }
+void FullconnectionFP16CPUKernel::InitBShape() {
+  auto b_shape = in_tensors_.at(1)->shape();
+  params_->col_ = b_shape[0];
+  params_->deep_ = b_shape[1];
 }
 
 int FullconnectionFP16CPUKernel::ReSize() {
-  FreeTmpBuffer();
-  int row = 1;
-  for (size_t i = 0; i < out_tensors_.at(0)->shape().size() - 1; ++i) row *= (out_tensors_.at(0)->shape())[i];
-  fc_param_->row_ = row;
-  fc_param_->col_ = out_tensors_.at(0)->shape().back();
-  fc_param_->deep_ = (in_tensors_.at(1)->shape()).at(1);
-  fc_param_->row_16_ = UP_ROUND(fc_param_->row_, C16NUM);
-  fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM);
-  thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(fc_param_->col_, C8NUM));
-  thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_, C8NUM), thread_count_) * C8NUM;
+  InitAShape();
+  InitBShape();
+  return MatmulBaseFP16CPUKernel::ReSize();
+}
 
-  if (row == 1) is_vector_input_ = true;
-  int a_pack_row = 0;
-  int b_pack_col = 0;
-  if (is_vector_input_) {
-    a_pack_row = 1;
-    b_pack_col = fc_param_->col_;
-  } else {
-    a_pack_row = fc_param_->row_16_;
-    b_pack_col = fc_param_->col_8_;
-  }
-  a_pack_ptr_ =
-    reinterpret_cast<float16_t *>(context_->allocator->Malloc(a_pack_row * fc_param_->deep_ * sizeof(float16_t)));
-  if (a_pack_ptr_ == nullptr) {
-    FreeTmpBuffer();
-    return RET_MEMORY_FAILED;
-  }
-  memset(a_pack_ptr_, 0, a_pack_row * fc_param_->deep_ * sizeof(float16_t));
+int FullconnectionFP16CPUKernel::Init() {
+  params_->batch = 1;
+  params_->a_transpose_ = false;
+  params_->b_transpose_ = true;
 
-  b_pack_ptr_ =
-    reinterpret_cast<float16_t *>(context_->allocator->Malloc(b_pack_col * fc_param_->deep_ * sizeof(float16_t)));
-  if (b_pack_ptr_ == nullptr) {
-    FreeTmpBuffer();
-    return RET_MEMORY_FAILED;
-  }
-  memset(b_pack_ptr_, 0, b_pack_col * fc_param_->deep_ * sizeof(float16_t));
+  MatmulBaseFP16CPUKernel::InitParameter();
 
-  fc_param_->b_const_ = (in_tensors_.at(1)->data_c() != nullptr);
-  if (fc_param_->b_const_) {
-    if (in_tensors_.at(1)->data_type() == kNumberTypeFloat32) {
-      if (is_vector_input_) {
-        Float32ToFloat16(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_,
-                         fc_param_->col_ * fc_param_->deep_);
-      } else {
-        InitMatrixB(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_);
-      }
-    } else {
-      if (is_vector_input_) {
-        memcpy(b_pack_ptr_, reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()),
-               fc_param_->col_ * fc_param_->deep_ * sizeof(float16_t));
-      } else {
-        InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), b_pack_ptr_);
-      }
-    }
-    b_ptr_ = b_pack_ptr_;
+  if (params_->a_const_ == true) {
+    InitAShape();
   }
 
-  if (in_tensors_.size() == 3) {
-    bias_ptr_ = reinterpret_cast<float16_t *>(context_->allocator->Malloc(b_pack_col * sizeof(float16_t)));
-    if (bias_ptr_ == nullptr) {
-      FreeTmpBuffer();
-      return RET_MEMORY_FAILED;
-    }
-    memset(bias_ptr_, 0, b_pack_col * sizeof(float16_t));
-    Float32ToFloat16(reinterpret_cast<float *>(in_tensors_.at(2)->data_c()), bias_ptr_, fc_param_->col_);
+  if (params_->b_const_ == true) {
+    InitBShape();
   }
 
-  if (out_tensors_.at(0)->data_type() == kNumberTypeFloat32) {
-    output_fp16_ =
-      reinterpret_cast<float16_t *>(context_->allocator->Malloc(fc_param_->row_ * fc_param_->col_ * sizeof(float16_t)));
-    if (output_fp16_ == nullptr) {
-      FreeTmpBuffer();
-      return RET_MEMORY_FAILED;
-    }
+  auto ret = MatmulBaseFP16CPUKernel::Init();
+  if (ret != RET_OK) {
+    return ret;
   }
-  return RET_OK;
-}  // namespace mindspore::kernel
-
-void FullconnectionFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) {
-  RowMajor2Col16MajorFp16(reinterpret_cast<void *>(a_ptr), a_pack_ptr, fc_param_->row_, fc_param_->deep_, true);
-}
-
-void FullconnectionFP16CPUKernel::InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr) {
-  RowMajor2Col16MajorFp16(reinterpret_cast<void *>(a_ptr), a_pack_ptr, fc_param_->row_, fc_param_->deep_, false);
-}
-
-void FullconnectionFP16CPUKernel::InitMatrixB(float *b_ptr, float16_t *b_pack_ptr) {
-  RowMajor2Col8MajorFp16(reinterpret_cast<void *>(b_ptr), b_pack_ptr, fc_param_->col_, fc_param_->deep_, true);
-}
-
-void FullconnectionFP16CPUKernel::InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr) {
-  RowMajor2Col8MajorFp16(reinterpret_cast<void *>(b_ptr), b_pack_ptr, fc_param_->col_, fc_param_->deep_, false);
-}
 
-int FullconnectionFP16CPUKernel::Init() {
   if (!InferShapeDone()) {
     return RET_OK;
   }
   return ReSize();
 }
 
-int FullconnectionFP16CPUKernel::RunImpl(int task_id) {
-  int cur_stride = fc_param_->col_ - task_id * thread_stride_;
-  int cur_oc = MSMIN(thread_stride_, cur_stride);
-  if (cur_oc <= 0) {
-    return RET_OK;
-  }
-  auto b = b_ptr_ + task_id * thread_stride_ * fc_param_->deep_;
-  auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
-  auto c = output_ptr_ + task_id * thread_stride_;
-  if (is_vector_input_) {
-    MatVecMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, cur_oc);
-  } else {
-    MatMulFp16(a_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_,
-               OutType_Nhwc);
-  }
-
-  return RET_OK;
-}
-
-int FcFP16Run(void *cdata, int task_id) {
-  auto op = reinterpret_cast<FullconnectionFP16CPUKernel *>(cdata);
-  auto error_code = op->RunImpl(task_id);
-  if (error_code != RET_OK) {
-    MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]";
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
 int FullconnectionFP16CPUKernel::Run() {
-  auto out_tensor = out_tensors_.at(0);
-  if (out_tensor->data_type() == kNumberTypeFloat32) {
-    output_ptr_ = output_fp16_;
-  } else {
-    output_ptr_ = reinterpret_cast<float16_t *>(out_tensor->data_c());
-  }
-
-  if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) {
-    if (is_vector_input_) {
-      Float32ToFloat16(reinterpret_cast<float *>(in_tensors_.at(0)->data_c()), a_pack_ptr_, fc_param_->deep_);
-    } else {
-      InitMatrixA(reinterpret_cast<float *>(in_tensors_.at(0)->data_c()), a_pack_ptr_);
-    }
-    a_ptr_ = a_pack_ptr_;
-  } else {
-    if (is_vector_input_) {
-      a_ptr_ = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
-    } else {
-      InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c()), a_pack_ptr_);
-      a_ptr_ = a_pack_ptr_;
-    }
-  }
-
-  if (!fc_param_->b_const_) {
-    if (in_tensors_.at(1)->data_type() == kNumberTypeFloat32) {
-      if (is_vector_input_) {
-        Float32ToFloat16(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_,
-                         fc_param_->col_ * fc_param_->deep_);
-      } else {
-        InitMatrixB(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_);
-      }
-      b_ptr_ = b_pack_ptr_;
-    } else {
-      if (is_vector_input_) {
-        b_ptr_ = reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c());
-      } else {
-        InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), b_pack_ptr_);
-        b_ptr_ = b_pack_ptr_;
-      }
-    }
-  }
-  ParallelLaunch(this->context_->thread_pool_, FcFP16Run, this, thread_count_);
-  if (out_tensor->data_type() == kNumberTypeFloat32) {
-    auto size = out_tensor->ElementsNum();
-    auto out_tensor_data = reinterpret_cast<float *>(out_tensor->data_c());
-    Float16ToFloat32(output_fp16_, out_tensor_data, size);
-  }
-  return RET_OK;
-}
-
-kernel::LiteKernel *CpuFullConnectionFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                                       const std::vector<lite::Tensor *> &outputs,
-                                                       OpParameter *opParameter, const lite::InnerContext *ctx,
-                                                       const kernel::KernelKey &desc,
-                                                       const mindspore::lite::PrimitiveC *primitive) {
-  auto *kernel = new (std::nothrow) FullconnectionFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << "kernel is nullptr.";
-    free(opParameter);
-    return nullptr;
-  }
-  auto ret = kernel->Init();
+  auto ret = MatmulBaseFP16CPUKernel::Run();
   if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
-                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
-    delete kernel;
-    return nullptr;
+    MS_LOG(ERROR) << "FullconnectionFP16CPUKernel run failed";
   }
-  return kernel;
+  return ret;
 }
 
-REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FullConnection, CpuFullConnectionFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_FullConnection, LiteKernelCreator<FullconnectionFP16CPUKernel>)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h
index 146bd34604..38bbce8ce5 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.h
@@ -19,46 +19,24 @@
 
 #include <arm_neon.h>
 #include <vector>
-#include "include/errorcode.h"
-#include "nnacl/matmul_parameter.h"
-#include "nnacl/fp16/matmul_fp16.h"
-#include "nnacl/fp16/cast_fp16.h"
-#include "src/lite_kernel.h"
+#include "src/runtime/kernel/arm/fp16/matmul_base_fp16.h"
 
 namespace mindspore::kernel {
-class FullconnectionFP16CPUKernel : public LiteKernel {
+class FullconnectionFP16CPUKernel : public MatmulBaseFP16CPUKernel {
  public:
   explicit FullconnectionFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                                        const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                                        const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
-    fc_param_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
-  }
-  ~FullconnectionFP16CPUKernel() override;
+      : MatmulBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
+  ~FullconnectionFP16CPUKernel() override = default;
+
   int Init() override;
   int ReSize() override;
   int Run() override;
-  int RunImpl(int task_id);
-
- private:
-  void InitMatrixA(float *a_ptr, float16_t *a_pack_ptr);
-  void InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr);
-  void InitMatrixB(float *b_ptr, float16_t *b_pack_ptr);
-  void InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr);
-  void FreeTmpBuffer();
 
  private:
-  MatMulParameter *fc_param_ = nullptr;
-  float16_t *a_pack_ptr_ = nullptr;
-  float16_t *b_pack_ptr_ = nullptr;
-  float16_t *bias_ptr_ = nullptr;
-  float16_t *output_fp16_ = nullptr;
-  float16_t *output_ptr_ = nullptr;
-  float16_t *a_ptr_ = nullptr;
-  float16_t *b_ptr_ = nullptr;
-  bool is_vector_input_ = false;
-  int thread_count_ = 1;
-  int thread_stride_ = 0;
+  void InitAShape();
+  void InitBShape();
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc
new file mode 100644
index 0000000000..89a432d17b
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.cc
@@ -0,0 +1,298 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16/matmul_base_fp16.h"
+#include "nnacl/fp16/matmul_fp16.h"
+#include "nnacl/fp16/cast_fp16.h"
+#include "src/runtime/runtime_api.h"
+#include "include/errorcode.h"
+
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_INPUT_TENSOR_ERROR;
+using mindspore::lite::RET_MEMORY_FAILED;
+using mindspore::lite::RET_OK;
+
+namespace mindspore::kernel {
+int MatmulBaseFP16Run(void *cdata, int task_id) {
+  auto op = reinterpret_cast<MatmulBaseFP16CPUKernel *>(cdata);
+  auto error_code = op->RunImpl(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "MatmulFp16Run error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+MatmulBaseFP16CPUKernel::~MatmulBaseFP16CPUKernel() {
+  if (bias_ptr_ != nullptr) {
+    free(bias_ptr_);
+    bias_ptr_ = nullptr;
+  }
+  FreeResizeBufA();
+  FreeResizeBufB();
+}
+
+void MatmulBaseFP16CPUKernel::FreeResizeBufA() {
+  if (a_pack_ptr_ != nullptr) {
+    context_->allocator->Free(a_pack_ptr_);
+    a_pack_ptr_ = nullptr;
+  }
+  return;
+}
+
+void MatmulBaseFP16CPUKernel::FreeResizeBufB() {
+  if (b_pack_ptr_ != nullptr) {
+    context_->allocator->Free(b_pack_ptr_);
+    b_pack_ptr_ = nullptr;
+  }
+  return;
+}
+
+void MatmulBaseFP16CPUKernel::InitParameter() {
+  params_->a_const_ = (in_tensors_[0]->data_c() != nullptr);
+  params_->b_const_ = (in_tensors_[1]->data_c() != nullptr);
+  return;
+}
+
+int MatmulBaseFP16CPUKernel::InitBias() {
+  if (in_tensors_.size() == 3) {
+    auto bias_tensor = in_tensors_[2];
+    int max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), C8NUM);
+    bias_ptr_ = reinterpret_cast<float16_t *>(malloc(max_bias_data * sizeof(float)));
+    if (bias_ptr_ == nullptr) {
+      MS_LOG(ERROR) << "malloc bias_ptr_ failed";
+      return RET_ERROR;
+    }
+    memset(bias_ptr_, 0, max_bias_data * sizeof(float16_t));
+    Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->data_c()), bias_ptr_, bias_tensor->ElementsNum());
+  }
+  return RET_OK;
+}
+
+int MatmulBaseFP16CPUKernel::ReSize() {
+  ResizeParameter();
+
+  if (params_->b_const_ == true && src_b_ != nullptr) {
+    InitBufferB();
+    InitMatrixB(src_b_, kNumberTypeFloat16);
+    free(src_b_);
+    src_b_ = nullptr;
+  }
+
+  thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C8NUM));
+  thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM;
+  return RET_OK;
+}
+
+void MatmulBaseFP16CPUKernel::ResizeParameter() {
+  if (params_->row_ == 1) {
+    vec_matmul_ = true;
+  }
+
+  if (vec_matmul_) {
+    params_->row_align_ = 1;
+    params_->col_align_ = params_->col_;
+  } else {
+    params_->row_align_ = UP_ROUND(params_->row_, C16NUM);
+    params_->col_align_ = UP_ROUND(params_->col_, C8NUM);
+  }
+  return;
+}
+
+int MatmulBaseFP16CPUKernel::InitBufferA() {
+  a_pack_ptr_ = reinterpret_cast<float16_t *>(
+    context_->allocator->Malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float16_t)));
+  if (a_pack_ptr_ == nullptr) {
+    return RET_MEMORY_FAILED;
+  }
+
+  memset(a_pack_ptr_, 0, params_->batch * params_->row_align_ * params_->deep_ * sizeof(float16_t));
+  return RET_OK;
+}
+
+int MatmulBaseFP16CPUKernel::InitBufferB() {
+  if (b_pack_ptr_ != nullptr) {
+    return RET_OK;
+  }
+
+  b_pack_ptr_ = reinterpret_cast<float16_t *>(
+    context_->allocator->Malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float16_t)));
+  if (b_pack_ptr_ == nullptr) {
+    return RET_MEMORY_FAILED;
+  }
+
+  memset(b_pack_ptr_, 0, params_->batch * params_->col_align_ * params_->deep_ * sizeof(float16_t));
+  return RET_OK;
+}
+
+void MatmulBaseFP16CPUKernel::InitMatrixA(void *src_ptr) {
+  auto src_data_type = in_tensors_[0]->data_type();
+
+  if (vec_matmul_) {
+    if (src_data_type == kNumberTypeFloat32) {
+      Float32ToFloat16(reinterpret_cast<float *>(src_ptr), a_pack_ptr_, params_->batch * params_->deep_);
+    } else {
+      memcpy(a_pack_ptr_, src_ptr, params_->batch * params_->deep_ * sizeof(float16_t));
+    }
+    return;
+  }
+
+  int8_t *int8_src = reinterpret_cast<int8_t *>(src_ptr);
+  for (int i = 0; i < params_->batch; i++) {
+    int8_t *src = int8_src + i * params_->deep_ * params_->row_ * lite::DataTypeSize(src_data_type);
+    float16_t *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_;
+    if (params_->a_transpose_) {
+      RowMajor2Row16MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32);
+    } else {
+      RowMajor2Col16MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32);
+    }
+  }
+  return;
+}
+
+void MatmulBaseFP16CPUKernel::InitMatrixB(void *src_ptr, TypeId src_data_type) {
+  int8_t *int8_src = reinterpret_cast<int8_t *>(src_ptr);
+
+  if (vec_matmul_) {
+    if (params_->b_transpose_) {
+      if (src_data_type == kNumberTypeFloat32) {
+        Float32ToFloat16(reinterpret_cast<float *>(src_ptr), b_pack_ptr_,
+                         params_->batch * params_->col_ * params_->deep_);
+      } else {
+        memcpy(b_pack_ptr_, src_ptr, params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t));
+      }
+    } else {
+      for (int i = 0; i < params_->batch; i++) {
+        const int8_t *batch_src = int8_src + i * params_->deep_ * params_->col_ * lite::DataTypeSize(src_data_type);
+        float16_t *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_;
+        RowMajor2ColMajorFp16(batch_src, dst, params_->deep_, params_->col_, src_data_type == kNumberTypeFloat32);
+      }
+    }
+    return;
+  }
+
+  for (int i = 0; i < params_->batch; i++) {
+    int8_t *src = int8_src + i * params_->deep_ * params_->col_ * lite::DataTypeSize(src_data_type);
+    float16_t *dst = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
+    if (params_->b_transpose_) {
+      RowMajor2Col8MajorFp16(src, dst, params_->col_, params_->deep_, src_data_type == kNumberTypeFloat32);
+    } else {
+      RowMajor2Row8MajorFp16(src, dst, params_->deep_, params_->col_, src_data_type == kNumberTypeFloat32);
+    }
+  }
+  return;
+}
+
+int MatmulBaseFP16CPUKernel::Init() {
+  ResizeParameter();
+  if (params_->a_const_ == true) {
+    if (RET_OK != InitBufferA()) {
+      return RET_ERROR;
+    }
+    InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()));
+  }
+
+  if (params_->b_const_ == true) {
+    /* copy origin b data, pack in resize
+     * pack after a infershape done */
+    auto b_tensor = in_tensors_[1];
+    src_b_ = reinterpret_cast<float16_t *>(malloc(params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t)));
+    if (src_b_ == nullptr) {
+      MS_LOG(ERROR) << "Matmul fp16 malloc src_b_ failed";
+      return RET_ERROR;
+    }
+
+    if (b_tensor->data_type() == kNumberTypeFloat32) {
+      Float32ToFloat16(reinterpret_cast<float *>(b_tensor->data_c()), src_b_,
+                       params_->batch * params_->col_ * params_->deep_);
+    } else {
+      memcpy(src_b_, b_tensor->data_c(), params_->batch * params_->col_ * params_->deep_ * sizeof(float16_t));
+    }
+  }
+
+  auto ret = InitBias();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Matmul fp16 malloc matrix A buffer failed";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int MatmulBaseFP16CPUKernel::RunImpl(int task_id) {
+  int cur_stride = params_->col_ - task_id * thread_stride_;
+  int cur_oc = MSMIN(thread_stride_, cur_stride);
+  if (cur_oc <= 0) {
+    return RET_OK;
+  }
+
+  auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
+  auto b = batch_b_ptr_ + task_id * thread_stride_ * params_->deep_;
+  auto c = batch_c_ptr_ + task_id * thread_stride_;
+
+  if (vec_matmul_) {
+    MatVecMulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, cur_oc);
+  } else {
+    MatMulFp16(batch_a_ptr_, b, c, bias, params_->act_type_, params_->deep_, params_->row_, cur_oc, params_->col_,
+               OutType_Nhwc);
+  }
+  return RET_OK;
+}
+
+int MatmulBaseFP16CPUKernel::Run() {
+  auto c_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
+
+  if (params_->a_const_ == false) {
+    if (RET_OK != InitBufferA()) {
+      return RET_ERROR;
+    }
+    InitMatrixA(in_tensors_.at(0)->data_c());
+  }
+  if (params_->b_const_ == false) {
+    if (RET_OK != InitBufferB()) {
+      FreeResizeBufA();
+      return RET_ERROR;
+    }
+    InitMatrixB(in_tensors_.at(1)->data_c(), in_tensors_.at(1)->data_type());
+  }
+
+  for (int i = 0; i < params_->batch; ++i) {
+    if (vec_matmul_) {
+      batch_a_ptr_ = a_pack_ptr_ + i * params_->deep_;
+      batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_;
+      batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_;
+    } else {
+      batch_a_ptr_ = a_pack_ptr_ + i * params_->row_align_ * params_->deep_;
+      batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
+      batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_;
+    }
+    auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseFP16Run, this, thread_count_);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "MatmulBaseFloatRun failed";
+      return ret;
+    }
+  }
+
+  if (params_->a_const_ == false) {
+    FreeResizeBufA();
+  }
+
+  if (params_->b_const_ == false) {
+    FreeResizeBufB();
+  }
+  return RET_OK;
+}
+
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.h
new file mode 100644
index 0000000000..92eb91b784
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_base_fp16.h
@@ -0,0 +1,74 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_BASE_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_BASE_FP16_H_
+
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
+#include <vector>
+#include "src/lite_kernel.h"
+#include "nnacl/matmul_parameter.h"
+
+namespace mindspore::kernel {
+class MatmulBaseFP16CPUKernel : public LiteKernel {
+ public:
+  explicit MatmulBaseFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                                   const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
+                                   const mindspore::lite::PrimitiveC *primitive)
+      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
+    params_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
+  }
+  ~MatmulBaseFP16CPUKernel() override;
+  int Init() override;
+  int ReSize() override;
+  int Run() override;
+
+ public:
+  int RunImpl(int task_id);
+
+ protected:
+  void InitParameter();
+
+ private:
+  int InitBias();
+  void ResizeParameter();
+  int InitBufferA();
+  int InitBufferB();
+  void InitMatrixA(void *src_ptr);
+  void InitMatrixB(void *src_ptr, TypeId data_type);
+  void FreeResizeBufA();
+  void FreeResizeBufB();
+
+ protected:
+  MatMulParameter *params_ = nullptr;
+
+ private:
+  int thread_stride_ = 0;
+  int thread_count_ = 0;
+  bool vec_matmul_ = false;
+  float16_t *a_pack_ptr_ = nullptr;
+  float16_t *b_pack_ptr_ = nullptr;
+  float16_t *src_b_ = nullptr;
+  float16_t *bias_ptr_ = nullptr;
+  float16_t *batch_a_ptr_ = nullptr;
+  float16_t *batch_b_ptr_ = nullptr;
+  float16_t *batch_c_ptr_ = nullptr;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_BASE_FP16_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc
index f3f2a8f6b2..f2e9c4ca20 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc
@@ -15,48 +15,20 @@
  */
 
 #include "src/runtime/kernel/arm/fp16/matmul_fp16.h"
-#include "nnacl/fp16/matmul_fp16.h"
-#include "nnacl/fp16/cast_fp16.h"
 #include "src/runtime/runtime_api.h"
 #include "include/errorcode.h"
 #include "src/kernel_registry.h"
 
 using mindspore::lite::KernelRegistrar;
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_INPUT_TENSOR_ERROR;
-using mindspore::lite::RET_MEMORY_FAILED;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_MatMul;
 
 namespace mindspore::kernel {
-MatmulFP16CPUKernel::~MatmulFP16CPUKernel() {
-  if (a_pack_ptr_ != nullptr) {
-    free(a_pack_ptr_);
-    a_pack_ptr_ = nullptr;
-  }
-  if (b_pack_ptr_ != nullptr) {
-    free(b_pack_ptr_);
-    b_pack_ptr_ = nullptr;
-  }
-  if (bias_ptr_ != nullptr) {
-    free(bias_ptr_);
-    bias_ptr_ = nullptr;
-  }
-}
-
-void MatmulFP16CPUKernel::FreeTmpBuffer() {
-  if (a_pack_ptr_ != nullptr) {
-    params_->a_const_ ? free(a_pack_ptr_) : context_->allocator->Free(a_pack_ptr_);
-    a_pack_ptr_ = nullptr;
-  }
-  if (b_pack_ptr_ != nullptr) {
-    params_->b_const_ ? free(b_pack_ptr_) : context_->allocator->Free(b_pack_ptr_);
-    b_pack_ptr_ = nullptr;
-  }
-}
-
-int MatmulFP16CPUKernel::MallocMatrixABuffer() {
+void MatmulFP16CPUKernel::InitAShape() {
   auto a_shape = in_tensors_[0]->shape();
+  if (a_shape.empty()) {
+    return;
+  }
   int batch = 1;
   for (size_t i = 0; i < a_shape.size() - 2; ++i) {
     batch *= a_shape[i];
@@ -65,25 +37,12 @@ int MatmulFP16CPUKernel::MallocMatrixABuffer() {
   params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2];
   params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1];
   params_->row_16_ = UP_ROUND(params_->row_, C16NUM);
-  if (params_->a_const_) {
-    a_pack_ptr_ =
-      reinterpret_cast<float16_t *>(malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t)));
-  } else {
-    a_pack_ptr_ = reinterpret_cast<float16_t *>(
-      context_->allocator->Malloc(params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t)));
-  }
-  if (a_pack_ptr_ == nullptr) {
-    FreeTmpBuffer();
-    return RET_MEMORY_FAILED;
-  }
-  memset(a_pack_ptr_, 0, params_->batch * params_->row_16_ * params_->deep_ * sizeof(float16_t));
-  return RET_OK;
 }
 
-int MatmulFP16CPUKernel::MallocMatrixBBuffer() {
+void MatmulFP16CPUKernel::InitBShape() {
   auto b_shape = in_tensors_[1]->shape();
   if (b_shape.empty()) {
-    return RET_OK;
+    return;
   }
   int batch = 1;
   for (size_t i = 0; i < b_shape.size() - 2; ++i) {
@@ -93,257 +52,42 @@ int MatmulFP16CPUKernel::MallocMatrixBBuffer() {
   params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1];
   params_->col_8_ = UP_ROUND(params_->col_, 8);
   params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2];
-
-  if (params_->b_const_) {
-    b_pack_ptr_ =
-      reinterpret_cast<float16_t *>(malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t)));
-  } else {
-    b_pack_ptr_ = reinterpret_cast<float16_t *>(
-      context_->allocator->Malloc(params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t)));
-  }
-  if (b_pack_ptr_ == nullptr) {
-    FreeTmpBuffer();
-    return RET_MEMORY_FAILED;
-  }
-  memset(b_pack_ptr_, 0, params_->batch * params_->col_8_ * params_->deep_ * sizeof(float16_t));
-  thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_, C8NUM));
-  thread_stride_ = UP_DIV(UP_DIV(params_->col_, C8NUM), thread_count_) * C8NUM;
-  return RET_OK;
-}
-
-int MatmulFP16CPUKernel::InitBias() {
-  auto b_shape = in_tensors_[1]->shape();
-  auto c_shape = out_tensors_[0]->shape();
-  params_->col_ = params_->b_const_
-                    ? (params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1])
-                    : (c_shape[c_shape.size() - 1]);
-  params_->col_8_ = UP_ROUND(params_->col_, 8);
-  bias_ptr_ = reinterpret_cast<float16_t *>(malloc(params_->col_8_ * sizeof(float16_t)));
-  if (bias_ptr_ == nullptr) {
-    FreeTmpBuffer();
-    return RET_MEMORY_FAILED;
-  }
-  memset(bias_ptr_, 0, params_->col_8_ * sizeof(float16_t));
-  if (in_tensors_.size() == 3) {
-    Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->data_c()), bias_ptr_, params_->col_);
-  }
-  return RET_OK;
-}
-
-int MatmulFP16CPUKernel::ReSize() {
-  if (!params_->b_const_) {
-    auto ret = InitBias();
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "Matmul fp16 init bias failed";
-      return RET_ERROR;
-    }
-  }
-  return RET_OK;
-}
-
-void MatmulFP16CPUKernel::InitMatrixA(float *a_ptr, float16_t *a_pack_ptr) {
-  for (int i = 0; i < params_->batch; i++) {
-    float *src = a_ptr + i * params_->deep_ * params_->row_;
-    float16_t *dst = a_pack_ptr + i * params_->deep_ * params_->row_16_;
-    if (params_->a_transpose_) {
-      RowMajor2Row16MajorFp16(reinterpret_cast<void *>(src), dst, params_->deep_, params_->row_, true);
-    } else {
-      RowMajor2Col16MajorFp16(reinterpret_cast<void *>(src), dst, params_->row_, params_->deep_, true);
-    }
-  }
-}
-
-void MatmulFP16CPUKernel::InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr) {
-  for (int i = 0; i < params_->batch; i++) {
-    float16_t *src = a_ptr + i * params_->deep_ * params_->row_;
-    float16_t *dst = a_pack_ptr + i * params_->deep_ * params_->row_16_;
-    if (params_->a_transpose_) {
-      RowMajor2Row16MajorFp16(reinterpret_cast<void *>(src), dst, params_->deep_, params_->row_, false);
-    } else {
-      RowMajor2Col16MajorFp16(reinterpret_cast<void *>(src), dst, params_->row_, params_->deep_, false);
-    }
-  }
-}
-
-void MatmulFP16CPUKernel::InitMatrixB(float *b_ptr, float16_t *b_pack_ptr) {
-  for (int i = 0; i < params_->batch; i++) {
-    float *src = b_ptr + i * params_->deep_ * params_->col_;
-    float16_t *dst = b_pack_ptr + i * params_->deep_ * params_->col_8_;
-    if (params_->b_transpose_) {
-      RowMajor2Col8MajorFp16(reinterpret_cast<void *>(src), dst, params_->col_, params_->deep_, true);
-    } else {
-      RowMajor2Row8MajorFp16(reinterpret_cast<void *>(src), dst, params_->deep_, params_->col_, true);
-    }
-  }
-}
-
-void MatmulFP16CPUKernel::InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr) {
-  for (int i = 0; i < params_->batch; i++) {
-    float16_t *src = b_ptr + i * params_->deep_ * params_->col_;
-    float16_t *dst = b_pack_ptr + i * params_->deep_ * params_->col_8_;
-    if (params_->b_transpose_) {
-      RowMajor2Col8MajorFp16(reinterpret_cast<void *>(src), dst, params_->col_, params_->deep_, false);
-    } else {
-      RowMajor2Row8MajorFp16(reinterpret_cast<void *>(src), dst, params_->deep_, params_->col_, false);
-    }
-  }
 }
 
 int MatmulFP16CPUKernel::Init() {
-  params_->a_const_ = (in_tensors_[0]->data_c() != nullptr);
-  params_->b_const_ = (in_tensors_[1]->data_c() != nullptr);
+  MatmulBaseFP16CPUKernel::InitParameter();
+
   if (params_->a_const_) {
-    auto ret = MallocMatrixABuffer();
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "Matmul fp16 malloc matrix A buffer failed";
-      return RET_ERROR;
-    }
-    if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
-      InitMatrixA(reinterpret_cast<float *>(in_tensors_[0]->data_c()), a_pack_ptr_);
-    } else {
-      InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_[0]->data_c()), a_pack_ptr_);
-    }
+    InitAShape();
   }
   if (params_->b_const_) {
-    auto ret = MallocMatrixBBuffer();
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "Matmul fp16 malloc matrix B buffer failed";
-      return RET_ERROR;
-    }
-    if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
-      InitMatrixB(reinterpret_cast<float *>(in_tensors_[1]->data_c()), b_pack_ptr_);
-    } else {
-      InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_[1]->data_c()), b_pack_ptr_);
-    }
-    ret = InitBias();
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "Matmul fp16 init bias failed";
-      return RET_ERROR;
-    }
+    InitBShape();
   }
-  return RET_OK;
-}
 
-int MatmulFP16CPUKernel::MallocFp16Output() {
-  if (out_tensors_[0]->data_type() == kNumberTypeFloat32) {
-    output_ptr_ = reinterpret_cast<float16_t *>(
-      context_->allocator->Malloc(params_->batch * params_->row_ * params_->col_ * sizeof(float16_t)));
-    if (output_ptr_ == nullptr) {
-      MS_LOG(ERROR) << "malloc output_ptr_ failed.";
-      return RET_MEMORY_FAILED;
-    }
+  auto ret = MatmulBaseFP16CPUKernel::Init();
+  if (ret != RET_OK) {
+    return ret;
   }
-  return RET_OK;
-}
 
-int MatmulFP16CPUKernel::RunImpl(int task_id) {
-  int cur_stride = params_->col_ - task_id * thread_stride_;
-  int cur_oc = MSMIN(thread_stride_, cur_stride);
-  if (cur_oc <= 0) {
+  if (!InferShapeDone()) {
     return RET_OK;
   }
-  auto b = current_b_ + task_id * thread_stride_ * params_->deep_;
-  auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
-  auto c = current_c_ + task_id * thread_stride_;
-  MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc);
-
-  return RET_OK;
+  return ReSize();
 }
 
-int MatmulFP16Run(void *cdata, int task_id) {
-  auto op = reinterpret_cast<MatmulFP16CPUKernel *>(cdata);
-  auto error_code = op->RunImpl(task_id);
-  if (error_code != RET_OK) {
-    MS_LOG(ERROR) << "MatmulFp16Run error task_id[" << task_id << "] error_code[" << error_code << "]";
-    return RET_ERROR;
-  }
-  return RET_OK;
+int MatmulFP16CPUKernel::ReSize() {
+  InitAShape();
+  InitBShape();
+  return MatmulBaseFP16CPUKernel::ReSize();
 }
 
 int MatmulFP16CPUKernel::Run() {
-  auto out_tensor = out_tensors_.at(0);
-  auto ret = MallocFp16Output();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Matmul MallocFp16Output failed";
-    return RET_ERROR;
-  }
-  float16_t *c_ptr = nullptr;
-  if (out_tensor->data_type() == kNumberTypeFloat32) {
-    c_ptr = output_ptr_;
-  } else {
-    c_ptr = reinterpret_cast<float16_t *>(out_tensor->data_c());
-  }
-  if (!params_->a_const_) {
-    ret = MallocMatrixABuffer();
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "Matmul fp16 malloc matrix A buffer failed";
-      return RET_ERROR;
-    }
-    if (in_tensors_.at(0)->data_type() == kNumberTypeFloat32) {
-      InitMatrixA(reinterpret_cast<float *>(in_tensors_.at(0)->data_c()), a_pack_ptr_);
-    } else {
-      InitMatrixA(reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c()), a_pack_ptr_);
-    }
-  }
-  if (!params_->b_const_) {
-    ret = MallocMatrixBBuffer();
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "Matmul fp16 malloc matrix B buffer failed";
-      return RET_ERROR;
-    }
-    if (in_tensors_.at(1)->data_type() == kNumberTypeFloat32) {
-      InitMatrixB(reinterpret_cast<float *>(in_tensors_.at(1)->data_c()), b_pack_ptr_);
-    } else {
-      InitMatrixB(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), b_pack_ptr_);
-    }
-  }
-  for (int i = 0; i < params_->batch; ++i) {
-    current_a_ = a_pack_ptr_ + i * params_->row_16_ * params_->deep_;
-    current_b_ = b_pack_ptr_ + i * params_->deep_ * params_->col_8_;
-    current_c_ = c_ptr + i * params_->row_ * params_->col_;
-    ret = ParallelLaunch(this->context_->thread_pool_, MatmulFP16Run, this, thread_count_);
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "Matmul fp16 run function MatmulFP16Run failed";
-      FreeTmpBuffer();
-      return RET_ERROR;
-    }
-  }
-  if (out_tensor->data_type() == kNumberTypeFloat32) {
-    auto size = out_tensor->ElementsNum();
-    auto out_tensor_data = reinterpret_cast<float *>(out_tensor->data_c());
-    Float16ToFloat32(output_ptr_, out_tensor_data, size);
-    context_->allocator->Free(output_ptr_);
-  }
-  if (!params_->a_const_) {
-    context_->allocator->Free(a_pack_ptr_);
-    a_pack_ptr_ = nullptr;
-  }
-  if (!params_->b_const_) {
-    context_->allocator->Free(b_pack_ptr_);
-    b_pack_ptr_ = nullptr;
-  }
-  return RET_OK;
-}
-
-kernel::LiteKernel *CpuMatmulFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                               const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
-                                               const lite::InnerContext *ctx, const kernel::KernelKey &desc,
-                                               const mindspore::lite::PrimitiveC *primitive) {
-  auto *kernel = new (std::nothrow) MatmulFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << "kernel is nullptr.";
-    free(opParameter);
-    return nullptr;
-  }
-  auto ret = kernel->Init();
+  auto ret = MatmulBaseFP16CPUKernel::Run();
   if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
-                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
-    delete kernel;
-    return nullptr;
+    MS_LOG(ERROR) << "MatmulFP16CPUKernel run failed";
   }
-  return kernel;
+  return ret;
 }
 
-REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MatMul, CpuMatmulFp16KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_MatMul, LiteKernelCreator<MatmulFP16CPUKernel>)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h
index ff1ace9398..7a0f8980de 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.h
@@ -17,50 +17,24 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_MATMUL_H_
 
-#ifdef ENABLE_NEON
-#include <arm_neon.h>
-#endif
 #include <vector>
-#include "src/lite_kernel.h"
-#include "nnacl/matmul_parameter.h"
+#include "src/runtime/kernel/arm/fp16/matmul_base_fp16.h"
 
 namespace mindspore::kernel {
-class MatmulFP16CPUKernel : public LiteKernel {
+class MatmulFP16CPUKernel : public MatmulBaseFP16CPUKernel {
  public:
   explicit MatmulFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                                const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                                const mindspore::lite::PrimitiveC *primitive)
-      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
-    params_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
-  }
-  ~MatmulFP16CPUKernel() override;
+      : MatmulBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
+  ~MatmulFP16CPUKernel() override = default;
   int Init() override;
   int ReSize() override;
   int Run() override;
-  int RunImpl(int task_id);
 
  private:
-  int MallocMatrixABuffer();
-  int MallocMatrixBBuffer();
-  int InitBias();
-  int MallocFp16Output();
-  void InitMatrixA(float *a_ptr, float16_t *a_pack_ptr);
-  void InitMatrixA(float16_t *a_ptr, float16_t *a_pack_ptr);
-  void InitMatrixB(float *b_ptr, float16_t *b_pack_ptr);
-  void InitMatrixB(float16_t *b_ptr, float16_t *b_pack_ptr);
-  void FreeTmpBuffer();
-
- private:
-  MatMulParameter *params_ = nullptr;
-  float16_t *a_pack_ptr_ = nullptr;
-  float16_t *b_pack_ptr_ = nullptr;
-  float16_t *bias_ptr_ = nullptr;
-  float16_t *output_ptr_ = nullptr;
-  float16_t *current_a_ = nullptr;
-  float16_t *current_b_ = nullptr;
-  float16_t *current_c_ = nullptr;
-  int thread_stride_ = 0;
-  int thread_count_ = 0;
+  void InitAShape();
+  void InitBShape();
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc
index 0239e53fc9..319948dc34 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.cc
@@ -25,29 +25,37 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_MatMul;
 
 namespace mindspore::kernel {
+void MatmulCPUKernel::InitShapeA() {
+  auto a_shape = in_tensors_.at(0)->shape();
+  int batch = 1;
+  for (size_t i = 0; i < a_shape.size() - 2; ++i) {
+    batch *= a_shape[i];
+  }
+  params_->batch = batch;
+  params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2];
+  params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1];
+}
+
+void MatmulCPUKernel::InitShapeB() {
+  auto b_shape = in_tensors_.at(1)->shape();
+  int batch = 1;
+  for (size_t i = 0; i < b_shape.size() - 2; ++i) {
+    batch *= b_shape[i];
+  }
+  params_->batch = batch;
+  params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1];
+  params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2];
+}
+
 int MatmulCPUKernel::Init() {
   MatmulFp32BaseCPUKernel::InitParameter();
 
   if (params_->a_const_ == true) {
-    auto a_shape = in_tensors_.at(0)->shape();
-    int batch = 1;
-    for (size_t i = 0; i < a_shape.size() - 2; ++i) {
-      batch *= a_shape[i];
-    }
-    params_->batch = batch;
-    params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2];
-    params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1];
+    InitShapeA();
   }
 
   if (params_->b_const_ == true) {
-    auto b_shape = in_tensors_.at(1)->shape();
-    int batch = 1;
-    for (size_t i = 0; i < b_shape.size() - 2; ++i) {
-      batch *= b_shape[i];
-    }
-    params_->batch = batch;
-    params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1];
-    params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - 1] : b_shape[b_shape.size() - 2];
+    InitShapeB();
   }
 
   auto ret = MatmulFp32BaseCPUKernel::Init();
@@ -62,17 +70,8 @@ int MatmulCPUKernel::Init() {
 }
 
 int MatmulCPUKernel::ReSize() {
-  auto a_shape = in_tensors_.at(0)->shape();
-  auto b_shape = in_tensors_.at(1)->shape();
-  int batch = 1;
-  MS_ASSERT(a_shape.size() >= 2);
-  for (size_t i = 0; i < a_shape.size() - 2; ++i) {
-    batch *= a_shape[i];
-  }
-  params_->batch = batch;
-  params_->row_ = params_->a_transpose_ ? a_shape[a_shape.size() - 1] : a_shape[a_shape.size() - 2];
-  params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - 2] : b_shape[b_shape.size() - 1];
-  params_->deep_ = params_->a_transpose_ ? a_shape[a_shape.size() - 2] : a_shape[a_shape.size() - 1];
+  InitShapeA();
+  InitShapeB();
 
   return MatmulFp32BaseCPUKernel::ReSize();
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h
index 6a9bb0305a..716bef906a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32.h
@@ -33,6 +33,10 @@ class MatmulCPUKernel : public MatmulFp32BaseCPUKernel {
   int ReSize() override;
   int Run() override;
   int Eval() override;
+
+ private:
+  void InitShapeA();
+  void InitShapeB();
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc
index 77bf5b28f1..93869e6cb2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc
@@ -68,8 +68,8 @@ int MatmulFp32BaseCPUKernel::InitBufferA() {
   if (a_pack_ptr_ != nullptr) {
     return RET_OK;
   }
-  a_pack_ptr_ =
-    reinterpret_cast<float *>(malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float)));
+  a_pack_ptr_ = reinterpret_cast<float *>(
+    context_->allocator->Malloc(params_->batch * params_->row_align_ * params_->deep_ * sizeof(float)));
   if (a_pack_ptr_ == nullptr) {
     MS_LOG(ERROR) << "malloc a_pack_ptr_ failed";
     return RET_ERROR;
@@ -81,8 +81,8 @@ int MatmulFp32BaseCPUKernel::InitBufferB() {
   if (b_pack_ptr_ != nullptr) {
     return RET_OK;
   }
-  b_pack_ptr_ =
-    reinterpret_cast<float *>(malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float)));
+  b_pack_ptr_ = reinterpret_cast<float *>(
+    context_->allocator->Malloc(params_->batch * params_->col_align_ * params_->deep_ * sizeof(float)));
   if (b_pack_ptr_ == nullptr) {
     MS_LOG(ERROR) << "malloc b_pack_ptr_ failed";
     return RET_ERROR;
@@ -99,6 +99,7 @@ int MatmulFp32BaseCPUKernel::InitBiasData() {
       MS_LOG(ERROR) << "malloc bias_ptr_ failed";
       return RET_ERROR;
     }
+    memset(bias_ptr_, 0, max_bias_data * sizeof(float));
     memcpy(bias_ptr_, bias_tensor->data_c(), bias_tensor->ElementsNum() * sizeof(float));
   }
   return RET_OK;
@@ -201,7 +202,9 @@ void MatmulFp32BaseCPUKernel::FreeResizeBufB() {
 }
 
 int MatmulFp32BaseCPUKernel::FloatRun(int task_id) {
-  int cur_oc = MSMIN(thread_stride_ * col_tile_, params_->col_ - task_id * thread_stride_ * col_tile_);
+  int current_stride_oc = thread_stride_ * col_tile_;
+  int current_rest_oc = params_->col_ - task_id * thread_stride_ * col_tile_;
+  int cur_oc = MSMIN(current_stride_oc, current_rest_oc);
   if (cur_oc <= 0) {
     return RET_OK;
   }
@@ -254,7 +257,7 @@ int MatmulFp32BaseCPUKernel::ReSize() {
 int MatmulFp32BaseCPUKernel::Run() {
   auto a_ptr = reinterpret_cast<float *>(in_tensors_.at(0)->data_c());
   auto b_ptr = reinterpret_cast<float *>(in_tensors_.at(1)->data_c());
-  c_ptr_ = reinterpret_cast<float *>(out_tensors_.at(0)->data_c());
+  auto c_ptr = reinterpret_cast<float *>(out_tensors_.at(0)->data_c());
 
   if (params_->a_const_ == false) {
     if (RET_OK != InitBufferA()) {
@@ -274,11 +277,11 @@ int MatmulFp32BaseCPUKernel::Run() {
     if (vec_matmul_) {
       batch_a_ptr_ = a_pack_ptr_ + i * params_->deep_;
       batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_;
-      batch_c_ptr_ = c_ptr_ + i * params_->row_ * params_->col_;
+      batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_;
     } else {
       batch_a_ptr_ = a_pack_ptr_ + i * params_->row_align_ * params_->deep_;
       batch_b_ptr_ = b_pack_ptr_ + i * params_->deep_ * params_->col_align_;
-      batch_c_ptr_ = c_ptr_ + i * params_->row_ * params_->col_;
+      batch_c_ptr_ = c_ptr + i * params_->row_ * params_->col_;
     }
     auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseFloatRun, this, thread_count_);
     if (ret != RET_OK) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h
index 188863a5e0..71b44d28cf 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.h
@@ -62,16 +62,17 @@ class MatmulFp32BaseCPUKernel : public LiteKernel {
   MatMulParameter *params_ = nullptr;
   float *a_pack_ptr_ = nullptr;
   float *b_pack_ptr_ = nullptr;
-  float *c_ptr_ = nullptr;
-  float *bias_ptr_ = nullptr;
-  float *batch_a_ptr_ = nullptr;
-  float *batch_b_ptr_ = nullptr;
-  float *batch_c_ptr_ = nullptr;
+
+ private:
   int col_tile_ = 0;
   int row_tile_ = 0;
   int thread_stride_ = 0;
   int thread_count_ = 0;
   bool vec_matmul_ = false;
+  float *bias_ptr_ = nullptr;
+  float *batch_a_ptr_ = nullptr;
+  float *batch_b_ptr_ = nullptr;
+  float *batch_c_ptr_ = nullptr;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_FP32_BASE_H_