diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
index 4b913ab3f2..ab236d4483 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@@ -51,6 +51,14 @@ int DepthwiseConv2dOpenCLKernel::CheckSpecs() {
     MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type();
     return RET_ERROR;
   }
+  if (!in_tensors_.at(kWeightIndex)->IsConst()) {
+    MS_LOG(ERROR) << "DepthwiseConv2d don't support non-constant weight yet.";
+    return RET_ERROR;
+  }
+  if (in_tensors_.size() == 3 && !in_tensors_.at(kBiasIndex)->IsConst()) {
+    MS_LOG(ERROR) << "DepthwiseConv2d don't support non-constant bias yet.";
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 int DepthwiseConv2dOpenCLKernel::Prepare() {
@@ -62,13 +70,10 @@ int DepthwiseConv2dOpenCLKernel::Prepare() {
   }
   kernel_name += "_NHWC4";
   auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_);
-  if (parameter->kernel_h_ == 1) {
+  if (parameter->kernel_h_ == 1 && parameter->kernel_w_ == 1) {
     kernel_name += "_1x1";
   }
-  kernel_name += "_b";
-  for (auto iv : block_size_) {
-    kernel_name += std::to_string(iv);
-  }
+  kernel_name += "_b" + std::to_string(block_size_.H) + std::to_string(block_size_.W) + std::to_string(block_size_.C);
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
@@ -100,9 +105,10 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() {
   auto allocator = ocl_runtime_->GetAllocator();
   bool is_fp16 = ocl_runtime_->GetFp16Enable();
 
+  auto out_info = GpuTensorInfo(out_tensors_[0]);
   // weight: o, h, w, i; o == group, i == 1
   void *origin_weight = in_tensors_.at(kWeightIndex)->data_c();
-  int CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
+  int CO4 = UP_DIV(out_info.C, C4NUM * block_size_.C);
   int pack_weight_size = C4NUM * CO4 * parameter->kernel_h_ * parameter->kernel_w_;
 
   int plane = parameter->kernel_h_ * parameter->kernel_w_;
@@ -111,13 +117,13 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() {
     packed_weight_ = allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
     if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16) {
       std::function<int16_t(int16_t)> to_dtype = [](int16_t x) -> int16_t { return x; };
-      PackNCHWToNC4HW4<int16_t, int16_t>(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype);
+      PackNCHWToNC4HW4<int16_t, int16_t>(origin_weight, packed_weight_, 1, plane, out_info.C, to_dtype);
     } else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) {
       std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
-      PackNCHWToNC4HW4<float, float16_t>(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype);
+      PackNCHWToNC4HW4<float, float16_t>(origin_weight, packed_weight_, 1, plane, out_info.C, to_dtype);
     } else {  // int8 or int16
       std::function<int16_t(int16_t)> to_dtype = [](int16_t x) -> int16_t { return x; };
-      PackNCHWToNC4HW4<int16_t, int16_t>(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype);
+      PackNCHWToNC4HW4<int16_t, int16_t>(origin_weight, packed_weight_, 1, plane, out_info.C, to_dtype);
       FreeDequantedWeight();
     }
   } else {
@@ -125,51 +131,53 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() {
     packed_weight_ = allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
     if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat32) {
       std::function<float(float)> to_dtype = [](float x) -> float { return x; };
-      PackNCHWToNC4HW4<float, float>(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype);
+      PackNCHWToNC4HW4<float, float>(origin_weight, packed_weight_, 1, plane, out_info.C, to_dtype);
     } else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeFloat16) {
       std::function<float(float16_t)> to_dtype = [](float16_t x) -> float { return static_cast<float>(x); };
-      PackNCHWToNC4HW4<float16_t, float>(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype);
+      PackNCHWToNC4HW4<float16_t, float>(origin_weight, packed_weight_, 1, plane, out_info.C, to_dtype);
     } else {  // int8 or int16
       std::function<float(float)> to_dtype = [](float x) -> float { return x; };
-      PackNCHWToNC4HW4<float, float>(origin_weight, packed_weight_, 1, plane, out_tensors_[0]->Channel(), to_dtype);
+      PackNCHWToNC4HW4<float, float>(origin_weight, packed_weight_, 1, plane, out_info.C, to_dtype);
       FreeDequantedWeight();
     }
   }
-
   allocator->UnmapBuffer(packed_weight_);
 
+  size_t dtype_size = sizeof(float);
+  if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat16) {
+    dtype_size = sizeof(int16_t);
+  }
+  bias_data_ = allocator->Malloc(C4NUM * CO4 * dtype_size);
+  bias_data_ = allocator->MapBuffer(bias_data_, CL_MAP_WRITE, nullptr, true);
+  size_t up_co_size = C4NUM * CO4 * dtype_size;
+  memset(bias_data_, 0, up_co_size);
   if (in_tensors_.size() == kInputSize2) {
-    if (!in_tensors_.at(2)->IsConst()) {
-      MS_LOG(ERROR) << "DepthwiseConv2d don't support non-constant bias yet.";
-      return RET_ERROR;
-    }
-    size_t dtype_size = sizeof(float);
-    if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat16) {
-      dtype_size = sizeof(int16_t);
-    }
-    bias_data_ = allocator->Malloc(C4NUM * CO4 * dtype_size);
-    bias_data_ = allocator->MapBuffer(bias_data_, CL_MAP_WRITE, nullptr, true);
-    size_t up_co_size = C4NUM * CO4 * dtype_size;
-    memset(bias_data_, 0, up_co_size);
     auto ori_bias = in_tensors_.at(kBiasIndex)->data_c();
     if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat32) {
       float16_t *bias_ptr = static_cast<float16_t *>(bias_data_);
       for (size_t i = 0; i < in_tensors_.at(kBiasIndex)->ElementsNum(); ++i) {
         bias_ptr[i] = static_cast<float16_t>(static_cast<float *>(ori_bias)[i]);
       }
+    } else if (!is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat16) {
+      float32_t *bias_ptr = static_cast<float32_t *>(bias_data_);
+      for (size_t i = 0; i < in_tensors_.at(kBiasIndex)->ElementsNum(); ++i) {
+        bias_ptr[i] = static_cast<float32_t>(static_cast<float16_t *>(ori_bias)[i]);
+      }
     } else {
-      memcpy(bias_data_, ori_bias, out_tensors_[0]->Channel() * dtype_size);
+      memcpy(bias_data_, ori_bias, out_info.C * dtype_size);
     }
-    allocator->UnmapBuffer(bias_data_);
   } else {
     MS_ASSERT(in_tensors_.size() == kInputSize1);
   }
+  allocator->UnmapBuffer(bias_data_);
   return mindspore::lite::RET_OK;
 }
 void DepthwiseConv2dOpenCLKernel::SetConstArgs() {
   auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_);
-  size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
-  size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
+  auto in_info = GpuTensorInfo(in_tensors_[0]);
+  auto out_info = GpuTensorInfo(out_tensors_[0]);
+  size_t CO4 = UP_DIV(out_info.C, C4NUM);
+  size_t CI4 = UP_DIV(in_info.C, C4NUM);
 
   std::map<ActType, std::pair<float, float>> relu_clips{
     {ActType_No, {-FLT_MAX, FLT_MAX}}, {ActType_Relu, {0.0, FLT_MAX}}, {ActType_Relu6, {0, 6.0}}};
@@ -177,9 +185,8 @@ void DepthwiseConv2dOpenCLKernel::SetConstArgs() {
   cl_int2 stride = {parameter->stride_h_, parameter->stride_w_};
   cl_int2 padding = {-parameter->pad_u_, -parameter->pad_l_};
   cl_int2 dilation = {parameter->dilation_h_, parameter->dilation_w_};
-  cl_int4 src_size = {in_tensors_[0]->Width(), in_tensors_[0]->Height(), (cl_int)CI4, in_tensors_[0]->Batch()};
-  cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4,
-                      (cl_int)out_tensors_[0]->Batch()};
+  cl_int4 src_size = {(cl_int)in_info.W, (cl_int)in_info.H, (cl_int)CI4, (cl_int)in_info.N};
+  cl_int4 dst_size = {(cl_int)out_info.W, (cl_int)out_info.H, (cl_int)CO4, (cl_int)out_info.N};
 
   int arg_cnt = 2;
   ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, packed_weight_, lite::opencl::MemType::BUF);
@@ -194,10 +201,11 @@ void DepthwiseConv2dOpenCLKernel::SetConstArgs() {
   ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second);
 }
 void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() {
+  auto out_info = GpuTensorInfo(out_tensors_[0]);
   // set global
-  size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]);
-  global_size_ = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]),
-                  (size_t)UP_DIV(out_tensors_[0]->Height() * out_tensors_[0]->Batch(), block_size_[0])};
+  size_t CO4 = UP_DIV(out_info.C, C4NUM * block_size_.C);
+  global_size_ = {CO4, (size_t)UP_DIV(out_info.W, block_size_.W),
+                  (size_t)UP_DIV(out_info.H * out_info.N, block_size_.H)};
   // set local
   const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
   int z = global_size_[0];
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
index 17bafbd094..fbaccaf535 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
@@ -42,7 +42,11 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel {
  private:
   void *packed_weight_{nullptr};
   void *bias_data_{nullptr};
-  std::vector<int> block_size_{2, 2, 1};
+  struct {
+    int H{2};
+    int W{2};
+    int C{1};
+  } block_size_;
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc
index f18f1e2b16..176cba173c 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc
@@ -203,9 +203,9 @@ std::set<size_t> OpenCLKernel::GenerateLocalByGlobal(size_t global_i) {
 int OpenCLKernel::DequantWeight() {
   bool is_fp16 = ocl_runtime_->GetFp16Enable();
   auto *weight_tensor = in_tensors_.at(kWeightIndex);
-  auto *restore_data = weight_tensor->data_c();
-  dequant_flag_ =
-    !weight_tensor->quant_params().empty() && weight_tensor->quant_params().front().inited && restore_data != nullptr;
+  restore_quant_data_ = weight_tensor->data_c();
+  dequant_flag_ = !weight_tensor->quant_params().empty() && weight_tensor->quant_params().front().inited &&
+                  restore_quant_data_ != nullptr;
   if (dequant_flag_) {
     void *dequant_weight{nullptr};
     bool set_flag{true};
@@ -242,6 +242,7 @@ void OpenCLKernel::FreeDequantedWeight() {
   auto *weight_tensor = in_tensors_.at(kWeightIndex);
   if (dequant_flag_) {
     free(weight_tensor->data_c());
+    weight_tensor->set_data(restore_quant_data_);
   }
 }
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
index ee9d927676..b822616c88 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@@ -209,6 +209,7 @@ class OpenCLKernel : public LiteKernel {
   std::vector<size_t> local_size_;
   cl::Kernel kernel_;
   cl::Event event_;
+  void *restore_quant_data_{nullptr};
   bool dequant_flag_{false};
 
  private: