diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl index b67e19383a..22bd59396a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl @@ -13,8 +13,10 @@ __kernel void conv2d_transpose(__read_only image2d_t src_data, __write_only imag int rem_w = dst_w % stride.y; int ceil_w = dst_w / stride.y; dst_w = ceil_w * stride.y * 2 + rem_w; - int dst_c = get_global_id(2); - if (dst_h >= dst_size.x || dst_w >= dst_size.y || dst_c >= dst_size.z) return; + int dst_c = get_global_id(2); // n * c4 + int n = dst_c / dst_size.z; + dst_c = dst_c % dst_size.z; + if (dst_h >= dst_size.x || dst_w >= dst_size.y || dst_c >= dst_size.z || n >= dst_size.w) return; int weight_base = dst_c * src_size.z * kernel_size.x * kernel_size.y; FLT4 r0 = (FLT4)(0.f); FLT4 r1 = (FLT4)(0.f); @@ -40,10 +42,18 @@ __kernel void conv2d_transpose(__read_only image2d_t src_data, __write_only imag int kernel_w = kw_start - kw_copy; int weight_offset = weight_base + (kernel_h * kernel_size.y + kernel_w) * src_size.z; for (int ci = 0; ci < src_size.z; ++ci) { - FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, out0_src_h)); - FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, out1_src_h)); - FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, out0_src_h)); - FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, out1_src_h)); + FLT4 x0 = (FLT4)0.f; + FLT4 x2 = (FLT4)0.f; + if (out0_src_h < src_size.x) { + x0 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, n * src_size.x + out0_src_h)); + x2 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, n * src_size.x + out0_src_h)); + } + FLT4 x1 = (FLT4)0.f; + FLT4 x3 = (FLT4)0.f; + if (out1_src_h < src_size.x) { + x1 = READ_IMAGE(src_data, smp_zero, (int2)(out0_src_w * src_size.z + ci, n * src_size.x + out1_src_h)); + x3 = READ_IMAGE(src_data, smp_zero, (int2)(out1_src_w * src_size.z + ci, n * src_size.x + out1_src_h)); + } FLT16 weight_cache = weight[weight_offset++]; r0 += x0.x * weight_cache.s0123; r0 += x0.y * weight_cache.s4567; @@ -85,14 +95,14 @@ __kernel void conv2d_transpose(__read_only image2d_t src_data, __write_only imag r3 = clamp(r3, (FLT4)(0.0f), (FLT4)(6.0f)); } - WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h), r0); + WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, n * dst_size.x + dst_h), r0); if (dst_h + stride.x < dst_size.x && dst_w < dst_size.y) { - WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h + stride.x), r1); + WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, n * dst_size.x + dst_h + stride.x), r1); } if (dst_h < dst_size.x && dst_w + stride.y < dst_size.y) { - WRITE_IMAGE(dst_data, (int2)((dst_w + stride.y) * dst_size.z + dst_c, dst_h), r2); + WRITE_IMAGE(dst_data, (int2)((dst_w + stride.y) * dst_size.z + dst_c, n * dst_size.x + dst_h), r2); } if (dst_h + stride.x < dst_size.x && dst_w + stride.y < dst_size.y) { - WRITE_IMAGE(dst_data, (int2)((dst_w + stride.y) * dst_size.z + dst_c, dst_h + stride.x), r3); + WRITE_IMAGE(dst_data, (int2)((dst_w + stride.y) * dst_size.z + dst_c, n * dst_size.x + dst_h + stride.x), r3); } } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc index bfbd10e10e..6d367d744a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc @@ -83,10 +83,12 @@ void Conv2dTransposeOpenCLKernel::SetGlobalLocal() { int co4 = UP_DIV(co, C4NUM); int stride_h = param->stride_h_; int stride_w = param->stride_w_; + int n = out_tensors_[0]->shape()[0]; int oh = out_tensors_[0]->shape()[1]; int ow = out_tensors_[0]->shape()[2]; local_size_ = {16, 1, 16}; - global_size_ = {(size_t)UP_ROUND(UP_DIV(oh, 2), stride_h), (size_t)UP_ROUND(UP_DIV(ow, 2), stride_w), (size_t)co4}; + global_size_ = {(size_t)UP_ROUND(UP_DIV(oh, 2), stride_h), (size_t)UP_ROUND(UP_DIV(ow, 2), stride_w), + (size_t)co4 * (size_t)n}; AlignGlobalLocal(global_size_, local_size_); } @@ -103,13 +105,14 @@ void Conv2dTransposeOpenCLKernel::SetConstArgs() { int stride_w = param->stride_w_; int oh = out_tensors_[0]->shape()[1]; int ow = out_tensors_[0]->shape()[2]; + int n = in_tensors_[0]->shape()[0]; int h = in_tensors_[0]->shape()[1]; int w = in_tensors_[0]->shape()[2]; cl_int2 kernel_size = {kh, kw}; cl_int2 stride = {stride_h, stride_w}; cl_int2 padding = {pad_h, pad_w}; - cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), 1}; - cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), 1}; + cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), n}; + cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), n}; ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc index cc5c5a14c4..f3ae95e9d3 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc @@ -38,9 +38,10 @@ OpParameter *CreateParameter(int n, int h, int w, int ci, int co, int kh, int kw param->dilation_h_ = 1; param->dilation_w_ = 1; param->act_type_ = ActType_No; + param->group_ = 1; *input_shape = {n, h, w, ci}; - *weight_shape = {co, kh, kw, ci}; + *weight_shape = {ci, kh, kw, co}; *bias_shape = {co}; *output_shape = {1, oh, ow, co}; return reinterpret_cast(param); @@ -59,9 +60,9 @@ TEST_F(TestOpenCL_Conv2dTranspose, test0) { int kw = 2; std::vector pad = {0, 0, 0, 0}; float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7}; - float weight_data[] = {1, 2, 3, 4, 5, 6, 7, 8}; + float weight_data[] = {0, 2, 4, 6, 1, 3, 5, 7}; float bias_data[] = {0.5}; - float output_data[] = {5.5, 6.5, 17.5, 22.5, 7.5, 8.5, 27.5, 32.5, 29.5, 38.5, 41.5, 54.5, 47.5, 56.5, 67.5, 80.5}; + float output_data[] = {1.5, 3.5, 3.5, 13.5, 5.5, 7.5, 23.5, 33.5, 5.5, 23.5, 7.5, 33.5, 41.5, 59.5, 59.5, 85.5}; for (auto fp16_enable : {false, true}) { std::vector input_shape, weight_shape, bias_shape, output_shape; @@ -78,19 +79,18 @@ TEST_F(TestOpenCL_Conv2dTranspose, test1) { int n = 1; int h = 3; int w = 3; - int oh = 6; - int ow = 6; + int oh = 5; + int ow = 5; int ci = 2; int co = 1; int kh = 2; int kw = 2; - std::vector pad = {0, 1, 0, 1}; + std::vector pad = {0, 0, 0, 0}; float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}; float weight_data[] = {0, 2, 4, 6, 1, 3, 5, 7}; float bias_data[] = {0.5}; - float output_data[] = {1.5, 3.5, 3.5, 13.5, 5.5, 23.5, 5.5, 7.5, 23.5, 33.5, 41.5, 59.5, - 7.5, 33.5, 9.5, 43.5, 11.5, 53.5, 59.5, 85.5, 77.5, 111.5, 95.5, 137.5, - 13.5, 63.5, 15.5, 73.5, 17.5, 83.5, 113.5, 163.5, 131.5, 189.5, 149.5, 215.5}; + float output_data[] = {1.5, 3.5, 3.5, 13.5, 5.5, 5.5, 7.5, 23.5, 33.5, 41.5, 7.5, 33.5, 9.5, + 43.5, 11.5, 59.5, 85.5, 77.5, 111.5, 95.5, 13.5, 63.5, 15.5, 73.5, 17.5}; for (auto fp16_enable : {false, true}) { std::vector input_shape, weight_shape, bias_shape, output_shape; @@ -130,4 +130,93 @@ TEST_F(TestOpenCL_Conv2dTranspose, test2) { {output_shape, output_data}, param, fp16_enable); } } + +TEST_F(TestOpenCL_Conv2dTranspose, test0MultiBatch) { + int n = 2; + int h = 2; + int w = 2; + int oh = 4; + int ow = 4; + int ci = 2; + int co = 1; + int kh = 2; + int kw = 2; + std::vector pad = {0, 0, 0, 0}; + float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; + float weight_data[] = {0, 2, 4, 6, 1, 3, 5, 7}; + float bias_data[] = {0.5}; + float output_data[] = {1.5, 3.5, 3.5, 13.5, 5.5, 7.5, 23.5, 33.5, 5.5, 23.5, 7.5, 33.5, 41.5, 59.5, 59.5, 85.5, + 1.5, 3.5, 3.5, 13.5, 5.5, 7.5, 23.5, 33.5, 5.5, 23.5, 7.5, 33.5, 41.5, 59.5, 59.5, 85.5}; + + for (auto fp16_enable : {false, true}) { + std::vector input_shape, weight_shape, bias_shape, output_shape; + auto *param = + CreateParameter(n, h, w, ci, co, kh, kw, pad, oh, ow, &input_shape, &weight_shape, &bias_shape, &output_shape); + TestMain({{input_shape, input_data, VAR}, + {weight_shape, weight_data, CONST_TENSOR}, + {bias_shape, bias_data, CONST_TENSOR}}, + {output_shape, output_data}, param, fp16_enable); + } +} + +TEST_F(TestOpenCL_Conv2dTranspose, test1MultiBatch) { + int n = 2; + int h = 3; + int w = 3; + int oh = 5; + int ow = 5; + int ci = 2; + int co = 1; + int kh = 2; + int kw = 2; + std::vector pad = {0, 0, 0, 0}; + float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}; + float weight_data[] = {0, 2, 4, 6, 1, 3, 5, 7}; + float bias_data[] = {0.5}; + float output_data[] = {1.5, 3.5, 3.5, 13.5, 5.5, 5.5, 7.5, 23.5, 33.5, 41.5, 7.5, 33.5, 9.5, + 43.5, 11.5, 59.5, 85.5, 77.5, 111.5, 95.5, 13.5, 63.5, 15.5, 73.5, 17.5, 1.5, + 3.5, 3.5, 13.5, 5.5, 5.5, 7.5, 23.5, 33.5, 41.5, 7.5, 33.5, 9.5, 43.5, + 11.5, 59.5, 85.5, 77.5, 111.5, 95.5, 13.5, 63.5, 15.5, 73.5, 17.5}; + + for (auto fp16_enable : {false, true}) { + std::vector input_shape, weight_shape, bias_shape, output_shape; + auto *param = + CreateParameter(n, h, w, ci, co, kh, kw, pad, oh, ow, &input_shape, &weight_shape, &bias_shape, &output_shape); + TestMain({{input_shape, input_data, VAR}, + {weight_shape, weight_data, CONST_TENSOR}, + {bias_shape, bias_data, CONST_TENSOR}}, + {output_shape, output_data}, param, fp16_enable); + } +} +TEST_F(TestOpenCL_Conv2dTranspose, test2MultiBatch) { + int n = 2; + int h = 2; + int w = 2; + int oh = 5; + int ow = 5; + int ci = 2; + int co = 1; + int kh = 3; + int kw = 3; + std::vector pad = {0, 0, 0, 0}; + float input_data[] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}; + float weight_data[] = {0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, + 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0}; + float bias_data[] = {0.5}; + float output_data[] = {1.5, 3.5, 8.5, 13.5, 23.5, 7.5, 9.5, 44.5, 43.5, 53.5, 18.5, 38.5, 128.5, + 106.5, 142.5, 59.5, 77.5, 180.5, 111.5, 137.5, 113.5, 131.5, 312.5, 189.5, 215.5, 1.5, + 3.5, 8.5, 13.5, 23.5, 7.5, 9.5, 44.5, 43.5, 53.5, 18.5, 38.5, 128.5, 106.5, + 142.5, 59.5, 77.5, 180.5, 111.5, 137.5, 113.5, 131.5, 312.5, 189.5, 215.5}; + + for (auto fp16_enable : {false, true}) { + std::vector input_shape, weight_shape, bias_shape, output_shape; + auto *param = + CreateParameter(n, h, w, ci, co, kh, kw, pad, oh, ow, &input_shape, &weight_shape, &bias_shape, &output_shape); + TestMain({{input_shape, input_data, VAR}, + {weight_shape, weight_data, CONST_TENSOR}, + {bias_shape, bias_data, CONST_TENSOR}}, + {output_shape, output_data}, param, fp16_enable); + } +} } // namespace mindspore::lite::opencl::test