From 3ceeadf5c2d846fbf3e3f6a7e8dee933e18f3966 Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Mon, 8 Feb 2021 22:06:18 +0800 Subject: [PATCH] fix bug: adreno softmax fp16 acc --- .../lite/src/runtime/kernel/opencl/cl/softmax.cl | 8 ++++---- .../lite/src/runtime/kernel/opencl/cl/to_format.cl | 4 ++-- .../lite/src/runtime/kernel/opencl/kernel/softmax.cc | 8 +++++++- .../src/runtime/kernel/opencl/kernel/to_format.cc | 8 +++++++- .../lite/src/runtime/kernel/opencl/opencl_subgraph.cc | 11 +++++++++++ 5 files changed, 31 insertions(+), 8 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl index 1dcb8ad38f..0fa9c06a72 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl @@ -44,7 +44,7 @@ __kernel void SoftMaxAxis3_NHWC4(__read_only image2d_t input, __write_only image float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X))); result = exp(min(result - input_max_f4, 0)) / sum; result = result * mask; - WRITE_IMAGE(output, (int2)(Y * C4 + C4 - 1, X), TO_FLT4(result)); + WRITE_IMAGEOUT(output, (int2)(Y * C4 + C4 - 1, X), OUT_FLT4(result)); } __kernel void SoftMaxAxis1_NHWC4(__read_only image2d_t input, __write_only image2d_t output, const float4 mask, @@ -65,7 +65,7 @@ __kernel void SoftMaxAxis1_NHWC4(__read_only image2d_t input, __write_only image for (int d = 0; d < H; ++d) { float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, d))); result = exp(result) / sum; - WRITE_IMAGE(output, (int2)(X * C4 + Y, d), TO_FLT4(result)); + WRITE_IMAGEOUT(output, (int2)(X * C4 + Y, d), OUT_FLT4(result)); } } @@ -87,7 +87,7 @@ __kernel void SoftMaxAxis2_NHWC4(__read_only image2d_t input, __write_only image for (int d = 0; d < W; ++d) { float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, X))); result = exp(result) / sum; - WRITE_IMAGE(output, (int2)(d * C4 + Y, X), TO_FLT4(result)); + WRITE_IMAGEOUT(output, (int2)(d * C4 + Y, X), OUT_FLT4(result)); } } @@ -125,6 +125,6 @@ __kernel void SoftMax1x1_NHWC4(__read_only image2d_t input, __write_only image2d for (size_t i = tid; i < C4; i += 32) { float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0))); result = exp(result) * sum; - WRITE_IMAGE(output, (int2)(i, 0), TO_FLT4(result)); + WRITE_IMAGEOUT(output, (int2)(i, 0), OUT_FLT4(result)); } } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl index 574ffb8fd1..98099f1fe6 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl @@ -236,7 +236,7 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data, if (X >= size.x || Y >= size.y || Z >= size.z) { return; } - float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); + float4 data = convert_float4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X))); int offset = (X * shape.z + Y) * shape.w + Z * 4; __global float *dst_addr = (__global float *)dst_data; dst_addr += offset; @@ -320,7 +320,7 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, _ if (X >= size.x || Y >= size.y || Z >= size.z) { return; } - half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); + half4 data = convert_half4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X))); int offset = (X * shape.z + Y) * shape.w + Z * 4; __global half *dst_addr = (__global half *)dst_data; dst_addr += offset; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc index f007feeb95..b3d7b8a817 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc @@ -87,7 +87,13 @@ int SoftmaxOpenCLKernel::Prepare() { #else std::string program_name = "SoftMax"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector ext_build_opt; + if (out_tensors_[0]->data_type() == kNumberTypeFloat32) { + ext_build_opt.push_back("-DOUT_FLT4=convert_float4 -DWRITE_IMAGEOUT=write_imagef"); + } else { + ext_build_opt.push_back("-DOUT_FLT4=convert_half4 -DWRITE_IMAGEOUT=write_imageh"); + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt); #endif SetConstArgs(); SetGlobalLocal(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc index a7ef74e216..e0512be8e2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc @@ -81,7 +81,13 @@ int ToFormatOpenCLKernel::Prepare() { std::string program_name = "to_format"; std::string source = to_format_source; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector ext_build_opt; + if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { + ext_build_opt.push_back("-DREAD_IMAGEIN=read_imagef"); + } else { + ext_build_opt.push_back("-DREAD_IMAGEIN=read_imageh"); + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt); #endif auto output = GpuTensorInfo(out_tensors_.front()); diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc index 16805abeb8..778347c265 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc @@ -251,6 +251,17 @@ int OpenCLSubGraph::UpdateTensorDataTypePass() { for (auto iv : nodes_) { MS_ASSERT(iv); auto cur_outs = iv->out_tensors(); + // if softmax is last kernel, output fp32 tensor + if (iv->Type() == schema::PrimitiveType_SoftMax) { + bool last_kernel = true; + for (auto k : iv->out_kernels()) { + if (k->Type() != schema::PrimitiveType_ToFormat) { + last_kernel = false; + break; + } + } + if (last_kernel) continue; + } for (auto jv : cur_outs) { if (out_set.count(jv) == 0) { MS_ASSERT(jv);