!12267 [MS][LITE][GPU]adreno softmax fp16 acc error

From: @chenzupeng Reviewed-by: Signed-off-by:
5 years ago · f9a2b2004f
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
@@ -44,7 +44,7 @@ __kernel void SoftMaxAxis3_NHWC4(__read_only image2d_t input, __write_only image
  float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X)));
  result = exp(min(result - input_max_f4, 0)) / sum;
  result = result * mask;
  WRITE_IMAGE(output, (int2)(Y * C4 + C4 - 1, X), TO_FLT4(result));
  WRITE_IMAGEOUT(output, (int2)(Y * C4 + C4 - 1, X), OUT_FLT4(result));
 }
 __kernel void SoftMaxAxis1_NHWC4(__read_only image2d_t input, __write_only image2d_t output, const float4 mask,
@@ -65,7 +65,7 @@ __kernel void SoftMaxAxis1_NHWC4(__read_only image2d_t input, __write_only image
  for (int d = 0; d < H; ++d) {
    float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, d)));
    result = exp(result) / sum;
    WRITE_IMAGE(output, (int2)(X * C4 + Y, d), TO_FLT4(result));
    WRITE_IMAGEOUT(output, (int2)(X * C4 + Y, d), OUT_FLT4(result));
  }
 }
@@ -87,7 +87,7 @@ __kernel void SoftMaxAxis2_NHWC4(__read_only image2d_t input, __write_only image
  for (int d = 0; d < W; ++d) {
    float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, X)));
    result = exp(result) / sum;
    WRITE_IMAGE(output, (int2)(d * C4 + Y, X), TO_FLT4(result));
    WRITE_IMAGEOUT(output, (int2)(d * C4 + Y, X), OUT_FLT4(result));
  }
 }
@@ -125,6 +125,6 @@ __kernel void SoftMax1x1_NHWC4(__read_only image2d_t input, __write_only image2d
  for (size_t i = tid; i < C4; i += 32) {
    float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0)));
    result = exp(result) * sum;
    WRITE_IMAGE(output, (int2)(i, 0), TO_FLT4(result));
    WRITE_IMAGEOUT(output, (int2)(i, 0), OUT_FLT4(result));
  }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
@@ -236,7 +236,7 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data,
  if (X >= size.x || Y >= size.y || Z >= size.z) {
    return;
  }
  float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
  float4 data = convert_float4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
  int offset = (X * shape.z + Y) * shape.w + Z * 4;
  __global float *dst_addr = (__global float *)dst_data;
  dst_addr += offset;
@@ -320,7 +320,7 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, _
  if (X >= size.x || Y >= size.y || Z >= size.z) {
    return;
  }
  half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
  half4 data = convert_half4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
  int offset = (X * shape.z + Y) * shape.w + Z * 4;
  __global half *dst_addr = (__global half *)dst_data;
  dst_addr += offset;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
@@ -87,7 +87,13 @@ int SoftmaxOpenCLKernel::Prepare() {
 #else
  std::string program_name = "SoftMax";
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
  std::vector<std::string> ext_build_opt;
  if (out_tensors_[0]->data_type() == kNumberTypeFloat32) {
    ext_build_opt.push_back("-DOUT_FLT4=convert_float4 -DWRITE_IMAGEOUT=write_imagef");
  } else {
    ext_build_opt.push_back("-DOUT_FLT4=convert_half4 -DWRITE_IMAGEOUT=write_imageh");
  }
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt);
 #endif
  SetConstArgs();
  SetGlobalLocal();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
@@ -81,7 +81,13 @@ int ToFormatOpenCLKernel::Prepare() {
  std::string program_name = "to_format";
  std::string source = to_format_source;
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
  std::vector<std::string> ext_build_opt;
  if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
    ext_build_opt.push_back("-DREAD_IMAGEIN=read_imagef");
  } else {
    ext_build_opt.push_back("-DREAD_IMAGEIN=read_imageh");
  }
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt);
 #endif
  auto output = GpuTensorInfo(out_tensors_.front());
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
@@ -251,6 +251,17 @@ int OpenCLSubGraph::UpdateTensorDataTypePass() {
    for (auto iv : nodes_) {
      MS_ASSERT(iv);
      auto cur_outs = iv->out_tensors();
      // if softmax is last kernel, output fp32 tensor
      if (iv->Type() == schema::PrimitiveType_SoftMax) {
        bool last_kernel = true;
        for (auto k : iv->out_kernels()) {
          if (k->Type() != schema::PrimitiveType_ToFormat) {
            last_kernel = false;
            break;
          }
        }
        if (last_kernel) continue;
      }
      for (auto jv : cur_outs) {
        if (out_set.count(jv) == 0) {
          MS_ASSERT(jv);