From: @chenzupeng Reviewed-by: Signed-off-by:tags/v1.2.0-rc1
| @@ -44,7 +44,7 @@ __kernel void SoftMaxAxis3_NHWC4(__read_only image2d_t input, __write_only image | |||||
| float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X))); | float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X))); | ||||
| result = exp(min(result - input_max_f4, 0)) / sum; | result = exp(min(result - input_max_f4, 0)) / sum; | ||||
| result = result * mask; | result = result * mask; | ||||
| WRITE_IMAGE(output, (int2)(Y * C4 + C4 - 1, X), TO_FLT4(result)); | |||||
| WRITE_IMAGEOUT(output, (int2)(Y * C4 + C4 - 1, X), OUT_FLT4(result)); | |||||
| } | } | ||||
| __kernel void SoftMaxAxis1_NHWC4(__read_only image2d_t input, __write_only image2d_t output, const float4 mask, | __kernel void SoftMaxAxis1_NHWC4(__read_only image2d_t input, __write_only image2d_t output, const float4 mask, | ||||
| @@ -65,7 +65,7 @@ __kernel void SoftMaxAxis1_NHWC4(__read_only image2d_t input, __write_only image | |||||
| for (int d = 0; d < H; ++d) { | for (int d = 0; d < H; ++d) { | ||||
| float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, d))); | float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, d))); | ||||
| result = exp(result) / sum; | result = exp(result) / sum; | ||||
| WRITE_IMAGE(output, (int2)(X * C4 + Y, d), TO_FLT4(result)); | |||||
| WRITE_IMAGEOUT(output, (int2)(X * C4 + Y, d), OUT_FLT4(result)); | |||||
| } | } | ||||
| } | } | ||||
| @@ -87,7 +87,7 @@ __kernel void SoftMaxAxis2_NHWC4(__read_only image2d_t input, __write_only image | |||||
| for (int d = 0; d < W; ++d) { | for (int d = 0; d < W; ++d) { | ||||
| float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, X))); | float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, X))); | ||||
| result = exp(result) / sum; | result = exp(result) / sum; | ||||
| WRITE_IMAGE(output, (int2)(d * C4 + Y, X), TO_FLT4(result)); | |||||
| WRITE_IMAGEOUT(output, (int2)(d * C4 + Y, X), OUT_FLT4(result)); | |||||
| } | } | ||||
| } | } | ||||
| @@ -125,6 +125,6 @@ __kernel void SoftMax1x1_NHWC4(__read_only image2d_t input, __write_only image2d | |||||
| for (size_t i = tid; i < C4; i += 32) { | for (size_t i = tid; i < C4; i += 32) { | ||||
| float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0))); | float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0))); | ||||
| result = exp(result) * sum; | result = exp(result) * sum; | ||||
| WRITE_IMAGE(output, (int2)(i, 0), TO_FLT4(result)); | |||||
| WRITE_IMAGEOUT(output, (int2)(i, 0), OUT_FLT4(result)); | |||||
| } | } | ||||
| } | } | ||||
| @@ -236,7 +236,7 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data, | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | if (X >= size.x || Y >= size.y || Z >= size.z) { | ||||
| return; | return; | ||||
| } | } | ||||
| float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| float4 data = convert_float4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | int offset = (X * shape.z + Y) * shape.w + Z * 4; | ||||
| __global float *dst_addr = (__global float *)dst_data; | __global float *dst_addr = (__global float *)dst_data; | ||||
| dst_addr += offset; | dst_addr += offset; | ||||
| @@ -320,7 +320,7 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, _ | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | if (X >= size.x || Y >= size.y || Z >= size.z) { | ||||
| return; | return; | ||||
| } | } | ||||
| half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| half4 data = convert_half4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | int offset = (X * shape.z + Y) * shape.w + Z * 4; | ||||
| __global half *dst_addr = (__global half *)dst_data; | __global half *dst_addr = (__global half *)dst_data; | ||||
| dst_addr += offset; | dst_addr += offset; | ||||
| @@ -87,7 +87,13 @@ int SoftmaxOpenCLKernel::Prepare() { | |||||
| #else | #else | ||||
| std::string program_name = "SoftMax"; | std::string program_name = "SoftMax"; | ||||
| ocl_runtime_->LoadSource(program_name, source); | ocl_runtime_->LoadSource(program_name, source); | ||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | |||||
| std::vector<std::string> ext_build_opt; | |||||
| if (out_tensors_[0]->data_type() == kNumberTypeFloat32) { | |||||
| ext_build_opt.push_back("-DOUT_FLT4=convert_float4 -DWRITE_IMAGEOUT=write_imagef"); | |||||
| } else { | |||||
| ext_build_opt.push_back("-DOUT_FLT4=convert_half4 -DWRITE_IMAGEOUT=write_imageh"); | |||||
| } | |||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt); | |||||
| #endif | #endif | ||||
| SetConstArgs(); | SetConstArgs(); | ||||
| SetGlobalLocal(); | SetGlobalLocal(); | ||||
| @@ -81,7 +81,13 @@ int ToFormatOpenCLKernel::Prepare() { | |||||
| std::string program_name = "to_format"; | std::string program_name = "to_format"; | ||||
| std::string source = to_format_source; | std::string source = to_format_source; | ||||
| ocl_runtime_->LoadSource(program_name, source); | ocl_runtime_->LoadSource(program_name, source); | ||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | |||||
| std::vector<std::string> ext_build_opt; | |||||
| if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { | |||||
| ext_build_opt.push_back("-DREAD_IMAGEIN=read_imagef"); | |||||
| } else { | |||||
| ext_build_opt.push_back("-DREAD_IMAGEIN=read_imageh"); | |||||
| } | |||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt); | |||||
| #endif | #endif | ||||
| auto output = GpuTensorInfo(out_tensors_.front()); | auto output = GpuTensorInfo(out_tensors_.front()); | ||||
| @@ -251,6 +251,17 @@ int OpenCLSubGraph::UpdateTensorDataTypePass() { | |||||
| for (auto iv : nodes_) { | for (auto iv : nodes_) { | ||||
| MS_ASSERT(iv); | MS_ASSERT(iv); | ||||
| auto cur_outs = iv->out_tensors(); | auto cur_outs = iv->out_tensors(); | ||||
| // if softmax is last kernel, output fp32 tensor | |||||
| if (iv->Type() == schema::PrimitiveType_SoftMax) { | |||||
| bool last_kernel = true; | |||||
| for (auto k : iv->out_kernels()) { | |||||
| if (k->Type() != schema::PrimitiveType_ToFormat) { | |||||
| last_kernel = false; | |||||
| break; | |||||
| } | |||||
| } | |||||
| if (last_kernel) continue; | |||||
| } | |||||
| for (auto jv : cur_outs) { | for (auto jv : cur_outs) { | ||||
| if (out_set.count(jv) == 0) { | if (out_set.count(jv) == 0) { | ||||
| MS_ASSERT(jv); | MS_ASSERT(jv); | ||||