diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl index f74197e7b6..672d56c08c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl @@ -48,3 +48,57 @@ __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t slices_count++; } while (slices_count < slices_x32); } + +__kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *output, const float4 mask, const int slices, + const int slices_x32) { + int tid = get_local_id(0); + float sum = 0.0f; + for (size_t i = tid; i < slices - 1; i += 32) { + float4 src = read_imagef(input, smp_none, (int2)(i, 0)); + sum += dot((float4)(1.0f), exp(src)); + } + if ((slices - 1) % 32 == tid) { + float4 src = read_imagef(input, smp_none, (int2)(slices - 1, 0)); + sum += dot(mask, exp(src)); + } + + __local float4 tmp[8]; + __local float *tmpx1 = (__local float *)tmp; + tmpx1[tid] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + if (tid == 0) { + sum = dot((float4)(1.0f), tmp[0]); + sum += dot((float4)(1.0f), tmp[1]); + sum += dot((float4)(1.0f), tmp[2]); + sum += dot((float4)(1.0f), tmp[3]); + sum += dot((float4)(1.0f), tmp[4]); + sum += dot((float4)(1.0f), tmp[5]); + sum += dot((float4)(1.0f), tmp[6]); + sum += dot((float4)(1.0f), tmp[7]); + tmpx1[0] = 1.0f / sum; + } + barrier(CLK_LOCAL_MEM_FENCE); + sum = tmpx1[0]; + for (size_t i = tid; i < slices - 1; i += 32) { + float4 result = read_imagef(input, smp_none, (int2)(i, 0)); + result = exp(result) * sum; + output[i] = result; + } + if ((slices - 1) % 32 == tid) { + float4 result = read_imagef(input, smp_none, (int2)(slices - 1, 0)); + result = exp(result) * sum; + __global float4 *remain_ptr4 = output; + remain_ptr4 += slices - 1; + __global float *remain_ptr = (__global float *)remain_ptr4; + remain_ptr[0] = result.x; + if (mask.y > 0.f) { + remain_ptr[1] = result.y; + } + if (mask.z > 0.f) { + remain_ptr[2] = result.z; + } + if (mask.w > 0.f) { + remain_ptr[3] = result.w; + } + } +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc index ed93a2a52f..bf4aa835b3 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc @@ -62,7 +62,7 @@ int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) int h = shapex[1]; int w = shapex[2]; int c = shapex[3]; - im_dst_x = UP_DIV(w * c, C4NUM); + im_dst_x = w * UP_DIV(c, C4NUM); im_dst_y = h; #ifdef ENABLE_FP16 size_t img_dtype = CL_HALF_FLOAT; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc index 6345a08e66..c0fb0b13e9 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc @@ -90,7 +90,8 @@ int SoftmaxOpenCLKernel::Init() { std::string program_name = "SoftMax"; std::string source = softmax_source_fp32; runtime_ = lite::opencl::OpenCLRuntime::GetInstance(); - + // framework not set this param yet! just use default. + parameter_->axis_ = 1; if (in_tensors_[0]->shape().size() == 4 && parameter_->axis_ == 3) { // support 4d tensor onexone_flag_ = false; @@ -106,7 +107,10 @@ int SoftmaxOpenCLKernel::Init() { #ifdef PROGRAM_WITH_IL runtime_->CreateKernelFromIL(kernel_(), kernel_name); #else - if (mem_type_ == MEM_TYPE::BUF) { + if (!is_image_out_) { + out_mem_type_ = OpenCLMemType::BUF; + } + if (out_mem_type_ == OpenCLMemType::BUF) { kernel_name += "_BUF"; program_name += "_BUF"; } else { @@ -119,6 +123,10 @@ int SoftmaxOpenCLKernel::Init() { #endif ori_format_ = out_tensors_[0]->GetFormat(); out_tensors_[0]->SetFormat(schema::Format_NHWC4); + if (!is_image_out_) { + ori_format_ = schema::Format_NC; + out_tensors_[0]->SetFormat(schema::Format_NC); + } MS_LOG(DEBUG) << kernel_name << " Init Done!"; return lite::RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h index 7497be4e8f..565aaf988d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h @@ -41,17 +41,17 @@ class SoftmaxOpenCLKernel : public OpenCLKernel { int InitGlobalSize(); int SetWorkGroupSize1x1(); int SetWorkGroupSize(); - std::vector GetMaskForLastChannel(int channels); + std::vector GetMaskForLastChannel(int channels); private: cl::Kernel kernel_; SoftmaxParameter *parameter_; lite::opencl::OpenCLRuntime *runtime_; - enum class MEM_TYPE { BUF, IMG } mem_type_{MEM_TYPE::IMG}; bool onexone_flag_{false}; std::vector local_size_; std::vector global_size_; + bool is_image_out_{false}; }; } // namespace mindspore::kernel