Merge pull request !4727 from chenzupeng/master-litetags/v0.7.0-beta
| @@ -48,3 +48,57 @@ __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t | |||||
| slices_count++; | slices_count++; | ||||
| } while (slices_count < slices_x32); | } while (slices_count < slices_x32); | ||||
| } | } | ||||
| __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *output, const float4 mask, const int slices, | |||||
| const int slices_x32) { | |||||
| int tid = get_local_id(0); | |||||
| float sum = 0.0f; | |||||
| for (size_t i = tid; i < slices - 1; i += 32) { | |||||
| float4 src = read_imagef(input, smp_none, (int2)(i, 0)); | |||||
| sum += dot((float4)(1.0f), exp(src)); | |||||
| } | |||||
| if ((slices - 1) % 32 == tid) { | |||||
| float4 src = read_imagef(input, smp_none, (int2)(slices - 1, 0)); | |||||
| sum += dot(mask, exp(src)); | |||||
| } | |||||
| __local float4 tmp[8]; | |||||
| __local float *tmpx1 = (__local float *)tmp; | |||||
| tmpx1[tid] = sum; | |||||
| barrier(CLK_LOCAL_MEM_FENCE); | |||||
| if (tid == 0) { | |||||
| sum = dot((float4)(1.0f), tmp[0]); | |||||
| sum += dot((float4)(1.0f), tmp[1]); | |||||
| sum += dot((float4)(1.0f), tmp[2]); | |||||
| sum += dot((float4)(1.0f), tmp[3]); | |||||
| sum += dot((float4)(1.0f), tmp[4]); | |||||
| sum += dot((float4)(1.0f), tmp[5]); | |||||
| sum += dot((float4)(1.0f), tmp[6]); | |||||
| sum += dot((float4)(1.0f), tmp[7]); | |||||
| tmpx1[0] = 1.0f / sum; | |||||
| } | |||||
| barrier(CLK_LOCAL_MEM_FENCE); | |||||
| sum = tmpx1[0]; | |||||
| for (size_t i = tid; i < slices - 1; i += 32) { | |||||
| float4 result = read_imagef(input, smp_none, (int2)(i, 0)); | |||||
| result = exp(result) * sum; | |||||
| output[i] = result; | |||||
| } | |||||
| if ((slices - 1) % 32 == tid) { | |||||
| float4 result = read_imagef(input, smp_none, (int2)(slices - 1, 0)); | |||||
| result = exp(result) * sum; | |||||
| __global float4 *remain_ptr4 = output; | |||||
| remain_ptr4 += slices - 1; | |||||
| __global float *remain_ptr = (__global float *)remain_ptr4; | |||||
| remain_ptr[0] = result.x; | |||||
| if (mask.y > 0.f) { | |||||
| remain_ptr[1] = result.y; | |||||
| } | |||||
| if (mask.z > 0.f) { | |||||
| remain_ptr[2] = result.z; | |||||
| } | |||||
| if (mask.w > 0.f) { | |||||
| remain_ptr[3] = result.w; | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -62,7 +62,7 @@ int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) | |||||
| int h = shapex[1]; | int h = shapex[1]; | ||||
| int w = shapex[2]; | int w = shapex[2]; | ||||
| int c = shapex[3]; | int c = shapex[3]; | ||||
| im_dst_x = UP_DIV(w * c, C4NUM); | |||||
| im_dst_x = w * UP_DIV(c, C4NUM); | |||||
| im_dst_y = h; | im_dst_y = h; | ||||
| #ifdef ENABLE_FP16 | #ifdef ENABLE_FP16 | ||||
| size_t img_dtype = CL_HALF_FLOAT; | size_t img_dtype = CL_HALF_FLOAT; | ||||
| @@ -90,7 +90,8 @@ int SoftmaxOpenCLKernel::Init() { | |||||
| std::string program_name = "SoftMax"; | std::string program_name = "SoftMax"; | ||||
| std::string source = softmax_source_fp32; | std::string source = softmax_source_fp32; | ||||
| runtime_ = lite::opencl::OpenCLRuntime::GetInstance(); | runtime_ = lite::opencl::OpenCLRuntime::GetInstance(); | ||||
| // framework not set this param yet! just use default. | |||||
| parameter_->axis_ = 1; | |||||
| if (in_tensors_[0]->shape().size() == 4 && parameter_->axis_ == 3) { | if (in_tensors_[0]->shape().size() == 4 && parameter_->axis_ == 3) { | ||||
| // support 4d tensor | // support 4d tensor | ||||
| onexone_flag_ = false; | onexone_flag_ = false; | ||||
| @@ -106,7 +107,10 @@ int SoftmaxOpenCLKernel::Init() { | |||||
| #ifdef PROGRAM_WITH_IL | #ifdef PROGRAM_WITH_IL | ||||
| runtime_->CreateKernelFromIL(kernel_(), kernel_name); | runtime_->CreateKernelFromIL(kernel_(), kernel_name); | ||||
| #else | #else | ||||
| if (mem_type_ == MEM_TYPE::BUF) { | |||||
| if (!is_image_out_) { | |||||
| out_mem_type_ = OpenCLMemType::BUF; | |||||
| } | |||||
| if (out_mem_type_ == OpenCLMemType::BUF) { | |||||
| kernel_name += "_BUF"; | kernel_name += "_BUF"; | ||||
| program_name += "_BUF"; | program_name += "_BUF"; | ||||
| } else { | } else { | ||||
| @@ -119,6 +123,10 @@ int SoftmaxOpenCLKernel::Init() { | |||||
| #endif | #endif | ||||
| ori_format_ = out_tensors_[0]->GetFormat(); | ori_format_ = out_tensors_[0]->GetFormat(); | ||||
| out_tensors_[0]->SetFormat(schema::Format_NHWC4); | out_tensors_[0]->SetFormat(schema::Format_NHWC4); | ||||
| if (!is_image_out_) { | |||||
| ori_format_ = schema::Format_NC; | |||||
| out_tensors_[0]->SetFormat(schema::Format_NC); | |||||
| } | |||||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | MS_LOG(DEBUG) << kernel_name << " Init Done!"; | ||||
| return lite::RET_OK; | return lite::RET_OK; | ||||
| } | } | ||||
| @@ -41,17 +41,17 @@ class SoftmaxOpenCLKernel : public OpenCLKernel { | |||||
| int InitGlobalSize(); | int InitGlobalSize(); | ||||
| int SetWorkGroupSize1x1(); | int SetWorkGroupSize1x1(); | ||||
| int SetWorkGroupSize(); | int SetWorkGroupSize(); | ||||
| std::vector<float> GetMaskForLastChannel(int channels); | |||||
| std::vector<float> GetMaskForLastChannel(int channels); | |||||
| private: | private: | ||||
| cl::Kernel kernel_; | cl::Kernel kernel_; | ||||
| SoftmaxParameter *parameter_; | SoftmaxParameter *parameter_; | ||||
| lite::opencl::OpenCLRuntime *runtime_; | lite::opencl::OpenCLRuntime *runtime_; | ||||
| enum class MEM_TYPE { BUF, IMG } mem_type_{MEM_TYPE::IMG}; | |||||
| bool onexone_flag_{false}; | bool onexone_flag_{false}; | ||||
| std::vector<size_t> local_size_; | std::vector<size_t> local_size_; | ||||
| std::vector<size_t> global_size_; | std::vector<size_t> global_size_; | ||||
| bool is_image_out_{false}; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||