diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl index 6ae366a8bd..1dcb8ad38f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl @@ -14,20 +14,35 @@ __kernel void SoftMaxAxis3_NHWC4(__read_only image2d_t input, __write_only image if (X >= H || Y >= W) return; + // get max + float4 last = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X))); + float input_max = last.x; + if (mask.y > 0.5f) input_max = max(input_max, last.y); + if (mask.z > 0.5f) input_max = max(input_max, last.z); + if (mask.w > 0.5f) input_max = max(input_max, last.w); + for (int d = 0; d < C4 - 1; ++d) { + float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + d, X))); + input_max = max(input_max, t.x); + input_max = max(input_max, t.y); + input_max = max(input_max, t.z); + input_max = max(input_max, t.w); + } + float4 input_max_f4 = (float4)(input_max, input_max, input_max, input_max); + float sum = 0.0f; for (int d = 0; d < C4 - 1; ++d) { float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + d, X))); - sum += dot(exp(t), (float4)(1.f)); + sum += dot(exp(t - input_max_f4), (float4)(1.f)); } float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X))); - sum += dot(exp(t), mask); + sum += dot(exp(min(t - input_max_f4, 0)), mask); for (int d = 0; d < C4 - 1; ++d) { float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + d, X))); - result = exp(result) / sum; + result = exp(result - input_max_f4) / sum; WRITE_IMAGE(output, (int2)(Y * C4 + d, X), TO_FLT4(result)); } float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X))); - result = exp(result) / sum; + result = exp(min(result - input_max_f4, 0)) / sum; result = result * mask; WRITE_IMAGE(output, (int2)(Y * C4 + C4 - 1, X), TO_FLT4(result)); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc index 972ec3096f..1a500babbf 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc @@ -16,14 +16,14 @@ * limitations under the License. */ +#include "src/runtime/kernel/opencl/kernel/prelu.h" +#include #include #include - +#include "src/runtime/kernel/opencl/cl/prelu.cl.inc" #include "src/kernel_registry.h" #include "include/errorcode.h" #include "nnacl/fp32/common_func_fp32.h" -#include "src/runtime/kernel/opencl/kernel/prelu.h" -#include "src/runtime/kernel/opencl/cl/prelu.cl.inc" using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; @@ -36,7 +36,6 @@ namespace mindspore::kernel { int PReluOpenCLKernel::InitWeights() { auto allocator = ocl_runtime_->GetAllocator(); auto weight_tensor = in_tensors_.at(1); - int C_ = weight_shape_.s[3]; if (weight_is_scalar) { if (weight_tensor->data_type() == kNumberTypeFloat16) { weight_scalar_ = static_cast(*reinterpret_cast(weight_tensor->data_c())); @@ -44,6 +43,7 @@ int PReluOpenCLKernel::InitWeights() { weight_scalar_ = *reinterpret_cast(weight_tensor->data_c()); } } else { + int C_ = weight_tensor->ElementsNum(); auto sizeof_FLT = enable_fp16_ ? sizeof(float16_t) : sizeof(float); size_t weight_size = UP_ROUND(C_, C4NUM) * sizeof_FLT; weight_vector_ = allocator->Malloc(weight_size); @@ -123,7 +123,8 @@ int PReluOpenCLKernel::Prepare() { } Broadcast2GpuShape(out_shape_.s, output_shape.s, out_tensors_.at(0)->shape().size(), 1); Broadcast2GpuShape(weight_shape_.s, weight_shape.s, in_tensors_.at(1)->shape().size(), 1); - weight_is_scalar = weight_shape_.s[3] == 1; + auto param = reinterpret_cast(op_parameter_); + weight_is_scalar = param->channelShared; enable_fp16_ = ocl_runtime_->GetFp16Enable(); std::string source = prelu_source; std::string program_name = "PRelu"; diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc index 9d2e157c49..746c95106a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc @@ -321,7 +321,7 @@ void TryMergeArithmeticAct(LiteKernel *act, std::set *removed_set) // FullConnection(NO_ACTIVATION) + Activation(RELU/RELU6/TANH) template void TryMergeXxxActivation(LiteKernel *act, std::set *removed_set) { - MS_ASSERT(node); + MS_ASSERT(act); MS_ASSERT(removed_set); auto *act_param = reinterpret_cast(reinterpret_cast(act)->GetParameter()); LiteKernel *node = act->in_kernels().front(); @@ -534,7 +534,7 @@ int TryMergeEltwiseEltwise(LiteKernel *node, std::set *removed_set if (AIsInB(pred, nodes) && IsEltwiseAndOperatorSupported(pred) && pred->out_kernels().size() == 1) { auto *tensor = pred->out_tensors().front(); MS_ASSERT(pred->out_kernels().front() == node); - MS_ASSERT(AIsInB(tensor, node.in_tensors())); + MS_ASSERT(AIsInB(tensor, &node->in_tensors())); pred_eltwises.insert(pred); // create FusionEltwiseParameter for this pred eltwise auto param = CreateFusionEltwiseParameter(pred); diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc index d9a60f48fd..4f6e884a1b 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc @@ -93,7 +93,6 @@ void *OpenCLAllocator::CreateImage2D(size_t size, const ImageSize &img_size, voi cl_int ret = CL_SUCCESS; MS_ASSERT(buffer); MS_ASSERT(image); - MS_ASSERT(img_size.size() == 3); if (data == nullptr) { // copy from cl2.hpp cl_image_desc desc = {CL_MEM_OBJECT_IMAGE2D, img_size.width, img_size.height, 0, 0, 0, 0, 0, 0, (**buffer).get()}; @@ -136,23 +135,27 @@ void *OpenCLAllocator::CreateImage2D(size_t size, const ImageSize &img_size, voi return host_ptr; } +int OpenCLAllocator::GetImgDtypeSize(const ImageSize &img_size) { + size_t dtype_size = 0; + if (img_size.dtype == CL_FLOAT) { + dtype_size = sizeof(cl_float); + } else if (img_size.dtype == CL_HALF_FLOAT) { + dtype_size = sizeof(cl_half); + } else if (img_size.dtype == CL_UNSIGNED_INT8) { + dtype_size = sizeof(cl_uchar); + } else { + MS_LOG(ERROR) << "Unsupported dtype " << img_size.dtype; + return RET_ERROR; + } + uint32_t image_alignment = ocl_runtime_->GetImagePitchAlignment(); + size_t size = UP_ROUND(img_size.width, image_alignment) * img_size.height * C4NUM * dtype_size; + return size; +} + void *OpenCLAllocator::_Malloc(MemType mem_type, void *data, size_t size, const ImageSize &img_size) { auto svm_capabilities = ocl_runtime_->GetSVMCapabilities(); - MS_ASSERT(img_size.size() == 0 || img_size.size() == 3); if (mem_type == MemType::IMG) { - size_t dtype_size = 0; - if (img_size.dtype == CL_FLOAT) { - dtype_size = sizeof(cl_float); - } else if (img_size.dtype == CL_HALF_FLOAT) { - dtype_size = sizeof(cl_half); - } else if (img_size.dtype == CL_UNSIGNED_INT8) { - dtype_size = sizeof(cl_uchar); - } else { - MS_LOG(ERROR) << "Unsupported dtype " << img_size.dtype; - return nullptr; - } - uint32_t image_alignment = ocl_runtime_->GetImagePitchAlignment(); - size = UP_ROUND(img_size.width, image_alignment) * img_size.height * C4NUM * dtype_size; + size = GetImgDtypeSize(img_size); } if (size > ocl_runtime_->GetMaxAllocSize()) { MS_LOG(ERROR) << "MallocData out of max_size, size: " << size; diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.h b/mindspore/lite/src/runtime/opencl/opencl_allocator.h index 64a458b873..a9b1026caa 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_allocator.h +++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.h @@ -75,6 +75,7 @@ class OpenCLAllocator : public Allocator { void *CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer); void *CreateImage2D(size_t size, const ImageSize &img_size, void *data, size_t flags, bool is_map, cl::Buffer **buffer, cl::Image2D **image); + int GetImgDtypeSize(const ImageSize &img_size); template void ClearMemList(T *list);