* add int8 definitions * packing vulkan int8/int32, quantize vulkan * vulkan dequantize * requantize vulkanpull/4204/merge
| @@ -892,6 +892,13 @@ VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize, | |||
| if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT; | |||
| if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT; | |||
| } | |||
| if (elemsize / elempack == 1) | |||
| { | |||
| // int8 | |||
| if (elempack == 1) format = VK_FORMAT_R8_SINT; | |||
| if (elempack == 4) format = VK_FORMAT_R8G8B8A8_SINT; | |||
| if (elempack == 8) format = VK_FORMAT_R8G8B8A8_SINT; | |||
| } | |||
| // resolve image width height depth | |||
| int width = w; | |||
| @@ -1468,6 +1475,16 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsiz | |||
| if (elempack == 32) format = VK_FORMAT_R16G16B16A16_SFLOAT; | |||
| if (elempack == 64) format = VK_FORMAT_R16G16B16A16_SFLOAT; | |||
| } | |||
| if (elemsize / elempack == 1) | |||
| { | |||
| // int8 | |||
| if (elempack == 1) format = VK_FORMAT_R8_SINT; | |||
| if (elempack == 4) format = VK_FORMAT_R8G8B8A8_SINT; | |||
| if (elempack == 8) format = VK_FORMAT_R8G8B8A8_SINT; | |||
| if (elempack == 16) format = VK_FORMAT_R8G8B8A8_SINT; | |||
| if (elempack == 32) format = VK_FORMAT_R8G8B8A8_SINT; | |||
| if (elempack == 64) format = VK_FORMAT_R8G8B8A8_SINT; | |||
| } | |||
| // resolve image width height depth | |||
| int width = w; | |||
| @@ -450,6 +450,11 @@ void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt) | |||
| cast_type_to = 1; | |||
| } | |||
| if (src.elemsize == src.elempack * 1u) | |||
| { | |||
| cast_type_to = 4; | |||
| } | |||
| VkMat dst_staging; | |||
| vkdev->convert_packing(src, dst_staging, dst_elempack, cast_type_to, *this, opt_staging); | |||
| @@ -3032,6 +3032,10 @@ public: | |||
| // to fp32 | fp16 | |||
| // to pack1 | pack4 | pack8 | |||
| mutable ncnn::Layer* uop_packing[2][2][3]; | |||
| // from int8 | |||
| // to int8 | |||
| // to pack1 | pack4 | pack8 | |||
| mutable ncnn::Layer* uop_packing_int8[3]; | |||
| mutable Mutex uop_lock; | |||
| // device is valid and sucessfully initialized | |||
| @@ -3047,6 +3051,7 @@ VulkanDevicePrivate::VulkanDevicePrivate(VulkanDevice* _vkdev) | |||
| pipeline_cache = 0; | |||
| valid = false; | |||
| memset(uop_packing, 0, sizeof(uop_packing)); | |||
| memset(uop_packing_int8, 0, sizeof(uop_packing_int8)); | |||
| } | |||
| int VulkanDevicePrivate::create_dummy_buffer_image() | |||
| @@ -3096,18 +3101,29 @@ void VulkanDevicePrivate::destroy_dummy_buffer_image() | |||
| const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const | |||
| { | |||
| bool use_fp16 = (cast_type_from_index == 1 || cast_type_to_index == 1); | |||
| bool use_int8 = (cast_type_from_index == 3 || cast_type_to_index == 3); | |||
| MutexLockGuard lock(uop_lock); | |||
| const ncnn::Layer* cached_uop = uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index]; | |||
| const ncnn::Layer* cached_uop = 0; | |||
| if (use_int8) | |||
| { | |||
| cached_uop = uop_packing_int8[packing_type_to_index]; | |||
| } | |||
| else | |||
| { | |||
| cached_uop = uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index]; | |||
| } | |||
| if (cached_uop) | |||
| return cached_uop; | |||
| bool use_fp16 = (cast_type_from_index == 1 || cast_type_to_index == 1); | |||
| // create uop | |||
| Option opt; | |||
| opt.use_fp16_packed = use_fp16; // fp16p is always supported | |||
| opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage(); | |||
| opt.use_int8_packed = use_int8; // int8p is always supported | |||
| opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage(); | |||
| // fp16/int8 arithmetic are not necessary for packing | |||
| // and may conflict with storage options | |||
| @@ -3132,14 +3148,21 @@ const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_ | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack | |||
| pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16 | |||
| pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16 3=int8 | |||
| pd.set(3, cast_type_to_index + 1); | |||
| uop->load_param(pd); | |||
| uop->create_pipeline(opt); | |||
| uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop; | |||
| if (use_int8) | |||
| { | |||
| uop_packing_int8[packing_type_to_index] = uop; | |||
| } | |||
| else | |||
| { | |||
| uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop; | |||
| } | |||
| return uop; | |||
| } | |||
| @@ -3164,6 +3187,8 @@ void VulkanDevicePrivate::destroy_utility_operator() | |||
| opt.use_fp16_packed = use_fp16; | |||
| opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage(); | |||
| opt.use_int8_packed = false; | |||
| opt.use_int8_storage = false; | |||
| // to pack1 | pack4 | pack8 | |||
| for (int k = 0; k < 3; k++) | |||
| @@ -3183,6 +3208,33 @@ void VulkanDevicePrivate::destroy_utility_operator() | |||
| } | |||
| } | |||
| } | |||
| // int8 | |||
| { | |||
| bool use_int8 = true; | |||
| opt.use_fp16_packed = false; | |||
| opt.use_fp16_storage = false; | |||
| opt.use_int8_packed = use_int8; | |||
| opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage(); | |||
| // to pack1 | pack4 | pack8 | |||
| for (int k = 0; k < 3; k++) | |||
| { | |||
| // enable pack8 for pack8to1/pack8to4 | |||
| opt.use_shader_pack8 = true; | |||
| ncnn::Layer* uop = uop_packing_int8[k]; | |||
| if (!uop) | |||
| continue; | |||
| uop->destroy_pipeline(opt); | |||
| delete uop; | |||
| uop_packing_int8[k] = 0; | |||
| } | |||
| } | |||
| } | |||
| VulkanDevice::VulkanDevice(int device_index) | |||
| @@ -4232,18 +4284,35 @@ void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempac | |||
| { | |||
| cast_type_from_index = 0; | |||
| } | |||
| else // if (src.elembits() == 16) | |||
| else if (src.elembits() == 16) | |||
| { | |||
| cast_type_from_index = 1; | |||
| } | |||
| else // if (src.elembits() == 8) | |||
| { | |||
| cast_type_from_index = 3; | |||
| } | |||
| int cast_type_to_index = cast_type_to ? cast_type_to - 1 : cast_type_from_index; | |||
| // NCNN_LOGE("convert_packing b2b %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index); | |||
| if ((cast_type_from_index == 0 || cast_type_from_index == 1) && (cast_type_to_index == 2 || cast_type_to_index == 3)) | |||
| { | |||
| NCNN_LOGE("convert_packing from fp32/fp16 to int32/int8 is not supported"); | |||
| return; | |||
| } | |||
| if ((cast_type_from_index == 2 || cast_type_from_index == 3) && (cast_type_to_index == 0 || cast_type_to_index == 1)) | |||
| { | |||
| NCNN_LOGE("convert_packing from int32/int8 to fp32/fp16 is not supported"); | |||
| return; | |||
| } | |||
| Option opt2 = opt; | |||
| opt2.use_fp16_packed = (cast_type_from_index == 1 || cast_type_to_index == 1); | |||
| opt2.use_fp16_storage = (cast_type_from_index == 1 || cast_type_to_index == 1) && info.support_fp16_storage(); | |||
| opt2.use_int8_packed = (cast_type_from_index == 3 || cast_type_to_index == 3); | |||
| opt2.use_int8_storage = (cast_type_from_index == 3 || cast_type_to_index == 3) && info.support_int8_storage(); | |||
| const ncnn::Layer* uop = d->get_utility_operator(cast_type_from_index, cast_type_to_index, packing_type_to_index); | |||
| uop->forward(src, dst, cmd, opt2); | |||
| @@ -4809,6 +4878,49 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option | |||
| custom_defines.append("afp2sfpmat4(v)", "v"); | |||
| } | |||
| if (opt.use_int8_storage) | |||
| { | |||
| custom_defines.append("sint8", "int8_t"); | |||
| } | |||
| else if (opt.use_int8_packed) | |||
| { | |||
| custom_defines.append("sint8", "int"); | |||
| } | |||
| else | |||
| { | |||
| custom_defines.append("sint8", "int"); | |||
| } | |||
| custom_defines.append("sint8vec4", "int"); | |||
| custom_defines.append("sint8vec8", "ivec2"); | |||
| custom_defines.append("aint8", "int"); | |||
| custom_defines.append("aint8vec4", "ivec4"); | |||
| custom_defines.append("unpackInt4x8(v)", "ivec4((v<<24)>>24,(v<<16)>>24,(v<<8)>>24,v>>24)"); | |||
| custom_defines.append("packInt4x8(v)", "int((uint(v.r)&0xFFu)|((uint(v.g)&0xFFu)<<8)|((uint(v.b)&0xFFu)<<16)|((uint(v.a)&0xFFu)<<24))"); | |||
| if (opt.use_int8_storage) | |||
| { | |||
| custom_defines.append("i8buffer_ld1(buf,i)", "int(buf[i])"); | |||
| custom_defines.append("i8buffer_st1(buf,i,v)", "{buf[i]=int8_t(v);}"); | |||
| custom_defines.append("i8buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); | |||
| } | |||
| else | |||
| { | |||
| custom_defines.append("i8buffer_ld1(buf,i)", "int(((buf[(i)/4])<<(24-((i)%4)*8))>>24)"); | |||
| custom_defines.append("i8buffer_st1(buf,i,v)", "{uint _i=uint(i);uint _id4=_i/4;uint _im4=_i%4;int _vs=int(v);int _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id4],0,0);ivec4 _v=unpackInt4x8(_old_v);_v[_im4]=_vs;_new_v=packInt4x8(_v);} while(atomicCompSwap(buf[_id4],_old_v,_new_v)!=_old_v);}"); | |||
| custom_defines.append("i8buffer_cp1(buf,i,sbuf,si)", "{int _v=i8buffer_ld1(sbuf,si);i8buffer_st1(buf,i,_v);}"); | |||
| } | |||
| custom_defines.append("i8buffer_ld4(buf,i)", "unpackInt4x8(buf[i])"); | |||
| custom_defines.append("i8buffer_st4(buf,i,v)", "{buf[i]=packInt4x8(v);}"); | |||
| custom_defines.append("i8buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); | |||
| custom_defines.append("i8buffer_ld8(buf,i)", "ivec8(unpackInt4x8(buf[i].r),unpackInt4x8(buf[i].g))"); | |||
| custom_defines.append("i8buffer_st8(buf,i,v)", "{buf[i]=ivec2(packInt4x8(v.abcd),packInt4x8(v.efgh));}"); | |||
| custom_defines.append("i8buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); | |||
| custom_defines.append("psc(x)", "(x==0?p.x:x)"); | |||
| if (opt.use_fp16_storage) | |||
| @@ -5426,6 +5538,15 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option | |||
| { | |||
| custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_float16: require\n"; | |||
| } | |||
| custom_exts += "struct ivec8 { ivec4 abcd; ivec4 efgh; };\n"; | |||
| if (opt.use_int8_storage) | |||
| { | |||
| custom_exts += "#extension GL_EXT_shader_8bit_storage: require\n"; | |||
| } | |||
| if (opt.use_int8_arithmetic) | |||
| { | |||
| custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_int8: require\n"; | |||
| } | |||
| #if ENABLE_VALIDATION_LAYER | |||
| { | |||
| custom_exts += "#extension GL_EXT_debug_printf : require\n"; | |||
| @@ -5507,11 +5628,22 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option | |||
| NCNN_LOGE("%s", s.getInfoLog()); | |||
| NCNN_LOGE("%s", s.getInfoDebugLog()); | |||
| // for (int i = 0; i < 4; i++) | |||
| // print as line_number: code | |||
| { | |||
| int i = 3; | |||
| std::string s(comp_datas[i], comp_data_sizes[i]); | |||
| NCNN_LOGE("%s", s.c_str()); | |||
| const char* p = comp_datas[3]; | |||
| const char* line_end; | |||
| int line_number = 1; | |||
| while ((line_end = strchr(p, '\n')) != NULL) | |||
| { | |||
| NCNN_LOGE("%d:\t%.*s", line_number++, (int)(line_end - p), p); | |||
| p = line_end + 1; | |||
| } | |||
| if (*p != '\0') | |||
| { | |||
| NCNN_LOGE("%d:\t%s", line_number, p); | |||
| } | |||
| } | |||
| compile_success = false; | |||
| @@ -465,7 +465,7 @@ public: | |||
| // utility operator | |||
| void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const; | |||
| // cast_type_to 0=auto(same as src) 1=fp32 2=fp16 | |||
| // cast_type_to 0=auto(same as src) 1=fp32 2=fp16 3=int32 4=int8 | |||
| void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const; | |||
| // VK_KHR_bind_memory2 | |||
| @@ -36,6 +36,8 @@ public: | |||
| // 0 = auto | |||
| // 1 = fp32 | |||
| // 2 = fp16 | |||
| // 3 = int32 | |||
| // 4 = int8 | |||
| int cast_type_from; | |||
| int cast_type_to; | |||
| }; | |||
| @@ -0,0 +1,231 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "dequantize_vulkan.h" | |||
| #include "layer_shader_type.h" | |||
| namespace ncnn { | |||
| Dequantize_vulkan::Dequantize_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| pipeline_dequantize = 0; | |||
| pipeline_dequantize_pack4 = 0; | |||
| pipeline_dequantize_pack8 = 0; | |||
| } | |||
| int Dequantize_vulkan::create_pipeline(const Option& opt) | |||
| { | |||
| const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; | |||
| const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; | |||
| const int dims = shape.dims; | |||
| int elempack = 1; | |||
| if (dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; | |||
| if (dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; | |||
| if (dims == 3 || dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; | |||
| const size_t elemsize = elempack * 4u; | |||
| size_t out_elemsize; | |||
| if (opt.use_fp16_storage || opt.use_fp16_packed) | |||
| { | |||
| out_elemsize = elempack * 2u; | |||
| } | |||
| else | |||
| { | |||
| out_elemsize = elempack * 4u; | |||
| } | |||
| Mat shape_packed; | |||
| if (dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack); | |||
| if (dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); | |||
| if (dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); | |||
| if (dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack); | |||
| Mat out_shape_packed; | |||
| if (dims == 1) out_shape_packed = Mat(out_shape.w / elempack, (void*)0, out_elemsize, elempack); | |||
| if (dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / elempack, (void*)0, out_elemsize, elempack); | |||
| if (dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / elempack, (void*)0, out_elemsize, elempack); | |||
| if (dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / elempack, (void*)0, out_elemsize, elempack); | |||
| size_t c = 0; | |||
| size_t in_stride = 0; | |||
| size_t out_stride = 0; | |||
| if (dims == 1) | |||
| { | |||
| c = 1; | |||
| in_stride = shape_packed.w; | |||
| out_stride = out_shape_packed.w; | |||
| } | |||
| if (dims == 2) | |||
| { | |||
| c = shape_packed.h; | |||
| in_stride = shape_packed.w; | |||
| out_stride = out_shape_packed.w; | |||
| } | |||
| if (dims == 3 || dims == 4) | |||
| { | |||
| c = shape_packed.c; | |||
| in_stride = shape_packed.cstep; | |||
| out_stride = out_shape_packed.cstep; | |||
| } | |||
| std::vector<vk_specialization_type> specializations(4 + 3); | |||
| specializations[0].i = scale_data_size; | |||
| specializations[1].f = scale_data_size == 1 ? scale_data[0] : 1.f; | |||
| specializations[2].i = bias_data_size; | |||
| specializations[3].f = bias_data_size == 1 ? bias_data[0] : 0.f; | |||
| specializations[4 + 0].u32 = c; | |||
| specializations[4 + 1].u32 = in_stride; | |||
| specializations[4 + 2].u32 = out_stride; | |||
| const int local_size_x = vkdev->info.subgroup_size(); | |||
| // pack1 | |||
| if (shape.dims == 0 || elempack == 1) | |||
| { | |||
| pipeline_dequantize = new Pipeline(vkdev); | |||
| pipeline_dequantize->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_dequantize->create(LayerShaderType::dequantize, opt, specializations); | |||
| } | |||
| // pack4 | |||
| if (shape.dims == 0 || elempack == 4) | |||
| { | |||
| pipeline_dequantize_pack4 = new Pipeline(vkdev); | |||
| pipeline_dequantize_pack4->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_dequantize_pack4->create(LayerShaderType::dequantize_pack4, opt, specializations); | |||
| } | |||
| // pack8 | |||
| if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8) | |||
| { | |||
| pipeline_dequantize_pack8 = new Pipeline(vkdev); | |||
| pipeline_dequantize_pack8->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_dequantize_pack8->create(LayerShaderType::dequantize_pack8, opt, specializations); | |||
| } | |||
| return 0; | |||
| } | |||
| int Dequantize_vulkan::destroy_pipeline(const Option& /*opt*/) | |||
| { | |||
| delete pipeline_dequantize; | |||
| pipeline_dequantize = 0; | |||
| delete pipeline_dequantize_pack4; | |||
| pipeline_dequantize_pack4 = 0; | |||
| delete pipeline_dequantize_pack8; | |||
| pipeline_dequantize_pack8 = 0; | |||
| return 0; | |||
| } | |||
| int Dequantize_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| { | |||
| if (scale_data_size > 1) | |||
| { | |||
| cmd.record_upload(scale_data, scale_data_gpu, opt); | |||
| } | |||
| if (bias_data_size > 1) | |||
| { | |||
| cmd.record_upload(bias_data, bias_data_gpu, opt); | |||
| } | |||
| return 0; | |||
| } | |||
| int Dequantize_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| const int dims = bottom_blob.dims; | |||
| const int w = bottom_blob.w; | |||
| const int h = bottom_blob.h; | |||
| const int d = bottom_blob.d; | |||
| const int channels = bottom_blob.c; | |||
| const int elempack = bottom_blob.elempack; | |||
| size_t out_elemsize; | |||
| if (opt.use_fp16_storage || opt.use_fp16_packed) | |||
| { | |||
| out_elemsize = elempack * 2u; | |||
| } | |||
| else | |||
| { | |||
| out_elemsize = elempack * 4u; | |||
| } | |||
| if (dims == 1) | |||
| top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator); | |||
| if (dims == 2) | |||
| top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator); | |||
| if (dims == 3) | |||
| top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator); | |||
| if (dims == 4) | |||
| top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| size_t c = 0; | |||
| size_t in_stride = 0; | |||
| size_t out_stride = 0; | |||
| if (dims == 1) | |||
| { | |||
| c = 1; | |||
| in_stride = bottom_blob.w; | |||
| out_stride = top_blob.w; | |||
| } | |||
| if (dims == 2) | |||
| { | |||
| c = bottom_blob.h; | |||
| in_stride = bottom_blob.w; | |||
| out_stride = top_blob.w; | |||
| } | |||
| if (dims == 3 || dims == 4) | |||
| { | |||
| c = bottom_blob.c; | |||
| in_stride = bottom_blob.cstep; | |||
| out_stride = top_blob.cstep; | |||
| } | |||
| std::vector<VkMat> bindings(4); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob; | |||
| bindings[2] = scale_data_gpu; | |||
| bindings[3] = bias_data_gpu; | |||
| std::vector<vk_constant_type> constants(3); | |||
| constants[0].u32 = c; | |||
| constants[1].u32 = in_stride; | |||
| constants[2].u32 = out_stride; | |||
| VkMat dispatcher; | |||
| dispatcher.w = in_stride * c; | |||
| dispatcher.h = 1; | |||
| dispatcher.c = 1; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_dequantize_pack8 | |||
| : elempack == 4 ? pipeline_dequantize_pack4 | |||
| : pipeline_dequantize; | |||
| cmd.record_pipeline(pipeline, bindings, constants, dispatcher); | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,46 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_DEQUANTIZE_VULKAN_H | |||
| #define LAYER_DEQUANTIZE_VULKAN_H | |||
| #include "dequantize.h" | |||
| namespace ncnn { | |||
| class Dequantize_vulkan : virtual public Dequantize | |||
| { | |||
| public: | |||
| Dequantize_vulkan(); | |||
| virtual int create_pipeline(const Option& opt); | |||
| virtual int destroy_pipeline(const Option& opt); | |||
| virtual int upload_model(VkTransfer& cmd, const Option& opt); | |||
| using Dequantize::forward; | |||
| virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| VkMat scale_data_gpu; | |||
| VkMat bias_data_gpu; | |||
| Pipeline* pipeline_dequantize; | |||
| Pipeline* pipeline_dequantize_pack4; | |||
| Pipeline* pipeline_dequantize_pack8; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_DEQUANTIZE_VULKAN_H | |||
| @@ -45,6 +45,8 @@ int Packing_vulkan::create_pipeline(const Option& opt) | |||
| const int local_size_x = vkdev->info.subgroup_size(); | |||
| bool use_int8_shader = cast_type_from == 4 || cast_type_to == 4; | |||
| std::vector<vk_specialization_type> specializations(2 + 3); | |||
| specializations[0].i = cast_type_from; | |||
| specializations[1].i = cast_type_to; | |||
| @@ -91,7 +93,14 @@ int Packing_vulkan::create_pipeline(const Option& opt) | |||
| pipeline_packing = new Pipeline(vkdev); | |||
| pipeline_packing->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_packing->create(LayerShaderType::packing, opt, specializations); | |||
| if (use_int8_shader) | |||
| { | |||
| pipeline_packing->create(LayerShaderType::packing_int8, opt, specializations); | |||
| } | |||
| else | |||
| { | |||
| pipeline_packing->create(LayerShaderType::packing, opt, specializations); | |||
| } | |||
| } | |||
| if (shape.dims == 0 || elempack < out_elempack) | |||
| { | |||
| @@ -126,7 +135,14 @@ int Packing_vulkan::create_pipeline(const Option& opt) | |||
| pipeline_packing_pack1to4 = new Pipeline(vkdev); | |||
| pipeline_packing_pack1to4->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4, opt, specializations); | |||
| if (use_int8_shader) | |||
| { | |||
| pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4_int8, opt, specializations); | |||
| } | |||
| else | |||
| { | |||
| pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4, opt, specializations); | |||
| } | |||
| } | |||
| if (shape.dims == 0 || (elempack == 1 && out_elempack == 8)) | |||
| @@ -138,7 +154,14 @@ int Packing_vulkan::create_pipeline(const Option& opt) | |||
| pipeline_packing_pack1to8 = new Pipeline(vkdev); | |||
| pipeline_packing_pack1to8->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8, opt, specializations); | |||
| if (use_int8_shader) | |||
| { | |||
| pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8_int8, opt, specializations); | |||
| } | |||
| else | |||
| { | |||
| pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8, opt, specializations); | |||
| } | |||
| } | |||
| if (shape.dims == 0 || (elempack == 4 && out_elempack == 8)) | |||
| @@ -150,7 +173,14 @@ int Packing_vulkan::create_pipeline(const Option& opt) | |||
| pipeline_packing_pack4to8 = new Pipeline(vkdev); | |||
| pipeline_packing_pack4to8->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8, opt, specializations); | |||
| if (use_int8_shader) | |||
| { | |||
| pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8_int8, opt, specializations); | |||
| } | |||
| else | |||
| { | |||
| pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8, opt, specializations); | |||
| } | |||
| } | |||
| } | |||
| if (shape.dims == 0 || elempack > out_elempack) | |||
| @@ -186,7 +216,14 @@ int Packing_vulkan::create_pipeline(const Option& opt) | |||
| pipeline_packing_pack4to1 = new Pipeline(vkdev); | |||
| pipeline_packing_pack4to1->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1, opt, specializations); | |||
| if (use_int8_shader) | |||
| { | |||
| pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1_int8, opt, specializations); | |||
| } | |||
| else | |||
| { | |||
| pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1, opt, specializations); | |||
| } | |||
| } | |||
| if (shape.dims == 0 || (elempack == 8 && out_elempack == 1)) | |||
| @@ -198,7 +235,14 @@ int Packing_vulkan::create_pipeline(const Option& opt) | |||
| pipeline_packing_pack8to1 = new Pipeline(vkdev); | |||
| pipeline_packing_pack8to1->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1, opt, specializations); | |||
| if (use_int8_shader) | |||
| { | |||
| pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1_int8, opt, specializations); | |||
| } | |||
| else | |||
| { | |||
| pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1, opt, specializations); | |||
| } | |||
| } | |||
| if (shape.dims == 0 || (elempack == 8 && out_elempack == 4)) | |||
| @@ -210,7 +254,14 @@ int Packing_vulkan::create_pipeline(const Option& opt) | |||
| pipeline_packing_pack8to4 = new Pipeline(vkdev); | |||
| pipeline_packing_pack8to4->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4, opt, specializations); | |||
| if (use_int8_shader) | |||
| { | |||
| pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4_int8, opt, specializations); | |||
| } | |||
| else | |||
| { | |||
| pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4, opt, specializations); | |||
| } | |||
| } | |||
| } | |||
| @@ -296,10 +347,14 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| { | |||
| out_elemsize = out_elempack * 4u; | |||
| } | |||
| else // if (cast_type_to == 2) | |||
| else if (cast_type_to == 2) | |||
| { | |||
| out_elemsize = out_elempack * 2u; | |||
| } | |||
| else // if (cast_type_to == 3) | |||
| { | |||
| out_elemsize = out_elempack * 1u; | |||
| } | |||
| if (dims == 1) | |||
| { | |||
| @@ -0,0 +1,215 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "quantize_vulkan.h" | |||
| #include "layer_shader_type.h" | |||
| namespace ncnn { | |||
| Quantize_vulkan::Quantize_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| pipeline_quantize = 0; | |||
| pipeline_quantize_pack4 = 0; | |||
| pipeline_quantize_pack8 = 0; | |||
| } | |||
| int Quantize_vulkan::create_pipeline(const Option& opt) | |||
| { | |||
| const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; | |||
| const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; | |||
| const int dims = shape.dims; | |||
| int elempack = 0; | |||
| if (dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; | |||
| if (dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; | |||
| if (dims == 3 || dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; | |||
| size_t elemsize; | |||
| const size_t out_elemsize = elempack * 1u; | |||
| if (opt.use_fp16_storage || opt.use_fp16_packed) | |||
| { | |||
| elemsize = elempack * 2u; | |||
| } | |||
| else | |||
| { | |||
| elemsize = elempack * 4u; | |||
| } | |||
| Mat shape_packed; | |||
| if (dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack); | |||
| if (dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); | |||
| if (dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); | |||
| if (dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack); | |||
| Mat out_shape_packed; | |||
| if (dims == 1) out_shape_packed = Mat(out_shape.w / elempack, (void*)0, out_elemsize, elempack); | |||
| if (dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / elempack, (void*)0, out_elemsize, elempack); | |||
| if (dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / elempack, (void*)0, out_elemsize, elempack); | |||
| if (dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / elempack, (void*)0, out_elemsize, elempack); | |||
| size_t c = 0; | |||
| size_t in_stride = 0; | |||
| size_t out_stride = 0; | |||
| if (dims == 1) | |||
| { | |||
| c = 1; | |||
| in_stride = shape_packed.w; | |||
| out_stride = out_shape_packed.w; | |||
| } | |||
| if (dims == 2) | |||
| { | |||
| c = shape_packed.h; | |||
| in_stride = shape_packed.w; | |||
| out_stride = out_shape_packed.w; | |||
| } | |||
| if (dims == 3 || dims == 4) | |||
| { | |||
| c = shape_packed.c; | |||
| in_stride = shape_packed.cstep; | |||
| out_stride = out_shape_packed.cstep; | |||
| } | |||
| std::vector<vk_specialization_type> specializations(2 + 3); | |||
| specializations[0].i = scale_data_size; | |||
| specializations[1].f = scale_data_size == 1 ? scale_data[0] : 1.f; | |||
| specializations[2 + 0].u32 = c; | |||
| specializations[2 + 1].u32 = in_stride; | |||
| specializations[2 + 2].u32 = out_stride; | |||
| const int local_size_x = vkdev->info.subgroup_size(); | |||
| // pack1 | |||
| if (shape.dims == 0 || elempack == 1) | |||
| { | |||
| pipeline_quantize = new Pipeline(vkdev); | |||
| pipeline_quantize->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_quantize->create(LayerShaderType::quantize, opt, specializations); | |||
| } | |||
| // pack4 | |||
| if (shape.dims == 0 || elempack == 4) | |||
| { | |||
| pipeline_quantize_pack4 = new Pipeline(vkdev); | |||
| pipeline_quantize_pack4->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_quantize_pack4->create(LayerShaderType::quantize_pack4, opt, specializations); | |||
| } | |||
| // pack8 | |||
| if (shape.dims == 0 || elempack == 8) | |||
| { | |||
| pipeline_quantize_pack8 = new Pipeline(vkdev); | |||
| pipeline_quantize_pack8->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_quantize_pack8->create(LayerShaderType::quantize_pack8, opt, specializations); | |||
| } | |||
| return 0; | |||
| } | |||
| int Quantize_vulkan::destroy_pipeline(const Option& /*opt*/) | |||
| { | |||
| delete pipeline_quantize; | |||
| pipeline_quantize = 0; | |||
| delete pipeline_quantize_pack4; | |||
| pipeline_quantize_pack4 = 0; | |||
| delete pipeline_quantize_pack8; | |||
| pipeline_quantize_pack8 = 0; | |||
| return 0; | |||
| } | |||
| int Quantize_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| { | |||
| if (scale_data_size > 1) | |||
| { | |||
| cmd.record_upload(scale_data, scale_data_gpu, opt); | |||
| } | |||
| return 0; | |||
| } | |||
| int Quantize_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| const int dims = bottom_blob.dims; | |||
| const int w = bottom_blob.w; | |||
| const int h = bottom_blob.h; | |||
| const int d = bottom_blob.d; | |||
| const int channels = bottom_blob.c; | |||
| const int elempack = bottom_blob.elempack; | |||
| const size_t out_elemsize = 1u * elempack; | |||
| if (dims == 1) | |||
| top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator); | |||
| if (dims == 2) | |||
| top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator); | |||
| if (dims == 3) | |||
| top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator); | |||
| if (dims == 4) | |||
| top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| size_t c = 0; | |||
| size_t in_stride = 0; | |||
| size_t out_stride = 0; | |||
| if (dims == 1) | |||
| { | |||
| c = 1; | |||
| in_stride = bottom_blob.w; | |||
| out_stride = top_blob.w; | |||
| } | |||
| if (dims == 2) | |||
| { | |||
| c = bottom_blob.h; | |||
| in_stride = bottom_blob.w; | |||
| out_stride = top_blob.w; | |||
| } | |||
| if (dims == 3 || dims == 4) | |||
| { | |||
| c = bottom_blob.c; | |||
| in_stride = bottom_blob.cstep; | |||
| out_stride = top_blob.cstep; | |||
| } | |||
| std::vector<VkMat> bindings(3); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob; | |||
| bindings[2] = scale_data_gpu; | |||
| std::vector<vk_constant_type> constants(3); | |||
| constants[0].u32 = c; | |||
| constants[1].u32 = in_stride; | |||
| constants[2].u32 = out_stride; | |||
| VkMat dispatcher; | |||
| dispatcher.w = in_stride * c; | |||
| dispatcher.h = 1; | |||
| dispatcher.c = 1; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_quantize_pack8 | |||
| : elempack == 4 ? pipeline_quantize_pack4 | |||
| : pipeline_quantize; | |||
| cmd.record_pipeline(pipeline, bindings, constants, dispatcher); | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,45 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_QUANTIZE_VULKAN_H | |||
| #define LAYER_QUANTIZE_VULKAN_H | |||
| #include "quantize.h" | |||
| namespace ncnn { | |||
| class Quantize_vulkan : virtual public Quantize | |||
| { | |||
| public: | |||
| Quantize_vulkan(); | |||
| virtual int create_pipeline(const Option& opt); | |||
| virtual int destroy_pipeline(const Option& opt); | |||
| virtual int upload_model(VkTransfer& cmd, const Option& opt); | |||
| using Quantize::forward; | |||
| virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| VkMat scale_data_gpu; | |||
| Pipeline* pipeline_quantize; | |||
| Pipeline* pipeline_quantize_pack4; | |||
| Pipeline* pipeline_quantize_pack8; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_QUANTIZE_VULKAN_H | |||
| @@ -0,0 +1,231 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "requantize_vulkan.h" | |||
| #include "layer_shader_type.h" | |||
| namespace ncnn { | |||
| Requantize_vulkan::Requantize_vulkan() | |||
| { | |||
| support_vulkan = true; | |||
| pipeline_requantize = 0; | |||
| pipeline_requantize_pack4 = 0; | |||
| pipeline_requantize_pack8 = 0; | |||
| } | |||
| int Requantize_vulkan::create_pipeline(const Option& opt) | |||
| { | |||
| const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; | |||
| const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; | |||
| const int dims = shape.dims; | |||
| int elempack = 1; | |||
| if (dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; | |||
| if (dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; | |||
| if (dims == 3 || dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; | |||
| int out_elempack = 1; | |||
| if (dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1; | |||
| if (dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1; | |||
| if (dims == 3 || dims == 4) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1; | |||
| const size_t elemsize = elempack * 4u; | |||
| const size_t out_elemsize = out_elempack * 1u; | |||
| Mat shape_packed; | |||
| if (dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack); | |||
| if (dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); | |||
| if (dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); | |||
| if (dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack); | |||
| Mat out_shape_packed; | |||
| if (dims == 1) out_shape_packed = Mat(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack); | |||
| if (dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack); | |||
| if (dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack); | |||
| if (dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack); | |||
| size_t c = 0; | |||
| size_t in_stride = 0; | |||
| size_t out_stride = 0; | |||
| if (dims == 1) | |||
| { | |||
| c = 1; | |||
| in_stride = shape_packed.w; | |||
| out_stride = out_shape_packed.w; | |||
| } | |||
| if (dims == 2) | |||
| { | |||
| c = shape_packed.h; | |||
| in_stride = shape_packed.w; | |||
| out_stride = out_shape_packed.w; | |||
| } | |||
| if (dims == 3 || dims == 4) | |||
| { | |||
| c = shape_packed.c; | |||
| in_stride = shape_packed.cstep; | |||
| out_stride = out_shape_packed.cstep; | |||
| } | |||
| std::vector<vk_specialization_type> specializations(9 + 3); | |||
| specializations[0].i = scale_in_data_size; | |||
| specializations[1].f = scale_in_data_size == 1 ? scale_in_data[0] : 1.f; | |||
| specializations[2].i = scale_out_data_size; | |||
| specializations[3].f = scale_out_data_size == 1 ? scale_out_data[0] : 1.f; | |||
| specializations[4].i = bias_data_size; | |||
| specializations[5].f = bias_data_size == 1 ? bias_data[0] : 0.f; | |||
| specializations[6].i = activation_type; | |||
| specializations[7].f = activation_params.w >= 1 ? activation_params[0] : 0.f; | |||
| specializations[8].f = activation_params.w == 2 ? activation_params[1] : 0.f; | |||
| specializations[9 + 0].u32 = c; | |||
| specializations[9 + 1].u32 = in_stride; | |||
| specializations[9 + 2].u32 = out_stride; | |||
| const int local_size_x = vkdev->info.subgroup_size(); | |||
| // pack1 | |||
| if (shape.dims == 0 || elempack == 1) | |||
| { | |||
| pipeline_requantize = new Pipeline(vkdev); | |||
| pipeline_requantize->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_requantize->create(LayerShaderType::requantize, opt, specializations); | |||
| } | |||
| // pack4 | |||
| if (shape.dims == 0 || elempack == 4) | |||
| { | |||
| pipeline_requantize_pack4 = new Pipeline(vkdev); | |||
| pipeline_requantize_pack4->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_requantize_pack4->create(LayerShaderType::requantize_pack4, opt, specializations); | |||
| } | |||
| // pack8 | |||
| if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8) | |||
| { | |||
| pipeline_requantize_pack8 = new Pipeline(vkdev); | |||
| pipeline_requantize_pack8->set_optimal_local_size_xyz(local_size_x, 1, 1); | |||
| pipeline_requantize_pack8->create(LayerShaderType::requantize_pack8, opt, specializations); | |||
| } | |||
| return 0; | |||
| } | |||
| int Requantize_vulkan::destroy_pipeline(const Option& /*opt*/) | |||
| { | |||
| delete pipeline_requantize; | |||
| pipeline_requantize = 0; | |||
| delete pipeline_requantize_pack4; | |||
| pipeline_requantize_pack4 = 0; | |||
| delete pipeline_requantize_pack8; | |||
| pipeline_requantize_pack8 = 0; | |||
| return 0; | |||
| } | |||
| int Requantize_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| { | |||
| if (scale_in_data_size > 1) | |||
| { | |||
| cmd.record_upload(scale_in_data, scale_in_data_gpu, opt); | |||
| } | |||
| if (scale_out_data_size > 1) | |||
| { | |||
| cmd.record_upload(scale_out_data, scale_out_data_gpu, opt); | |||
| } | |||
| if (bias_data_size > 1) | |||
| { | |||
| cmd.record_upload(bias_data, bias_data_gpu, opt); | |||
| } | |||
| return 0; | |||
| } | |||
| int Requantize_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| const int dims = bottom_blob.dims; | |||
| const int w = bottom_blob.w; | |||
| const int h = bottom_blob.h; | |||
| const int d = bottom_blob.d; | |||
| const int channels = bottom_blob.c; | |||
| const int elempack = bottom_blob.elempack; | |||
| size_t out_elemsize = 1u * elempack; | |||
| if (dims == 1) | |||
| top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator); | |||
| if (dims == 2) | |||
| top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator); | |||
| if (dims == 3) | |||
| top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator); | |||
| if (dims == 4) | |||
| top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| size_t c = 0; | |||
| size_t in_stride = 0; | |||
| size_t out_stride = 0; | |||
| if (dims == 1) | |||
| { | |||
| c = 1; | |||
| in_stride = bottom_blob.w; | |||
| out_stride = top_blob.w; | |||
| } | |||
| if (dims == 2) | |||
| { | |||
| c = bottom_blob.h; | |||
| in_stride = bottom_blob.w; | |||
| out_stride = top_blob.w; | |||
| } | |||
| if (dims == 3 || dims == 4) | |||
| { | |||
| c = bottom_blob.c; | |||
| in_stride = bottom_blob.cstep; | |||
| out_stride = top_blob.cstep; | |||
| } | |||
| std::vector<VkMat> bindings(5); | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob; | |||
| bindings[2] = scale_in_data_gpu; | |||
| bindings[3] = scale_out_data_gpu; | |||
| bindings[4] = bias_data_gpu; | |||
| std::vector<vk_constant_type> constants(3); | |||
| constants[0].u32 = c; | |||
| constants[1].u32 = in_stride; | |||
| constants[2].u32 = out_stride; | |||
| VkMat dispatcher; | |||
| dispatcher.w = in_stride * c; | |||
| dispatcher.h = 1; | |||
| dispatcher.c = 1; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_requantize_pack8 | |||
| : elempack == 4 ? pipeline_requantize_pack4 | |||
| : pipeline_requantize; | |||
| cmd.record_pipeline(pipeline, bindings, constants, dispatcher); | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,47 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_REQUANTIZE_VULKAN_H | |||
| #define LAYER_REQUANTIZE_VULKAN_H | |||
| #include "requantize.h" | |||
| namespace ncnn { | |||
| class Requantize_vulkan : virtual public Requantize | |||
| { | |||
| public: | |||
| Requantize_vulkan(); | |||
| virtual int create_pipeline(const Option& opt); | |||
| virtual int destroy_pipeline(const Option& opt); | |||
| virtual int upload_model(VkTransfer& cmd, const Option& opt); | |||
| using Requantize::forward; | |||
| virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; | |||
| public: | |||
| VkMat scale_in_data_gpu; | |||
| VkMat scale_out_data_gpu; | |||
| VkMat bias_data_gpu; | |||
| Pipeline* pipeline_requantize; | |||
| Pipeline* pipeline_requantize_pack4; | |||
| Pipeline* pipeline_requantize_pack8; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_REQUANTIZE_VULKAN_H | |||
| @@ -0,0 +1,80 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| layout (constant_id = 0) const int scale_data_size = 0; | |||
| layout (constant_id = 1) const float scale_value = 1.f; | |||
| layout (constant_id = 2) const int bias_data_size = 0; | |||
| layout (constant_id = 3) const float bias_value = 0.f; | |||
| #define shape_constant_id_offset 4 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { int bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer scale_blob { sfp scale_blob_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfp bias_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint c; | |||
| uint in_stride; | |||
| uint out_stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gi = gl_GlobalInvocationID.x; | |||
| if (gi >= psc(in_stride) * psc(c)) | |||
| return; | |||
| const uint gy = gi / psc(in_stride); | |||
| const uint gx = gi % psc(in_stride); | |||
| int v = bottom_blob_data[gi]; | |||
| afp scale; | |||
| if (scale_data_size == 1) | |||
| { | |||
| scale = afp(scale_value); | |||
| } | |||
| else | |||
| { | |||
| scale = buffer_ld1(scale_blob_data, gy); | |||
| } | |||
| afp bias; | |||
| if (bias_data_size == 0) | |||
| { | |||
| bias = afp(0.f); | |||
| } | |||
| else if (bias_data_size == 1) | |||
| { | |||
| bias = afp(bias_value); | |||
| } | |||
| else | |||
| { | |||
| bias = buffer_ld1(bias_blob_data, gy); | |||
| } | |||
| afp v_fp = afp(v) * scale + bias; | |||
| const uint outgi = gy * psc(out_stride) + gx; | |||
| buffer_st1(top_blob_data, outgi, v_fp); | |||
| } | |||
| @@ -0,0 +1,80 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| layout (constant_id = 0) const int scale_data_size = 0; | |||
| layout (constant_id = 1) const float scale_value = 1.f; | |||
| layout (constant_id = 2) const int bias_data_size = 0; | |||
| layout (constant_id = 3) const float bias_value = 0.f; | |||
| #define shape_constant_id_offset 4 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { ivec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer scale_blob { sfpvec4 scale_blob_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint c; | |||
| uint in_stride; | |||
| uint out_stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gi = gl_GlobalInvocationID.x; | |||
| if (gi >= psc(in_stride) * psc(c)) | |||
| return; | |||
| const uint gy = gi / psc(in_stride); | |||
| const uint gx = gi % psc(in_stride); | |||
| ivec4 v = bottom_blob_data[gi]; | |||
| afpvec4 scale; | |||
| if (scale_data_size == 1) | |||
| { | |||
| scale = afpvec4(scale_value); | |||
| } | |||
| else | |||
| { | |||
| scale = buffer_ld4(scale_blob_data, gy); | |||
| } | |||
| afpvec4 bias; | |||
| if (bias_data_size == 0) | |||
| { | |||
| bias = afpvec4(0.f); | |||
| } | |||
| else if (bias_data_size == 1) | |||
| { | |||
| bias = afpvec4(bias_value); | |||
| } | |||
| else | |||
| { | |||
| bias = buffer_ld4(bias_blob_data, gy); | |||
| } | |||
| afpvec4 v_fp = afpvec4(v) * scale + bias; | |||
| const uint outgi = gy * psc(out_stride) + gx; | |||
| buffer_st4(top_blob_data, outgi, v_fp); | |||
| } | |||
| @@ -0,0 +1,84 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| layout (constant_id = 0) const int scale_data_size = 0; | |||
| layout (constant_id = 1) const float scale_value = 1.f; | |||
| layout (constant_id = 2) const int bias_data_size = 0; | |||
| layout (constant_id = 3) const float bias_value = 0.f; | |||
| #define shape_constant_id_offset 4 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { ivec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer scale_blob { sfpvec8 scale_blob_data[]; }; | |||
| layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint c; | |||
| uint in_stride; | |||
| uint out_stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gi = gl_GlobalInvocationID.x; | |||
| if (gi >= psc(in_stride) * psc(c)) | |||
| return; | |||
| const uint gy = gi / psc(in_stride); | |||
| const uint gx = gi % psc(in_stride); | |||
| ivec8 v = bottom_blob_data[gi]; | |||
| afpvec8 scale; | |||
| if (scale_data_size == 1) | |||
| { | |||
| scale = afpvec8(afpvec4(scale_value), afpvec4(scale_value)); | |||
| } | |||
| else | |||
| { | |||
| scale = buffer_ld8(scale_blob_data, gy); | |||
| } | |||
| afpvec8 bias; | |||
| if (bias_data_size == 0) | |||
| { | |||
| bias[0] = afpvec4(0.f); | |||
| bias[1] = afpvec4(0.f); | |||
| } | |||
| else if (bias_data_size == 1) | |||
| { | |||
| bias[0] = afpvec4(bias_value); | |||
| bias[1] = afpvec4(bias_value); | |||
| } | |||
| else | |||
| { | |||
| bias = buffer_ld8(bias_blob_data, gy); | |||
| } | |||
| afpvec8 v_fp; | |||
| v_fp[0] = afpvec4(v.abcd) * scale[0] + bias[0]; | |||
| v_fp[1] = afpvec4(v.efgh) * scale[1] + bias[1]; | |||
| const uint outgi = gy * psc(out_stride) + gx; | |||
| buffer_st8(top_blob_data, outgi, v_fp); | |||
| } | |||
| @@ -0,0 +1,73 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| layout (constant_id = 0) const int cast_type_from = 0; | |||
| layout (constant_id = 1) const int cast_type_to = 1; | |||
| #define shape_constant_id_offset 2 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint n = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) readonly buffer bottom_blob_int32 { ivec4 bottom_blob_int32_data[]; }; | |||
| layout (binding = 2) writeonly buffer top_blob { sint8vec4 top_blob_data[]; }; | |||
| layout (binding = 3) writeonly buffer top_blob_int32 { ivec4 top_blob_int32_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint n; | |||
| uint c; | |||
| uint stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gx = gl_GlobalInvocationID.x; | |||
| const uint gy = gl_GlobalInvocationID.y; | |||
| if (gx >= psc(n) || gy >= psc(c)) | |||
| return; | |||
| const uint gi = gy * psc(n) + gx; | |||
| if (cast_type_from == cast_type_to) | |||
| { | |||
| i8buffer_cp4(top_blob_data, gi, bottom_blob_data, gi); | |||
| return; | |||
| } | |||
| const uint gi2 = gy * psc(stride) + gx; | |||
| ivec4 v; | |||
| if (cast_type_from == 3) | |||
| { | |||
| v = bottom_blob_int32_data[gi]; | |||
| } | |||
| else | |||
| { | |||
| v = i8buffer_ld4(bottom_blob_data, gi2); | |||
| } | |||
| if (cast_type_to == 3) | |||
| { | |||
| top_blob_int32_data[gi] = v; | |||
| } | |||
| else | |||
| { | |||
| i8buffer_st4(top_blob_data, gi2, v); | |||
| } | |||
| } | |||
| @@ -0,0 +1,79 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| layout (constant_id = 0) const int cast_type_from = 0; | |||
| layout (constant_id = 1) const int cast_type_to = 1; | |||
| #define shape_constant_id_offset 2 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint n = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; }; | |||
| layout (binding = 1) readonly buffer bottom_blob_int32 { int bottom_blob_int32_data[]; }; | |||
| layout (binding = 2) writeonly buffer top_blob { sint8vec4 top_blob_data[]; }; | |||
| layout (binding = 3) writeonly buffer top_blob_int32 { ivec4 top_blob_int32_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint n; | |||
| uint c; | |||
| uint stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gx = gl_GlobalInvocationID.x; | |||
| const uint gy = gl_GlobalInvocationID.y; | |||
| if (gx >= psc(n) || gy >= psc(c)) | |||
| return; | |||
| const uvec4 gi4 = (gy * 4 + uvec4(0, 1, 2, 3)) * psc(stride) + gx; | |||
| const uint gi = gy * psc(n) + gx; | |||
| // if (cast_type_from == cast_type_to) | |||
| // { | |||
| // i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, gi4); | |||
| // return; | |||
| // } | |||
| ivec4 v; | |||
| if (cast_type_from == 3) | |||
| { | |||
| v.r = bottom_blob_int32_data[gi4.r]; | |||
| v.g = bottom_blob_int32_data[gi4.g]; | |||
| v.b = bottom_blob_int32_data[gi4.b]; | |||
| v.a = bottom_blob_int32_data[gi4.a]; | |||
| } | |||
| else | |||
| { | |||
| v.r = i8buffer_ld1(bottom_blob_data, gi4.r); | |||
| v.g = i8buffer_ld1(bottom_blob_data, gi4.g); | |||
| v.b = i8buffer_ld1(bottom_blob_data, gi4.b); | |||
| v.a = i8buffer_ld1(bottom_blob_data, gi4.a); | |||
| } | |||
| if (cast_type_to == 3) | |||
| { | |||
| top_blob_int32_data[gi] = v; | |||
| } | |||
| else | |||
| { | |||
| i8buffer_st4(top_blob_data, gi, v); | |||
| } | |||
| } | |||
| @@ -0,0 +1,88 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| layout (constant_id = 0) const int cast_type_from = 0; | |||
| layout (constant_id = 1) const int cast_type_to = 1; | |||
| #define shape_constant_id_offset 2 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint n = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; }; | |||
| layout (binding = 1) readonly buffer bottom_blob_int32 { int bottom_blob_int32_data[]; }; | |||
| layout (binding = 2) writeonly buffer top_blob { sint8vec8 top_blob_data[]; }; | |||
| layout (binding = 3) writeonly buffer top_blob_int32 { ivec8 top_blob_int32_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint n; | |||
| uint c; | |||
| uint stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gx = gl_GlobalInvocationID.x; | |||
| const uint gy = gl_GlobalInvocationID.y; | |||
| if (gx >= psc(n) || gy >= psc(c)) | |||
| return; | |||
| const uvec4 gi4 = (gy * 8 + uvec4(0, 1, 2, 3)) * psc(stride) + gx; | |||
| const uvec4 gi8 = gi4 + psc(stride) * 4; | |||
| const uint gi = gy * psc(n) + gx; | |||
| // if (cast_type_from == cast_type_to) | |||
| // { | |||
| // i8buffer_cp1to8(top_blob_data, gi, bottom_blob_data, gi4, gi8); | |||
| // return; | |||
| // } | |||
| ivec8 v; | |||
| if (cast_type_from == 3) | |||
| { | |||
| v.abcd.r = bottom_blob_int32_data[gi4.r]; | |||
| v.abcd.g = bottom_blob_int32_data[gi4.g]; | |||
| v.abcd.b = bottom_blob_int32_data[gi4.b]; | |||
| v.abcd.a = bottom_blob_int32_data[gi4.a]; | |||
| v.efgh.r = bottom_blob_int32_data[gi8.r]; | |||
| v.efgh.g = bottom_blob_int32_data[gi8.g]; | |||
| v.efgh.b = bottom_blob_int32_data[gi8.b]; | |||
| v.efgh.a = bottom_blob_int32_data[gi8.a]; | |||
| } | |||
| else | |||
| { | |||
| v.abcd.r = i8buffer_ld1(bottom_blob_data, gi4.r); | |||
| v.abcd.g = i8buffer_ld1(bottom_blob_data, gi4.g); | |||
| v.abcd.b = i8buffer_ld1(bottom_blob_data, gi4.b); | |||
| v.abcd.a = i8buffer_ld1(bottom_blob_data, gi4.a); | |||
| v.efgh.r = i8buffer_ld1(bottom_blob_data, gi8.r); | |||
| v.efgh.g = i8buffer_ld1(bottom_blob_data, gi8.g); | |||
| v.efgh.b = i8buffer_ld1(bottom_blob_data, gi8.b); | |||
| v.efgh.a = i8buffer_ld1(bottom_blob_data, gi8.a); | |||
| } | |||
| if (cast_type_to == 3) | |||
| { | |||
| top_blob_int32_data[gi] = v; | |||
| } | |||
| else | |||
| { | |||
| i8buffer_st8(top_blob_data, gi, v); | |||
| } | |||
| } | |||
| @@ -0,0 +1,79 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| layout (constant_id = 0) const int cast_type_from = 0; | |||
| layout (constant_id = 1) const int cast_type_to = 1; | |||
| #define shape_constant_id_offset 2 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint n = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) readonly buffer bottom_blob_int32 { ivec4 bottom_blob_int32_data[]; }; | |||
| layout (binding = 2) writeonly buffer top_blob { sint8 top_blob_data[]; }; | |||
| layout (binding = 3) writeonly buffer top_blob_int32 { int top_blob_int32_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint n; | |||
| uint c; | |||
| uint stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gx = gl_GlobalInvocationID.x; | |||
| const uint gy = gl_GlobalInvocationID.y; | |||
| if (gx >= psc(n) || gy >= psc(c)) | |||
| return; | |||
| const uint gi = gy * psc(n) + gx; | |||
| const uvec4 gi4 = (gy * 4 + uvec4(0, 1, 2, 3)) * psc(stride) + gx; | |||
| // if (cast_type_from == cast_type_to) | |||
| // { | |||
| // buffer_cp4to1(top_blob_data, gi4, bottom_blob_data, gi); | |||
| // return; | |||
| // } | |||
| ivec4 v; | |||
| if (cast_type_from == 3) | |||
| { | |||
| v = bottom_blob_int32_data[gi]; | |||
| } | |||
| else | |||
| { | |||
| v = i8buffer_ld4(bottom_blob_data, gi); | |||
| } | |||
| if (cast_type_to == 3) | |||
| { | |||
| top_blob_int32_data[gi4.r] = v.r; | |||
| top_blob_int32_data[gi4.g] = v.g; | |||
| top_blob_int32_data[gi4.b] = v.b; | |||
| top_blob_int32_data[gi4.a] = v.a; | |||
| } | |||
| else | |||
| { | |||
| i8buffer_st1(top_blob_data, gi4.r, v.r); | |||
| i8buffer_st1(top_blob_data, gi4.g, v.g); | |||
| i8buffer_st1(top_blob_data, gi4.b, v.b); | |||
| i8buffer_st1(top_blob_data, gi4.a, v.a); | |||
| } | |||
| } | |||
| @@ -0,0 +1,75 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| layout (constant_id = 0) const int cast_type_from = 0; | |||
| layout (constant_id = 1) const int cast_type_to = 1; | |||
| #define shape_constant_id_offset 2 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint n = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) readonly buffer bottom_blob_int32 { ivec4 bottom_blob_int32_data[]; }; | |||
| layout (binding = 2) writeonly buffer top_blob { sint8vec8 top_blob_data[]; }; | |||
| layout (binding = 3) writeonly buffer top_blob_int32 { ivec8 top_blob_int32_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint n; | |||
| uint c; | |||
| uint stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gx = gl_GlobalInvocationID.x; | |||
| const uint gy = gl_GlobalInvocationID.y; | |||
| if (gx >= psc(n) || gy >= psc(c)) | |||
| return; | |||
| const uvec2 gi2 = (gy * 2 + uvec2(0, 1)) * psc(stride) + gx; | |||
| const uint gi = gy * psc(n) + gx; | |||
| // if (cast_type_from == cast_type_to) | |||
| // { | |||
| // buffer_cp4to8(top_blob_data, gi, bottom_blob_data, gi2); | |||
| // return; | |||
| // } | |||
| ivec8 v; | |||
| if (cast_type_from == 3) | |||
| { | |||
| v.abcd = bottom_blob_int32_data[gi2.r]; | |||
| v.efgh = bottom_blob_int32_data[gi2.g]; | |||
| } | |||
| else | |||
| { | |||
| v.abcd = i8buffer_ld4(bottom_blob_data, gi2.r); | |||
| v.efgh = i8buffer_ld4(bottom_blob_data, gi2.g); | |||
| } | |||
| if (cast_type_to == 3) | |||
| { | |||
| top_blob_int32_data[gi] = v; | |||
| } | |||
| else | |||
| { | |||
| i8buffer_st8(top_blob_data, gi, v); | |||
| } | |||
| } | |||
| @@ -0,0 +1,88 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| layout (constant_id = 0) const int cast_type_from = 0; | |||
| layout (constant_id = 1) const int cast_type_to = 1; | |||
| #define shape_constant_id_offset 2 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint n = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { sint8vec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) readonly buffer bottom_blob_int32 { ivec8 bottom_blob_int32_data[]; }; | |||
| layout (binding = 2) writeonly buffer top_blob { sint8 top_blob_data[]; }; | |||
| layout (binding = 3) writeonly buffer top_blob_int32 { int top_blob_int32_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint n; | |||
| uint c; | |||
| uint stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gx = gl_GlobalInvocationID.x; | |||
| const uint gy = gl_GlobalInvocationID.y; | |||
| if (gx >= psc(n) || gy >= psc(c)) | |||
| return; | |||
| const uint gi = gy * psc(n) + gx; | |||
| const uvec4 gi4 = (gy * 8 + uvec4(0, 1, 2, 3)) * psc(stride) + gx; | |||
| const uvec4 gi8 = gi4 + psc(stride) * 4; | |||
| // if (cast_type_from == cast_type_to) | |||
| // { | |||
| // i8buffer_cp8to1(top_blob_data, gi4, gi8, bottom_blob_data, gi); | |||
| // return; | |||
| // } | |||
| ivec8 v; | |||
| if (cast_type_from == 3) | |||
| { | |||
| v = bottom_blob_int32_data[gi]; | |||
| } | |||
| else | |||
| { | |||
| v = i8buffer_ld8(bottom_blob_data, gi); | |||
| } | |||
| if (cast_type_to == 3) | |||
| { | |||
| top_blob_int32_data[gi4.r] = v.abcd.r; | |||
| top_blob_int32_data[gi4.g] = v.abcd.g; | |||
| top_blob_int32_data[gi4.b] = v.abcd.b; | |||
| top_blob_int32_data[gi4.a] = v.abcd.a; | |||
| top_blob_int32_data[gi8.r] = v.efgh.r; | |||
| top_blob_int32_data[gi8.g] = v.efgh.g; | |||
| top_blob_int32_data[gi8.b] = v.efgh.b; | |||
| top_blob_int32_data[gi8.a] = v.efgh.a; | |||
| } | |||
| else | |||
| { | |||
| i8buffer_st1(top_blob_data, gi4.r, v.abcd.r); | |||
| i8buffer_st1(top_blob_data, gi4.g, v.abcd.g); | |||
| i8buffer_st1(top_blob_data, gi4.b, v.abcd.b); | |||
| i8buffer_st1(top_blob_data, gi4.a, v.abcd.a); | |||
| i8buffer_st1(top_blob_data, gi8.r, v.efgh.r); | |||
| i8buffer_st1(top_blob_data, gi8.g, v.efgh.g); | |||
| i8buffer_st1(top_blob_data, gi8.b, v.efgh.b); | |||
| i8buffer_st1(top_blob_data, gi8.a, v.efgh.a); | |||
| } | |||
| } | |||
| @@ -0,0 +1,75 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| layout (constant_id = 0) const int cast_type_from = 0; | |||
| layout (constant_id = 1) const int cast_type_to = 1; | |||
| #define shape_constant_id_offset 2 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint n = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { sint8vec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) readonly buffer bottom_blob_int32 { ivec8 bottom_blob_int32_data[]; }; | |||
| layout (binding = 2) writeonly buffer top_blob { sint8vec4 top_blob_data[]; }; | |||
| layout (binding = 3) writeonly buffer top_blob_int32 { ivec4 top_blob_int32_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint n; | |||
| uint c; | |||
| uint stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gx = gl_GlobalInvocationID.x; | |||
| const uint gy = gl_GlobalInvocationID.y; | |||
| if (gx >= psc(n) || gy >= psc(c)) | |||
| return; | |||
| const uint gi = gy * psc(n) + gx; | |||
| const uvec2 gi2 = (gy * 2 + uvec2(0, 1)) * psc(stride) + gx; | |||
| // if (cast_type_from == cast_type_to) | |||
| // { | |||
| // buffer_cp8to4(top_blob_data, gi2, bottom_blob_data, gi); | |||
| // return; | |||
| // } | |||
| ivec8 v; | |||
| if (cast_type_from == 3) | |||
| { | |||
| v = bottom_blob_int32_data[gi]; | |||
| } | |||
| else | |||
| { | |||
| v = i8buffer_ld8(bottom_blob_data, gi); | |||
| } | |||
| if (cast_type_to == 3) | |||
| { | |||
| top_blob_int32_data[gi2.r] = v.abcd; | |||
| top_blob_int32_data[gi2.g] = v.efgh; | |||
| } | |||
| else | |||
| { | |||
| i8buffer_st4(top_blob_data, gi2.r, v.abcd); | |||
| i8buffer_st4(top_blob_data, gi2.g, v.efgh); | |||
| } | |||
| } | |||
| @@ -0,0 +1,63 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| layout (constant_id = 0) const int scale_data_size = 0; | |||
| layout (constant_id = 1) const float scale_value = 1.f; | |||
| #define shape_constant_id_offset 2 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer scale_blob { sfp scale_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint c; | |||
| uint in_stride; | |||
| uint out_stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gi = gl_GlobalInvocationID.x; | |||
| if (gi >= psc(in_stride) * psc(c)) | |||
| return; | |||
| const uint gy = gi / psc(in_stride); | |||
| const uint gx = gi % psc(in_stride); | |||
| afp v = buffer_ld1(bottom_blob_data, gi); | |||
| afp scale; | |||
| if (scale_data_size == 1) | |||
| { | |||
| scale = afp(scale_value); | |||
| } | |||
| else | |||
| { | |||
| scale = buffer_ld1(scale_blob_data, gy); | |||
| } | |||
| int v_int = int(round(clamp(v * scale, afp(-127.f), afp(127.f)))); | |||
| const uint outgi = gy * psc(out_stride) + gx; | |||
| i8buffer_st1(top_blob_data, outgi, v_int); | |||
| } | |||
| @@ -0,0 +1,63 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| layout (constant_id = 0) const int scale_data_size = 0; | |||
| layout (constant_id = 1) const float scale_value = 1.f; | |||
| #define shape_constant_id_offset 2 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer scale_blob { sfpvec4 scale_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint c; | |||
| uint in_stride; | |||
| uint out_stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gi = gl_GlobalInvocationID.x; | |||
| if (gi >= psc(in_stride) * psc(c)) | |||
| return; | |||
| const uint gy = gi / psc(in_stride); | |||
| const uint gx = gi % psc(in_stride); | |||
| afpvec4 v = buffer_ld4(bottom_blob_data, gi); | |||
| afpvec4 scale; | |||
| if (scale_data_size == 1) | |||
| { | |||
| scale = afpvec4(scale_value); | |||
| } | |||
| else | |||
| { | |||
| scale = buffer_ld4(scale_blob_data, gy); | |||
| } | |||
| ivec4 v_int = ivec4(round(clamp(v * scale, afp(-127.f), afp(127.f)))); | |||
| const uint outgi = gy * psc(out_stride) + gx; | |||
| i8buffer_st4(top_blob_data, outgi, v_int); | |||
| } | |||
| @@ -0,0 +1,65 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| layout (constant_id = 0) const int scale_data_size = 0; | |||
| layout (constant_id = 1) const float scale_value = 1.f; | |||
| #define shape_constant_id_offset 2 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sint8vec8 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer scale_blob { sfpvec8 scale_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint c; | |||
| uint in_stride; | |||
| uint out_stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gi = gl_GlobalInvocationID.x; | |||
| if (gi >= psc(in_stride) * psc(c)) | |||
| return; | |||
| const uint gy = gi / psc(in_stride); | |||
| const uint gx = gi % psc(in_stride); | |||
| afpvec8 v = buffer_ld8(bottom_blob_data, gi); | |||
| afpvec8 scale; | |||
| if (scale_data_size == 1) | |||
| { | |||
| scale = afpvec8(afpvec4(scale_value), afpvec4(scale_value)); | |||
| } | |||
| else | |||
| { | |||
| scale = buffer_ld8(scale_blob_data, gy); | |||
| } | |||
| ivec8 v_int; | |||
| v_int.abcd = ivec4(round(clamp(v[0] * scale[0], afp(-127.f), afp(127.f)))); | |||
| v_int.efgh = ivec4(round(clamp(v[1] * scale[1], afp(-127.f), afp(127.f)))); | |||
| const uint outgi = gy * psc(out_stride) + gx; | |||
| i8buffer_st8(top_blob_data, outgi, v_int); | |||
| } | |||
| @@ -0,0 +1,103 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #extension GL_GOOGLE_include_directive: enable | |||
| #include "vulkan_activation.comp" | |||
| layout (constant_id = 0) const int scale_in_data_size = 0; | |||
| layout (constant_id = 1) const float scale_in_value = 1.f; | |||
| layout (constant_id = 2) const int scale_out_data_size = 0; | |||
| layout (constant_id = 3) const float scale_out_value = 1.f; | |||
| layout (constant_id = 4) const int bias_data_size = 0; | |||
| layout (constant_id = 5) const float bias_value = 0.f; | |||
| layout (constant_id = 6) const int activation_type = 0; | |||
| layout (constant_id = 7) const float activation_param_0 = 0; | |||
| layout (constant_id = 8) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 9 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { int bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer scale_in_blob { sfp scale_in_blob_data[]; }; | |||
| layout (binding = 3) readonly buffer scale_out_blob { sfp scale_out_blob_data[]; }; | |||
| layout (binding = 4) readonly buffer bias_blob { sfp bias_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint c; | |||
| uint in_stride; | |||
| uint out_stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gi = gl_GlobalInvocationID.x; | |||
| if (gi >= psc(in_stride) * psc(c)) | |||
| return; | |||
| const uint gy = gi / psc(in_stride); | |||
| const uint gx = gi % psc(in_stride); | |||
| int v = bottom_blob_data[gi]; | |||
| afp scale_in; | |||
| if (scale_in_data_size == 1) | |||
| { | |||
| scale_in = afp(scale_in_value); | |||
| } | |||
| else | |||
| { | |||
| scale_in = buffer_ld1(scale_in_blob_data, gy); | |||
| } | |||
| afp bias; | |||
| if (bias_data_size == 0) | |||
| { | |||
| bias = afp(0.f); | |||
| } | |||
| else if (bias_data_size == 1) | |||
| { | |||
| bias = afp(bias_value); | |||
| } | |||
| else | |||
| { | |||
| bias = buffer_ld1(bias_blob_data, gy); | |||
| } | |||
| afp v_fp = afp(v) * scale_in + bias; | |||
| v_fp = activation_afp(v_fp, activation_type, activation_param_0, activation_param_1); | |||
| afp scale_out; | |||
| if (scale_out_data_size == 1) | |||
| { | |||
| scale_out = afp(scale_out_value); | |||
| } | |||
| else | |||
| { | |||
| scale_out = buffer_ld1(scale_out_blob_data, gy); | |||
| } | |||
| int v_int = int(round(clamp(v_fp * scale_out, afp(-127.f), afp(127.f)))); | |||
| const uint outgi = gy * psc(out_stride) + gx; | |||
| i8buffer_st1(top_blob_data, outgi, v_int); | |||
| } | |||
| @@ -0,0 +1,103 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #extension GL_GOOGLE_include_directive: enable | |||
| #include "vulkan_activation.comp" | |||
| layout (constant_id = 0) const int scale_in_data_size = 0; | |||
| layout (constant_id = 1) const float scale_in_value = 1.f; | |||
| layout (constant_id = 2) const int scale_out_data_size = 0; | |||
| layout (constant_id = 3) const float scale_out_value = 1.f; | |||
| layout (constant_id = 4) const int bias_data_size = 0; | |||
| layout (constant_id = 5) const float bias_value = 0.f; | |||
| layout (constant_id = 6) const int activation_type = 0; | |||
| layout (constant_id = 7) const float activation_param_0 = 0; | |||
| layout (constant_id = 8) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 9 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { ivec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer scale_in_blob { sfpvec4 scale_in_blob_data[]; }; | |||
| layout (binding = 3) readonly buffer scale_out_blob { sfpvec4 scale_out_blob_data[]; }; | |||
| layout (binding = 4) readonly buffer bias_blob { sfpvec4 bias_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint c; | |||
| uint in_stride; | |||
| uint out_stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gi = gl_GlobalInvocationID.x; | |||
| if (gi >= psc(in_stride) * psc(c)) | |||
| return; | |||
| const uint gy = gi / psc(in_stride); | |||
| const uint gx = gi % psc(in_stride); | |||
| ivec4 v = bottom_blob_data[gi]; | |||
| afpvec4 scale_in; | |||
| if (scale_in_data_size == 1) | |||
| { | |||
| scale_in = afpvec4(scale_in_value); | |||
| } | |||
| else | |||
| { | |||
| scale_in = buffer_ld4(scale_in_blob_data, gy); | |||
| } | |||
| afpvec4 bias; | |||
| if (bias_data_size == 0) | |||
| { | |||
| bias = afpvec4(0.f); | |||
| } | |||
| else if (bias_data_size == 1) | |||
| { | |||
| bias = afpvec4(bias_value); | |||
| } | |||
| else | |||
| { | |||
| bias = buffer_ld4(bias_blob_data, gy); | |||
| } | |||
| afpvec4 v_fp = afpvec4(v) * scale_in + bias; | |||
| v_fp = activation_afpvec4(v_fp, activation_type, activation_param_0, activation_param_1); | |||
| afpvec4 scale_out; | |||
| if (scale_out_data_size == 1) | |||
| { | |||
| scale_out = afpvec4(scale_out_value); | |||
| } | |||
| else | |||
| { | |||
| scale_out = buffer_ld4(scale_out_blob_data, gy); | |||
| } | |||
| ivec4 v_int = ivec4(round(clamp(v_fp * scale_out, afp(-127.f), afp(127.f)))); | |||
| const uint outgi = gy * psc(out_stride) + gx; | |||
| i8buffer_st4(top_blob_data, outgi, v_int); | |||
| } | |||
| @@ -0,0 +1,107 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #extension GL_GOOGLE_include_directive: enable | |||
| #include "vulkan_activation.comp" | |||
| layout (constant_id = 0) const int scale_in_data_size = 0; | |||
| layout (constant_id = 1) const float scale_in_value = 1.f; | |||
| layout (constant_id = 2) const int scale_out_data_size = 0; | |||
| layout (constant_id = 3) const float scale_out_value = 1.f; | |||
| layout (constant_id = 4) const int bias_data_size = 0; | |||
| layout (constant_id = 5) const float bias_value = 0.f; | |||
| layout (constant_id = 6) const int activation_type = 0; | |||
| layout (constant_id = 7) const float activation_param_0 = 0; | |||
| layout (constant_id = 8) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 9 | |||
| layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; | |||
| layout (binding = 0) readonly buffer bottom_blob { ivec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sint8vec8 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer scale_in_blob { sfpvec8 scale_in_blob_data[]; }; | |||
| layout (binding = 3) readonly buffer scale_out_blob { sfpvec8 scale_out_blob_data[]; }; | |||
| layout (binding = 4) readonly buffer bias_blob { sfpvec8 bias_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| uint c; | |||
| uint in_stride; | |||
| uint out_stride; | |||
| } p; | |||
| void main() | |||
| { | |||
| const uint gi = gl_GlobalInvocationID.x; | |||
| if (gi >= psc(in_stride) * psc(c)) | |||
| return; | |||
| const uint gy = gi / psc(in_stride); | |||
| const uint gx = gi % psc(in_stride); | |||
| ivec8 v = bottom_blob_data[gi]; | |||
| afpvec8 scale_in; | |||
| if (scale_in_data_size == 1) | |||
| { | |||
| scale_in = afpvec8(afpvec4(scale_in_value), afpvec4(scale_in_value)); | |||
| } | |||
| else | |||
| { | |||
| scale_in = buffer_ld8(scale_in_blob_data, gy); | |||
| } | |||
| afpvec8 bias; | |||
| if (bias_data_size == 0) | |||
| { | |||
| bias = afpvec8(afpvec4(0.f), afpvec4(0.f)); | |||
| } | |||
| else if (bias_data_size == 1) | |||
| { | |||
| bias = afpvec8(afpvec4(bias_value), afpvec4(bias_value)); | |||
| } | |||
| else | |||
| { | |||
| bias = buffer_ld8(bias_blob_data, gy); | |||
| } | |||
| afpvec8 v_fp; | |||
| v_fp[0] = afpvec4(v.abcd) * scale_in[0] + bias[0]; | |||
| v_fp[1] = afpvec4(v.efgh) * scale_in[1] + bias[1]; | |||
| v_fp = activation_afpvec8(v_fp, activation_type, activation_param_0, activation_param_1); | |||
| afpvec8 scale_out; | |||
| if (scale_out_data_size == 1) | |||
| { | |||
| scale_out = afpvec8(afpvec4(scale_out_value), afpvec4(scale_out_value)); | |||
| } | |||
| else | |||
| { | |||
| scale_out = buffer_ld8(scale_out_blob_data, gy); | |||
| } | |||
| ivec8 v_int; | |||
| v_int.abcd = ivec4(round(clamp(v_fp[0] * scale_out[0], afp(-127.f), afp(127.f)))); | |||
| v_int.efgh = ivec4(round(clamp(v_fp[1] * scale_out[1], afp(-127.f), afp(127.f)))); | |||
| const uint outgi = gy * psc(out_stride) + gx; | |||
| i8buffer_st8(top_blob_data, outgi, v_int); | |||
| } | |||
| @@ -1043,6 +1043,9 @@ int Net::load_param(const DataReader& dr) | |||
| // fp16a makes no sense when fp16 storage disabled | |||
| if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false; | |||
| // int8a makes no sense when int8 storage disabled | |||
| if (!opt.use_int8_packed && !opt.use_int8_storage) opt.use_int8_arithmetic = false; | |||
| // fp16 uniform makes no sense when fp16 arithmetic disabled | |||
| if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false; | |||
| } | |||
| @@ -1339,6 +1342,9 @@ int Net::load_param_bin(const DataReader& dr) | |||
| // fp16a makes no sense when fp16 storage disabled | |||
| if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false; | |||
| // int8a makes no sense when int8 storage disabled | |||
| if (!opt.use_int8_packed && !opt.use_int8_storage) opt.use_int8_arithmetic = false; | |||
| // fp16 uniform makes no sense when fp16 arithmetic disabled | |||
| if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false; | |||
| } | |||
| @@ -142,12 +142,8 @@ static int test_dequantize_3() | |||
| || test_dequantize_pack8(RandomIntMat(15, 24), 24, 24) | |||
| || test_dequantize_pack8(RandomIntMat(15, 24), 24, 1) | |||
| || test_dequantize_pack8(RandomIntMat(15, 24), 24, 0) | |||
| || test_dequantize_pack8(RandomIntMat(128), 1, 128) | |||
| || test_dequantize_pack8(RandomIntMat(128), 1, 1) | |||
| || test_dequantize_pack8(RandomIntMat(128), 1, 0) | |||
| || test_dequantize_pack8(RandomIntMat(128), 128, 128) | |||
| || test_dequantize_pack8(RandomIntMat(128), 128, 1) | |||
| || test_dequantize_pack8(RandomIntMat(128), 128, 0); | |||
| || test_dequantize_pack8(RandomIntMat(128), 1, 0); | |||
| } | |||
| int main() | |||
| @@ -217,15 +217,12 @@ static int test_packing_cpu(const ncnn::Mat& a, int in_elempack, int out_elempac | |||
| } | |||
| #if NCNN_VULKAN | |||
| static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_elempack) | |||
| static int test_packing_gpu_fp32(const ncnn::Mat& a, int in_elempack, int out_elempack) | |||
| { | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, out_elempack); | |||
| pd.set(2, 1); // cast_type_from | |||
| pd.set(3, 1); // cast_type_to | |||
| pd.set(4, 0); // storage_type_from | |||
| pd.set(5, 0); // storage_type_to | |||
| std::vector<ncnn::Mat> weights(0); | |||
| @@ -297,12 +294,112 @@ static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_ | |||
| if (CompareMat(b, d, 0.001) != 0) | |||
| { | |||
| fprintf(stderr, "test_packing_gpu_buffer failed a.dims=%d a=(%d %d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.d, a.c, in_elempack, out_elempack); | |||
| fprintf(stderr, "test_packing_gpu failed a.dims=%d a=(%d %d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.d, a.c, in_elempack, out_elempack); | |||
| return -1; | |||
| } | |||
| return 0; | |||
| } | |||
| static int test_packing_gpu_int8(const ncnn::Mat& a, int in_elempack, int out_elempack) | |||
| { | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, out_elempack); | |||
| pd.set(2, 4); // cast_type_from | |||
| pd.set(3, 4); // cast_type_to | |||
| std::vector<ncnn::Mat> weights(0); | |||
| ncnn::Option opt; | |||
| opt.num_threads = 1; | |||
| opt.use_vulkan_compute = true; | |||
| opt.use_int8_inference = false; | |||
| opt.use_fp16_packed = false; | |||
| opt.use_fp16_storage = false; | |||
| opt.use_fp16_arithmetic = false; | |||
| opt.use_int8_storage = false; | |||
| opt.use_int8_arithmetic = false; | |||
| opt.use_packing_layout = true; | |||
| opt.use_shader_pack8 = true; | |||
| ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); | |||
| ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator(); | |||
| ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator(); | |||
| opt.blob_vkallocator = blob_vkallocator; | |||
| opt.workspace_vkallocator = blob_vkallocator; | |||
| opt.staging_vkallocator = staging_vkallocator; | |||
| if (!vkdev->info.support_int8_packed()) opt.use_int8_packed = false; | |||
| if (!vkdev->info.support_int8_storage()) opt.use_int8_storage = false; | |||
| ncnn::Layer* op = ncnn::create_layer_vulkan("Packing"); | |||
| op->vkdev = vkdev; | |||
| op->load_param(pd); | |||
| ncnn::ModelBinFromMatArray mb(weights.data()); | |||
| op->load_model(mb); | |||
| op->create_pipeline(opt); | |||
| ncnn::Mat a8; | |||
| if (a.dims == 1) a8 = RandomS8Mat(a.w); | |||
| if (a.dims == 2) a8 = RandomS8Mat(a.w, a.h); | |||
| if (a.dims == 3) a8 = RandomS8Mat(a.w, a.h, a.c); | |||
| if (a.dims == 4) a8 = RandomS8Mat(a.w, a.h, a.d, a.c); | |||
| ncnn::Mat ap; | |||
| ncnn::convert_packing(a8, ap, in_elempack, opt); | |||
| ncnn::Mat b; | |||
| packing_cpu_naive(ap, b, out_elempack); | |||
| ncnn::Mat c; | |||
| // forward | |||
| ncnn::VkCompute cmd(vkdev); | |||
| // upload | |||
| ncnn::VkMat a_gpu; | |||
| cmd.record_clone(ap, a_gpu, opt); | |||
| ncnn::VkMat c_gpu; | |||
| op->forward(a_gpu, c_gpu, cmd, opt); | |||
| // download | |||
| cmd.record_clone(c_gpu, c, opt); | |||
| cmd.submit_and_wait(); | |||
| op->destroy_pipeline(opt); | |||
| delete op; | |||
| ncnn::Mat b32; | |||
| ncnn::cast_int8_to_float32(b, b32, opt); | |||
| ncnn::Mat c32; | |||
| ncnn::cast_int8_to_float32(c, c32, opt); | |||
| if (CompareMat(b32, c32, 0.001) != 0) | |||
| { | |||
| fprintf(stderr, "test_packing_gpu_int8 failed a.dims=%d a=(%d %d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.d, a.c, in_elempack, out_elempack); | |||
| return -1; | |||
| } | |||
| return 0; | |||
| } | |||
| static int test_packing_gpu(const ncnn::Mat& a, int in_elempack, int out_elempack) | |||
| { | |||
| return 0 | |||
| || test_packing_gpu_fp32(a, in_elempack, out_elempack) | |||
| || test_packing_gpu_int8(a, in_elempack, out_elempack); | |||
| } | |||
| #endif | |||
| static int test_packing_cpu(const ncnn::Mat& a) | |||
| @@ -329,15 +426,15 @@ static int test_packing_cpu(const ncnn::Mat& a) | |||
| static int test_packing_gpu(const ncnn::Mat& a) | |||
| { | |||
| return 0 | |||
| || test_packing_gpu_buffer(a, 1, 1) | |||
| || test_packing_gpu_buffer(a, 4, 4) | |||
| || test_packing_gpu_buffer(a, 8, 8) | |||
| || test_packing_gpu_buffer(a, 1, 4) | |||
| || test_packing_gpu_buffer(a, 4, 1) | |||
| || test_packing_gpu_buffer(a, 1, 8) | |||
| || test_packing_gpu_buffer(a, 8, 1) | |||
| || test_packing_gpu_buffer(a, 4, 8) | |||
| || test_packing_gpu_buffer(a, 8, 4); | |||
| || test_packing_gpu(a, 1, 1) | |||
| || test_packing_gpu(a, 4, 4) | |||
| || test_packing_gpu(a, 8, 8) | |||
| || test_packing_gpu(a, 1, 4) | |||
| || test_packing_gpu(a, 4, 1) | |||
| || test_packing_gpu(a, 1, 8) | |||
| || test_packing_gpu(a, 8, 1) | |||
| || test_packing_gpu(a, 4, 8) | |||
| || test_packing_gpu(a, 8, 4); | |||
| } | |||
| #endif // NCNN_VULKAN | |||
| @@ -24,7 +24,7 @@ static int test_quantize(const ncnn::Mat& a, float scale_low, float scale_high) | |||
| } | |||
| else | |||
| { | |||
| if (a.dims == 1) scale_data.create(a.w); | |||
| if (a.dims == 1) scale_data.create(1); | |||
| if (a.dims == 2) scale_data.create(a.h); | |||
| if (a.dims == 3) scale_data.create(a.c); | |||
| Randomize(scale_data, scale_low, scale_high); | |||
| @@ -24,7 +24,7 @@ static int test_quantize_oom(const ncnn::Mat& a, float scale_low, float scale_hi | |||
| } | |||
| else | |||
| { | |||
| if (a.dims == 1) scale_data.create(a.w); | |||
| if (a.dims == 1) scale_data.create(1); | |||
| if (a.dims == 2) scale_data.create(a.h); | |||
| if (a.dims == 3) scale_data.create(a.c); | |||
| Randomize(scale_data, scale_low, scale_high); | |||
| @@ -759,7 +759,32 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n | |||
| std::vector<ncnn::VkMat> a_gpu(a.size()); | |||
| for (size_t i = 0; i < a_gpu.size(); i++) | |||
| { | |||
| cmd.record_upload(a[i], a_gpu[i], opt); | |||
| if (flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING) | |||
| { | |||
| // resolve dst_elempack | |||
| int dims = a[i].dims; | |||
| int elemcount = 0; | |||
| if (dims == 1) elemcount = a[i].elempack * a[i].w; | |||
| if (dims == 2) elemcount = a[i].elempack * a[i].h; | |||
| if (dims == 3 || dims == 4) elemcount = a[i].elempack * a[i].c; | |||
| const int dst_elempack = (opt.use_shader_pack8 && elemcount % 8 == 0) ? 8 : elemcount % 4 == 0 ? 4 : 1; | |||
| ncnn::Mat a4; | |||
| ncnn::convert_packing(a[i], a4, dst_elempack, opt); | |||
| ncnn::Option opt_upload = opt; | |||
| opt_upload.use_fp16_packed = false; | |||
| opt_upload.use_fp16_storage = false; | |||
| opt_upload.use_int8_packed = false; | |||
| opt_upload.use_int8_storage = false; | |||
| cmd.record_clone(a4, a_gpu[i], opt_upload); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(a[i], a_gpu[i], opt); | |||
| } | |||
| } | |||
| std::vector<ncnn::VkMat> d_gpu(top_blob_count); | |||
| @@ -1082,7 +1107,33 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n | |||
| { | |||
| // upload | |||
| ncnn::VkMat a_gpu; | |||
| cmd.record_upload(a, a_gpu, opt); | |||
| if (flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING) | |||
| { | |||
| // resolve dst_elempack | |||
| int dims = a.dims; | |||
| int elemcount = 0; | |||
| if (dims == 1) elemcount = a.elempack * a.w; | |||
| if (dims == 2) elemcount = a.elempack * a.h; | |||
| if (dims == 3 || dims == 4) elemcount = a.elempack * a.c; | |||
| const int dst_elempack = (opt.use_shader_pack8 && elemcount % 8 == 0) ? 8 : elemcount % 4 == 0 ? 4 : 1; | |||
| ncnn::Mat a4; | |||
| ncnn::convert_packing(a, a4, dst_elempack, opt); | |||
| ncnn::Option opt_upload = opt; | |||
| opt_upload.use_fp16_packed = false; | |||
| opt_upload.use_fp16_storage = false; | |||
| opt_upload.use_int8_packed = false; | |||
| opt_upload.use_int8_storage = false; | |||
| cmd.record_clone(a4, a_gpu, opt_upload); | |||
| } | |||
| else | |||
| { | |||
| cmd.record_upload(a, a_gpu, opt); | |||
| } | |||
| ncnn::VkMat d_gpu; | |||
| if (op->support_inplace) | |||