From 9f832c19c170322ecd01749a6d129e7e915ccdb1 Mon Sep 17 00:00:00 2001 From: nihui Date: Thu, 26 Jun 2025 10:47:25 +0800 Subject: [PATCH] vulkan int8 packing quantize dequantize requantize (#3731) * add int8 definitions * packing vulkan int8/int32, quantize vulkan * vulkan dequantize * requantize vulkan --- src/allocator.cpp | 17 ++ src/command.cpp | 5 + src/gpu.cpp | 152 +++++++++++- src/gpu.h | 2 +- src/layer/packing.h | 2 + src/layer/vulkan/dequantize_vulkan.cpp | 231 ++++++++++++++++++ src/layer/vulkan/dequantize_vulkan.h | 46 ++++ src/layer/vulkan/packing_vulkan.cpp | 71 +++++- src/layer/vulkan/quantize_vulkan.cpp | 215 ++++++++++++++++ src/layer/vulkan/quantize_vulkan.h | 45 ++++ src/layer/vulkan/requantize_vulkan.cpp | 231 ++++++++++++++++++ src/layer/vulkan/requantize_vulkan.h | 47 ++++ src/layer/vulkan/shader/dequantize.comp | 80 ++++++ src/layer/vulkan/shader/dequantize_pack4.comp | 80 ++++++ src/layer/vulkan/shader/dequantize_pack8.comp | 84 +++++++ src/layer/vulkan/shader/packing_int8.comp | 73 ++++++ .../vulkan/shader/packing_pack1to4_int8.comp | 79 ++++++ .../vulkan/shader/packing_pack1to8_int8.comp | 88 +++++++ .../vulkan/shader/packing_pack4to1_int8.comp | 79 ++++++ .../vulkan/shader/packing_pack4to8_int8.comp | 75 ++++++ .../vulkan/shader/packing_pack8to1_int8.comp | 88 +++++++ .../vulkan/shader/packing_pack8to4_int8.comp | 75 ++++++ src/layer/vulkan/shader/quantize.comp | 63 +++++ src/layer/vulkan/shader/quantize_pack4.comp | 63 +++++ src/layer/vulkan/shader/quantize_pack8.comp | 65 +++++ src/layer/vulkan/shader/requantize.comp | 103 ++++++++ src/layer/vulkan/shader/requantize_pack4.comp | 103 ++++++++ src/layer/vulkan/shader/requantize_pack8.comp | 107 ++++++++ src/net.cpp | 6 + tests/test_dequantize.cpp | 6 +- tests/test_packing.cpp | 125 ++++++++-- tests/test_quantize.cpp | 2 +- tests/test_quantize_oom.cpp | 2 +- tests/testutil.cpp | 55 ++++- 34 files changed, 2523 insertions(+), 42 deletions(-) create mode 100644 src/layer/vulkan/dequantize_vulkan.cpp create mode 100644 src/layer/vulkan/dequantize_vulkan.h create mode 100644 src/layer/vulkan/quantize_vulkan.cpp create mode 100644 src/layer/vulkan/quantize_vulkan.h create mode 100644 src/layer/vulkan/requantize_vulkan.cpp create mode 100644 src/layer/vulkan/requantize_vulkan.h create mode 100644 src/layer/vulkan/shader/dequantize.comp create mode 100644 src/layer/vulkan/shader/dequantize_pack4.comp create mode 100644 src/layer/vulkan/shader/dequantize_pack8.comp create mode 100644 src/layer/vulkan/shader/packing_int8.comp create mode 100644 src/layer/vulkan/shader/packing_pack1to4_int8.comp create mode 100644 src/layer/vulkan/shader/packing_pack1to8_int8.comp create mode 100644 src/layer/vulkan/shader/packing_pack4to1_int8.comp create mode 100644 src/layer/vulkan/shader/packing_pack4to8_int8.comp create mode 100644 src/layer/vulkan/shader/packing_pack8to1_int8.comp create mode 100644 src/layer/vulkan/shader/packing_pack8to4_int8.comp create mode 100644 src/layer/vulkan/shader/quantize.comp create mode 100644 src/layer/vulkan/shader/quantize_pack4.comp create mode 100644 src/layer/vulkan/shader/quantize_pack8.comp create mode 100644 src/layer/vulkan/shader/requantize.comp create mode 100644 src/layer/vulkan/shader/requantize_pack4.comp create mode 100644 src/layer/vulkan/shader/requantize_pack8.comp diff --git a/src/allocator.cpp b/src/allocator.cpp index 98115ccec..e036cc976 100644 --- a/src/allocator.cpp +++ b/src/allocator.cpp @@ -892,6 +892,13 @@ VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize, if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT; if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT; } + if (elemsize / elempack == 1) + { + // int8 + if (elempack == 1) format = VK_FORMAT_R8_SINT; + if (elempack == 4) format = VK_FORMAT_R8G8B8A8_SINT; + if (elempack == 8) format = VK_FORMAT_R8G8B8A8_SINT; + } // resolve image width height depth int width = w; @@ -1468,6 +1475,16 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsiz if (elempack == 32) format = VK_FORMAT_R16G16B16A16_SFLOAT; if (elempack == 64) format = VK_FORMAT_R16G16B16A16_SFLOAT; } + if (elemsize / elempack == 1) + { + // int8 + if (elempack == 1) format = VK_FORMAT_R8_SINT; + if (elempack == 4) format = VK_FORMAT_R8G8B8A8_SINT; + if (elempack == 8) format = VK_FORMAT_R8G8B8A8_SINT; + if (elempack == 16) format = VK_FORMAT_R8G8B8A8_SINT; + if (elempack == 32) format = VK_FORMAT_R8G8B8A8_SINT; + if (elempack == 64) format = VK_FORMAT_R8G8B8A8_SINT; + } // resolve image width height depth int width = w; diff --git a/src/command.cpp b/src/command.cpp index 98af7c5d0..27f037178 100644 --- a/src/command.cpp +++ b/src/command.cpp @@ -450,6 +450,11 @@ void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt) cast_type_to = 1; } + if (src.elemsize == src.elempack * 1u) + { + cast_type_to = 4; + } + VkMat dst_staging; vkdev->convert_packing(src, dst_staging, dst_elempack, cast_type_to, *this, opt_staging); diff --git a/src/gpu.cpp b/src/gpu.cpp index d6e5090f1..b9a7c76c3 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -3032,6 +3032,10 @@ public: // to fp32 | fp16 // to pack1 | pack4 | pack8 mutable ncnn::Layer* uop_packing[2][2][3]; + // from int8 + // to int8 + // to pack1 | pack4 | pack8 + mutable ncnn::Layer* uop_packing_int8[3]; mutable Mutex uop_lock; // device is valid and sucessfully initialized @@ -3047,6 +3051,7 @@ VulkanDevicePrivate::VulkanDevicePrivate(VulkanDevice* _vkdev) pipeline_cache = 0; valid = false; memset(uop_packing, 0, sizeof(uop_packing)); + memset(uop_packing_int8, 0, sizeof(uop_packing_int8)); } int VulkanDevicePrivate::create_dummy_buffer_image() @@ -3096,18 +3101,29 @@ void VulkanDevicePrivate::destroy_dummy_buffer_image() const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const { + bool use_fp16 = (cast_type_from_index == 1 || cast_type_to_index == 1); + bool use_int8 = (cast_type_from_index == 3 || cast_type_to_index == 3); + MutexLockGuard lock(uop_lock); - const ncnn::Layer* cached_uop = uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index]; + const ncnn::Layer* cached_uop = 0; + if (use_int8) + { + cached_uop = uop_packing_int8[packing_type_to_index]; + } + else + { + cached_uop = uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index]; + } if (cached_uop) return cached_uop; - bool use_fp16 = (cast_type_from_index == 1 || cast_type_to_index == 1); - // create uop Option opt; opt.use_fp16_packed = use_fp16; // fp16p is always supported opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage(); + opt.use_int8_packed = use_int8; // int8p is always supported + opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage(); // fp16/int8 arithmetic are not necessary for packing // and may conflict with storage options @@ -3132,14 +3148,21 @@ const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_ ncnn::ParamDict pd; pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack - pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16 + pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16 3=int8 pd.set(3, cast_type_to_index + 1); uop->load_param(pd); uop->create_pipeline(opt); - uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop; + if (use_int8) + { + uop_packing_int8[packing_type_to_index] = uop; + } + else + { + uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop; + } return uop; } @@ -3164,6 +3187,8 @@ void VulkanDevicePrivate::destroy_utility_operator() opt.use_fp16_packed = use_fp16; opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage(); + opt.use_int8_packed = false; + opt.use_int8_storage = false; // to pack1 | pack4 | pack8 for (int k = 0; k < 3; k++) @@ -3183,6 +3208,33 @@ void VulkanDevicePrivate::destroy_utility_operator() } } } + + // int8 + { + bool use_int8 = true; + + opt.use_fp16_packed = false; + opt.use_fp16_storage = false; + opt.use_int8_packed = use_int8; + opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage(); + + // to pack1 | pack4 | pack8 + for (int k = 0; k < 3; k++) + { + // enable pack8 for pack8to1/pack8to4 + opt.use_shader_pack8 = true; + + ncnn::Layer* uop = uop_packing_int8[k]; + if (!uop) + continue; + + uop->destroy_pipeline(opt); + + delete uop; + + uop_packing_int8[k] = 0; + } + } } VulkanDevice::VulkanDevice(int device_index) @@ -4232,18 +4284,35 @@ void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempac { cast_type_from_index = 0; } - else // if (src.elembits() == 16) + else if (src.elembits() == 16) { cast_type_from_index = 1; } + else // if (src.elembits() == 8) + { + cast_type_from_index = 3; + } int cast_type_to_index = cast_type_to ? cast_type_to - 1 : cast_type_from_index; // NCNN_LOGE("convert_packing b2b %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index); + if ((cast_type_from_index == 0 || cast_type_from_index == 1) && (cast_type_to_index == 2 || cast_type_to_index == 3)) + { + NCNN_LOGE("convert_packing from fp32/fp16 to int32/int8 is not supported"); + return; + } + if ((cast_type_from_index == 2 || cast_type_from_index == 3) && (cast_type_to_index == 0 || cast_type_to_index == 1)) + { + NCNN_LOGE("convert_packing from int32/int8 to fp32/fp16 is not supported"); + return; + } + Option opt2 = opt; opt2.use_fp16_packed = (cast_type_from_index == 1 || cast_type_to_index == 1); opt2.use_fp16_storage = (cast_type_from_index == 1 || cast_type_to_index == 1) && info.support_fp16_storage(); + opt2.use_int8_packed = (cast_type_from_index == 3 || cast_type_to_index == 3); + opt2.use_int8_storage = (cast_type_from_index == 3 || cast_type_to_index == 3) && info.support_int8_storage(); const ncnn::Layer* uop = d->get_utility_operator(cast_type_from_index, cast_type_to_index, packing_type_to_index); uop->forward(src, dst, cmd, opt2); @@ -4809,6 +4878,49 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option custom_defines.append("afp2sfpmat4(v)", "v"); } + if (opt.use_int8_storage) + { + custom_defines.append("sint8", "int8_t"); + } + else if (opt.use_int8_packed) + { + custom_defines.append("sint8", "int"); + } + else + { + custom_defines.append("sint8", "int"); + } + + custom_defines.append("sint8vec4", "int"); + custom_defines.append("sint8vec8", "ivec2"); + + custom_defines.append("aint8", "int"); + custom_defines.append("aint8vec4", "ivec4"); + + custom_defines.append("unpackInt4x8(v)", "ivec4((v<<24)>>24,(v<<16)>>24,(v<<8)>>24,v>>24)"); + custom_defines.append("packInt4x8(v)", "int((uint(v.r)&0xFFu)|((uint(v.g)&0xFFu)<<8)|((uint(v.b)&0xFFu)<<16)|((uint(v.a)&0xFFu)<<24))"); + + if (opt.use_int8_storage) + { + custom_defines.append("i8buffer_ld1(buf,i)", "int(buf[i])"); + custom_defines.append("i8buffer_st1(buf,i,v)", "{buf[i]=int8_t(v);}"); + custom_defines.append("i8buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); + } + else + { + custom_defines.append("i8buffer_ld1(buf,i)", "int(((buf[(i)/4])<<(24-((i)%4)*8))>>24)"); + custom_defines.append("i8buffer_st1(buf,i,v)", "{uint _i=uint(i);uint _id4=_i/4;uint _im4=_i%4;int _vs=int(v);int _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id4],0,0);ivec4 _v=unpackInt4x8(_old_v);_v[_im4]=_vs;_new_v=packInt4x8(_v);} while(atomicCompSwap(buf[_id4],_old_v,_new_v)!=_old_v);}"); + custom_defines.append("i8buffer_cp1(buf,i,sbuf,si)", "{int _v=i8buffer_ld1(sbuf,si);i8buffer_st1(buf,i,_v);}"); + } + + custom_defines.append("i8buffer_ld4(buf,i)", "unpackInt4x8(buf[i])"); + custom_defines.append("i8buffer_st4(buf,i,v)", "{buf[i]=packInt4x8(v);}"); + custom_defines.append("i8buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); + + custom_defines.append("i8buffer_ld8(buf,i)", "ivec8(unpackInt4x8(buf[i].r),unpackInt4x8(buf[i].g))"); + custom_defines.append("i8buffer_st8(buf,i,v)", "{buf[i]=ivec2(packInt4x8(v.abcd),packInt4x8(v.efgh));}"); + custom_defines.append("i8buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); + custom_defines.append("psc(x)", "(x==0?p.x:x)"); if (opt.use_fp16_storage) @@ -5426,6 +5538,15 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option { custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_float16: require\n"; } + custom_exts += "struct ivec8 { ivec4 abcd; ivec4 efgh; };\n"; + if (opt.use_int8_storage) + { + custom_exts += "#extension GL_EXT_shader_8bit_storage: require\n"; + } + if (opt.use_int8_arithmetic) + { + custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_int8: require\n"; + } #if ENABLE_VALIDATION_LAYER { custom_exts += "#extension GL_EXT_debug_printf : require\n"; @@ -5507,11 +5628,22 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option NCNN_LOGE("%s", s.getInfoLog()); NCNN_LOGE("%s", s.getInfoDebugLog()); - // for (int i = 0; i < 4; i++) + // print as line_number: code { - int i = 3; - std::string s(comp_datas[i], comp_data_sizes[i]); - NCNN_LOGE("%s", s.c_str()); + const char* p = comp_datas[3]; + const char* line_end; + int line_number = 1; + + while ((line_end = strchr(p, '\n')) != NULL) + { + NCNN_LOGE("%d:\t%.*s", line_number++, (int)(line_end - p), p); + p = line_end + 1; + } + + if (*p != '\0') + { + NCNN_LOGE("%d:\t%s", line_number, p); + } } compile_success = false; diff --git a/src/gpu.h b/src/gpu.h index d9668b837..cefb02363 100644 --- a/src/gpu.h +++ b/src/gpu.h @@ -465,7 +465,7 @@ public: // utility operator void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const; - // cast_type_to 0=auto(same as src) 1=fp32 2=fp16 + // cast_type_to 0=auto(same as src) 1=fp32 2=fp16 3=int32 4=int8 void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const; // VK_KHR_bind_memory2 diff --git a/src/layer/packing.h b/src/layer/packing.h index f590f0fe1..bdb511da9 100644 --- a/src/layer/packing.h +++ b/src/layer/packing.h @@ -36,6 +36,8 @@ public: // 0 = auto // 1 = fp32 // 2 = fp16 + // 3 = int32 + // 4 = int8 int cast_type_from; int cast_type_to; }; diff --git a/src/layer/vulkan/dequantize_vulkan.cpp b/src/layer/vulkan/dequantize_vulkan.cpp new file mode 100644 index 000000000..6ffccbc9e --- /dev/null +++ b/src/layer/vulkan/dequantize_vulkan.cpp @@ -0,0 +1,231 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "dequantize_vulkan.h" + +#include "layer_shader_type.h" + +namespace ncnn { + +Dequantize_vulkan::Dequantize_vulkan() +{ + support_vulkan = true; + + pipeline_dequantize = 0; + pipeline_dequantize_pack4 = 0; + pipeline_dequantize_pack8 = 0; +} + +int Dequantize_vulkan::create_pipeline(const Option& opt) +{ + const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; + const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; + + const int dims = shape.dims; + + int elempack = 1; + if (dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; + if (dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; + if (dims == 3 || dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + + const size_t elemsize = elempack * 4u; + size_t out_elemsize; + if (opt.use_fp16_storage || opt.use_fp16_packed) + { + out_elemsize = elempack * 2u; + } + else + { + out_elemsize = elempack * 4u; + } + + Mat shape_packed; + if (dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack); + if (dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); + if (dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); + if (dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack); + + Mat out_shape_packed; + if (dims == 1) out_shape_packed = Mat(out_shape.w / elempack, (void*)0, out_elemsize, elempack); + if (dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / elempack, (void*)0, out_elemsize, elempack); + if (dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / elempack, (void*)0, out_elemsize, elempack); + if (dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / elempack, (void*)0, out_elemsize, elempack); + + size_t c = 0; + size_t in_stride = 0; + size_t out_stride = 0; + if (dims == 1) + { + c = 1; + in_stride = shape_packed.w; + out_stride = out_shape_packed.w; + } + if (dims == 2) + { + c = shape_packed.h; + in_stride = shape_packed.w; + out_stride = out_shape_packed.w; + } + if (dims == 3 || dims == 4) + { + c = shape_packed.c; + in_stride = shape_packed.cstep; + out_stride = out_shape_packed.cstep; + } + + std::vector specializations(4 + 3); + specializations[0].i = scale_data_size; + specializations[1].f = scale_data_size == 1 ? scale_data[0] : 1.f; + specializations[2].i = bias_data_size; + specializations[3].f = bias_data_size == 1 ? bias_data[0] : 0.f; + specializations[4 + 0].u32 = c; + specializations[4 + 1].u32 = in_stride; + specializations[4 + 2].u32 = out_stride; + + const int local_size_x = vkdev->info.subgroup_size(); + + // pack1 + if (shape.dims == 0 || elempack == 1) + { + pipeline_dequantize = new Pipeline(vkdev); + pipeline_dequantize->set_optimal_local_size_xyz(local_size_x, 1, 1); + pipeline_dequantize->create(LayerShaderType::dequantize, opt, specializations); + } + + // pack4 + if (shape.dims == 0 || elempack == 4) + { + pipeline_dequantize_pack4 = new Pipeline(vkdev); + pipeline_dequantize_pack4->set_optimal_local_size_xyz(local_size_x, 1, 1); + pipeline_dequantize_pack4->create(LayerShaderType::dequantize_pack4, opt, specializations); + } + + // pack8 + if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8) + { + pipeline_dequantize_pack8 = new Pipeline(vkdev); + pipeline_dequantize_pack8->set_optimal_local_size_xyz(local_size_x, 1, 1); + pipeline_dequantize_pack8->create(LayerShaderType::dequantize_pack8, opt, specializations); + } + + return 0; +} + +int Dequantize_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_dequantize; + pipeline_dequantize = 0; + + delete pipeline_dequantize_pack4; + pipeline_dequantize_pack4 = 0; + + delete pipeline_dequantize_pack8; + pipeline_dequantize_pack8 = 0; + + return 0; +} + +int Dequantize_vulkan::upload_model(VkTransfer& cmd, const Option& opt) +{ + if (scale_data_size > 1) + { + cmd.record_upload(scale_data, scale_data_gpu, opt); + } + + if (bias_data_size > 1) + { + cmd.record_upload(bias_data, bias_data_gpu, opt); + } + + return 0; +} + +int Dequantize_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int d = bottom_blob.d; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + + size_t out_elemsize; + if (opt.use_fp16_storage || opt.use_fp16_packed) + { + out_elemsize = elempack * 2u; + } + else + { + out_elemsize = elempack * 4u; + } + + if (dims == 1) + top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator); + if (dims == 2) + top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator); + if (dims == 3) + top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator); + if (dims == 4) + top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + size_t c = 0; + size_t in_stride = 0; + size_t out_stride = 0; + if (dims == 1) + { + c = 1; + in_stride = bottom_blob.w; + out_stride = top_blob.w; + } + if (dims == 2) + { + c = bottom_blob.h; + in_stride = bottom_blob.w; + out_stride = top_blob.w; + } + if (dims == 3 || dims == 4) + { + c = bottom_blob.c; + in_stride = bottom_blob.cstep; + out_stride = top_blob.cstep; + } + + std::vector bindings(4); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + bindings[2] = scale_data_gpu; + bindings[3] = bias_data_gpu; + + std::vector constants(3); + constants[0].u32 = c; + constants[1].u32 = in_stride; + constants[2].u32 = out_stride; + + VkMat dispatcher; + dispatcher.w = in_stride * c; + dispatcher.h = 1; + dispatcher.c = 1; + + const Pipeline* pipeline = elempack == 8 ? pipeline_dequantize_pack8 + : elempack == 4 ? pipeline_dequantize_pack4 + : pipeline_dequantize; + + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/vulkan/dequantize_vulkan.h b/src/layer/vulkan/dequantize_vulkan.h new file mode 100644 index 000000000..08ee83fc5 --- /dev/null +++ b/src/layer/vulkan/dequantize_vulkan.h @@ -0,0 +1,46 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_DEQUANTIZE_VULKAN_H +#define LAYER_DEQUANTIZE_VULKAN_H + +#include "dequantize.h" + +namespace ncnn { + +class Dequantize_vulkan : virtual public Dequantize +{ +public: + Dequantize_vulkan(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int upload_model(VkTransfer& cmd, const Option& opt); + + using Dequantize::forward; + virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + +public: + VkMat scale_data_gpu; + VkMat bias_data_gpu; + + Pipeline* pipeline_dequantize; + Pipeline* pipeline_dequantize_pack4; + Pipeline* pipeline_dequantize_pack8; +}; + +} // namespace ncnn + +#endif // LAYER_DEQUANTIZE_VULKAN_H diff --git a/src/layer/vulkan/packing_vulkan.cpp b/src/layer/vulkan/packing_vulkan.cpp index d314d4554..aba9c6557 100644 --- a/src/layer/vulkan/packing_vulkan.cpp +++ b/src/layer/vulkan/packing_vulkan.cpp @@ -45,6 +45,8 @@ int Packing_vulkan::create_pipeline(const Option& opt) const int local_size_x = vkdev->info.subgroup_size(); + bool use_int8_shader = cast_type_from == 4 || cast_type_to == 4; + std::vector specializations(2 + 3); specializations[0].i = cast_type_from; specializations[1].i = cast_type_to; @@ -91,7 +93,14 @@ int Packing_vulkan::create_pipeline(const Option& opt) pipeline_packing = new Pipeline(vkdev); pipeline_packing->set_optimal_local_size_xyz(local_size_x, 1, 1); - pipeline_packing->create(LayerShaderType::packing, opt, specializations); + if (use_int8_shader) + { + pipeline_packing->create(LayerShaderType::packing_int8, opt, specializations); + } + else + { + pipeline_packing->create(LayerShaderType::packing, opt, specializations); + } } if (shape.dims == 0 || elempack < out_elempack) { @@ -126,7 +135,14 @@ int Packing_vulkan::create_pipeline(const Option& opt) pipeline_packing_pack1to4 = new Pipeline(vkdev); pipeline_packing_pack1to4->set_optimal_local_size_xyz(local_size_x, 1, 1); - pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4, opt, specializations); + if (use_int8_shader) + { + pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4_int8, opt, specializations); + } + else + { + pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4, opt, specializations); + } } if (shape.dims == 0 || (elempack == 1 && out_elempack == 8)) @@ -138,7 +154,14 @@ int Packing_vulkan::create_pipeline(const Option& opt) pipeline_packing_pack1to8 = new Pipeline(vkdev); pipeline_packing_pack1to8->set_optimal_local_size_xyz(local_size_x, 1, 1); - pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8, opt, specializations); + if (use_int8_shader) + { + pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8_int8, opt, specializations); + } + else + { + pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8, opt, specializations); + } } if (shape.dims == 0 || (elempack == 4 && out_elempack == 8)) @@ -150,7 +173,14 @@ int Packing_vulkan::create_pipeline(const Option& opt) pipeline_packing_pack4to8 = new Pipeline(vkdev); pipeline_packing_pack4to8->set_optimal_local_size_xyz(local_size_x, 1, 1); - pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8, opt, specializations); + if (use_int8_shader) + { + pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8_int8, opt, specializations); + } + else + { + pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8, opt, specializations); + } } } if (shape.dims == 0 || elempack > out_elempack) @@ -186,7 +216,14 @@ int Packing_vulkan::create_pipeline(const Option& opt) pipeline_packing_pack4to1 = new Pipeline(vkdev); pipeline_packing_pack4to1->set_optimal_local_size_xyz(local_size_x, 1, 1); - pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1, opt, specializations); + if (use_int8_shader) + { + pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1_int8, opt, specializations); + } + else + { + pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1, opt, specializations); + } } if (shape.dims == 0 || (elempack == 8 && out_elempack == 1)) @@ -198,7 +235,14 @@ int Packing_vulkan::create_pipeline(const Option& opt) pipeline_packing_pack8to1 = new Pipeline(vkdev); pipeline_packing_pack8to1->set_optimal_local_size_xyz(local_size_x, 1, 1); - pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1, opt, specializations); + if (use_int8_shader) + { + pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1_int8, opt, specializations); + } + else + { + pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1, opt, specializations); + } } if (shape.dims == 0 || (elempack == 8 && out_elempack == 4)) @@ -210,7 +254,14 @@ int Packing_vulkan::create_pipeline(const Option& opt) pipeline_packing_pack8to4 = new Pipeline(vkdev); pipeline_packing_pack8to4->set_optimal_local_size_xyz(local_size_x, 1, 1); - pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4, opt, specializations); + if (use_int8_shader) + { + pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4_int8, opt, specializations); + } + else + { + pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4, opt, specializations); + } } } @@ -296,10 +347,14 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute { out_elemsize = out_elempack * 4u; } - else // if (cast_type_to == 2) + else if (cast_type_to == 2) { out_elemsize = out_elempack * 2u; } + else // if (cast_type_to == 3) + { + out_elemsize = out_elempack * 1u; + } if (dims == 1) { diff --git a/src/layer/vulkan/quantize_vulkan.cpp b/src/layer/vulkan/quantize_vulkan.cpp new file mode 100644 index 000000000..8ad860147 --- /dev/null +++ b/src/layer/vulkan/quantize_vulkan.cpp @@ -0,0 +1,215 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "quantize_vulkan.h" + +#include "layer_shader_type.h" + +namespace ncnn { + +Quantize_vulkan::Quantize_vulkan() +{ + support_vulkan = true; + + pipeline_quantize = 0; + pipeline_quantize_pack4 = 0; + pipeline_quantize_pack8 = 0; +} + +int Quantize_vulkan::create_pipeline(const Option& opt) +{ + const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; + const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; + + const int dims = shape.dims; + + int elempack = 0; + if (dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; + if (dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; + if (dims == 3 || dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + + size_t elemsize; + const size_t out_elemsize = elempack * 1u; + if (opt.use_fp16_storage || opt.use_fp16_packed) + { + elemsize = elempack * 2u; + } + else + { + elemsize = elempack * 4u; + } + + Mat shape_packed; + if (dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack); + if (dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); + if (dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); + if (dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack); + + Mat out_shape_packed; + if (dims == 1) out_shape_packed = Mat(out_shape.w / elempack, (void*)0, out_elemsize, elempack); + if (dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / elempack, (void*)0, out_elemsize, elempack); + if (dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / elempack, (void*)0, out_elemsize, elempack); + if (dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / elempack, (void*)0, out_elemsize, elempack); + + size_t c = 0; + size_t in_stride = 0; + size_t out_stride = 0; + if (dims == 1) + { + c = 1; + in_stride = shape_packed.w; + out_stride = out_shape_packed.w; + } + if (dims == 2) + { + c = shape_packed.h; + in_stride = shape_packed.w; + out_stride = out_shape_packed.w; + } + if (dims == 3 || dims == 4) + { + c = shape_packed.c; + in_stride = shape_packed.cstep; + out_stride = out_shape_packed.cstep; + } + + std::vector specializations(2 + 3); + specializations[0].i = scale_data_size; + specializations[1].f = scale_data_size == 1 ? scale_data[0] : 1.f; + specializations[2 + 0].u32 = c; + specializations[2 + 1].u32 = in_stride; + specializations[2 + 2].u32 = out_stride; + + const int local_size_x = vkdev->info.subgroup_size(); + + // pack1 + if (shape.dims == 0 || elempack == 1) + { + pipeline_quantize = new Pipeline(vkdev); + pipeline_quantize->set_optimal_local_size_xyz(local_size_x, 1, 1); + pipeline_quantize->create(LayerShaderType::quantize, opt, specializations); + } + + // pack4 + if (shape.dims == 0 || elempack == 4) + { + pipeline_quantize_pack4 = new Pipeline(vkdev); + pipeline_quantize_pack4->set_optimal_local_size_xyz(local_size_x, 1, 1); + pipeline_quantize_pack4->create(LayerShaderType::quantize_pack4, opt, specializations); + } + + // pack8 + if (shape.dims == 0 || elempack == 8) + { + pipeline_quantize_pack8 = new Pipeline(vkdev); + pipeline_quantize_pack8->set_optimal_local_size_xyz(local_size_x, 1, 1); + pipeline_quantize_pack8->create(LayerShaderType::quantize_pack8, opt, specializations); + } + + return 0; +} + +int Quantize_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_quantize; + pipeline_quantize = 0; + + delete pipeline_quantize_pack4; + pipeline_quantize_pack4 = 0; + + delete pipeline_quantize_pack8; + pipeline_quantize_pack8 = 0; + + return 0; +} + +int Quantize_vulkan::upload_model(VkTransfer& cmd, const Option& opt) +{ + if (scale_data_size > 1) + { + cmd.record_upload(scale_data, scale_data_gpu, opt); + } + + return 0; +} + +int Quantize_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int d = bottom_blob.d; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + + const size_t out_elemsize = 1u * elempack; + + if (dims == 1) + top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator); + if (dims == 2) + top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator); + if (dims == 3) + top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator); + if (dims == 4) + top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + size_t c = 0; + size_t in_stride = 0; + size_t out_stride = 0; + if (dims == 1) + { + c = 1; + in_stride = bottom_blob.w; + out_stride = top_blob.w; + } + if (dims == 2) + { + c = bottom_blob.h; + in_stride = bottom_blob.w; + out_stride = top_blob.w; + } + if (dims == 3 || dims == 4) + { + c = bottom_blob.c; + in_stride = bottom_blob.cstep; + out_stride = top_blob.cstep; + } + + std::vector bindings(3); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + bindings[2] = scale_data_gpu; + + std::vector constants(3); + constants[0].u32 = c; + constants[1].u32 = in_stride; + constants[2].u32 = out_stride; + + VkMat dispatcher; + dispatcher.w = in_stride * c; + dispatcher.h = 1; + dispatcher.c = 1; + + const Pipeline* pipeline = elempack == 8 ? pipeline_quantize_pack8 + : elempack == 4 ? pipeline_quantize_pack4 + : pipeline_quantize; + + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/vulkan/quantize_vulkan.h b/src/layer/vulkan/quantize_vulkan.h new file mode 100644 index 000000000..9a1963932 --- /dev/null +++ b/src/layer/vulkan/quantize_vulkan.h @@ -0,0 +1,45 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_QUANTIZE_VULKAN_H +#define LAYER_QUANTIZE_VULKAN_H + +#include "quantize.h" + +namespace ncnn { + +class Quantize_vulkan : virtual public Quantize +{ +public: + Quantize_vulkan(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int upload_model(VkTransfer& cmd, const Option& opt); + + using Quantize::forward; + virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + +public: + VkMat scale_data_gpu; + + Pipeline* pipeline_quantize; + Pipeline* pipeline_quantize_pack4; + Pipeline* pipeline_quantize_pack8; +}; + +} // namespace ncnn + +#endif // LAYER_QUANTIZE_VULKAN_H diff --git a/src/layer/vulkan/requantize_vulkan.cpp b/src/layer/vulkan/requantize_vulkan.cpp new file mode 100644 index 000000000..e85743c4e --- /dev/null +++ b/src/layer/vulkan/requantize_vulkan.cpp @@ -0,0 +1,231 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "requantize_vulkan.h" + +#include "layer_shader_type.h" + +namespace ncnn { + +Requantize_vulkan::Requantize_vulkan() +{ + support_vulkan = true; + + pipeline_requantize = 0; + pipeline_requantize_pack4 = 0; + pipeline_requantize_pack8 = 0; +} + +int Requantize_vulkan::create_pipeline(const Option& opt) +{ + const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; + const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; + + const int dims = shape.dims; + + int elempack = 1; + if (dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; + if (dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; + if (dims == 3 || dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + + int out_elempack = 1; + if (dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1; + if (dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1; + if (dims == 3 || dims == 4) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1; + + const size_t elemsize = elempack * 4u; + const size_t out_elemsize = out_elempack * 1u; + + Mat shape_packed; + if (dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack); + if (dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); + if (dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); + if (dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack); + + Mat out_shape_packed; + if (dims == 1) out_shape_packed = Mat(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack); + if (dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack); + if (dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack); + if (dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack); + + size_t c = 0; + size_t in_stride = 0; + size_t out_stride = 0; + if (dims == 1) + { + c = 1; + in_stride = shape_packed.w; + out_stride = out_shape_packed.w; + } + if (dims == 2) + { + c = shape_packed.h; + in_stride = shape_packed.w; + out_stride = out_shape_packed.w; + } + if (dims == 3 || dims == 4) + { + c = shape_packed.c; + in_stride = shape_packed.cstep; + out_stride = out_shape_packed.cstep; + } + + std::vector specializations(9 + 3); + specializations[0].i = scale_in_data_size; + specializations[1].f = scale_in_data_size == 1 ? scale_in_data[0] : 1.f; + specializations[2].i = scale_out_data_size; + specializations[3].f = scale_out_data_size == 1 ? scale_out_data[0] : 1.f; + specializations[4].i = bias_data_size; + specializations[5].f = bias_data_size == 1 ? bias_data[0] : 0.f; + specializations[6].i = activation_type; + specializations[7].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[8].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[9 + 0].u32 = c; + specializations[9 + 1].u32 = in_stride; + specializations[9 + 2].u32 = out_stride; + + const int local_size_x = vkdev->info.subgroup_size(); + + // pack1 + if (shape.dims == 0 || elempack == 1) + { + pipeline_requantize = new Pipeline(vkdev); + pipeline_requantize->set_optimal_local_size_xyz(local_size_x, 1, 1); + pipeline_requantize->create(LayerShaderType::requantize, opt, specializations); + } + + // pack4 + if (shape.dims == 0 || elempack == 4) + { + pipeline_requantize_pack4 = new Pipeline(vkdev); + pipeline_requantize_pack4->set_optimal_local_size_xyz(local_size_x, 1, 1); + pipeline_requantize_pack4->create(LayerShaderType::requantize_pack4, opt, specializations); + } + + // pack8 + if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8) + { + pipeline_requantize_pack8 = new Pipeline(vkdev); + pipeline_requantize_pack8->set_optimal_local_size_xyz(local_size_x, 1, 1); + pipeline_requantize_pack8->create(LayerShaderType::requantize_pack8, opt, specializations); + } + + return 0; +} + +int Requantize_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_requantize; + pipeline_requantize = 0; + + delete pipeline_requantize_pack4; + pipeline_requantize_pack4 = 0; + + delete pipeline_requantize_pack8; + pipeline_requantize_pack8 = 0; + + return 0; +} + +int Requantize_vulkan::upload_model(VkTransfer& cmd, const Option& opt) +{ + if (scale_in_data_size > 1) + { + cmd.record_upload(scale_in_data, scale_in_data_gpu, opt); + } + + if (scale_out_data_size > 1) + { + cmd.record_upload(scale_out_data, scale_out_data_gpu, opt); + } + + if (bias_data_size > 1) + { + cmd.record_upload(bias_data, bias_data_gpu, opt); + } + + return 0; +} + +int Requantize_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int d = bottom_blob.d; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + + size_t out_elemsize = 1u * elempack; + + if (dims == 1) + top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator); + if (dims == 2) + top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator); + if (dims == 3) + top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator); + if (dims == 4) + top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + size_t c = 0; + size_t in_stride = 0; + size_t out_stride = 0; + if (dims == 1) + { + c = 1; + in_stride = bottom_blob.w; + out_stride = top_blob.w; + } + if (dims == 2) + { + c = bottom_blob.h; + in_stride = bottom_blob.w; + out_stride = top_blob.w; + } + if (dims == 3 || dims == 4) + { + c = bottom_blob.c; + in_stride = bottom_blob.cstep; + out_stride = top_blob.cstep; + } + + std::vector bindings(5); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + bindings[2] = scale_in_data_gpu; + bindings[3] = scale_out_data_gpu; + bindings[4] = bias_data_gpu; + + std::vector constants(3); + constants[0].u32 = c; + constants[1].u32 = in_stride; + constants[2].u32 = out_stride; + + VkMat dispatcher; + dispatcher.w = in_stride * c; + dispatcher.h = 1; + dispatcher.c = 1; + + const Pipeline* pipeline = elempack == 8 ? pipeline_requantize_pack8 + : elempack == 4 ? pipeline_requantize_pack4 + : pipeline_requantize; + + cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/vulkan/requantize_vulkan.h b/src/layer/vulkan/requantize_vulkan.h new file mode 100644 index 000000000..c0a86199e --- /dev/null +++ b/src/layer/vulkan/requantize_vulkan.h @@ -0,0 +1,47 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_REQUANTIZE_VULKAN_H +#define LAYER_REQUANTIZE_VULKAN_H + +#include "requantize.h" + +namespace ncnn { + +class Requantize_vulkan : virtual public Requantize +{ +public: + Requantize_vulkan(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int upload_model(VkTransfer& cmd, const Option& opt); + + using Requantize::forward; + virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + +public: + VkMat scale_in_data_gpu; + VkMat scale_out_data_gpu; + VkMat bias_data_gpu; + + Pipeline* pipeline_requantize; + Pipeline* pipeline_requantize_pack4; + Pipeline* pipeline_requantize_pack8; +}; + +} // namespace ncnn + +#endif // LAYER_REQUANTIZE_VULKAN_H diff --git a/src/layer/vulkan/shader/dequantize.comp b/src/layer/vulkan/shader/dequantize.comp new file mode 100644 index 000000000..4dd77d713 --- /dev/null +++ b/src/layer/vulkan/shader/dequantize.comp @@ -0,0 +1,80 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int scale_data_size = 0; +layout (constant_id = 1) const float scale_value = 1.f; +layout (constant_id = 2) const int bias_data_size = 0; +layout (constant_id = 3) const float bias_value = 0.f; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { int bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer scale_blob { sfp scale_blob_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + uint c; + uint in_stride; + uint out_stride; +} p; + +void main() +{ + const uint gi = gl_GlobalInvocationID.x; + + if (gi >= psc(in_stride) * psc(c)) + return; + + const uint gy = gi / psc(in_stride); + const uint gx = gi % psc(in_stride); + + int v = bottom_blob_data[gi]; + + afp scale; + if (scale_data_size == 1) + { + scale = afp(scale_value); + } + else + { + scale = buffer_ld1(scale_blob_data, gy); + } + + afp bias; + if (bias_data_size == 0) + { + bias = afp(0.f); + } + else if (bias_data_size == 1) + { + bias = afp(bias_value); + } + else + { + bias = buffer_ld1(bias_blob_data, gy); + } + + afp v_fp = afp(v) * scale + bias; + + const uint outgi = gy * psc(out_stride) + gx; + + buffer_st1(top_blob_data, outgi, v_fp); +} diff --git a/src/layer/vulkan/shader/dequantize_pack4.comp b/src/layer/vulkan/shader/dequantize_pack4.comp new file mode 100644 index 000000000..b54d0af92 --- /dev/null +++ b/src/layer/vulkan/shader/dequantize_pack4.comp @@ -0,0 +1,80 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int scale_data_size = 0; +layout (constant_id = 1) const float scale_value = 1.f; +layout (constant_id = 2) const int bias_data_size = 0; +layout (constant_id = 3) const float bias_value = 0.f; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { ivec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer scale_blob { sfpvec4 scale_blob_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + uint c; + uint in_stride; + uint out_stride; +} p; + +void main() +{ + const uint gi = gl_GlobalInvocationID.x; + + if (gi >= psc(in_stride) * psc(c)) + return; + + const uint gy = gi / psc(in_stride); + const uint gx = gi % psc(in_stride); + + ivec4 v = bottom_blob_data[gi]; + + afpvec4 scale; + if (scale_data_size == 1) + { + scale = afpvec4(scale_value); + } + else + { + scale = buffer_ld4(scale_blob_data, gy); + } + + afpvec4 bias; + if (bias_data_size == 0) + { + bias = afpvec4(0.f); + } + else if (bias_data_size == 1) + { + bias = afpvec4(bias_value); + } + else + { + bias = buffer_ld4(bias_blob_data, gy); + } + + afpvec4 v_fp = afpvec4(v) * scale + bias; + + const uint outgi = gy * psc(out_stride) + gx; + + buffer_st4(top_blob_data, outgi, v_fp); +} diff --git a/src/layer/vulkan/shader/dequantize_pack8.comp b/src/layer/vulkan/shader/dequantize_pack8.comp new file mode 100644 index 000000000..63b759b7c --- /dev/null +++ b/src/layer/vulkan/shader/dequantize_pack8.comp @@ -0,0 +1,84 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int scale_data_size = 0; +layout (constant_id = 1) const float scale_value = 1.f; +layout (constant_id = 2) const int bias_data_size = 0; +layout (constant_id = 3) const float bias_value = 0.f; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { ivec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer scale_blob { sfpvec8 scale_blob_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + uint c; + uint in_stride; + uint out_stride; +} p; + +void main() +{ + const uint gi = gl_GlobalInvocationID.x; + + if (gi >= psc(in_stride) * psc(c)) + return; + + const uint gy = gi / psc(in_stride); + const uint gx = gi % psc(in_stride); + + ivec8 v = bottom_blob_data[gi]; + + afpvec8 scale; + if (scale_data_size == 1) + { + scale = afpvec8(afpvec4(scale_value), afpvec4(scale_value)); + } + else + { + scale = buffer_ld8(scale_blob_data, gy); + } + + afpvec8 bias; + if (bias_data_size == 0) + { + bias[0] = afpvec4(0.f); + bias[1] = afpvec4(0.f); + } + else if (bias_data_size == 1) + { + bias[0] = afpvec4(bias_value); + bias[1] = afpvec4(bias_value); + } + else + { + bias = buffer_ld8(bias_blob_data, gy); + } + + afpvec8 v_fp; + v_fp[0] = afpvec4(v.abcd) * scale[0] + bias[0]; + v_fp[1] = afpvec4(v.efgh) * scale[1] + bias[1]; + + const uint outgi = gy * psc(out_stride) + gx; + + buffer_st8(top_blob_data, outgi, v_fp); +} diff --git a/src/layer/vulkan/shader/packing_int8.comp b/src/layer/vulkan/shader/packing_int8.comp new file mode 100644 index 000000000..4ea5a8f93 --- /dev/null +++ b/src/layer/vulkan/shader/packing_int8.comp @@ -0,0 +1,73 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int cast_type_from = 0; +layout (constant_id = 1) const int cast_type_to = 1; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const uint n = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; }; +layout (binding = 1) readonly buffer bottom_blob_int32 { ivec4 bottom_blob_int32_data[]; }; +layout (binding = 2) writeonly buffer top_blob { sint8vec4 top_blob_data[]; }; +layout (binding = 3) writeonly buffer top_blob_int32 { ivec4 top_blob_int32_data[]; }; + +layout (push_constant) uniform parameter +{ + uint n; + uint c; + uint stride; +} p; + +void main() +{ + const uint gx = gl_GlobalInvocationID.x; + const uint gy = gl_GlobalInvocationID.y; + + if (gx >= psc(n) || gy >= psc(c)) + return; + + const uint gi = gy * psc(n) + gx; + + if (cast_type_from == cast_type_to) + { + i8buffer_cp4(top_blob_data, gi, bottom_blob_data, gi); + return; + } + + const uint gi2 = gy * psc(stride) + gx; + + ivec4 v; + if (cast_type_from == 3) + { + v = bottom_blob_int32_data[gi]; + } + else + { + v = i8buffer_ld4(bottom_blob_data, gi2); + } + + if (cast_type_to == 3) + { + top_blob_int32_data[gi] = v; + } + else + { + i8buffer_st4(top_blob_data, gi2, v); + } +} diff --git a/src/layer/vulkan/shader/packing_pack1to4_int8.comp b/src/layer/vulkan/shader/packing_pack1to4_int8.comp new file mode 100644 index 000000000..fb99d5d34 --- /dev/null +++ b/src/layer/vulkan/shader/packing_pack1to4_int8.comp @@ -0,0 +1,79 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int cast_type_from = 0; +layout (constant_id = 1) const int cast_type_to = 1; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const uint n = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; }; +layout (binding = 1) readonly buffer bottom_blob_int32 { int bottom_blob_int32_data[]; }; +layout (binding = 2) writeonly buffer top_blob { sint8vec4 top_blob_data[]; }; +layout (binding = 3) writeonly buffer top_blob_int32 { ivec4 top_blob_int32_data[]; }; + +layout (push_constant) uniform parameter +{ + uint n; + uint c; + uint stride; +} p; + +void main() +{ + const uint gx = gl_GlobalInvocationID.x; + const uint gy = gl_GlobalInvocationID.y; + + if (gx >= psc(n) || gy >= psc(c)) + return; + + const uvec4 gi4 = (gy * 4 + uvec4(0, 1, 2, 3)) * psc(stride) + gx; + + const uint gi = gy * psc(n) + gx; + +// if (cast_type_from == cast_type_to) +// { +// i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, gi4); +// return; +// } + + ivec4 v; + if (cast_type_from == 3) + { + v.r = bottom_blob_int32_data[gi4.r]; + v.g = bottom_blob_int32_data[gi4.g]; + v.b = bottom_blob_int32_data[gi4.b]; + v.a = bottom_blob_int32_data[gi4.a]; + } + else + { + v.r = i8buffer_ld1(bottom_blob_data, gi4.r); + v.g = i8buffer_ld1(bottom_blob_data, gi4.g); + v.b = i8buffer_ld1(bottom_blob_data, gi4.b); + v.a = i8buffer_ld1(bottom_blob_data, gi4.a); + } + + if (cast_type_to == 3) + { + top_blob_int32_data[gi] = v; + } + else + { + i8buffer_st4(top_blob_data, gi, v); + } +} diff --git a/src/layer/vulkan/shader/packing_pack1to8_int8.comp b/src/layer/vulkan/shader/packing_pack1to8_int8.comp new file mode 100644 index 000000000..4f7b14732 --- /dev/null +++ b/src/layer/vulkan/shader/packing_pack1to8_int8.comp @@ -0,0 +1,88 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int cast_type_from = 0; +layout (constant_id = 1) const int cast_type_to = 1; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const uint n = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; }; +layout (binding = 1) readonly buffer bottom_blob_int32 { int bottom_blob_int32_data[]; }; +layout (binding = 2) writeonly buffer top_blob { sint8vec8 top_blob_data[]; }; +layout (binding = 3) writeonly buffer top_blob_int32 { ivec8 top_blob_int32_data[]; }; + +layout (push_constant) uniform parameter +{ + uint n; + uint c; + uint stride; +} p; + +void main() +{ + const uint gx = gl_GlobalInvocationID.x; + const uint gy = gl_GlobalInvocationID.y; + + if (gx >= psc(n) || gy >= psc(c)) + return; + + const uvec4 gi4 = (gy * 8 + uvec4(0, 1, 2, 3)) * psc(stride) + gx; + const uvec4 gi8 = gi4 + psc(stride) * 4; + + const uint gi = gy * psc(n) + gx; + +// if (cast_type_from == cast_type_to) +// { +// i8buffer_cp1to8(top_blob_data, gi, bottom_blob_data, gi4, gi8); +// return; +// } + + ivec8 v; + if (cast_type_from == 3) + { + v.abcd.r = bottom_blob_int32_data[gi4.r]; + v.abcd.g = bottom_blob_int32_data[gi4.g]; + v.abcd.b = bottom_blob_int32_data[gi4.b]; + v.abcd.a = bottom_blob_int32_data[gi4.a]; + v.efgh.r = bottom_blob_int32_data[gi8.r]; + v.efgh.g = bottom_blob_int32_data[gi8.g]; + v.efgh.b = bottom_blob_int32_data[gi8.b]; + v.efgh.a = bottom_blob_int32_data[gi8.a]; + } + else + { + v.abcd.r = i8buffer_ld1(bottom_blob_data, gi4.r); + v.abcd.g = i8buffer_ld1(bottom_blob_data, gi4.g); + v.abcd.b = i8buffer_ld1(bottom_blob_data, gi4.b); + v.abcd.a = i8buffer_ld1(bottom_blob_data, gi4.a); + v.efgh.r = i8buffer_ld1(bottom_blob_data, gi8.r); + v.efgh.g = i8buffer_ld1(bottom_blob_data, gi8.g); + v.efgh.b = i8buffer_ld1(bottom_blob_data, gi8.b); + v.efgh.a = i8buffer_ld1(bottom_blob_data, gi8.a); + } + + if (cast_type_to == 3) + { + top_blob_int32_data[gi] = v; + } + else + { + i8buffer_st8(top_blob_data, gi, v); + } +} diff --git a/src/layer/vulkan/shader/packing_pack4to1_int8.comp b/src/layer/vulkan/shader/packing_pack4to1_int8.comp new file mode 100644 index 000000000..53145a40c --- /dev/null +++ b/src/layer/vulkan/shader/packing_pack4to1_int8.comp @@ -0,0 +1,79 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int cast_type_from = 0; +layout (constant_id = 1) const int cast_type_to = 1; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const uint n = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; }; +layout (binding = 1) readonly buffer bottom_blob_int32 { ivec4 bottom_blob_int32_data[]; }; +layout (binding = 2) writeonly buffer top_blob { sint8 top_blob_data[]; }; +layout (binding = 3) writeonly buffer top_blob_int32 { int top_blob_int32_data[]; }; + +layout (push_constant) uniform parameter +{ + uint n; + uint c; + uint stride; +} p; + +void main() +{ + const uint gx = gl_GlobalInvocationID.x; + const uint gy = gl_GlobalInvocationID.y; + + if (gx >= psc(n) || gy >= psc(c)) + return; + + const uint gi = gy * psc(n) + gx; + + const uvec4 gi4 = (gy * 4 + uvec4(0, 1, 2, 3)) * psc(stride) + gx; + +// if (cast_type_from == cast_type_to) +// { +// buffer_cp4to1(top_blob_data, gi4, bottom_blob_data, gi); +// return; +// } + + ivec4 v; + if (cast_type_from == 3) + { + v = bottom_blob_int32_data[gi]; + } + else + { + v = i8buffer_ld4(bottom_blob_data, gi); + } + + if (cast_type_to == 3) + { + top_blob_int32_data[gi4.r] = v.r; + top_blob_int32_data[gi4.g] = v.g; + top_blob_int32_data[gi4.b] = v.b; + top_blob_int32_data[gi4.a] = v.a; + } + else + { + i8buffer_st1(top_blob_data, gi4.r, v.r); + i8buffer_st1(top_blob_data, gi4.g, v.g); + i8buffer_st1(top_blob_data, gi4.b, v.b); + i8buffer_st1(top_blob_data, gi4.a, v.a); + } +} diff --git a/src/layer/vulkan/shader/packing_pack4to8_int8.comp b/src/layer/vulkan/shader/packing_pack4to8_int8.comp new file mode 100644 index 000000000..112dd2472 --- /dev/null +++ b/src/layer/vulkan/shader/packing_pack4to8_int8.comp @@ -0,0 +1,75 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int cast_type_from = 0; +layout (constant_id = 1) const int cast_type_to = 1; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const uint n = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; }; +layout (binding = 1) readonly buffer bottom_blob_int32 { ivec4 bottom_blob_int32_data[]; }; +layout (binding = 2) writeonly buffer top_blob { sint8vec8 top_blob_data[]; }; +layout (binding = 3) writeonly buffer top_blob_int32 { ivec8 top_blob_int32_data[]; }; + +layout (push_constant) uniform parameter +{ + uint n; + uint c; + uint stride; +} p; + +void main() +{ + const uint gx = gl_GlobalInvocationID.x; + const uint gy = gl_GlobalInvocationID.y; + + if (gx >= psc(n) || gy >= psc(c)) + return; + + const uvec2 gi2 = (gy * 2 + uvec2(0, 1)) * psc(stride) + gx; + + const uint gi = gy * psc(n) + gx; + +// if (cast_type_from == cast_type_to) +// { +// buffer_cp4to8(top_blob_data, gi, bottom_blob_data, gi2); +// return; +// } + + ivec8 v; + if (cast_type_from == 3) + { + v.abcd = bottom_blob_int32_data[gi2.r]; + v.efgh = bottom_blob_int32_data[gi2.g]; + } + else + { + v.abcd = i8buffer_ld4(bottom_blob_data, gi2.r); + v.efgh = i8buffer_ld4(bottom_blob_data, gi2.g); + } + + if (cast_type_to == 3) + { + top_blob_int32_data[gi] = v; + } + else + { + i8buffer_st8(top_blob_data, gi, v); + } +} diff --git a/src/layer/vulkan/shader/packing_pack8to1_int8.comp b/src/layer/vulkan/shader/packing_pack8to1_int8.comp new file mode 100644 index 000000000..6d1c9a4a1 --- /dev/null +++ b/src/layer/vulkan/shader/packing_pack8to1_int8.comp @@ -0,0 +1,88 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int cast_type_from = 0; +layout (constant_id = 1) const int cast_type_to = 1; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const uint n = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { sint8vec8 bottom_blob_data[]; }; +layout (binding = 1) readonly buffer bottom_blob_int32 { ivec8 bottom_blob_int32_data[]; }; +layout (binding = 2) writeonly buffer top_blob { sint8 top_blob_data[]; }; +layout (binding = 3) writeonly buffer top_blob_int32 { int top_blob_int32_data[]; }; + +layout (push_constant) uniform parameter +{ + uint n; + uint c; + uint stride; +} p; + +void main() +{ + const uint gx = gl_GlobalInvocationID.x; + const uint gy = gl_GlobalInvocationID.y; + + if (gx >= psc(n) || gy >= psc(c)) + return; + + const uint gi = gy * psc(n) + gx; + + const uvec4 gi4 = (gy * 8 + uvec4(0, 1, 2, 3)) * psc(stride) + gx; + const uvec4 gi8 = gi4 + psc(stride) * 4; + +// if (cast_type_from == cast_type_to) +// { +// i8buffer_cp8to1(top_blob_data, gi4, gi8, bottom_blob_data, gi); +// return; +// } + + ivec8 v; + if (cast_type_from == 3) + { + v = bottom_blob_int32_data[gi]; + } + else + { + v = i8buffer_ld8(bottom_blob_data, gi); + } + + if (cast_type_to == 3) + { + top_blob_int32_data[gi4.r] = v.abcd.r; + top_blob_int32_data[gi4.g] = v.abcd.g; + top_blob_int32_data[gi4.b] = v.abcd.b; + top_blob_int32_data[gi4.a] = v.abcd.a; + top_blob_int32_data[gi8.r] = v.efgh.r; + top_blob_int32_data[gi8.g] = v.efgh.g; + top_blob_int32_data[gi8.b] = v.efgh.b; + top_blob_int32_data[gi8.a] = v.efgh.a; + } + else + { + i8buffer_st1(top_blob_data, gi4.r, v.abcd.r); + i8buffer_st1(top_blob_data, gi4.g, v.abcd.g); + i8buffer_st1(top_blob_data, gi4.b, v.abcd.b); + i8buffer_st1(top_blob_data, gi4.a, v.abcd.a); + i8buffer_st1(top_blob_data, gi8.r, v.efgh.r); + i8buffer_st1(top_blob_data, gi8.g, v.efgh.g); + i8buffer_st1(top_blob_data, gi8.b, v.efgh.b); + i8buffer_st1(top_blob_data, gi8.a, v.efgh.a); + } +} diff --git a/src/layer/vulkan/shader/packing_pack8to4_int8.comp b/src/layer/vulkan/shader/packing_pack8to4_int8.comp new file mode 100644 index 000000000..c3df6dc6d --- /dev/null +++ b/src/layer/vulkan/shader/packing_pack8to4_int8.comp @@ -0,0 +1,75 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int cast_type_from = 0; +layout (constant_id = 1) const int cast_type_to = 1; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const uint n = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { sint8vec8 bottom_blob_data[]; }; +layout (binding = 1) readonly buffer bottom_blob_int32 { ivec8 bottom_blob_int32_data[]; }; +layout (binding = 2) writeonly buffer top_blob { sint8vec4 top_blob_data[]; }; +layout (binding = 3) writeonly buffer top_blob_int32 { ivec4 top_blob_int32_data[]; }; + +layout (push_constant) uniform parameter +{ + uint n; + uint c; + uint stride; +} p; + +void main() +{ + const uint gx = gl_GlobalInvocationID.x; + const uint gy = gl_GlobalInvocationID.y; + + if (gx >= psc(n) || gy >= psc(c)) + return; + + const uint gi = gy * psc(n) + gx; + + const uvec2 gi2 = (gy * 2 + uvec2(0, 1)) * psc(stride) + gx; + +// if (cast_type_from == cast_type_to) +// { +// buffer_cp8to4(top_blob_data, gi2, bottom_blob_data, gi); +// return; +// } + + ivec8 v; + if (cast_type_from == 3) + { + v = bottom_blob_int32_data[gi]; + } + else + { + v = i8buffer_ld8(bottom_blob_data, gi); + } + + if (cast_type_to == 3) + { + top_blob_int32_data[gi2.r] = v.abcd; + top_blob_int32_data[gi2.g] = v.efgh; + } + else + { + i8buffer_st4(top_blob_data, gi2.r, v.abcd); + i8buffer_st4(top_blob_data, gi2.g, v.efgh); + } +} diff --git a/src/layer/vulkan/shader/quantize.comp b/src/layer/vulkan/shader/quantize.comp new file mode 100644 index 000000000..58d23f852 --- /dev/null +++ b/src/layer/vulkan/shader/quantize.comp @@ -0,0 +1,63 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int scale_data_size = 0; +layout (constant_id = 1) const float scale_value = 1.f; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; }; +layout (binding = 2) readonly buffer scale_blob { sfp scale_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + uint c; + uint in_stride; + uint out_stride; +} p; + +void main() +{ + const uint gi = gl_GlobalInvocationID.x; + + if (gi >= psc(in_stride) * psc(c)) + return; + + const uint gy = gi / psc(in_stride); + const uint gx = gi % psc(in_stride); + + afp v = buffer_ld1(bottom_blob_data, gi); + + afp scale; + if (scale_data_size == 1) + { + scale = afp(scale_value); + } + else + { + scale = buffer_ld1(scale_blob_data, gy); + } + + int v_int = int(round(clamp(v * scale, afp(-127.f), afp(127.f)))); + + const uint outgi = gy * psc(out_stride) + gx; + + i8buffer_st1(top_blob_data, outgi, v_int); +} diff --git a/src/layer/vulkan/shader/quantize_pack4.comp b/src/layer/vulkan/shader/quantize_pack4.comp new file mode 100644 index 000000000..7b58eff1a --- /dev/null +++ b/src/layer/vulkan/shader/quantize_pack4.comp @@ -0,0 +1,63 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int scale_data_size = 0; +layout (constant_id = 1) const float scale_value = 1.f; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer scale_blob { sfpvec4 scale_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + uint c; + uint in_stride; + uint out_stride; +} p; + +void main() +{ + const uint gi = gl_GlobalInvocationID.x; + + if (gi >= psc(in_stride) * psc(c)) + return; + + const uint gy = gi / psc(in_stride); + const uint gx = gi % psc(in_stride); + + afpvec4 v = buffer_ld4(bottom_blob_data, gi); + + afpvec4 scale; + if (scale_data_size == 1) + { + scale = afpvec4(scale_value); + } + else + { + scale = buffer_ld4(scale_blob_data, gy); + } + + ivec4 v_int = ivec4(round(clamp(v * scale, afp(-127.f), afp(127.f)))); + + const uint outgi = gy * psc(out_stride) + gx; + + i8buffer_st4(top_blob_data, outgi, v_int); +} diff --git a/src/layer/vulkan/shader/quantize_pack8.comp b/src/layer/vulkan/shader/quantize_pack8.comp new file mode 100644 index 000000000..032f8ff1f --- /dev/null +++ b/src/layer/vulkan/shader/quantize_pack8.comp @@ -0,0 +1,65 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int scale_data_size = 0; +layout (constant_id = 1) const float scale_value = 1.f; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sint8vec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer scale_blob { sfpvec8 scale_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + uint c; + uint in_stride; + uint out_stride; +} p; + +void main() +{ + const uint gi = gl_GlobalInvocationID.x; + + if (gi >= psc(in_stride) * psc(c)) + return; + + const uint gy = gi / psc(in_stride); + const uint gx = gi % psc(in_stride); + + afpvec8 v = buffer_ld8(bottom_blob_data, gi); + + afpvec8 scale; + if (scale_data_size == 1) + { + scale = afpvec8(afpvec4(scale_value), afpvec4(scale_value)); + } + else + { + scale = buffer_ld8(scale_blob_data, gy); + } + + ivec8 v_int; + v_int.abcd = ivec4(round(clamp(v[0] * scale[0], afp(-127.f), afp(127.f)))); + v_int.efgh = ivec4(round(clamp(v[1] * scale[1], afp(-127.f), afp(127.f)))); + + const uint outgi = gy * psc(out_stride) + gx; + + i8buffer_st8(top_blob_data, outgi, v_int); +} diff --git a/src/layer/vulkan/shader/requantize.comp b/src/layer/vulkan/shader/requantize.comp new file mode 100644 index 000000000..cb2ef7432 --- /dev/null +++ b/src/layer/vulkan/shader/requantize.comp @@ -0,0 +1,103 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +layout (constant_id = 0) const int scale_in_data_size = 0; +layout (constant_id = 1) const float scale_in_value = 1.f; +layout (constant_id = 2) const int scale_out_data_size = 0; +layout (constant_id = 3) const float scale_out_value = 1.f; +layout (constant_id = 4) const int bias_data_size = 0; +layout (constant_id = 5) const float bias_value = 0.f; +layout (constant_id = 6) const int activation_type = 0; +layout (constant_id = 7) const float activation_param_0 = 0; +layout (constant_id = 8) const float activation_param_1 = 0; + +#define shape_constant_id_offset 9 +layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { int bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; }; +layout (binding = 2) readonly buffer scale_in_blob { sfp scale_in_blob_data[]; }; +layout (binding = 3) readonly buffer scale_out_blob { sfp scale_out_blob_data[]; }; +layout (binding = 4) readonly buffer bias_blob { sfp bias_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + uint c; + uint in_stride; + uint out_stride; +} p; + +void main() +{ + const uint gi = gl_GlobalInvocationID.x; + + if (gi >= psc(in_stride) * psc(c)) + return; + + const uint gy = gi / psc(in_stride); + const uint gx = gi % psc(in_stride); + + int v = bottom_blob_data[gi]; + + afp scale_in; + if (scale_in_data_size == 1) + { + scale_in = afp(scale_in_value); + } + else + { + scale_in = buffer_ld1(scale_in_blob_data, gy); + } + + afp bias; + if (bias_data_size == 0) + { + bias = afp(0.f); + } + else if (bias_data_size == 1) + { + bias = afp(bias_value); + } + else + { + bias = buffer_ld1(bias_blob_data, gy); + } + + afp v_fp = afp(v) * scale_in + bias; + + v_fp = activation_afp(v_fp, activation_type, activation_param_0, activation_param_1); + + afp scale_out; + if (scale_out_data_size == 1) + { + scale_out = afp(scale_out_value); + } + else + { + scale_out = buffer_ld1(scale_out_blob_data, gy); + } + + int v_int = int(round(clamp(v_fp * scale_out, afp(-127.f), afp(127.f)))); + + const uint outgi = gy * psc(out_stride) + gx; + + i8buffer_st1(top_blob_data, outgi, v_int); +} diff --git a/src/layer/vulkan/shader/requantize_pack4.comp b/src/layer/vulkan/shader/requantize_pack4.comp new file mode 100644 index 000000000..2fcbc862b --- /dev/null +++ b/src/layer/vulkan/shader/requantize_pack4.comp @@ -0,0 +1,103 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +layout (constant_id = 0) const int scale_in_data_size = 0; +layout (constant_id = 1) const float scale_in_value = 1.f; +layout (constant_id = 2) const int scale_out_data_size = 0; +layout (constant_id = 3) const float scale_out_value = 1.f; +layout (constant_id = 4) const int bias_data_size = 0; +layout (constant_id = 5) const float bias_value = 0.f; +layout (constant_id = 6) const int activation_type = 0; +layout (constant_id = 7) const float activation_param_0 = 0; +layout (constant_id = 8) const float activation_param_1 = 0; + +#define shape_constant_id_offset 9 +layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { ivec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer scale_in_blob { sfpvec4 scale_in_blob_data[]; }; +layout (binding = 3) readonly buffer scale_out_blob { sfpvec4 scale_out_blob_data[]; }; +layout (binding = 4) readonly buffer bias_blob { sfpvec4 bias_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + uint c; + uint in_stride; + uint out_stride; +} p; + +void main() +{ + const uint gi = gl_GlobalInvocationID.x; + + if (gi >= psc(in_stride) * psc(c)) + return; + + const uint gy = gi / psc(in_stride); + const uint gx = gi % psc(in_stride); + + ivec4 v = bottom_blob_data[gi]; + + afpvec4 scale_in; + if (scale_in_data_size == 1) + { + scale_in = afpvec4(scale_in_value); + } + else + { + scale_in = buffer_ld4(scale_in_blob_data, gy); + } + + afpvec4 bias; + if (bias_data_size == 0) + { + bias = afpvec4(0.f); + } + else if (bias_data_size == 1) + { + bias = afpvec4(bias_value); + } + else + { + bias = buffer_ld4(bias_blob_data, gy); + } + + afpvec4 v_fp = afpvec4(v) * scale_in + bias; + + v_fp = activation_afpvec4(v_fp, activation_type, activation_param_0, activation_param_1); + + afpvec4 scale_out; + if (scale_out_data_size == 1) + { + scale_out = afpvec4(scale_out_value); + } + else + { + scale_out = buffer_ld4(scale_out_blob_data, gy); + } + + ivec4 v_int = ivec4(round(clamp(v_fp * scale_out, afp(-127.f), afp(127.f)))); + + const uint outgi = gy * psc(out_stride) + gx; + + i8buffer_st4(top_blob_data, outgi, v_int); +} diff --git a/src/layer/vulkan/shader/requantize_pack8.comp b/src/layer/vulkan/shader/requantize_pack8.comp new file mode 100644 index 000000000..fedff0151 --- /dev/null +++ b/src/layer/vulkan/shader/requantize_pack8.comp @@ -0,0 +1,107 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +layout (constant_id = 0) const int scale_in_data_size = 0; +layout (constant_id = 1) const float scale_in_value = 1.f; +layout (constant_id = 2) const int scale_out_data_size = 0; +layout (constant_id = 3) const float scale_out_value = 1.f; +layout (constant_id = 4) const int bias_data_size = 0; +layout (constant_id = 5) const float bias_value = 0.f; +layout (constant_id = 6) const int activation_type = 0; +layout (constant_id = 7) const float activation_param_0 = 0; +layout (constant_id = 8) const float activation_param_1 = 0; + +#define shape_constant_id_offset 9 +layout (constant_id = shape_constant_id_offset + 0) const uint c = 0; +layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0; +layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0; + +layout (binding = 0) readonly buffer bottom_blob { ivec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sint8vec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer scale_in_blob { sfpvec8 scale_in_blob_data[]; }; +layout (binding = 3) readonly buffer scale_out_blob { sfpvec8 scale_out_blob_data[]; }; +layout (binding = 4) readonly buffer bias_blob { sfpvec8 bias_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + uint c; + uint in_stride; + uint out_stride; +} p; + +void main() +{ + const uint gi = gl_GlobalInvocationID.x; + + if (gi >= psc(in_stride) * psc(c)) + return; + + const uint gy = gi / psc(in_stride); + const uint gx = gi % psc(in_stride); + + ivec8 v = bottom_blob_data[gi]; + + afpvec8 scale_in; + if (scale_in_data_size == 1) + { + scale_in = afpvec8(afpvec4(scale_in_value), afpvec4(scale_in_value)); + } + else + { + scale_in = buffer_ld8(scale_in_blob_data, gy); + } + + afpvec8 bias; + if (bias_data_size == 0) + { + bias = afpvec8(afpvec4(0.f), afpvec4(0.f)); + } + else if (bias_data_size == 1) + { + bias = afpvec8(afpvec4(bias_value), afpvec4(bias_value)); + } + else + { + bias = buffer_ld8(bias_blob_data, gy); + } + + afpvec8 v_fp; + v_fp[0] = afpvec4(v.abcd) * scale_in[0] + bias[0]; + v_fp[1] = afpvec4(v.efgh) * scale_in[1] + bias[1]; + + v_fp = activation_afpvec8(v_fp, activation_type, activation_param_0, activation_param_1); + + afpvec8 scale_out; + if (scale_out_data_size == 1) + { + scale_out = afpvec8(afpvec4(scale_out_value), afpvec4(scale_out_value)); + } + else + { + scale_out = buffer_ld8(scale_out_blob_data, gy); + } + + ivec8 v_int; + v_int.abcd = ivec4(round(clamp(v_fp[0] * scale_out[0], afp(-127.f), afp(127.f)))); + v_int.efgh = ivec4(round(clamp(v_fp[1] * scale_out[1], afp(-127.f), afp(127.f)))); + + const uint outgi = gy * psc(out_stride) + gx; + + i8buffer_st8(top_blob_data, outgi, v_int); +} diff --git a/src/net.cpp b/src/net.cpp index 21b99fcf5..05f121dd9 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -1043,6 +1043,9 @@ int Net::load_param(const DataReader& dr) // fp16a makes no sense when fp16 storage disabled if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false; + // int8a makes no sense when int8 storage disabled + if (!opt.use_int8_packed && !opt.use_int8_storage) opt.use_int8_arithmetic = false; + // fp16 uniform makes no sense when fp16 arithmetic disabled if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false; } @@ -1339,6 +1342,9 @@ int Net::load_param_bin(const DataReader& dr) // fp16a makes no sense when fp16 storage disabled if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false; + // int8a makes no sense when int8 storage disabled + if (!opt.use_int8_packed && !opt.use_int8_storage) opt.use_int8_arithmetic = false; + // fp16 uniform makes no sense when fp16 arithmetic disabled if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false; } diff --git a/tests/test_dequantize.cpp b/tests/test_dequantize.cpp index 431201b2e..f80684539 100644 --- a/tests/test_dequantize.cpp +++ b/tests/test_dequantize.cpp @@ -142,12 +142,8 @@ static int test_dequantize_3() || test_dequantize_pack8(RandomIntMat(15, 24), 24, 24) || test_dequantize_pack8(RandomIntMat(15, 24), 24, 1) || test_dequantize_pack8(RandomIntMat(15, 24), 24, 0) - || test_dequantize_pack8(RandomIntMat(128), 1, 128) || test_dequantize_pack8(RandomIntMat(128), 1, 1) - || test_dequantize_pack8(RandomIntMat(128), 1, 0) - || test_dequantize_pack8(RandomIntMat(128), 128, 128) - || test_dequantize_pack8(RandomIntMat(128), 128, 1) - || test_dequantize_pack8(RandomIntMat(128), 128, 0); + || test_dequantize_pack8(RandomIntMat(128), 1, 0); } int main() diff --git a/tests/test_packing.cpp b/tests/test_packing.cpp index 2d84199eb..a8e5c6c28 100644 --- a/tests/test_packing.cpp +++ b/tests/test_packing.cpp @@ -217,15 +217,12 @@ static int test_packing_cpu(const ncnn::Mat& a, int in_elempack, int out_elempac } #if NCNN_VULKAN - -static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_elempack) +static int test_packing_gpu_fp32(const ncnn::Mat& a, int in_elempack, int out_elempack) { ncnn::ParamDict pd; pd.set(0, out_elempack); pd.set(2, 1); // cast_type_from pd.set(3, 1); // cast_type_to - pd.set(4, 0); // storage_type_from - pd.set(5, 0); // storage_type_to std::vector weights(0); @@ -297,12 +294,112 @@ static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_ if (CompareMat(b, d, 0.001) != 0) { - fprintf(stderr, "test_packing_gpu_buffer failed a.dims=%d a=(%d %d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.d, a.c, in_elempack, out_elempack); + fprintf(stderr, "test_packing_gpu failed a.dims=%d a=(%d %d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.d, a.c, in_elempack, out_elempack); return -1; } return 0; } + +static int test_packing_gpu_int8(const ncnn::Mat& a, int in_elempack, int out_elempack) +{ + ncnn::ParamDict pd; + pd.set(0, out_elempack); + pd.set(2, 4); // cast_type_from + pd.set(3, 4); // cast_type_to + + std::vector weights(0); + + ncnn::Option opt; + opt.num_threads = 1; + opt.use_vulkan_compute = true; + opt.use_int8_inference = false; + opt.use_fp16_packed = false; + opt.use_fp16_storage = false; + opt.use_fp16_arithmetic = false; + opt.use_int8_storage = false; + opt.use_int8_arithmetic = false; + opt.use_packing_layout = true; + opt.use_shader_pack8 = true; + + ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(); + + ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator(); + ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator(); + + opt.blob_vkallocator = blob_vkallocator; + opt.workspace_vkallocator = blob_vkallocator; + opt.staging_vkallocator = staging_vkallocator; + + if (!vkdev->info.support_int8_packed()) opt.use_int8_packed = false; + if (!vkdev->info.support_int8_storage()) opt.use_int8_storage = false; + + ncnn::Layer* op = ncnn::create_layer_vulkan("Packing"); + + op->vkdev = vkdev; + + op->load_param(pd); + + ncnn::ModelBinFromMatArray mb(weights.data()); + + op->load_model(mb); + + op->create_pipeline(opt); + + ncnn::Mat a8; + if (a.dims == 1) a8 = RandomS8Mat(a.w); + if (a.dims == 2) a8 = RandomS8Mat(a.w, a.h); + if (a.dims == 3) a8 = RandomS8Mat(a.w, a.h, a.c); + if (a.dims == 4) a8 = RandomS8Mat(a.w, a.h, a.d, a.c); + + ncnn::Mat ap; + ncnn::convert_packing(a8, ap, in_elempack, opt); + + ncnn::Mat b; + packing_cpu_naive(ap, b, out_elempack); + + ncnn::Mat c; + + // forward + ncnn::VkCompute cmd(vkdev); + + // upload + ncnn::VkMat a_gpu; + cmd.record_clone(ap, a_gpu, opt); + + ncnn::VkMat c_gpu; + op->forward(a_gpu, c_gpu, cmd, opt); + + // download + cmd.record_clone(c_gpu, c, opt); + + cmd.submit_and_wait(); + + op->destroy_pipeline(opt); + + delete op; + + ncnn::Mat b32; + ncnn::cast_int8_to_float32(b, b32, opt); + + ncnn::Mat c32; + ncnn::cast_int8_to_float32(c, c32, opt); + + if (CompareMat(b32, c32, 0.001) != 0) + { + fprintf(stderr, "test_packing_gpu_int8 failed a.dims=%d a=(%d %d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.d, a.c, in_elempack, out_elempack); + return -1; + } + + return 0; +} + +static int test_packing_gpu(const ncnn::Mat& a, int in_elempack, int out_elempack) +{ + return 0 + || test_packing_gpu_fp32(a, in_elempack, out_elempack) + || test_packing_gpu_int8(a, in_elempack, out_elempack); +} #endif static int test_packing_cpu(const ncnn::Mat& a) @@ -329,15 +426,15 @@ static int test_packing_cpu(const ncnn::Mat& a) static int test_packing_gpu(const ncnn::Mat& a) { return 0 - || test_packing_gpu_buffer(a, 1, 1) - || test_packing_gpu_buffer(a, 4, 4) - || test_packing_gpu_buffer(a, 8, 8) - || test_packing_gpu_buffer(a, 1, 4) - || test_packing_gpu_buffer(a, 4, 1) - || test_packing_gpu_buffer(a, 1, 8) - || test_packing_gpu_buffer(a, 8, 1) - || test_packing_gpu_buffer(a, 4, 8) - || test_packing_gpu_buffer(a, 8, 4); + || test_packing_gpu(a, 1, 1) + || test_packing_gpu(a, 4, 4) + || test_packing_gpu(a, 8, 8) + || test_packing_gpu(a, 1, 4) + || test_packing_gpu(a, 4, 1) + || test_packing_gpu(a, 1, 8) + || test_packing_gpu(a, 8, 1) + || test_packing_gpu(a, 4, 8) + || test_packing_gpu(a, 8, 4); } #endif // NCNN_VULKAN diff --git a/tests/test_quantize.cpp b/tests/test_quantize.cpp index a6e67b23d..be137a49f 100644 --- a/tests/test_quantize.cpp +++ b/tests/test_quantize.cpp @@ -24,7 +24,7 @@ static int test_quantize(const ncnn::Mat& a, float scale_low, float scale_high) } else { - if (a.dims == 1) scale_data.create(a.w); + if (a.dims == 1) scale_data.create(1); if (a.dims == 2) scale_data.create(a.h); if (a.dims == 3) scale_data.create(a.c); Randomize(scale_data, scale_low, scale_high); diff --git a/tests/test_quantize_oom.cpp b/tests/test_quantize_oom.cpp index ca78535ed..cc029e0bb 100644 --- a/tests/test_quantize_oom.cpp +++ b/tests/test_quantize_oom.cpp @@ -24,7 +24,7 @@ static int test_quantize_oom(const ncnn::Mat& a, float scale_low, float scale_hi } else { - if (a.dims == 1) scale_data.create(a.w); + if (a.dims == 1) scale_data.create(1); if (a.dims == 2) scale_data.create(a.h); if (a.dims == 3) scale_data.create(a.c); Randomize(scale_data, scale_low, scale_high); diff --git a/tests/testutil.cpp b/tests/testutil.cpp index fa2f0cc01..db7b4ca8e 100644 --- a/tests/testutil.cpp +++ b/tests/testutil.cpp @@ -759,7 +759,32 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector a_gpu(a.size()); for (size_t i = 0; i < a_gpu.size(); i++) { - cmd.record_upload(a[i], a_gpu[i], opt); + if (flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING) + { + // resolve dst_elempack + int dims = a[i].dims; + int elemcount = 0; + if (dims == 1) elemcount = a[i].elempack * a[i].w; + if (dims == 2) elemcount = a[i].elempack * a[i].h; + if (dims == 3 || dims == 4) elemcount = a[i].elempack * a[i].c; + + const int dst_elempack = (opt.use_shader_pack8 && elemcount % 8 == 0) ? 8 : elemcount % 4 == 0 ? 4 : 1; + + ncnn::Mat a4; + ncnn::convert_packing(a[i], a4, dst_elempack, opt); + + ncnn::Option opt_upload = opt; + opt_upload.use_fp16_packed = false; + opt_upload.use_fp16_storage = false; + opt_upload.use_int8_packed = false; + opt_upload.use_int8_storage = false; + + cmd.record_clone(a4, a_gpu[i], opt_upload); + } + else + { + cmd.record_upload(a[i], a_gpu[i], opt); + } } std::vector d_gpu(top_blob_count); @@ -1082,7 +1107,33 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vectorsupport_inplace)