Browse Source

vulkan int8 packing quantize dequantize requantize (#3731)

* add int8 definitions
* packing vulkan int8/int32, quantize vulkan
* vulkan dequantize
* requantize vulkan
pull/4204/merge
nihui GitHub 11 months ago
parent
commit
9f832c19c1
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
34 changed files with 2523 additions and 42 deletions
  1. +17
    -0
      src/allocator.cpp
  2. +5
    -0
      src/command.cpp
  3. +142
    -10
      src/gpu.cpp
  4. +1
    -1
      src/gpu.h
  5. +2
    -0
      src/layer/packing.h
  6. +231
    -0
      src/layer/vulkan/dequantize_vulkan.cpp
  7. +46
    -0
      src/layer/vulkan/dequantize_vulkan.h
  8. +63
    -8
      src/layer/vulkan/packing_vulkan.cpp
  9. +215
    -0
      src/layer/vulkan/quantize_vulkan.cpp
  10. +45
    -0
      src/layer/vulkan/quantize_vulkan.h
  11. +231
    -0
      src/layer/vulkan/requantize_vulkan.cpp
  12. +47
    -0
      src/layer/vulkan/requantize_vulkan.h
  13. +80
    -0
      src/layer/vulkan/shader/dequantize.comp
  14. +80
    -0
      src/layer/vulkan/shader/dequantize_pack4.comp
  15. +84
    -0
      src/layer/vulkan/shader/dequantize_pack8.comp
  16. +73
    -0
      src/layer/vulkan/shader/packing_int8.comp
  17. +79
    -0
      src/layer/vulkan/shader/packing_pack1to4_int8.comp
  18. +88
    -0
      src/layer/vulkan/shader/packing_pack1to8_int8.comp
  19. +79
    -0
      src/layer/vulkan/shader/packing_pack4to1_int8.comp
  20. +75
    -0
      src/layer/vulkan/shader/packing_pack4to8_int8.comp
  21. +88
    -0
      src/layer/vulkan/shader/packing_pack8to1_int8.comp
  22. +75
    -0
      src/layer/vulkan/shader/packing_pack8to4_int8.comp
  23. +63
    -0
      src/layer/vulkan/shader/quantize.comp
  24. +63
    -0
      src/layer/vulkan/shader/quantize_pack4.comp
  25. +65
    -0
      src/layer/vulkan/shader/quantize_pack8.comp
  26. +103
    -0
      src/layer/vulkan/shader/requantize.comp
  27. +103
    -0
      src/layer/vulkan/shader/requantize_pack4.comp
  28. +107
    -0
      src/layer/vulkan/shader/requantize_pack8.comp
  29. +6
    -0
      src/net.cpp
  30. +1
    -5
      tests/test_dequantize.cpp
  31. +111
    -14
      tests/test_packing.cpp
  32. +1
    -1
      tests/test_quantize.cpp
  33. +1
    -1
      tests/test_quantize_oom.cpp
  34. +53
    -2
      tests/testutil.cpp

+ 17
- 0
src/allocator.cpp View File

@@ -892,6 +892,13 @@ VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize,
if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT;
if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT;
}
if (elemsize / elempack == 1)
{
// int8
if (elempack == 1) format = VK_FORMAT_R8_SINT;
if (elempack == 4) format = VK_FORMAT_R8G8B8A8_SINT;
if (elempack == 8) format = VK_FORMAT_R8G8B8A8_SINT;
}

// resolve image width height depth
int width = w;
@@ -1468,6 +1475,16 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsiz
if (elempack == 32) format = VK_FORMAT_R16G16B16A16_SFLOAT;
if (elempack == 64) format = VK_FORMAT_R16G16B16A16_SFLOAT;
}
if (elemsize / elempack == 1)
{
// int8
if (elempack == 1) format = VK_FORMAT_R8_SINT;
if (elempack == 4) format = VK_FORMAT_R8G8B8A8_SINT;
if (elempack == 8) format = VK_FORMAT_R8G8B8A8_SINT;
if (elempack == 16) format = VK_FORMAT_R8G8B8A8_SINT;
if (elempack == 32) format = VK_FORMAT_R8G8B8A8_SINT;
if (elempack == 64) format = VK_FORMAT_R8G8B8A8_SINT;
}

// resolve image width height depth
int width = w;


+ 5
- 0
src/command.cpp View File

@@ -450,6 +450,11 @@ void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt)
cast_type_to = 1;
}

if (src.elemsize == src.elempack * 1u)
{
cast_type_to = 4;
}

VkMat dst_staging;
vkdev->convert_packing(src, dst_staging, dst_elempack, cast_type_to, *this, opt_staging);



+ 142
- 10
src/gpu.cpp View File

@@ -3032,6 +3032,10 @@ public:
// to fp32 | fp16
// to pack1 | pack4 | pack8
mutable ncnn::Layer* uop_packing[2][2][3];
// from int8
// to int8
// to pack1 | pack4 | pack8
mutable ncnn::Layer* uop_packing_int8[3];
mutable Mutex uop_lock;

// device is valid and sucessfully initialized
@@ -3047,6 +3051,7 @@ VulkanDevicePrivate::VulkanDevicePrivate(VulkanDevice* _vkdev)
pipeline_cache = 0;
valid = false;
memset(uop_packing, 0, sizeof(uop_packing));
memset(uop_packing_int8, 0, sizeof(uop_packing_int8));
}

int VulkanDevicePrivate::create_dummy_buffer_image()
@@ -3096,18 +3101,29 @@ void VulkanDevicePrivate::destroy_dummy_buffer_image()

const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const
{
bool use_fp16 = (cast_type_from_index == 1 || cast_type_to_index == 1);
bool use_int8 = (cast_type_from_index == 3 || cast_type_to_index == 3);

MutexLockGuard lock(uop_lock);

const ncnn::Layer* cached_uop = uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index];
const ncnn::Layer* cached_uop = 0;
if (use_int8)
{
cached_uop = uop_packing_int8[packing_type_to_index];
}
else
{
cached_uop = uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index];
}
if (cached_uop)
return cached_uop;

bool use_fp16 = (cast_type_from_index == 1 || cast_type_to_index == 1);

// create uop
Option opt;
opt.use_fp16_packed = use_fp16; // fp16p is always supported
opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage();
opt.use_int8_packed = use_int8; // int8p is always supported
opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage();

// fp16/int8 arithmetic are not necessary for packing
// and may conflict with storage options
@@ -3132,14 +3148,21 @@ const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_

ncnn::ParamDict pd;
pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack
pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16
pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16 3=int8
pd.set(3, cast_type_to_index + 1);

uop->load_param(pd);

uop->create_pipeline(opt);

uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop;
if (use_int8)
{
uop_packing_int8[packing_type_to_index] = uop;
}
else
{
uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop;
}

return uop;
}
@@ -3164,6 +3187,8 @@ void VulkanDevicePrivate::destroy_utility_operator()

opt.use_fp16_packed = use_fp16;
opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage();
opt.use_int8_packed = false;
opt.use_int8_storage = false;

// to pack1 | pack4 | pack8
for (int k = 0; k < 3; k++)
@@ -3183,6 +3208,33 @@ void VulkanDevicePrivate::destroy_utility_operator()
}
}
}

// int8
{
bool use_int8 = true;

opt.use_fp16_packed = false;
opt.use_fp16_storage = false;
opt.use_int8_packed = use_int8;
opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage();

// to pack1 | pack4 | pack8
for (int k = 0; k < 3; k++)
{
// enable pack8 for pack8to1/pack8to4
opt.use_shader_pack8 = true;

ncnn::Layer* uop = uop_packing_int8[k];
if (!uop)
continue;

uop->destroy_pipeline(opt);

delete uop;

uop_packing_int8[k] = 0;
}
}
}

VulkanDevice::VulkanDevice(int device_index)
@@ -4232,18 +4284,35 @@ void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempac
{
cast_type_from_index = 0;
}
else // if (src.elembits() == 16)
else if (src.elembits() == 16)
{
cast_type_from_index = 1;
}
else // if (src.elembits() == 8)
{
cast_type_from_index = 3;
}

int cast_type_to_index = cast_type_to ? cast_type_to - 1 : cast_type_from_index;

// NCNN_LOGE("convert_packing b2b %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);

if ((cast_type_from_index == 0 || cast_type_from_index == 1) && (cast_type_to_index == 2 || cast_type_to_index == 3))
{
NCNN_LOGE("convert_packing from fp32/fp16 to int32/int8 is not supported");
return;
}
if ((cast_type_from_index == 2 || cast_type_from_index == 3) && (cast_type_to_index == 0 || cast_type_to_index == 1))
{
NCNN_LOGE("convert_packing from int32/int8 to fp32/fp16 is not supported");
return;
}

Option opt2 = opt;
opt2.use_fp16_packed = (cast_type_from_index == 1 || cast_type_to_index == 1);
opt2.use_fp16_storage = (cast_type_from_index == 1 || cast_type_to_index == 1) && info.support_fp16_storage();
opt2.use_int8_packed = (cast_type_from_index == 3 || cast_type_to_index == 3);
opt2.use_int8_storage = (cast_type_from_index == 3 || cast_type_to_index == 3) && info.support_int8_storage();

const ncnn::Layer* uop = d->get_utility_operator(cast_type_from_index, cast_type_to_index, packing_type_to_index);
uop->forward(src, dst, cmd, opt2);
@@ -4809,6 +4878,49 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
custom_defines.append("afp2sfpmat4(v)", "v");
}

if (opt.use_int8_storage)
{
custom_defines.append("sint8", "int8_t");
}
else if (opt.use_int8_packed)
{
custom_defines.append("sint8", "int");
}
else
{
custom_defines.append("sint8", "int");
}

custom_defines.append("sint8vec4", "int");
custom_defines.append("sint8vec8", "ivec2");

custom_defines.append("aint8", "int");
custom_defines.append("aint8vec4", "ivec4");

custom_defines.append("unpackInt4x8(v)", "ivec4((v<<24)>>24,(v<<16)>>24,(v<<8)>>24,v>>24)");
custom_defines.append("packInt4x8(v)", "int((uint(v.r)&0xFFu)|((uint(v.g)&0xFFu)<<8)|((uint(v.b)&0xFFu)<<16)|((uint(v.a)&0xFFu)<<24))");

if (opt.use_int8_storage)
{
custom_defines.append("i8buffer_ld1(buf,i)", "int(buf[i])");
custom_defines.append("i8buffer_st1(buf,i,v)", "{buf[i]=int8_t(v);}");
custom_defines.append("i8buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
}
else
{
custom_defines.append("i8buffer_ld1(buf,i)", "int(((buf[(i)/4])<<(24-((i)%4)*8))>>24)");
custom_defines.append("i8buffer_st1(buf,i,v)", "{uint _i=uint(i);uint _id4=_i/4;uint _im4=_i%4;int _vs=int(v);int _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id4],0,0);ivec4 _v=unpackInt4x8(_old_v);_v[_im4]=_vs;_new_v=packInt4x8(_v);} while(atomicCompSwap(buf[_id4],_old_v,_new_v)!=_old_v);}");
custom_defines.append("i8buffer_cp1(buf,i,sbuf,si)", "{int _v=i8buffer_ld1(sbuf,si);i8buffer_st1(buf,i,_v);}");
}

custom_defines.append("i8buffer_ld4(buf,i)", "unpackInt4x8(buf[i])");
custom_defines.append("i8buffer_st4(buf,i,v)", "{buf[i]=packInt4x8(v);}");
custom_defines.append("i8buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");

custom_defines.append("i8buffer_ld8(buf,i)", "ivec8(unpackInt4x8(buf[i].r),unpackInt4x8(buf[i].g))");
custom_defines.append("i8buffer_st8(buf,i,v)", "{buf[i]=ivec2(packInt4x8(v.abcd),packInt4x8(v.efgh));}");
custom_defines.append("i8buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");

custom_defines.append("psc(x)", "(x==0?p.x:x)");

if (opt.use_fp16_storage)
@@ -5426,6 +5538,15 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
{
custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_float16: require\n";
}
custom_exts += "struct ivec8 { ivec4 abcd; ivec4 efgh; };\n";
if (opt.use_int8_storage)
{
custom_exts += "#extension GL_EXT_shader_8bit_storage: require\n";
}
if (opt.use_int8_arithmetic)
{
custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_int8: require\n";
}
#if ENABLE_VALIDATION_LAYER
{
custom_exts += "#extension GL_EXT_debug_printf : require\n";
@@ -5507,11 +5628,22 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
NCNN_LOGE("%s", s.getInfoLog());
NCNN_LOGE("%s", s.getInfoDebugLog());

// for (int i = 0; i < 4; i++)
// print as line_number: code
{
int i = 3;
std::string s(comp_datas[i], comp_data_sizes[i]);
NCNN_LOGE("%s", s.c_str());
const char* p = comp_datas[3];
const char* line_end;
int line_number = 1;

while ((line_end = strchr(p, '\n')) != NULL)
{
NCNN_LOGE("%d:\t%.*s", line_number++, (int)(line_end - p), p);
p = line_end + 1;
}

if (*p != '\0')
{
NCNN_LOGE("%d:\t%s", line_number, p);
}
}

compile_success = false;


+ 1
- 1
src/gpu.h View File

@@ -465,7 +465,7 @@ public:

// utility operator
void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
// cast_type_to 0=auto(same as src) 1=fp32 2=fp16
// cast_type_to 0=auto(same as src) 1=fp32 2=fp16 3=int32 4=int8
void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const;

// VK_KHR_bind_memory2


+ 2
- 0
src/layer/packing.h View File

@@ -36,6 +36,8 @@ public:
// 0 = auto
// 1 = fp32
// 2 = fp16
// 3 = int32
// 4 = int8
int cast_type_from;
int cast_type_to;
};


+ 231
- 0
src/layer/vulkan/dequantize_vulkan.cpp View File

@@ -0,0 +1,231 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "dequantize_vulkan.h"

#include "layer_shader_type.h"

namespace ncnn {

Dequantize_vulkan::Dequantize_vulkan()
{
support_vulkan = true;

pipeline_dequantize = 0;
pipeline_dequantize_pack4 = 0;
pipeline_dequantize_pack8 = 0;
}

int Dequantize_vulkan::create_pipeline(const Option& opt)
{
const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];

const int dims = shape.dims;

int elempack = 1;
if (dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
if (dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
if (dims == 3 || dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;

const size_t elemsize = elempack * 4u;
size_t out_elemsize;
if (opt.use_fp16_storage || opt.use_fp16_packed)
{
out_elemsize = elempack * 2u;
}
else
{
out_elemsize = elempack * 4u;
}

Mat shape_packed;
if (dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
if (dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
if (dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
if (dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack);

Mat out_shape_packed;
if (dims == 1) out_shape_packed = Mat(out_shape.w / elempack, (void*)0, out_elemsize, elempack);
if (dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / elempack, (void*)0, out_elemsize, elempack);
if (dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / elempack, (void*)0, out_elemsize, elempack);
if (dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / elempack, (void*)0, out_elemsize, elempack);

size_t c = 0;
size_t in_stride = 0;
size_t out_stride = 0;
if (dims == 1)
{
c = 1;
in_stride = shape_packed.w;
out_stride = out_shape_packed.w;
}
if (dims == 2)
{
c = shape_packed.h;
in_stride = shape_packed.w;
out_stride = out_shape_packed.w;
}
if (dims == 3 || dims == 4)
{
c = shape_packed.c;
in_stride = shape_packed.cstep;
out_stride = out_shape_packed.cstep;
}

std::vector<vk_specialization_type> specializations(4 + 3);
specializations[0].i = scale_data_size;
specializations[1].f = scale_data_size == 1 ? scale_data[0] : 1.f;
specializations[2].i = bias_data_size;
specializations[3].f = bias_data_size == 1 ? bias_data[0] : 0.f;
specializations[4 + 0].u32 = c;
specializations[4 + 1].u32 = in_stride;
specializations[4 + 2].u32 = out_stride;

const int local_size_x = vkdev->info.subgroup_size();

// pack1
if (shape.dims == 0 || elempack == 1)
{
pipeline_dequantize = new Pipeline(vkdev);
pipeline_dequantize->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_dequantize->create(LayerShaderType::dequantize, opt, specializations);
}

// pack4
if (shape.dims == 0 || elempack == 4)
{
pipeline_dequantize_pack4 = new Pipeline(vkdev);
pipeline_dequantize_pack4->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_dequantize_pack4->create(LayerShaderType::dequantize_pack4, opt, specializations);
}

// pack8
if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
{
pipeline_dequantize_pack8 = new Pipeline(vkdev);
pipeline_dequantize_pack8->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_dequantize_pack8->create(LayerShaderType::dequantize_pack8, opt, specializations);
}

return 0;
}

int Dequantize_vulkan::destroy_pipeline(const Option& /*opt*/)
{
delete pipeline_dequantize;
pipeline_dequantize = 0;

delete pipeline_dequantize_pack4;
pipeline_dequantize_pack4 = 0;

delete pipeline_dequantize_pack8;
pipeline_dequantize_pack8 = 0;

return 0;
}

int Dequantize_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
{
if (scale_data_size > 1)
{
cmd.record_upload(scale_data, scale_data_gpu, opt);
}

if (bias_data_size > 1)
{
cmd.record_upload(bias_data, bias_data_gpu, opt);
}

return 0;
}

int Dequantize_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
{
const int dims = bottom_blob.dims;
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int d = bottom_blob.d;
const int channels = bottom_blob.c;
const int elempack = bottom_blob.elempack;

size_t out_elemsize;
if (opt.use_fp16_storage || opt.use_fp16_packed)
{
out_elemsize = elempack * 2u;
}
else
{
out_elemsize = elempack * 4u;
}

if (dims == 1)
top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator);
if (dims == 2)
top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator);
if (dims == 3)
top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator);
if (dims == 4)
top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

size_t c = 0;
size_t in_stride = 0;
size_t out_stride = 0;
if (dims == 1)
{
c = 1;
in_stride = bottom_blob.w;
out_stride = top_blob.w;
}
if (dims == 2)
{
c = bottom_blob.h;
in_stride = bottom_blob.w;
out_stride = top_blob.w;
}
if (dims == 3 || dims == 4)
{
c = bottom_blob.c;
in_stride = bottom_blob.cstep;
out_stride = top_blob.cstep;
}

std::vector<VkMat> bindings(4);
bindings[0] = bottom_blob;
bindings[1] = top_blob;
bindings[2] = scale_data_gpu;
bindings[3] = bias_data_gpu;

std::vector<vk_constant_type> constants(3);
constants[0].u32 = c;
constants[1].u32 = in_stride;
constants[2].u32 = out_stride;

VkMat dispatcher;
dispatcher.w = in_stride * c;
dispatcher.h = 1;
dispatcher.c = 1;

const Pipeline* pipeline = elempack == 8 ? pipeline_dequantize_pack8
: elempack == 4 ? pipeline_dequantize_pack4
: pipeline_dequantize;

cmd.record_pipeline(pipeline, bindings, constants, dispatcher);

return 0;
}

} // namespace ncnn

+ 46
- 0
src/layer/vulkan/dequantize_vulkan.h View File

@@ -0,0 +1,46 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_DEQUANTIZE_VULKAN_H
#define LAYER_DEQUANTIZE_VULKAN_H

#include "dequantize.h"

namespace ncnn {

class Dequantize_vulkan : virtual public Dequantize
{
public:
Dequantize_vulkan();

virtual int create_pipeline(const Option& opt);
virtual int destroy_pipeline(const Option& opt);

virtual int upload_model(VkTransfer& cmd, const Option& opt);

using Dequantize::forward;
virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;

public:
VkMat scale_data_gpu;
VkMat bias_data_gpu;

Pipeline* pipeline_dequantize;
Pipeline* pipeline_dequantize_pack4;
Pipeline* pipeline_dequantize_pack8;
};

} // namespace ncnn

#endif // LAYER_DEQUANTIZE_VULKAN_H

+ 63
- 8
src/layer/vulkan/packing_vulkan.cpp View File

@@ -45,6 +45,8 @@ int Packing_vulkan::create_pipeline(const Option& opt)

const int local_size_x = vkdev->info.subgroup_size();

bool use_int8_shader = cast_type_from == 4 || cast_type_to == 4;

std::vector<vk_specialization_type> specializations(2 + 3);
specializations[0].i = cast_type_from;
specializations[1].i = cast_type_to;
@@ -91,7 +93,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)

pipeline_packing = new Pipeline(vkdev);
pipeline_packing->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_packing->create(LayerShaderType::packing, opt, specializations);
if (use_int8_shader)
{
pipeline_packing->create(LayerShaderType::packing_int8, opt, specializations);
}
else
{
pipeline_packing->create(LayerShaderType::packing, opt, specializations);
}
}
if (shape.dims == 0 || elempack < out_elempack)
{
@@ -126,7 +135,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)

pipeline_packing_pack1to4 = new Pipeline(vkdev);
pipeline_packing_pack1to4->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4, opt, specializations);
if (use_int8_shader)
{
pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4_int8, opt, specializations);
}
else
{
pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4, opt, specializations);
}
}

if (shape.dims == 0 || (elempack == 1 && out_elempack == 8))
@@ -138,7 +154,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)

pipeline_packing_pack1to8 = new Pipeline(vkdev);
pipeline_packing_pack1to8->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8, opt, specializations);
if (use_int8_shader)
{
pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8_int8, opt, specializations);
}
else
{
pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8, opt, specializations);
}
}

if (shape.dims == 0 || (elempack == 4 && out_elempack == 8))
@@ -150,7 +173,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)

pipeline_packing_pack4to8 = new Pipeline(vkdev);
pipeline_packing_pack4to8->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8, opt, specializations);
if (use_int8_shader)
{
pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8_int8, opt, specializations);
}
else
{
pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8, opt, specializations);
}
}
}
if (shape.dims == 0 || elempack > out_elempack)
@@ -186,7 +216,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)

pipeline_packing_pack4to1 = new Pipeline(vkdev);
pipeline_packing_pack4to1->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1, opt, specializations);
if (use_int8_shader)
{
pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1_int8, opt, specializations);
}
else
{
pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1, opt, specializations);
}
}

if (shape.dims == 0 || (elempack == 8 && out_elempack == 1))
@@ -198,7 +235,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)

pipeline_packing_pack8to1 = new Pipeline(vkdev);
pipeline_packing_pack8to1->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1, opt, specializations);
if (use_int8_shader)
{
pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1_int8, opt, specializations);
}
else
{
pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1, opt, specializations);
}
}

if (shape.dims == 0 || (elempack == 8 && out_elempack == 4))
@@ -210,7 +254,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)

pipeline_packing_pack8to4 = new Pipeline(vkdev);
pipeline_packing_pack8to4->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4, opt, specializations);
if (use_int8_shader)
{
pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4_int8, opt, specializations);
}
else
{
pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4, opt, specializations);
}
}
}

@@ -296,10 +347,14 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
{
out_elemsize = out_elempack * 4u;
}
else // if (cast_type_to == 2)
else if (cast_type_to == 2)
{
out_elemsize = out_elempack * 2u;
}
else // if (cast_type_to == 3)
{
out_elemsize = out_elempack * 1u;
}

if (dims == 1)
{


+ 215
- 0
src/layer/vulkan/quantize_vulkan.cpp View File

@@ -0,0 +1,215 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "quantize_vulkan.h"

#include "layer_shader_type.h"

namespace ncnn {

Quantize_vulkan::Quantize_vulkan()
{
support_vulkan = true;

pipeline_quantize = 0;
pipeline_quantize_pack4 = 0;
pipeline_quantize_pack8 = 0;
}

int Quantize_vulkan::create_pipeline(const Option& opt)
{
const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];

const int dims = shape.dims;

int elempack = 0;
if (dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
if (dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
if (dims == 3 || dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;

size_t elemsize;
const size_t out_elemsize = elempack * 1u;
if (opt.use_fp16_storage || opt.use_fp16_packed)
{
elemsize = elempack * 2u;
}
else
{
elemsize = elempack * 4u;
}

Mat shape_packed;
if (dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
if (dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
if (dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
if (dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack);

Mat out_shape_packed;
if (dims == 1) out_shape_packed = Mat(out_shape.w / elempack, (void*)0, out_elemsize, elempack);
if (dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / elempack, (void*)0, out_elemsize, elempack);
if (dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / elempack, (void*)0, out_elemsize, elempack);
if (dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / elempack, (void*)0, out_elemsize, elempack);

size_t c = 0;
size_t in_stride = 0;
size_t out_stride = 0;
if (dims == 1)
{
c = 1;
in_stride = shape_packed.w;
out_stride = out_shape_packed.w;
}
if (dims == 2)
{
c = shape_packed.h;
in_stride = shape_packed.w;
out_stride = out_shape_packed.w;
}
if (dims == 3 || dims == 4)
{
c = shape_packed.c;
in_stride = shape_packed.cstep;
out_stride = out_shape_packed.cstep;
}

std::vector<vk_specialization_type> specializations(2 + 3);
specializations[0].i = scale_data_size;
specializations[1].f = scale_data_size == 1 ? scale_data[0] : 1.f;
specializations[2 + 0].u32 = c;
specializations[2 + 1].u32 = in_stride;
specializations[2 + 2].u32 = out_stride;

const int local_size_x = vkdev->info.subgroup_size();

// pack1
if (shape.dims == 0 || elempack == 1)
{
pipeline_quantize = new Pipeline(vkdev);
pipeline_quantize->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_quantize->create(LayerShaderType::quantize, opt, specializations);
}

// pack4
if (shape.dims == 0 || elempack == 4)
{
pipeline_quantize_pack4 = new Pipeline(vkdev);
pipeline_quantize_pack4->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_quantize_pack4->create(LayerShaderType::quantize_pack4, opt, specializations);
}

// pack8
if (shape.dims == 0 || elempack == 8)
{
pipeline_quantize_pack8 = new Pipeline(vkdev);
pipeline_quantize_pack8->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_quantize_pack8->create(LayerShaderType::quantize_pack8, opt, specializations);
}

return 0;
}

int Quantize_vulkan::destroy_pipeline(const Option& /*opt*/)
{
delete pipeline_quantize;
pipeline_quantize = 0;

delete pipeline_quantize_pack4;
pipeline_quantize_pack4 = 0;

delete pipeline_quantize_pack8;
pipeline_quantize_pack8 = 0;

return 0;
}

int Quantize_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
{
if (scale_data_size > 1)
{
cmd.record_upload(scale_data, scale_data_gpu, opt);
}

return 0;
}

int Quantize_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
{
const int dims = bottom_blob.dims;
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int d = bottom_blob.d;
const int channels = bottom_blob.c;
const int elempack = bottom_blob.elempack;

const size_t out_elemsize = 1u * elempack;

if (dims == 1)
top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator);
if (dims == 2)
top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator);
if (dims == 3)
top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator);
if (dims == 4)
top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

size_t c = 0;
size_t in_stride = 0;
size_t out_stride = 0;
if (dims == 1)
{
c = 1;
in_stride = bottom_blob.w;
out_stride = top_blob.w;
}
if (dims == 2)
{
c = bottom_blob.h;
in_stride = bottom_blob.w;
out_stride = top_blob.w;
}
if (dims == 3 || dims == 4)
{
c = bottom_blob.c;
in_stride = bottom_blob.cstep;
out_stride = top_blob.cstep;
}

std::vector<VkMat> bindings(3);
bindings[0] = bottom_blob;
bindings[1] = top_blob;
bindings[2] = scale_data_gpu;

std::vector<vk_constant_type> constants(3);
constants[0].u32 = c;
constants[1].u32 = in_stride;
constants[2].u32 = out_stride;

VkMat dispatcher;
dispatcher.w = in_stride * c;
dispatcher.h = 1;
dispatcher.c = 1;

const Pipeline* pipeline = elempack == 8 ? pipeline_quantize_pack8
: elempack == 4 ? pipeline_quantize_pack4
: pipeline_quantize;

cmd.record_pipeline(pipeline, bindings, constants, dispatcher);

return 0;
}

} // namespace ncnn

+ 45
- 0
src/layer/vulkan/quantize_vulkan.h View File

@@ -0,0 +1,45 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_QUANTIZE_VULKAN_H
#define LAYER_QUANTIZE_VULKAN_H

#include "quantize.h"

namespace ncnn {

class Quantize_vulkan : virtual public Quantize
{
public:
Quantize_vulkan();

virtual int create_pipeline(const Option& opt);
virtual int destroy_pipeline(const Option& opt);

virtual int upload_model(VkTransfer& cmd, const Option& opt);

using Quantize::forward;
virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;

public:
VkMat scale_data_gpu;

Pipeline* pipeline_quantize;
Pipeline* pipeline_quantize_pack4;
Pipeline* pipeline_quantize_pack8;
};

} // namespace ncnn

#endif // LAYER_QUANTIZE_VULKAN_H

+ 231
- 0
src/layer/vulkan/requantize_vulkan.cpp View File

@@ -0,0 +1,231 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "requantize_vulkan.h"

#include "layer_shader_type.h"

namespace ncnn {

Requantize_vulkan::Requantize_vulkan()
{
support_vulkan = true;

pipeline_requantize = 0;
pipeline_requantize_pack4 = 0;
pipeline_requantize_pack8 = 0;
}

int Requantize_vulkan::create_pipeline(const Option& opt)
{
const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];

const int dims = shape.dims;

int elempack = 1;
if (dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
if (dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
if (dims == 3 || dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;

int out_elempack = 1;
if (dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
if (dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1;
if (dims == 3 || dims == 4) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1;

const size_t elemsize = elempack * 4u;
const size_t out_elemsize = out_elempack * 1u;

Mat shape_packed;
if (dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
if (dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
if (dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
if (dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack);

Mat out_shape_packed;
if (dims == 1) out_shape_packed = Mat(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
if (dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack);
if (dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
if (dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);

size_t c = 0;
size_t in_stride = 0;
size_t out_stride = 0;
if (dims == 1)
{
c = 1;
in_stride = shape_packed.w;
out_stride = out_shape_packed.w;
}
if (dims == 2)
{
c = shape_packed.h;
in_stride = shape_packed.w;
out_stride = out_shape_packed.w;
}
if (dims == 3 || dims == 4)
{
c = shape_packed.c;
in_stride = shape_packed.cstep;
out_stride = out_shape_packed.cstep;
}

std::vector<vk_specialization_type> specializations(9 + 3);
specializations[0].i = scale_in_data_size;
specializations[1].f = scale_in_data_size == 1 ? scale_in_data[0] : 1.f;
specializations[2].i = scale_out_data_size;
specializations[3].f = scale_out_data_size == 1 ? scale_out_data[0] : 1.f;
specializations[4].i = bias_data_size;
specializations[5].f = bias_data_size == 1 ? bias_data[0] : 0.f;
specializations[6].i = activation_type;
specializations[7].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
specializations[8].f = activation_params.w == 2 ? activation_params[1] : 0.f;
specializations[9 + 0].u32 = c;
specializations[9 + 1].u32 = in_stride;
specializations[9 + 2].u32 = out_stride;

const int local_size_x = vkdev->info.subgroup_size();

// pack1
if (shape.dims == 0 || elempack == 1)
{
pipeline_requantize = new Pipeline(vkdev);
pipeline_requantize->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_requantize->create(LayerShaderType::requantize, opt, specializations);
}

// pack4
if (shape.dims == 0 || elempack == 4)
{
pipeline_requantize_pack4 = new Pipeline(vkdev);
pipeline_requantize_pack4->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_requantize_pack4->create(LayerShaderType::requantize_pack4, opt, specializations);
}

// pack8
if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
{
pipeline_requantize_pack8 = new Pipeline(vkdev);
pipeline_requantize_pack8->set_optimal_local_size_xyz(local_size_x, 1, 1);
pipeline_requantize_pack8->create(LayerShaderType::requantize_pack8, opt, specializations);
}

return 0;
}

int Requantize_vulkan::destroy_pipeline(const Option& /*opt*/)
{
delete pipeline_requantize;
pipeline_requantize = 0;

delete pipeline_requantize_pack4;
pipeline_requantize_pack4 = 0;

delete pipeline_requantize_pack8;
pipeline_requantize_pack8 = 0;

return 0;
}

int Requantize_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
{
if (scale_in_data_size > 1)
{
cmd.record_upload(scale_in_data, scale_in_data_gpu, opt);
}

if (scale_out_data_size > 1)
{
cmd.record_upload(scale_out_data, scale_out_data_gpu, opt);
}

if (bias_data_size > 1)
{
cmd.record_upload(bias_data, bias_data_gpu, opt);
}

return 0;
}

int Requantize_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
{
const int dims = bottom_blob.dims;
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int d = bottom_blob.d;
const int channels = bottom_blob.c;
const int elempack = bottom_blob.elempack;

size_t out_elemsize = 1u * elempack;

if (dims == 1)
top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator);
if (dims == 2)
top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator);
if (dims == 3)
top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator);
if (dims == 4)
top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

size_t c = 0;
size_t in_stride = 0;
size_t out_stride = 0;
if (dims == 1)
{
c = 1;
in_stride = bottom_blob.w;
out_stride = top_blob.w;
}
if (dims == 2)
{
c = bottom_blob.h;
in_stride = bottom_blob.w;
out_stride = top_blob.w;
}
if (dims == 3 || dims == 4)
{
c = bottom_blob.c;
in_stride = bottom_blob.cstep;
out_stride = top_blob.cstep;
}

std::vector<VkMat> bindings(5);
bindings[0] = bottom_blob;
bindings[1] = top_blob;
bindings[2] = scale_in_data_gpu;
bindings[3] = scale_out_data_gpu;
bindings[4] = bias_data_gpu;

std::vector<vk_constant_type> constants(3);
constants[0].u32 = c;
constants[1].u32 = in_stride;
constants[2].u32 = out_stride;

VkMat dispatcher;
dispatcher.w = in_stride * c;
dispatcher.h = 1;
dispatcher.c = 1;

const Pipeline* pipeline = elempack == 8 ? pipeline_requantize_pack8
: elempack == 4 ? pipeline_requantize_pack4
: pipeline_requantize;

cmd.record_pipeline(pipeline, bindings, constants, dispatcher);

return 0;
}

} // namespace ncnn

+ 47
- 0
src/layer/vulkan/requantize_vulkan.h View File

@@ -0,0 +1,47 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_REQUANTIZE_VULKAN_H
#define LAYER_REQUANTIZE_VULKAN_H

#include "requantize.h"

namespace ncnn {

class Requantize_vulkan : virtual public Requantize
{
public:
Requantize_vulkan();

virtual int create_pipeline(const Option& opt);
virtual int destroy_pipeline(const Option& opt);

virtual int upload_model(VkTransfer& cmd, const Option& opt);

using Requantize::forward;
virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;

public:
VkMat scale_in_data_gpu;
VkMat scale_out_data_gpu;
VkMat bias_data_gpu;

Pipeline* pipeline_requantize;
Pipeline* pipeline_requantize_pack4;
Pipeline* pipeline_requantize_pack8;
};

} // namespace ncnn

#endif // LAYER_REQUANTIZE_VULKAN_H

+ 80
- 0
src/layer/vulkan/shader/dequantize.comp View File

@@ -0,0 +1,80 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int scale_data_size = 0;
layout (constant_id = 1) const float scale_value = 1.f;
layout (constant_id = 2) const int bias_data_size = 0;
layout (constant_id = 3) const float bias_value = 0.f;

#define shape_constant_id_offset 4
layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

layout (binding = 0) readonly buffer bottom_blob { int bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
layout (binding = 2) readonly buffer scale_blob { sfp scale_blob_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfp bias_blob_data[]; };

layout (push_constant) uniform parameter
{
uint c;
uint in_stride;
uint out_stride;
} p;

void main()
{
const uint gi = gl_GlobalInvocationID.x;

if (gi >= psc(in_stride) * psc(c))
return;

const uint gy = gi / psc(in_stride);
const uint gx = gi % psc(in_stride);

int v = bottom_blob_data[gi];

afp scale;
if (scale_data_size == 1)
{
scale = afp(scale_value);
}
else
{
scale = buffer_ld1(scale_blob_data, gy);
}

afp bias;
if (bias_data_size == 0)
{
bias = afp(0.f);
}
else if (bias_data_size == 1)
{
bias = afp(bias_value);
}
else
{
bias = buffer_ld1(bias_blob_data, gy);
}

afp v_fp = afp(v) * scale + bias;

const uint outgi = gy * psc(out_stride) + gx;

buffer_st1(top_blob_data, outgi, v_fp);
}

+ 80
- 0
src/layer/vulkan/shader/dequantize_pack4.comp View File

@@ -0,0 +1,80 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int scale_data_size = 0;
layout (constant_id = 1) const float scale_value = 1.f;
layout (constant_id = 2) const int bias_data_size = 0;
layout (constant_id = 3) const float bias_value = 0.f;

#define shape_constant_id_offset 4
layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

layout (binding = 0) readonly buffer bottom_blob { ivec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
layout (binding = 2) readonly buffer scale_blob { sfpvec4 scale_blob_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_blob_data[]; };

layout (push_constant) uniform parameter
{
uint c;
uint in_stride;
uint out_stride;
} p;

void main()
{
const uint gi = gl_GlobalInvocationID.x;

if (gi >= psc(in_stride) * psc(c))
return;

const uint gy = gi / psc(in_stride);
const uint gx = gi % psc(in_stride);

ivec4 v = bottom_blob_data[gi];

afpvec4 scale;
if (scale_data_size == 1)
{
scale = afpvec4(scale_value);
}
else
{
scale = buffer_ld4(scale_blob_data, gy);
}

afpvec4 bias;
if (bias_data_size == 0)
{
bias = afpvec4(0.f);
}
else if (bias_data_size == 1)
{
bias = afpvec4(bias_value);
}
else
{
bias = buffer_ld4(bias_blob_data, gy);
}

afpvec4 v_fp = afpvec4(v) * scale + bias;

const uint outgi = gy * psc(out_stride) + gx;

buffer_st4(top_blob_data, outgi, v_fp);
}

+ 84
- 0
src/layer/vulkan/shader/dequantize_pack8.comp View File

@@ -0,0 +1,84 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int scale_data_size = 0;
layout (constant_id = 1) const float scale_value = 1.f;
layout (constant_id = 2) const int bias_data_size = 0;
layout (constant_id = 3) const float bias_value = 0.f;

#define shape_constant_id_offset 4
layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

layout (binding = 0) readonly buffer bottom_blob { ivec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
layout (binding = 2) readonly buffer scale_blob { sfpvec8 scale_blob_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_blob_data[]; };

layout (push_constant) uniform parameter
{
uint c;
uint in_stride;
uint out_stride;
} p;

void main()
{
const uint gi = gl_GlobalInvocationID.x;

if (gi >= psc(in_stride) * psc(c))
return;

const uint gy = gi / psc(in_stride);
const uint gx = gi % psc(in_stride);

ivec8 v = bottom_blob_data[gi];

afpvec8 scale;
if (scale_data_size == 1)
{
scale = afpvec8(afpvec4(scale_value), afpvec4(scale_value));
}
else
{
scale = buffer_ld8(scale_blob_data, gy);
}

afpvec8 bias;
if (bias_data_size == 0)
{
bias[0] = afpvec4(0.f);
bias[1] = afpvec4(0.f);
}
else if (bias_data_size == 1)
{
bias[0] = afpvec4(bias_value);
bias[1] = afpvec4(bias_value);
}
else
{
bias = buffer_ld8(bias_blob_data, gy);
}

afpvec8 v_fp;
v_fp[0] = afpvec4(v.abcd) * scale[0] + bias[0];
v_fp[1] = afpvec4(v.efgh) * scale[1] + bias[1];

const uint outgi = gy * psc(out_stride) + gx;

buffer_st8(top_blob_data, outgi, v_fp);
}

+ 73
- 0
src/layer/vulkan/shader/packing_int8.comp View File

@@ -0,0 +1,73 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int cast_type_from = 0;
layout (constant_id = 1) const int cast_type_to = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;

layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; };
layout (binding = 1) readonly buffer bottom_blob_int32 { ivec4 bottom_blob_int32_data[]; };
layout (binding = 2) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
layout (binding = 3) writeonly buffer top_blob_int32 { ivec4 top_blob_int32_data[]; };

layout (push_constant) uniform parameter
{
uint n;
uint c;
uint stride;
} p;

void main()
{
const uint gx = gl_GlobalInvocationID.x;
const uint gy = gl_GlobalInvocationID.y;

if (gx >= psc(n) || gy >= psc(c))
return;

const uint gi = gy * psc(n) + gx;

if (cast_type_from == cast_type_to)
{
i8buffer_cp4(top_blob_data, gi, bottom_blob_data, gi);
return;
}

const uint gi2 = gy * psc(stride) + gx;

ivec4 v;
if (cast_type_from == 3)
{
v = bottom_blob_int32_data[gi];
}
else
{
v = i8buffer_ld4(bottom_blob_data, gi2);
}

if (cast_type_to == 3)
{
top_blob_int32_data[gi] = v;
}
else
{
i8buffer_st4(top_blob_data, gi2, v);
}
}

+ 79
- 0
src/layer/vulkan/shader/packing_pack1to4_int8.comp View File

@@ -0,0 +1,79 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int cast_type_from = 0;
layout (constant_id = 1) const int cast_type_to = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;

layout (binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; };
layout (binding = 1) readonly buffer bottom_blob_int32 { int bottom_blob_int32_data[]; };
layout (binding = 2) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
layout (binding = 3) writeonly buffer top_blob_int32 { ivec4 top_blob_int32_data[]; };

layout (push_constant) uniform parameter
{
uint n;
uint c;
uint stride;
} p;

void main()
{
const uint gx = gl_GlobalInvocationID.x;
const uint gy = gl_GlobalInvocationID.y;

if (gx >= psc(n) || gy >= psc(c))
return;

const uvec4 gi4 = (gy * 4 + uvec4(0, 1, 2, 3)) * psc(stride) + gx;

const uint gi = gy * psc(n) + gx;

// if (cast_type_from == cast_type_to)
// {
// i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, gi4);
// return;
// }

ivec4 v;
if (cast_type_from == 3)
{
v.r = bottom_blob_int32_data[gi4.r];
v.g = bottom_blob_int32_data[gi4.g];
v.b = bottom_blob_int32_data[gi4.b];
v.a = bottom_blob_int32_data[gi4.a];
}
else
{
v.r = i8buffer_ld1(bottom_blob_data, gi4.r);
v.g = i8buffer_ld1(bottom_blob_data, gi4.g);
v.b = i8buffer_ld1(bottom_blob_data, gi4.b);
v.a = i8buffer_ld1(bottom_blob_data, gi4.a);
}

if (cast_type_to == 3)
{
top_blob_int32_data[gi] = v;
}
else
{
i8buffer_st4(top_blob_data, gi, v);
}
}

+ 88
- 0
src/layer/vulkan/shader/packing_pack1to8_int8.comp View File

@@ -0,0 +1,88 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int cast_type_from = 0;
layout (constant_id = 1) const int cast_type_to = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;

layout (binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; };
layout (binding = 1) readonly buffer bottom_blob_int32 { int bottom_blob_int32_data[]; };
layout (binding = 2) writeonly buffer top_blob { sint8vec8 top_blob_data[]; };
layout (binding = 3) writeonly buffer top_blob_int32 { ivec8 top_blob_int32_data[]; };

layout (push_constant) uniform parameter
{
uint n;
uint c;
uint stride;
} p;

void main()
{
const uint gx = gl_GlobalInvocationID.x;
const uint gy = gl_GlobalInvocationID.y;

if (gx >= psc(n) || gy >= psc(c))
return;

const uvec4 gi4 = (gy * 8 + uvec4(0, 1, 2, 3)) * psc(stride) + gx;
const uvec4 gi8 = gi4 + psc(stride) * 4;

const uint gi = gy * psc(n) + gx;

// if (cast_type_from == cast_type_to)
// {
// i8buffer_cp1to8(top_blob_data, gi, bottom_blob_data, gi4, gi8);
// return;
// }

ivec8 v;
if (cast_type_from == 3)
{
v.abcd.r = bottom_blob_int32_data[gi4.r];
v.abcd.g = bottom_blob_int32_data[gi4.g];
v.abcd.b = bottom_blob_int32_data[gi4.b];
v.abcd.a = bottom_blob_int32_data[gi4.a];
v.efgh.r = bottom_blob_int32_data[gi8.r];
v.efgh.g = bottom_blob_int32_data[gi8.g];
v.efgh.b = bottom_blob_int32_data[gi8.b];
v.efgh.a = bottom_blob_int32_data[gi8.a];
}
else
{
v.abcd.r = i8buffer_ld1(bottom_blob_data, gi4.r);
v.abcd.g = i8buffer_ld1(bottom_blob_data, gi4.g);
v.abcd.b = i8buffer_ld1(bottom_blob_data, gi4.b);
v.abcd.a = i8buffer_ld1(bottom_blob_data, gi4.a);
v.efgh.r = i8buffer_ld1(bottom_blob_data, gi8.r);
v.efgh.g = i8buffer_ld1(bottom_blob_data, gi8.g);
v.efgh.b = i8buffer_ld1(bottom_blob_data, gi8.b);
v.efgh.a = i8buffer_ld1(bottom_blob_data, gi8.a);
}

if (cast_type_to == 3)
{
top_blob_int32_data[gi] = v;
}
else
{
i8buffer_st8(top_blob_data, gi, v);
}
}

+ 79
- 0
src/layer/vulkan/shader/packing_pack4to1_int8.comp View File

@@ -0,0 +1,79 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int cast_type_from = 0;
layout (constant_id = 1) const int cast_type_to = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;

layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; };
layout (binding = 1) readonly buffer bottom_blob_int32 { ivec4 bottom_blob_int32_data[]; };
layout (binding = 2) writeonly buffer top_blob { sint8 top_blob_data[]; };
layout (binding = 3) writeonly buffer top_blob_int32 { int top_blob_int32_data[]; };

layout (push_constant) uniform parameter
{
uint n;
uint c;
uint stride;
} p;

void main()
{
const uint gx = gl_GlobalInvocationID.x;
const uint gy = gl_GlobalInvocationID.y;

if (gx >= psc(n) || gy >= psc(c))
return;

const uint gi = gy * psc(n) + gx;

const uvec4 gi4 = (gy * 4 + uvec4(0, 1, 2, 3)) * psc(stride) + gx;

// if (cast_type_from == cast_type_to)
// {
// buffer_cp4to1(top_blob_data, gi4, bottom_blob_data, gi);
// return;
// }

ivec4 v;
if (cast_type_from == 3)
{
v = bottom_blob_int32_data[gi];
}
else
{
v = i8buffer_ld4(bottom_blob_data, gi);
}

if (cast_type_to == 3)
{
top_blob_int32_data[gi4.r] = v.r;
top_blob_int32_data[gi4.g] = v.g;
top_blob_int32_data[gi4.b] = v.b;
top_blob_int32_data[gi4.a] = v.a;
}
else
{
i8buffer_st1(top_blob_data, gi4.r, v.r);
i8buffer_st1(top_blob_data, gi4.g, v.g);
i8buffer_st1(top_blob_data, gi4.b, v.b);
i8buffer_st1(top_blob_data, gi4.a, v.a);
}
}

+ 75
- 0
src/layer/vulkan/shader/packing_pack4to8_int8.comp View File

@@ -0,0 +1,75 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int cast_type_from = 0;
layout (constant_id = 1) const int cast_type_to = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;

layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; };
layout (binding = 1) readonly buffer bottom_blob_int32 { ivec4 bottom_blob_int32_data[]; };
layout (binding = 2) writeonly buffer top_blob { sint8vec8 top_blob_data[]; };
layout (binding = 3) writeonly buffer top_blob_int32 { ivec8 top_blob_int32_data[]; };

layout (push_constant) uniform parameter
{
uint n;
uint c;
uint stride;
} p;

void main()
{
const uint gx = gl_GlobalInvocationID.x;
const uint gy = gl_GlobalInvocationID.y;

if (gx >= psc(n) || gy >= psc(c))
return;

const uvec2 gi2 = (gy * 2 + uvec2(0, 1)) * psc(stride) + gx;

const uint gi = gy * psc(n) + gx;

// if (cast_type_from == cast_type_to)
// {
// buffer_cp4to8(top_blob_data, gi, bottom_blob_data, gi2);
// return;
// }

ivec8 v;
if (cast_type_from == 3)
{
v.abcd = bottom_blob_int32_data[gi2.r];
v.efgh = bottom_blob_int32_data[gi2.g];
}
else
{
v.abcd = i8buffer_ld4(bottom_blob_data, gi2.r);
v.efgh = i8buffer_ld4(bottom_blob_data, gi2.g);
}

if (cast_type_to == 3)
{
top_blob_int32_data[gi] = v;
}
else
{
i8buffer_st8(top_blob_data, gi, v);
}
}

+ 88
- 0
src/layer/vulkan/shader/packing_pack8to1_int8.comp View File

@@ -0,0 +1,88 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int cast_type_from = 0;
layout (constant_id = 1) const int cast_type_to = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;

layout (binding = 0) readonly buffer bottom_blob { sint8vec8 bottom_blob_data[]; };
layout (binding = 1) readonly buffer bottom_blob_int32 { ivec8 bottom_blob_int32_data[]; };
layout (binding = 2) writeonly buffer top_blob { sint8 top_blob_data[]; };
layout (binding = 3) writeonly buffer top_blob_int32 { int top_blob_int32_data[]; };

layout (push_constant) uniform parameter
{
uint n;
uint c;
uint stride;
} p;

void main()
{
const uint gx = gl_GlobalInvocationID.x;
const uint gy = gl_GlobalInvocationID.y;

if (gx >= psc(n) || gy >= psc(c))
return;

const uint gi = gy * psc(n) + gx;

const uvec4 gi4 = (gy * 8 + uvec4(0, 1, 2, 3)) * psc(stride) + gx;
const uvec4 gi8 = gi4 + psc(stride) * 4;

// if (cast_type_from == cast_type_to)
// {
// i8buffer_cp8to1(top_blob_data, gi4, gi8, bottom_blob_data, gi);
// return;
// }

ivec8 v;
if (cast_type_from == 3)
{
v = bottom_blob_int32_data[gi];
}
else
{
v = i8buffer_ld8(bottom_blob_data, gi);
}

if (cast_type_to == 3)
{
top_blob_int32_data[gi4.r] = v.abcd.r;
top_blob_int32_data[gi4.g] = v.abcd.g;
top_blob_int32_data[gi4.b] = v.abcd.b;
top_blob_int32_data[gi4.a] = v.abcd.a;
top_blob_int32_data[gi8.r] = v.efgh.r;
top_blob_int32_data[gi8.g] = v.efgh.g;
top_blob_int32_data[gi8.b] = v.efgh.b;
top_blob_int32_data[gi8.a] = v.efgh.a;
}
else
{
i8buffer_st1(top_blob_data, gi4.r, v.abcd.r);
i8buffer_st1(top_blob_data, gi4.g, v.abcd.g);
i8buffer_st1(top_blob_data, gi4.b, v.abcd.b);
i8buffer_st1(top_blob_data, gi4.a, v.abcd.a);
i8buffer_st1(top_blob_data, gi8.r, v.efgh.r);
i8buffer_st1(top_blob_data, gi8.g, v.efgh.g);
i8buffer_st1(top_blob_data, gi8.b, v.efgh.b);
i8buffer_st1(top_blob_data, gi8.a, v.efgh.a);
}
}

+ 75
- 0
src/layer/vulkan/shader/packing_pack8to4_int8.comp View File

@@ -0,0 +1,75 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int cast_type_from = 0;
layout (constant_id = 1) const int cast_type_to = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;

layout (binding = 0) readonly buffer bottom_blob { sint8vec8 bottom_blob_data[]; };
layout (binding = 1) readonly buffer bottom_blob_int32 { ivec8 bottom_blob_int32_data[]; };
layout (binding = 2) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
layout (binding = 3) writeonly buffer top_blob_int32 { ivec4 top_blob_int32_data[]; };

layout (push_constant) uniform parameter
{
uint n;
uint c;
uint stride;
} p;

void main()
{
const uint gx = gl_GlobalInvocationID.x;
const uint gy = gl_GlobalInvocationID.y;

if (gx >= psc(n) || gy >= psc(c))
return;

const uint gi = gy * psc(n) + gx;

const uvec2 gi2 = (gy * 2 + uvec2(0, 1)) * psc(stride) + gx;

// if (cast_type_from == cast_type_to)
// {
// buffer_cp8to4(top_blob_data, gi2, bottom_blob_data, gi);
// return;
// }

ivec8 v;
if (cast_type_from == 3)
{
v = bottom_blob_int32_data[gi];
}
else
{
v = i8buffer_ld8(bottom_blob_data, gi);
}

if (cast_type_to == 3)
{
top_blob_int32_data[gi2.r] = v.abcd;
top_blob_int32_data[gi2.g] = v.efgh;
}
else
{
i8buffer_st4(top_blob_data, gi2.r, v.abcd);
i8buffer_st4(top_blob_data, gi2.g, v.efgh);
}
}

+ 63
- 0
src/layer/vulkan/shader/quantize.comp View File

@@ -0,0 +1,63 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int scale_data_size = 0;
layout (constant_id = 1) const float scale_value = 1.f;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; };
layout (binding = 2) readonly buffer scale_blob { sfp scale_blob_data[]; };

layout (push_constant) uniform parameter
{
uint c;
uint in_stride;
uint out_stride;
} p;

void main()
{
const uint gi = gl_GlobalInvocationID.x;

if (gi >= psc(in_stride) * psc(c))
return;

const uint gy = gi / psc(in_stride);
const uint gx = gi % psc(in_stride);

afp v = buffer_ld1(bottom_blob_data, gi);

afp scale;
if (scale_data_size == 1)
{
scale = afp(scale_value);
}
else
{
scale = buffer_ld1(scale_blob_data, gy);
}

int v_int = int(round(clamp(v * scale, afp(-127.f), afp(127.f))));

const uint outgi = gy * psc(out_stride) + gx;

i8buffer_st1(top_blob_data, outgi, v_int);
}

+ 63
- 0
src/layer/vulkan/shader/quantize_pack4.comp View File

@@ -0,0 +1,63 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int scale_data_size = 0;
layout (constant_id = 1) const float scale_value = 1.f;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
layout (binding = 2) readonly buffer scale_blob { sfpvec4 scale_blob_data[]; };

layout (push_constant) uniform parameter
{
uint c;
uint in_stride;
uint out_stride;
} p;

void main()
{
const uint gi = gl_GlobalInvocationID.x;

if (gi >= psc(in_stride) * psc(c))
return;

const uint gy = gi / psc(in_stride);
const uint gx = gi % psc(in_stride);

afpvec4 v = buffer_ld4(bottom_blob_data, gi);

afpvec4 scale;
if (scale_data_size == 1)
{
scale = afpvec4(scale_value);
}
else
{
scale = buffer_ld4(scale_blob_data, gy);
}

ivec4 v_int = ivec4(round(clamp(v * scale, afp(-127.f), afp(127.f))));

const uint outgi = gy * psc(out_stride) + gx;

i8buffer_st4(top_blob_data, outgi, v_int);
}

+ 65
- 0
src/layer/vulkan/shader/quantize_pack8.comp View File

@@ -0,0 +1,65 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int scale_data_size = 0;
layout (constant_id = 1) const float scale_value = 1.f;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sint8vec8 top_blob_data[]; };
layout (binding = 2) readonly buffer scale_blob { sfpvec8 scale_blob_data[]; };

layout (push_constant) uniform parameter
{
uint c;
uint in_stride;
uint out_stride;
} p;

void main()
{
const uint gi = gl_GlobalInvocationID.x;

if (gi >= psc(in_stride) * psc(c))
return;

const uint gy = gi / psc(in_stride);
const uint gx = gi % psc(in_stride);

afpvec8 v = buffer_ld8(bottom_blob_data, gi);

afpvec8 scale;
if (scale_data_size == 1)
{
scale = afpvec8(afpvec4(scale_value), afpvec4(scale_value));
}
else
{
scale = buffer_ld8(scale_blob_data, gy);
}

ivec8 v_int;
v_int.abcd = ivec4(round(clamp(v[0] * scale[0], afp(-127.f), afp(127.f))));
v_int.efgh = ivec4(round(clamp(v[1] * scale[1], afp(-127.f), afp(127.f))));

const uint outgi = gy * psc(out_stride) + gx;

i8buffer_st8(top_blob_data, outgi, v_int);
}

+ 103
- 0
src/layer/vulkan/shader/requantize.comp View File

@@ -0,0 +1,103 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int scale_in_data_size = 0;
layout (constant_id = 1) const float scale_in_value = 1.f;
layout (constant_id = 2) const int scale_out_data_size = 0;
layout (constant_id = 3) const float scale_out_value = 1.f;
layout (constant_id = 4) const int bias_data_size = 0;
layout (constant_id = 5) const float bias_value = 0.f;
layout (constant_id = 6) const int activation_type = 0;
layout (constant_id = 7) const float activation_param_0 = 0;
layout (constant_id = 8) const float activation_param_1 = 0;

#define shape_constant_id_offset 9
layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

layout (binding = 0) readonly buffer bottom_blob { int bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; };
layout (binding = 2) readonly buffer scale_in_blob { sfp scale_in_blob_data[]; };
layout (binding = 3) readonly buffer scale_out_blob { sfp scale_out_blob_data[]; };
layout (binding = 4) readonly buffer bias_blob { sfp bias_blob_data[]; };

layout (push_constant) uniform parameter
{
uint c;
uint in_stride;
uint out_stride;
} p;

void main()
{
const uint gi = gl_GlobalInvocationID.x;

if (gi >= psc(in_stride) * psc(c))
return;

const uint gy = gi / psc(in_stride);
const uint gx = gi % psc(in_stride);

int v = bottom_blob_data[gi];

afp scale_in;
if (scale_in_data_size == 1)
{
scale_in = afp(scale_in_value);
}
else
{
scale_in = buffer_ld1(scale_in_blob_data, gy);
}

afp bias;
if (bias_data_size == 0)
{
bias = afp(0.f);
}
else if (bias_data_size == 1)
{
bias = afp(bias_value);
}
else
{
bias = buffer_ld1(bias_blob_data, gy);
}

afp v_fp = afp(v) * scale_in + bias;

v_fp = activation_afp(v_fp, activation_type, activation_param_0, activation_param_1);

afp scale_out;
if (scale_out_data_size == 1)
{
scale_out = afp(scale_out_value);
}
else
{
scale_out = buffer_ld1(scale_out_blob_data, gy);
}

int v_int = int(round(clamp(v_fp * scale_out, afp(-127.f), afp(127.f))));

const uint outgi = gy * psc(out_stride) + gx;

i8buffer_st1(top_blob_data, outgi, v_int);
}

+ 103
- 0
src/layer/vulkan/shader/requantize_pack4.comp View File

@@ -0,0 +1,103 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int scale_in_data_size = 0;
layout (constant_id = 1) const float scale_in_value = 1.f;
layout (constant_id = 2) const int scale_out_data_size = 0;
layout (constant_id = 3) const float scale_out_value = 1.f;
layout (constant_id = 4) const int bias_data_size = 0;
layout (constant_id = 5) const float bias_value = 0.f;
layout (constant_id = 6) const int activation_type = 0;
layout (constant_id = 7) const float activation_param_0 = 0;
layout (constant_id = 8) const float activation_param_1 = 0;

#define shape_constant_id_offset 9
layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

layout (binding = 0) readonly buffer bottom_blob { ivec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
layout (binding = 2) readonly buffer scale_in_blob { sfpvec4 scale_in_blob_data[]; };
layout (binding = 3) readonly buffer scale_out_blob { sfpvec4 scale_out_blob_data[]; };
layout (binding = 4) readonly buffer bias_blob { sfpvec4 bias_blob_data[]; };

layout (push_constant) uniform parameter
{
uint c;
uint in_stride;
uint out_stride;
} p;

void main()
{
const uint gi = gl_GlobalInvocationID.x;

if (gi >= psc(in_stride) * psc(c))
return;

const uint gy = gi / psc(in_stride);
const uint gx = gi % psc(in_stride);

ivec4 v = bottom_blob_data[gi];

afpvec4 scale_in;
if (scale_in_data_size == 1)
{
scale_in = afpvec4(scale_in_value);
}
else
{
scale_in = buffer_ld4(scale_in_blob_data, gy);
}

afpvec4 bias;
if (bias_data_size == 0)
{
bias = afpvec4(0.f);
}
else if (bias_data_size == 1)
{
bias = afpvec4(bias_value);
}
else
{
bias = buffer_ld4(bias_blob_data, gy);
}

afpvec4 v_fp = afpvec4(v) * scale_in + bias;

v_fp = activation_afpvec4(v_fp, activation_type, activation_param_0, activation_param_1);

afpvec4 scale_out;
if (scale_out_data_size == 1)
{
scale_out = afpvec4(scale_out_value);
}
else
{
scale_out = buffer_ld4(scale_out_blob_data, gy);
}

ivec4 v_int = ivec4(round(clamp(v_fp * scale_out, afp(-127.f), afp(127.f))));

const uint outgi = gy * psc(out_stride) + gx;

i8buffer_st4(top_blob_data, outgi, v_int);
}

+ 107
- 0
src/layer/vulkan/shader/requantize_pack8.comp View File

@@ -0,0 +1,107 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int scale_in_data_size = 0;
layout (constant_id = 1) const float scale_in_value = 1.f;
layout (constant_id = 2) const int scale_out_data_size = 0;
layout (constant_id = 3) const float scale_out_value = 1.f;
layout (constant_id = 4) const int bias_data_size = 0;
layout (constant_id = 5) const float bias_value = 0.f;
layout (constant_id = 6) const int activation_type = 0;
layout (constant_id = 7) const float activation_param_0 = 0;
layout (constant_id = 8) const float activation_param_1 = 0;

#define shape_constant_id_offset 9
layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

layout (binding = 0) readonly buffer bottom_blob { ivec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sint8vec8 top_blob_data[]; };
layout (binding = 2) readonly buffer scale_in_blob { sfpvec8 scale_in_blob_data[]; };
layout (binding = 3) readonly buffer scale_out_blob { sfpvec8 scale_out_blob_data[]; };
layout (binding = 4) readonly buffer bias_blob { sfpvec8 bias_blob_data[]; };

layout (push_constant) uniform parameter
{
uint c;
uint in_stride;
uint out_stride;
} p;

void main()
{
const uint gi = gl_GlobalInvocationID.x;

if (gi >= psc(in_stride) * psc(c))
return;

const uint gy = gi / psc(in_stride);
const uint gx = gi % psc(in_stride);

ivec8 v = bottom_blob_data[gi];

afpvec8 scale_in;
if (scale_in_data_size == 1)
{
scale_in = afpvec8(afpvec4(scale_in_value), afpvec4(scale_in_value));
}
else
{
scale_in = buffer_ld8(scale_in_blob_data, gy);
}

afpvec8 bias;
if (bias_data_size == 0)
{
bias = afpvec8(afpvec4(0.f), afpvec4(0.f));
}
else if (bias_data_size == 1)
{
bias = afpvec8(afpvec4(bias_value), afpvec4(bias_value));
}
else
{
bias = buffer_ld8(bias_blob_data, gy);
}

afpvec8 v_fp;
v_fp[0] = afpvec4(v.abcd) * scale_in[0] + bias[0];
v_fp[1] = afpvec4(v.efgh) * scale_in[1] + bias[1];

v_fp = activation_afpvec8(v_fp, activation_type, activation_param_0, activation_param_1);

afpvec8 scale_out;
if (scale_out_data_size == 1)
{
scale_out = afpvec8(afpvec4(scale_out_value), afpvec4(scale_out_value));
}
else
{
scale_out = buffer_ld8(scale_out_blob_data, gy);
}

ivec8 v_int;
v_int.abcd = ivec4(round(clamp(v_fp[0] * scale_out[0], afp(-127.f), afp(127.f))));
v_int.efgh = ivec4(round(clamp(v_fp[1] * scale_out[1], afp(-127.f), afp(127.f))));

const uint outgi = gy * psc(out_stride) + gx;

i8buffer_st8(top_blob_data, outgi, v_int);
}

+ 6
- 0
src/net.cpp View File

@@ -1043,6 +1043,9 @@ int Net::load_param(const DataReader& dr)
// fp16a makes no sense when fp16 storage disabled
if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;

// int8a makes no sense when int8 storage disabled
if (!opt.use_int8_packed && !opt.use_int8_storage) opt.use_int8_arithmetic = false;

// fp16 uniform makes no sense when fp16 arithmetic disabled
if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false;
}
@@ -1339,6 +1342,9 @@ int Net::load_param_bin(const DataReader& dr)
// fp16a makes no sense when fp16 storage disabled
if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;

// int8a makes no sense when int8 storage disabled
if (!opt.use_int8_packed && !opt.use_int8_storage) opt.use_int8_arithmetic = false;

// fp16 uniform makes no sense when fp16 arithmetic disabled
if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false;
}


+ 1
- 5
tests/test_dequantize.cpp View File

@@ -142,12 +142,8 @@ static int test_dequantize_3()
|| test_dequantize_pack8(RandomIntMat(15, 24), 24, 24)
|| test_dequantize_pack8(RandomIntMat(15, 24), 24, 1)
|| test_dequantize_pack8(RandomIntMat(15, 24), 24, 0)
|| test_dequantize_pack8(RandomIntMat(128), 1, 128)
|| test_dequantize_pack8(RandomIntMat(128), 1, 1)
|| test_dequantize_pack8(RandomIntMat(128), 1, 0)
|| test_dequantize_pack8(RandomIntMat(128), 128, 128)
|| test_dequantize_pack8(RandomIntMat(128), 128, 1)
|| test_dequantize_pack8(RandomIntMat(128), 128, 0);
|| test_dequantize_pack8(RandomIntMat(128), 1, 0);
}

int main()


+ 111
- 14
tests/test_packing.cpp View File

@@ -217,15 +217,12 @@ static int test_packing_cpu(const ncnn::Mat& a, int in_elempack, int out_elempac
}

#if NCNN_VULKAN

static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_elempack)
static int test_packing_gpu_fp32(const ncnn::Mat& a, int in_elempack, int out_elempack)
{
ncnn::ParamDict pd;
pd.set(0, out_elempack);
pd.set(2, 1); // cast_type_from
pd.set(3, 1); // cast_type_to
pd.set(4, 0); // storage_type_from
pd.set(5, 0); // storage_type_to

std::vector<ncnn::Mat> weights(0);

@@ -297,12 +294,112 @@ static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_

if (CompareMat(b, d, 0.001) != 0)
{
fprintf(stderr, "test_packing_gpu_buffer failed a.dims=%d a=(%d %d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.d, a.c, in_elempack, out_elempack);
fprintf(stderr, "test_packing_gpu failed a.dims=%d a=(%d %d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.d, a.c, in_elempack, out_elempack);
return -1;
}

return 0;
}

static int test_packing_gpu_int8(const ncnn::Mat& a, int in_elempack, int out_elempack)
{
ncnn::ParamDict pd;
pd.set(0, out_elempack);
pd.set(2, 4); // cast_type_from
pd.set(3, 4); // cast_type_to

std::vector<ncnn::Mat> weights(0);

ncnn::Option opt;
opt.num_threads = 1;
opt.use_vulkan_compute = true;
opt.use_int8_inference = false;
opt.use_fp16_packed = false;
opt.use_fp16_storage = false;
opt.use_fp16_arithmetic = false;
opt.use_int8_storage = false;
opt.use_int8_arithmetic = false;
opt.use_packing_layout = true;
opt.use_shader_pack8 = true;

ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();

ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();

opt.blob_vkallocator = blob_vkallocator;
opt.workspace_vkallocator = blob_vkallocator;
opt.staging_vkallocator = staging_vkallocator;

if (!vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
if (!vkdev->info.support_int8_storage()) opt.use_int8_storage = false;

ncnn::Layer* op = ncnn::create_layer_vulkan("Packing");

op->vkdev = vkdev;

op->load_param(pd);

ncnn::ModelBinFromMatArray mb(weights.data());

op->load_model(mb);

op->create_pipeline(opt);

ncnn::Mat a8;
if (a.dims == 1) a8 = RandomS8Mat(a.w);
if (a.dims == 2) a8 = RandomS8Mat(a.w, a.h);
if (a.dims == 3) a8 = RandomS8Mat(a.w, a.h, a.c);
if (a.dims == 4) a8 = RandomS8Mat(a.w, a.h, a.d, a.c);

ncnn::Mat ap;
ncnn::convert_packing(a8, ap, in_elempack, opt);

ncnn::Mat b;
packing_cpu_naive(ap, b, out_elempack);

ncnn::Mat c;

// forward
ncnn::VkCompute cmd(vkdev);

// upload
ncnn::VkMat a_gpu;
cmd.record_clone(ap, a_gpu, opt);

ncnn::VkMat c_gpu;
op->forward(a_gpu, c_gpu, cmd, opt);

// download
cmd.record_clone(c_gpu, c, opt);

cmd.submit_and_wait();

op->destroy_pipeline(opt);

delete op;

ncnn::Mat b32;
ncnn::cast_int8_to_float32(b, b32, opt);

ncnn::Mat c32;
ncnn::cast_int8_to_float32(c, c32, opt);

if (CompareMat(b32, c32, 0.001) != 0)
{
fprintf(stderr, "test_packing_gpu_int8 failed a.dims=%d a=(%d %d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.d, a.c, in_elempack, out_elempack);
return -1;
}

return 0;
}

static int test_packing_gpu(const ncnn::Mat& a, int in_elempack, int out_elempack)
{
return 0
|| test_packing_gpu_fp32(a, in_elempack, out_elempack)
|| test_packing_gpu_int8(a, in_elempack, out_elempack);
}
#endif

static int test_packing_cpu(const ncnn::Mat& a)
@@ -329,15 +426,15 @@ static int test_packing_cpu(const ncnn::Mat& a)
static int test_packing_gpu(const ncnn::Mat& a)
{
return 0
|| test_packing_gpu_buffer(a, 1, 1)
|| test_packing_gpu_buffer(a, 4, 4)
|| test_packing_gpu_buffer(a, 8, 8)
|| test_packing_gpu_buffer(a, 1, 4)
|| test_packing_gpu_buffer(a, 4, 1)
|| test_packing_gpu_buffer(a, 1, 8)
|| test_packing_gpu_buffer(a, 8, 1)
|| test_packing_gpu_buffer(a, 4, 8)
|| test_packing_gpu_buffer(a, 8, 4);
|| test_packing_gpu(a, 1, 1)
|| test_packing_gpu(a, 4, 4)
|| test_packing_gpu(a, 8, 8)
|| test_packing_gpu(a, 1, 4)
|| test_packing_gpu(a, 4, 1)
|| test_packing_gpu(a, 1, 8)
|| test_packing_gpu(a, 8, 1)
|| test_packing_gpu(a, 4, 8)
|| test_packing_gpu(a, 8, 4);
}
#endif // NCNN_VULKAN



+ 1
- 1
tests/test_quantize.cpp View File

@@ -24,7 +24,7 @@ static int test_quantize(const ncnn::Mat& a, float scale_low, float scale_high)
}
else
{
if (a.dims == 1) scale_data.create(a.w);
if (a.dims == 1) scale_data.create(1);
if (a.dims == 2) scale_data.create(a.h);
if (a.dims == 3) scale_data.create(a.c);
Randomize(scale_data, scale_low, scale_high);


+ 1
- 1
tests/test_quantize_oom.cpp View File

@@ -24,7 +24,7 @@ static int test_quantize_oom(const ncnn::Mat& a, float scale_low, float scale_hi
}
else
{
if (a.dims == 1) scale_data.create(a.w);
if (a.dims == 1) scale_data.create(1);
if (a.dims == 2) scale_data.create(a.h);
if (a.dims == 3) scale_data.create(a.c);
Randomize(scale_data, scale_low, scale_high);


+ 53
- 2
tests/testutil.cpp View File

@@ -759,7 +759,32 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
std::vector<ncnn::VkMat> a_gpu(a.size());
for (size_t i = 0; i < a_gpu.size(); i++)
{
cmd.record_upload(a[i], a_gpu[i], opt);
if (flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING)
{
// resolve dst_elempack
int dims = a[i].dims;
int elemcount = 0;
if (dims == 1) elemcount = a[i].elempack * a[i].w;
if (dims == 2) elemcount = a[i].elempack * a[i].h;
if (dims == 3 || dims == 4) elemcount = a[i].elempack * a[i].c;

const int dst_elempack = (opt.use_shader_pack8 && elemcount % 8 == 0) ? 8 : elemcount % 4 == 0 ? 4 : 1;

ncnn::Mat a4;
ncnn::convert_packing(a[i], a4, dst_elempack, opt);

ncnn::Option opt_upload = opt;
opt_upload.use_fp16_packed = false;
opt_upload.use_fp16_storage = false;
opt_upload.use_int8_packed = false;
opt_upload.use_int8_storage = false;

cmd.record_clone(a4, a_gpu[i], opt_upload);
}
else
{
cmd.record_upload(a[i], a_gpu[i], opt);
}
}

std::vector<ncnn::VkMat> d_gpu(top_blob_count);
@@ -1082,7 +1107,33 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
{
// upload
ncnn::VkMat a_gpu;
cmd.record_upload(a, a_gpu, opt);

if (flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING)
{
// resolve dst_elempack
int dims = a.dims;
int elemcount = 0;
if (dims == 1) elemcount = a.elempack * a.w;
if (dims == 2) elemcount = a.elempack * a.h;
if (dims == 3 || dims == 4) elemcount = a.elempack * a.c;

const int dst_elempack = (opt.use_shader_pack8 && elemcount % 8 == 0) ? 8 : elemcount % 4 == 0 ? 4 : 1;

ncnn::Mat a4;
ncnn::convert_packing(a, a4, dst_elempack, opt);

ncnn::Option opt_upload = opt;
opt_upload.use_fp16_packed = false;
opt_upload.use_fp16_storage = false;
opt_upload.use_int8_packed = false;
opt_upload.use_int8_storage = false;

cmd.record_clone(a4, a_gpu, opt_upload);
}
else
{
cmd.record_upload(a, a_gpu, opt);
}

ncnn::VkMat d_gpu;
if (op->support_inplace)


Loading…
Cancel
Save