vulkan int8 packing quantize dequantize requantize (#3731)

* add int8 definitions * packing vulkan int8/int32, quantize vulkan * vulkan dequantize * requantize vulkan
1 year ago · 9f832c19c1
--- a/src/allocator.cpp
+++ b/src/allocator.cpp
@@ -892,6 +892,13 @@ VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize,
        if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT;
        if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT;
    }
    if (elemsize / elempack == 1)
    {
        // int8
        if (elempack == 1) format = VK_FORMAT_R8_SINT;
        if (elempack == 4) format = VK_FORMAT_R8G8B8A8_SINT;
        if (elempack == 8) format = VK_FORMAT_R8G8B8A8_SINT;
    }

    // resolve image width height depth
    int width = w;
@@ -1468,6 +1475,16 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsiz
        if (elempack == 32) format = VK_FORMAT_R16G16B16A16_SFLOAT;
        if (elempack == 64) format = VK_FORMAT_R16G16B16A16_SFLOAT;
    }
    if (elemsize / elempack == 1)
    {
        // int8
        if (elempack == 1) format = VK_FORMAT_R8_SINT;
        if (elempack == 4) format = VK_FORMAT_R8G8B8A8_SINT;
        if (elempack == 8) format = VK_FORMAT_R8G8B8A8_SINT;
        if (elempack == 16) format = VK_FORMAT_R8G8B8A8_SINT;
        if (elempack == 32) format = VK_FORMAT_R8G8B8A8_SINT;
        if (elempack == 64) format = VK_FORMAT_R8G8B8A8_SINT;
    }

    // resolve image width height depth
    int width = w;
--- a/src/command.cpp
+++ b/src/command.cpp
@@ -450,6 +450,11 @@ void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt)
        cast_type_to = 1;
    }

    if (src.elemsize == src.elempack * 1u)
    {
        cast_type_to = 4;
    }

    VkMat dst_staging;
    vkdev->convert_packing(src, dst_staging, dst_elempack, cast_type_to, *this, opt_staging);

--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -3032,6 +3032,10 @@ public:
    // to fp32 | fp16
    // to pack1 | pack4 | pack8
    mutable ncnn::Layer* uop_packing[2][2][3];
    // from int8
    // to int8
    // to pack1 | pack4 | pack8
    mutable ncnn::Layer* uop_packing_int8[3];
    mutable Mutex uop_lock;

    // device is valid and sucessfully initialized
@@ -3047,6 +3051,7 @@ VulkanDevicePrivate::VulkanDevicePrivate(VulkanDevice* _vkdev)
    pipeline_cache = 0;
    valid = false;
    memset(uop_packing, 0, sizeof(uop_packing));
    memset(uop_packing_int8, 0, sizeof(uop_packing_int8));
 }

 int VulkanDevicePrivate::create_dummy_buffer_image()
@@ -3096,18 +3101,29 @@ void VulkanDevicePrivate::destroy_dummy_buffer_image()

 const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const
 {
    bool use_fp16 = (cast_type_from_index == 1 || cast_type_to_index == 1);
    bool use_int8 = (cast_type_from_index == 3 || cast_type_to_index == 3);

    MutexLockGuard lock(uop_lock);

    const ncnn::Layer* cached_uop = uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index];
    const ncnn::Layer* cached_uop = 0;
    if (use_int8)
    {
        cached_uop = uop_packing_int8[packing_type_to_index];
    }
    else
    {
        cached_uop = uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index];
    }
    if (cached_uop)
        return cached_uop;

    bool use_fp16 = (cast_type_from_index == 1 || cast_type_to_index == 1);

    // create uop
    Option opt;
    opt.use_fp16_packed = use_fp16; // fp16p is always supported
    opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage();
    opt.use_int8_packed = use_int8; // int8p is always supported
    opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage();

    // fp16/int8 arithmetic are not necessary for packing
    // and may conflict with storage options
@@ -3132,14 +3148,21 @@ const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_

    ncnn::ParamDict pd;
    pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack
    pd.set(2, cast_type_from_index + 1);                                            // 0=auto 1=fp32 2=fp16
    pd.set(2, cast_type_from_index + 1);                                            // 0=auto 1=fp32 2=fp16 3=int8
    pd.set(3, cast_type_to_index + 1);

    uop->load_param(pd);

    uop->create_pipeline(opt);

    uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop;
    if (use_int8)
    {
        uop_packing_int8[packing_type_to_index] = uop;
    }
    else
    {
        uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop;
    }

    return uop;
 }
@@ -3164,6 +3187,8 @@ void VulkanDevicePrivate::destroy_utility_operator()

            opt.use_fp16_packed = use_fp16;
            opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage();
            opt.use_int8_packed = false;
            opt.use_int8_storage = false;

            // to pack1 | pack4 | pack8
            for (int k = 0; k < 3; k++)
@@ -3183,6 +3208,33 @@ void VulkanDevicePrivate::destroy_utility_operator()
            }
        }
    }

    // int8
    {
        bool use_int8 = true;

        opt.use_fp16_packed = false;
        opt.use_fp16_storage = false;
        opt.use_int8_packed = use_int8;
        opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage();

        // to pack1 | pack4 | pack8
        for (int k = 0; k < 3; k++)
        {
            // enable pack8 for pack8to1/pack8to4
            opt.use_shader_pack8 = true;

            ncnn::Layer* uop = uop_packing_int8[k];
            if (!uop)
                continue;

            uop->destroy_pipeline(opt);

            delete uop;

            uop_packing_int8[k] = 0;
        }
    }
 }

 VulkanDevice::VulkanDevice(int device_index)
@@ -4232,18 +4284,35 @@ void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempac
    {
        cast_type_from_index = 0;
    }
    else // if (src.elembits() == 16)
    else if (src.elembits() == 16)
    {
        cast_type_from_index = 1;
    }
    else // if (src.elembits() == 8)
    {
        cast_type_from_index = 3;
    }

    int cast_type_to_index = cast_type_to ? cast_type_to - 1 : cast_type_from_index;

    // NCNN_LOGE("convert_packing b2b %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);

    if ((cast_type_from_index == 0 || cast_type_from_index == 1) && (cast_type_to_index == 2 || cast_type_to_index == 3))
    {
        NCNN_LOGE("convert_packing from fp32/fp16 to int32/int8 is not supported");
        return;
    }
    if ((cast_type_from_index == 2 || cast_type_from_index == 3) && (cast_type_to_index == 0 || cast_type_to_index == 1))
    {
        NCNN_LOGE("convert_packing from int32/int8 to fp32/fp16 is not supported");
        return;
    }

    Option opt2 = opt;
    opt2.use_fp16_packed = (cast_type_from_index == 1 || cast_type_to_index == 1);
    opt2.use_fp16_storage = (cast_type_from_index == 1 || cast_type_to_index == 1) && info.support_fp16_storage();
    opt2.use_int8_packed = (cast_type_from_index == 3 || cast_type_to_index == 3);
    opt2.use_int8_storage = (cast_type_from_index == 3 || cast_type_to_index == 3) && info.support_int8_storage();

    const ncnn::Layer* uop = d->get_utility_operator(cast_type_from_index, cast_type_to_index, packing_type_to_index);
    uop->forward(src, dst, cmd, opt2);
@@ -4809,6 +4878,49 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
        custom_defines.append("afp2sfpmat4(v)", "v");
    }

    if (opt.use_int8_storage)
    {
        custom_defines.append("sint8", "int8_t");
    }
    else if (opt.use_int8_packed)
    {
        custom_defines.append("sint8", "int");
    }
    else
    {
        custom_defines.append("sint8", "int");
    }

    custom_defines.append("sint8vec4", "int");
    custom_defines.append("sint8vec8", "ivec2");

    custom_defines.append("aint8", "int");
    custom_defines.append("aint8vec4", "ivec4");

    custom_defines.append("unpackInt4x8(v)", "ivec4((v<<24)>>24,(v<<16)>>24,(v<<8)>>24,v>>24)");
    custom_defines.append("packInt4x8(v)", "int((uint(v.r)&0xFFu)|((uint(v.g)&0xFFu)<<8)|((uint(v.b)&0xFFu)<<16)|((uint(v.a)&0xFFu)<<24))");

    if (opt.use_int8_storage)
    {
        custom_defines.append("i8buffer_ld1(buf,i)", "int(buf[i])");
        custom_defines.append("i8buffer_st1(buf,i,v)", "{buf[i]=int8_t(v);}");
        custom_defines.append("i8buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
    }
    else
    {
        custom_defines.append("i8buffer_ld1(buf,i)", "int(((buf[(i)/4])<<(24-((i)%4)*8))>>24)");
        custom_defines.append("i8buffer_st1(buf,i,v)", "{uint _i=uint(i);uint _id4=_i/4;uint _im4=_i%4;int _vs=int(v);int _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id4],0,0);ivec4 _v=unpackInt4x8(_old_v);_v[_im4]=_vs;_new_v=packInt4x8(_v);} while(atomicCompSwap(buf[_id4],_old_v,_new_v)!=_old_v);}");
        custom_defines.append("i8buffer_cp1(buf,i,sbuf,si)", "{int _v=i8buffer_ld1(sbuf,si);i8buffer_st1(buf,i,_v);}");
    }

    custom_defines.append("i8buffer_ld4(buf,i)", "unpackInt4x8(buf[i])");
    custom_defines.append("i8buffer_st4(buf,i,v)", "{buf[i]=packInt4x8(v);}");
    custom_defines.append("i8buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");

    custom_defines.append("i8buffer_ld8(buf,i)", "ivec8(unpackInt4x8(buf[i].r),unpackInt4x8(buf[i].g))");
    custom_defines.append("i8buffer_st8(buf,i,v)", "{buf[i]=ivec2(packInt4x8(v.abcd),packInt4x8(v.efgh));}");
    custom_defines.append("i8buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");

    custom_defines.append("psc(x)", "(x==0?p.x:x)");

    if (opt.use_fp16_storage)
@@ -5426,6 +5538,15 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
    {
        custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_float16: require\n";
    }
    custom_exts += "struct ivec8 { ivec4 abcd; ivec4 efgh; };\n";
    if (opt.use_int8_storage)
    {
        custom_exts += "#extension GL_EXT_shader_8bit_storage: require\n";
    }
    if (opt.use_int8_arithmetic)
    {
        custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_int8: require\n";
    }
 #if ENABLE_VALIDATION_LAYER
    {
        custom_exts += "#extension GL_EXT_debug_printf : require\n";
@@ -5507,11 +5628,22 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
            NCNN_LOGE("%s", s.getInfoLog());
            NCNN_LOGE("%s", s.getInfoDebugLog());

            // for (int i = 0; i < 4; i++)
            // print as line_number: code
            {
                int i = 3;
                std::string s(comp_datas[i], comp_data_sizes[i]);
                NCNN_LOGE("%s", s.c_str());
                const char* p = comp_datas[3];
                const char* line_end;
                int line_number = 1;

                while ((line_end = strchr(p, '\n')) != NULL)
                {
                    NCNN_LOGE("%d:\t%.*s", line_number++, (int)(line_end - p), p);
                    p = line_end + 1;
                }

                if (*p != '\0')
                {
                    NCNN_LOGE("%d:\t%s", line_number, p);
                }
            }

            compile_success = false;
--- a/src/gpu.h
+++ b/src/gpu.h
@@ -465,7 +465,7 @@ public:

    // utility operator
    void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
    // cast_type_to   0=auto(same as src)  1=fp32  2=fp16
    // cast_type_to   0=auto(same as src)  1=fp32  2=fp16  3=int32  4=int8
    void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const;

    // VK_KHR_bind_memory2
--- a/src/layer/packing.h
+++ b/src/layer/packing.h
@@ -36,6 +36,8 @@ public:
    // 0 = auto
    // 1 = fp32
    // 2 = fp16
    // 3 = int32
    // 4 = int8
    int cast_type_from;
    int cast_type_to;
 };
--- a/src/layer/vulkan/dequantize_vulkan.cpp
+++ b/src/layer/vulkan/dequantize_vulkan.cpp
@@ -0,0 +1,231 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "dequantize_vulkan.h"

 #include "layer_shader_type.h"

 namespace ncnn {

 Dequantize_vulkan::Dequantize_vulkan()
 {
    support_vulkan = true;

    pipeline_dequantize = 0;
    pipeline_dequantize_pack4 = 0;
    pipeline_dequantize_pack8 = 0;
 }

 int Dequantize_vulkan::create_pipeline(const Option& opt)
 {
    const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
    const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];

    const int dims = shape.dims;

    int elempack = 1;
    if (dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
    if (dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
    if (dims == 3 || dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;

    const size_t elemsize = elempack * 4u;
    size_t out_elemsize;
    if (opt.use_fp16_storage || opt.use_fp16_packed)
    {
        out_elemsize = elempack * 2u;
    }
    else
    {
        out_elemsize = elempack * 4u;
    }

    Mat shape_packed;
    if (dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
    if (dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
    if (dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
    if (dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack);

    Mat out_shape_packed;
    if (dims == 1) out_shape_packed = Mat(out_shape.w / elempack, (void*)0, out_elemsize, elempack);
    if (dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / elempack, (void*)0, out_elemsize, elempack);
    if (dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / elempack, (void*)0, out_elemsize, elempack);
    if (dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / elempack, (void*)0, out_elemsize, elempack);

    size_t c = 0;
    size_t in_stride = 0;
    size_t out_stride = 0;
    if (dims == 1)
    {
        c = 1;
        in_stride = shape_packed.w;
        out_stride = out_shape_packed.w;
    }
    if (dims == 2)
    {
        c = shape_packed.h;
        in_stride = shape_packed.w;
        out_stride = out_shape_packed.w;
    }
    if (dims == 3 || dims == 4)
    {
        c = shape_packed.c;
        in_stride = shape_packed.cstep;
        out_stride = out_shape_packed.cstep;
    }

    std::vector<vk_specialization_type> specializations(4 + 3);
    specializations[0].i = scale_data_size;
    specializations[1].f = scale_data_size == 1 ? scale_data[0] : 1.f;
    specializations[2].i = bias_data_size;
    specializations[3].f = bias_data_size == 1 ? bias_data[0] : 0.f;
    specializations[4 + 0].u32 = c;
    specializations[4 + 1].u32 = in_stride;
    specializations[4 + 2].u32 = out_stride;

    const int local_size_x = vkdev->info.subgroup_size();

    // pack1
    if (shape.dims == 0 || elempack == 1)
    {
        pipeline_dequantize = new Pipeline(vkdev);
        pipeline_dequantize->set_optimal_local_size_xyz(local_size_x, 1, 1);
        pipeline_dequantize->create(LayerShaderType::dequantize, opt, specializations);
    }

    // pack4
    if (shape.dims == 0 || elempack == 4)
    {
        pipeline_dequantize_pack4 = new Pipeline(vkdev);
        pipeline_dequantize_pack4->set_optimal_local_size_xyz(local_size_x, 1, 1);
        pipeline_dequantize_pack4->create(LayerShaderType::dequantize_pack4, opt, specializations);
    }

    // pack8
    if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
    {
        pipeline_dequantize_pack8 = new Pipeline(vkdev);
        pipeline_dequantize_pack8->set_optimal_local_size_xyz(local_size_x, 1, 1);
        pipeline_dequantize_pack8->create(LayerShaderType::dequantize_pack8, opt, specializations);
    }

    return 0;
 }

 int Dequantize_vulkan::destroy_pipeline(const Option& /*opt*/)
 {
    delete pipeline_dequantize;
    pipeline_dequantize = 0;

    delete pipeline_dequantize_pack4;
    pipeline_dequantize_pack4 = 0;

    delete pipeline_dequantize_pack8;
    pipeline_dequantize_pack8 = 0;

    return 0;
 }

 int Dequantize_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 {
    if (scale_data_size > 1)
    {
        cmd.record_upload(scale_data, scale_data_gpu, opt);
    }

    if (bias_data_size > 1)
    {
        cmd.record_upload(bias_data, bias_data_gpu, opt);
    }

    return 0;
 }

 int Dequantize_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
 {
    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int d = bottom_blob.d;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    size_t out_elemsize;
    if (opt.use_fp16_storage || opt.use_fp16_packed)
    {
        out_elemsize = elempack * 2u;
    }
    else
    {
        out_elemsize = elempack * 4u;
    }

    if (dims == 1)
        top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator);
    if (dims == 2)
        top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator);
    if (dims == 3)
        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator);
    if (dims == 4)
        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

    size_t c = 0;
    size_t in_stride = 0;
    size_t out_stride = 0;
    if (dims == 1)
    {
        c = 1;
        in_stride = bottom_blob.w;
        out_stride = top_blob.w;
    }
    if (dims == 2)
    {
        c = bottom_blob.h;
        in_stride = bottom_blob.w;
        out_stride = top_blob.w;
    }
    if (dims == 3 || dims == 4)
    {
        c = bottom_blob.c;
        in_stride = bottom_blob.cstep;
        out_stride = top_blob.cstep;
    }

    std::vector<VkMat> bindings(4);
    bindings[0] = bottom_blob;
    bindings[1] = top_blob;
    bindings[2] = scale_data_gpu;
    bindings[3] = bias_data_gpu;

    std::vector<vk_constant_type> constants(3);
    constants[0].u32 = c;
    constants[1].u32 = in_stride;
    constants[2].u32 = out_stride;

    VkMat dispatcher;
    dispatcher.w = in_stride * c;
    dispatcher.h = 1;
    dispatcher.c = 1;

    const Pipeline* pipeline = elempack == 8 ? pipeline_dequantize_pack8
                               : elempack == 4 ? pipeline_dequantize_pack4
                               : pipeline_dequantize;

    cmd.record_pipeline(pipeline, bindings, constants, dispatcher);

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/vulkan/dequantize_vulkan.h
+++ b/src/layer/vulkan/dequantize_vulkan.h
@@ -0,0 +1,46 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_DEQUANTIZE_VULKAN_H
 #define LAYER_DEQUANTIZE_VULKAN_H

 #include "dequantize.h"

 namespace ncnn {

 class Dequantize_vulkan : virtual public Dequantize
 {
 public:
    Dequantize_vulkan();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int upload_model(VkTransfer& cmd, const Option& opt);

    using Dequantize::forward;
    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;

 public:
    VkMat scale_data_gpu;
    VkMat bias_data_gpu;

    Pipeline* pipeline_dequantize;
    Pipeline* pipeline_dequantize_pack4;
    Pipeline* pipeline_dequantize_pack8;
 };

 } // namespace ncnn

 #endif // LAYER_DEQUANTIZE_VULKAN_H
--- a/src/layer/vulkan/packing_vulkan.cpp
+++ b/src/layer/vulkan/packing_vulkan.cpp
@@ -45,6 +45,8 @@ int Packing_vulkan::create_pipeline(const Option& opt)

    const int local_size_x = vkdev->info.subgroup_size();

    bool use_int8_shader = cast_type_from == 4 || cast_type_to == 4;

    std::vector<vk_specialization_type> specializations(2 + 3);
    specializations[0].i = cast_type_from;
    specializations[1].i = cast_type_to;
@@ -91,7 +93,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)

        pipeline_packing = new Pipeline(vkdev);
        pipeline_packing->set_optimal_local_size_xyz(local_size_x, 1, 1);
        pipeline_packing->create(LayerShaderType::packing, opt, specializations);
        if (use_int8_shader)
        {
            pipeline_packing->create(LayerShaderType::packing_int8, opt, specializations);
        }
        else
        {
            pipeline_packing->create(LayerShaderType::packing, opt, specializations);
        }
    }
    if (shape.dims == 0 || elempack < out_elempack)
    {
@@ -126,7 +135,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)

            pipeline_packing_pack1to4 = new Pipeline(vkdev);
            pipeline_packing_pack1to4->set_optimal_local_size_xyz(local_size_x, 1, 1);
            pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4, opt, specializations);
            if (use_int8_shader)
            {
                pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4_int8, opt, specializations);
            }
            else
            {
                pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4, opt, specializations);
            }
        }

        if (shape.dims == 0 || (elempack == 1 && out_elempack == 8))
@@ -138,7 +154,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)

            pipeline_packing_pack1to8 = new Pipeline(vkdev);
            pipeline_packing_pack1to8->set_optimal_local_size_xyz(local_size_x, 1, 1);
            pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8, opt, specializations);
            if (use_int8_shader)
            {
                pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8_int8, opt, specializations);
            }
            else
            {
                pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8, opt, specializations);
            }
        }

        if (shape.dims == 0 || (elempack == 4 && out_elempack == 8))
@@ -150,7 +173,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)

            pipeline_packing_pack4to8 = new Pipeline(vkdev);
            pipeline_packing_pack4to8->set_optimal_local_size_xyz(local_size_x, 1, 1);
            pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8, opt, specializations);
            if (use_int8_shader)
            {
                pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8_int8, opt, specializations);
            }
            else
            {
                pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8, opt, specializations);
            }
        }
    }
    if (shape.dims == 0 || elempack > out_elempack)
@@ -186,7 +216,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)

            pipeline_packing_pack4to1 = new Pipeline(vkdev);
            pipeline_packing_pack4to1->set_optimal_local_size_xyz(local_size_x, 1, 1);
            pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1, opt, specializations);
            if (use_int8_shader)
            {
                pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1_int8, opt, specializations);
            }
            else
            {
                pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1, opt, specializations);
            }
        }

        if (shape.dims == 0 || (elempack == 8 && out_elempack == 1))
@@ -198,7 +235,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)

            pipeline_packing_pack8to1 = new Pipeline(vkdev);
            pipeline_packing_pack8to1->set_optimal_local_size_xyz(local_size_x, 1, 1);
            pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1, opt, specializations);
            if (use_int8_shader)
            {
                pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1_int8, opt, specializations);
            }
            else
            {
                pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1, opt, specializations);
            }
        }

        if (shape.dims == 0 || (elempack == 8 && out_elempack == 4))
@@ -210,7 +254,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)

            pipeline_packing_pack8to4 = new Pipeline(vkdev);
            pipeline_packing_pack8to4->set_optimal_local_size_xyz(local_size_x, 1, 1);
            pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4, opt, specializations);
            if (use_int8_shader)
            {
                pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4_int8, opt, specializations);
            }
            else
            {
                pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4, opt, specializations);
            }
        }
    }

@@ -296,10 +347,14 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
    {
        out_elemsize = out_elempack * 4u;
    }
    else // if (cast_type_to == 2)
    else if (cast_type_to == 2)
    {
        out_elemsize = out_elempack * 2u;
    }
    else // if (cast_type_to == 3)
    {
        out_elemsize = out_elempack * 1u;
    }

    if (dims == 1)
    {
--- a/src/layer/vulkan/quantize_vulkan.cpp
+++ b/src/layer/vulkan/quantize_vulkan.cpp
@@ -0,0 +1,215 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "quantize_vulkan.h"

 #include "layer_shader_type.h"

 namespace ncnn {

 Quantize_vulkan::Quantize_vulkan()
 {
    support_vulkan = true;

    pipeline_quantize = 0;
    pipeline_quantize_pack4 = 0;
    pipeline_quantize_pack8 = 0;
 }

 int Quantize_vulkan::create_pipeline(const Option& opt)
 {
    const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
    const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];

    const int dims = shape.dims;

    int elempack = 0;
    if (dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
    if (dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
    if (dims == 3 || dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;

    size_t elemsize;
    const size_t out_elemsize = elempack * 1u;
    if (opt.use_fp16_storage || opt.use_fp16_packed)
    {
        elemsize = elempack * 2u;
    }
    else
    {
        elemsize = elempack * 4u;
    }

    Mat shape_packed;
    if (dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
    if (dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
    if (dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
    if (dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack);

    Mat out_shape_packed;
    if (dims == 1) out_shape_packed = Mat(out_shape.w / elempack, (void*)0, out_elemsize, elempack);
    if (dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / elempack, (void*)0, out_elemsize, elempack);
    if (dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / elempack, (void*)0, out_elemsize, elempack);
    if (dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / elempack, (void*)0, out_elemsize, elempack);

    size_t c = 0;
    size_t in_stride = 0;
    size_t out_stride = 0;
    if (dims == 1)
    {
        c = 1;
        in_stride = shape_packed.w;
        out_stride = out_shape_packed.w;
    }
    if (dims == 2)
    {
        c = shape_packed.h;
        in_stride = shape_packed.w;
        out_stride = out_shape_packed.w;
    }
    if (dims == 3 || dims == 4)
    {
        c = shape_packed.c;
        in_stride = shape_packed.cstep;
        out_stride = out_shape_packed.cstep;
    }

    std::vector<vk_specialization_type> specializations(2 + 3);
    specializations[0].i = scale_data_size;
    specializations[1].f = scale_data_size == 1 ? scale_data[0] : 1.f;
    specializations[2 + 0].u32 = c;
    specializations[2 + 1].u32 = in_stride;
    specializations[2 + 2].u32 = out_stride;

    const int local_size_x = vkdev->info.subgroup_size();

    // pack1
    if (shape.dims == 0 || elempack == 1)
    {
        pipeline_quantize = new Pipeline(vkdev);
        pipeline_quantize->set_optimal_local_size_xyz(local_size_x, 1, 1);
        pipeline_quantize->create(LayerShaderType::quantize, opt, specializations);
    }

    // pack4
    if (shape.dims == 0 || elempack == 4)
    {
        pipeline_quantize_pack4 = new Pipeline(vkdev);
        pipeline_quantize_pack4->set_optimal_local_size_xyz(local_size_x, 1, 1);
        pipeline_quantize_pack4->create(LayerShaderType::quantize_pack4, opt, specializations);
    }

    // pack8
    if (shape.dims == 0 || elempack == 8)
    {
        pipeline_quantize_pack8 = new Pipeline(vkdev);
        pipeline_quantize_pack8->set_optimal_local_size_xyz(local_size_x, 1, 1);
        pipeline_quantize_pack8->create(LayerShaderType::quantize_pack8, opt, specializations);
    }

    return 0;
 }

 int Quantize_vulkan::destroy_pipeline(const Option& /*opt*/)
 {
    delete pipeline_quantize;
    pipeline_quantize = 0;

    delete pipeline_quantize_pack4;
    pipeline_quantize_pack4 = 0;

    delete pipeline_quantize_pack8;
    pipeline_quantize_pack8 = 0;

    return 0;
 }

 int Quantize_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 {
    if (scale_data_size > 1)
    {
        cmd.record_upload(scale_data, scale_data_gpu, opt);
    }

    return 0;
 }

 int Quantize_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
 {
    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int d = bottom_blob.d;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    const size_t out_elemsize = 1u * elempack;

    if (dims == 1)
        top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator);
    if (dims == 2)
        top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator);
    if (dims == 3)
        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator);
    if (dims == 4)
        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

    size_t c = 0;
    size_t in_stride = 0;
    size_t out_stride = 0;
    if (dims == 1)
    {
        c = 1;
        in_stride = bottom_blob.w;
        out_stride = top_blob.w;
    }
    if (dims == 2)
    {
        c = bottom_blob.h;
        in_stride = bottom_blob.w;
        out_stride = top_blob.w;
    }
    if (dims == 3 || dims == 4)
    {
        c = bottom_blob.c;
        in_stride = bottom_blob.cstep;
        out_stride = top_blob.cstep;
    }

    std::vector<VkMat> bindings(3);
    bindings[0] = bottom_blob;
    bindings[1] = top_blob;
    bindings[2] = scale_data_gpu;

    std::vector<vk_constant_type> constants(3);
    constants[0].u32 = c;
    constants[1].u32 = in_stride;
    constants[2].u32 = out_stride;

    VkMat dispatcher;
    dispatcher.w = in_stride * c;
    dispatcher.h = 1;
    dispatcher.c = 1;

    const Pipeline* pipeline = elempack == 8 ? pipeline_quantize_pack8
                               : elempack == 4 ? pipeline_quantize_pack4
                               : pipeline_quantize;

    cmd.record_pipeline(pipeline, bindings, constants, dispatcher);

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/vulkan/quantize_vulkan.h
+++ b/src/layer/vulkan/quantize_vulkan.h
@@ -0,0 +1,45 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_QUANTIZE_VULKAN_H
 #define LAYER_QUANTIZE_VULKAN_H

 #include "quantize.h"

 namespace ncnn {

 class Quantize_vulkan : virtual public Quantize
 {
 public:
    Quantize_vulkan();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int upload_model(VkTransfer& cmd, const Option& opt);

    using Quantize::forward;
    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;

 public:
    VkMat scale_data_gpu;

    Pipeline* pipeline_quantize;
    Pipeline* pipeline_quantize_pack4;
    Pipeline* pipeline_quantize_pack8;
 };

 } // namespace ncnn

 #endif // LAYER_QUANTIZE_VULKAN_H
--- a/src/layer/vulkan/requantize_vulkan.cpp
+++ b/src/layer/vulkan/requantize_vulkan.cpp
@@ -0,0 +1,231 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "requantize_vulkan.h"

 #include "layer_shader_type.h"

 namespace ncnn {

 Requantize_vulkan::Requantize_vulkan()
 {
    support_vulkan = true;

    pipeline_requantize = 0;
    pipeline_requantize_pack4 = 0;
    pipeline_requantize_pack8 = 0;
 }

 int Requantize_vulkan::create_pipeline(const Option& opt)
 {
    const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
    const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];

    const int dims = shape.dims;

    int elempack = 1;
    if (dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
    if (dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
    if (dims == 3 || dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;

    int out_elempack = 1;
    if (dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
    if (dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1;
    if (dims == 3 || dims == 4) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1;

    const size_t elemsize = elempack * 4u;
    const size_t out_elemsize = out_elempack * 1u;

    Mat shape_packed;
    if (dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
    if (dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
    if (dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
    if (dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack);

    Mat out_shape_packed;
    if (dims == 1) out_shape_packed = Mat(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
    if (dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack);
    if (dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
    if (dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);

    size_t c = 0;
    size_t in_stride = 0;
    size_t out_stride = 0;
    if (dims == 1)
    {
        c = 1;
        in_stride = shape_packed.w;
        out_stride = out_shape_packed.w;
    }
    if (dims == 2)
    {
        c = shape_packed.h;
        in_stride = shape_packed.w;
        out_stride = out_shape_packed.w;
    }
    if (dims == 3 || dims == 4)
    {
        c = shape_packed.c;
        in_stride = shape_packed.cstep;
        out_stride = out_shape_packed.cstep;
    }

    std::vector<vk_specialization_type> specializations(9 + 3);
    specializations[0].i = scale_in_data_size;
    specializations[1].f = scale_in_data_size == 1 ? scale_in_data[0] : 1.f;
    specializations[2].i = scale_out_data_size;
    specializations[3].f = scale_out_data_size == 1 ? scale_out_data[0] : 1.f;
    specializations[4].i = bias_data_size;
    specializations[5].f = bias_data_size == 1 ? bias_data[0] : 0.f;
    specializations[6].i = activation_type;
    specializations[7].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
    specializations[8].f = activation_params.w == 2 ? activation_params[1] : 0.f;
    specializations[9 + 0].u32 = c;
    specializations[9 + 1].u32 = in_stride;
    specializations[9 + 2].u32 = out_stride;

    const int local_size_x = vkdev->info.subgroup_size();

    // pack1
    if (shape.dims == 0 || elempack == 1)
    {
        pipeline_requantize = new Pipeline(vkdev);
        pipeline_requantize->set_optimal_local_size_xyz(local_size_x, 1, 1);
        pipeline_requantize->create(LayerShaderType::requantize, opt, specializations);
    }

    // pack4
    if (shape.dims == 0 || elempack == 4)
    {
        pipeline_requantize_pack4 = new Pipeline(vkdev);
        pipeline_requantize_pack4->set_optimal_local_size_xyz(local_size_x, 1, 1);
        pipeline_requantize_pack4->create(LayerShaderType::requantize_pack4, opt, specializations);
    }

    // pack8
    if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
    {
        pipeline_requantize_pack8 = new Pipeline(vkdev);
        pipeline_requantize_pack8->set_optimal_local_size_xyz(local_size_x, 1, 1);
        pipeline_requantize_pack8->create(LayerShaderType::requantize_pack8, opt, specializations);
    }

    return 0;
 }

 int Requantize_vulkan::destroy_pipeline(const Option& /*opt*/)
 {
    delete pipeline_requantize;
    pipeline_requantize = 0;

    delete pipeline_requantize_pack4;
    pipeline_requantize_pack4 = 0;

    delete pipeline_requantize_pack8;
    pipeline_requantize_pack8 = 0;

    return 0;
 }

 int Requantize_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 {
    if (scale_in_data_size > 1)
    {
        cmd.record_upload(scale_in_data, scale_in_data_gpu, opt);
    }

    if (scale_out_data_size > 1)
    {
        cmd.record_upload(scale_out_data, scale_out_data_gpu, opt);
    }

    if (bias_data_size > 1)
    {
        cmd.record_upload(bias_data, bias_data_gpu, opt);
    }

    return 0;
 }

 int Requantize_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
 {
    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int d = bottom_blob.d;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    size_t out_elemsize = 1u * elempack;

    if (dims == 1)
        top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator);
    if (dims == 2)
        top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator);
    if (dims == 3)
        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator);
    if (dims == 4)
        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

    size_t c = 0;
    size_t in_stride = 0;
    size_t out_stride = 0;
    if (dims == 1)
    {
        c = 1;
        in_stride = bottom_blob.w;
        out_stride = top_blob.w;
    }
    if (dims == 2)
    {
        c = bottom_blob.h;
        in_stride = bottom_blob.w;
        out_stride = top_blob.w;
    }
    if (dims == 3 || dims == 4)
    {
        c = bottom_blob.c;
        in_stride = bottom_blob.cstep;
        out_stride = top_blob.cstep;
    }

    std::vector<VkMat> bindings(5);
    bindings[0] = bottom_blob;
    bindings[1] = top_blob;
    bindings[2] = scale_in_data_gpu;
    bindings[3] = scale_out_data_gpu;
    bindings[4] = bias_data_gpu;

    std::vector<vk_constant_type> constants(3);
    constants[0].u32 = c;
    constants[1].u32 = in_stride;
    constants[2].u32 = out_stride;

    VkMat dispatcher;
    dispatcher.w = in_stride * c;
    dispatcher.h = 1;
    dispatcher.c = 1;

    const Pipeline* pipeline = elempack == 8 ? pipeline_requantize_pack8
                               : elempack == 4 ? pipeline_requantize_pack4
                               : pipeline_requantize;

    cmd.record_pipeline(pipeline, bindings, constants, dispatcher);

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/vulkan/requantize_vulkan.h
+++ b/src/layer/vulkan/requantize_vulkan.h
@@ -0,0 +1,47 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_REQUANTIZE_VULKAN_H
 #define LAYER_REQUANTIZE_VULKAN_H

 #include "requantize.h"

 namespace ncnn {

 class Requantize_vulkan : virtual public Requantize
 {
 public:
    Requantize_vulkan();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int upload_model(VkTransfer& cmd, const Option& opt);

    using Requantize::forward;
    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;

 public:
    VkMat scale_in_data_gpu;
    VkMat scale_out_data_gpu;
    VkMat bias_data_gpu;

    Pipeline* pipeline_requantize;
    Pipeline* pipeline_requantize_pack4;
    Pipeline* pipeline_requantize_pack8;
 };

 } // namespace ncnn

 #endif // LAYER_REQUANTIZE_VULKAN_H
--- a/src/layer/vulkan/shader/dequantize.comp
+++ b/src/layer/vulkan/shader/dequantize.comp
@@ -0,0 +1,80 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int scale_data_size = 0;
 layout (constant_id = 1) const float scale_value = 1.f;
 layout (constant_id = 2) const int bias_data_size = 0;
 layout (constant_id = 3) const float bias_value = 0.f;

 #define shape_constant_id_offset 4
 layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { int bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
 layout (binding = 2) readonly buffer scale_blob { sfp scale_blob_data[]; };
 layout (binding = 3) readonly buffer bias_blob { sfp bias_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    uint c;
    uint in_stride;
    uint out_stride;
 } p;

 void main()
 {
    const uint gi = gl_GlobalInvocationID.x;

    if (gi >= psc(in_stride) * psc(c))
        return;

    const uint gy = gi / psc(in_stride);
    const uint gx = gi % psc(in_stride);

    int v = bottom_blob_data[gi];

    afp scale;
    if (scale_data_size == 1)
    {
        scale = afp(scale_value);
    }
    else
    {
        scale = buffer_ld1(scale_blob_data, gy);
    }

    afp bias;
    if (bias_data_size == 0)
    {
        bias = afp(0.f);
    }
    else if (bias_data_size == 1)
    {
        bias = afp(bias_value);
    }
    else
    {
        bias = buffer_ld1(bias_blob_data, gy);
    }

    afp v_fp = afp(v) * scale + bias;

    const uint outgi = gy * psc(out_stride) + gx;

    buffer_st1(top_blob_data, outgi, v_fp);
 }
--- a/src/layer/vulkan/shader/dequantize_pack4.comp
+++ b/src/layer/vulkan/shader/dequantize_pack4.comp
@@ -0,0 +1,80 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int scale_data_size = 0;
 layout (constant_id = 1) const float scale_value = 1.f;
 layout (constant_id = 2) const int bias_data_size = 0;
 layout (constant_id = 3) const float bias_value = 0.f;

 #define shape_constant_id_offset 4
 layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { ivec4 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
 layout (binding = 2) readonly buffer scale_blob { sfpvec4 scale_blob_data[]; };
 layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    uint c;
    uint in_stride;
    uint out_stride;
 } p;

 void main()
 {
    const uint gi = gl_GlobalInvocationID.x;

    if (gi >= psc(in_stride) * psc(c))
        return;

    const uint gy = gi / psc(in_stride);
    const uint gx = gi % psc(in_stride);

    ivec4 v = bottom_blob_data[gi];

    afpvec4 scale;
    if (scale_data_size == 1)
    {
        scale = afpvec4(scale_value);
    }
    else
    {
        scale = buffer_ld4(scale_blob_data, gy);
    }

    afpvec4 bias;
    if (bias_data_size == 0)
    {
        bias = afpvec4(0.f);
    }
    else if (bias_data_size == 1)
    {
        bias = afpvec4(bias_value);
    }
    else
    {
        bias = buffer_ld4(bias_blob_data, gy);
    }

    afpvec4 v_fp = afpvec4(v) * scale + bias;

    const uint outgi = gy * psc(out_stride) + gx;

    buffer_st4(top_blob_data, outgi, v_fp);
 }
--- a/src/layer/vulkan/shader/dequantize_pack8.comp
+++ b/src/layer/vulkan/shader/dequantize_pack8.comp
@@ -0,0 +1,84 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int scale_data_size = 0;
 layout (constant_id = 1) const float scale_value = 1.f;
 layout (constant_id = 2) const int bias_data_size = 0;
 layout (constant_id = 3) const float bias_value = 0.f;

 #define shape_constant_id_offset 4
 layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { ivec8 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
 layout (binding = 2) readonly buffer scale_blob { sfpvec8 scale_blob_data[]; };
 layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    uint c;
    uint in_stride;
    uint out_stride;
 } p;

 void main()
 {
    const uint gi = gl_GlobalInvocationID.x;

    if (gi >= psc(in_stride) * psc(c))
        return;

    const uint gy = gi / psc(in_stride);
    const uint gx = gi % psc(in_stride);

    ivec8 v = bottom_blob_data[gi];

    afpvec8 scale;
    if (scale_data_size == 1)
    {
        scale = afpvec8(afpvec4(scale_value), afpvec4(scale_value));
    }
    else
    {
        scale = buffer_ld8(scale_blob_data, gy);
    }

    afpvec8 bias;
    if (bias_data_size == 0)
    {
        bias[0] = afpvec4(0.f);
        bias[1] = afpvec4(0.f);
    }
    else if (bias_data_size == 1)
    {
        bias[0] = afpvec4(bias_value);
        bias[1] = afpvec4(bias_value);
    }
    else
    {
        bias = buffer_ld8(bias_blob_data, gy);
    }

    afpvec8 v_fp;
    v_fp[0] = afpvec4(v.abcd) * scale[0] + bias[0];
    v_fp[1] = afpvec4(v.efgh) * scale[1] + bias[1];

    const uint outgi = gy * psc(out_stride) + gx;

    buffer_st8(top_blob_data, outgi, v_fp);
 }
--- a/src/layer/vulkan/shader/packing_int8.comp
+++ b/src/layer/vulkan/shader/packing_int8.comp
@@ -0,0 +1,73 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int cast_type_from = 0;
 layout (constant_id = 1) const int cast_type_to = 1;

 #define shape_constant_id_offset 2
 layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; };
 layout (binding = 1) readonly buffer bottom_blob_int32 { ivec4 bottom_blob_int32_data[]; };
 layout (binding = 2) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
 layout (binding = 3) writeonly buffer top_blob_int32 { ivec4 top_blob_int32_data[]; };

 layout (push_constant) uniform parameter
 {
    uint n;
    uint c;
    uint stride;
 } p;

 void main()
 {
    const uint gx = gl_GlobalInvocationID.x;
    const uint gy = gl_GlobalInvocationID.y;

    if (gx >= psc(n) || gy >= psc(c))
        return;

    const uint gi = gy * psc(n) + gx;

    if (cast_type_from == cast_type_to)
    {
        i8buffer_cp4(top_blob_data, gi, bottom_blob_data, gi);
        return;
    }

    const uint gi2 = gy * psc(stride) + gx;

    ivec4 v;
    if (cast_type_from == 3)
    {
        v = bottom_blob_int32_data[gi];
    }
    else
    {
        v = i8buffer_ld4(bottom_blob_data, gi2);
    }

    if (cast_type_to == 3)
    {
        top_blob_int32_data[gi] = v;
    }
    else
    {
        i8buffer_st4(top_blob_data, gi2, v);
    }
 }
--- a/src/layer/vulkan/shader/packing_pack1to4_int8.comp
+++ b/src/layer/vulkan/shader/packing_pack1to4_int8.comp
@@ -0,0 +1,79 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int cast_type_from = 0;
 layout (constant_id = 1) const int cast_type_to = 1;

 #define shape_constant_id_offset 2
 layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; };
 layout (binding = 1) readonly buffer bottom_blob_int32 { int bottom_blob_int32_data[]; };
 layout (binding = 2) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
 layout (binding = 3) writeonly buffer top_blob_int32 { ivec4 top_blob_int32_data[]; };

 layout (push_constant) uniform parameter
 {
    uint n;
    uint c;
    uint stride;
 } p;

 void main()
 {
    const uint gx = gl_GlobalInvocationID.x;
    const uint gy = gl_GlobalInvocationID.y;

    if (gx >= psc(n) || gy >= psc(c))
        return;

    const uvec4 gi4 = (gy * 4 + uvec4(0, 1, 2, 3)) * psc(stride) + gx;

    const uint gi = gy * psc(n) + gx;

 //     if (cast_type_from == cast_type_to)
 //     {
 //         i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, gi4);
 //         return;
 //     }

    ivec4 v;
    if (cast_type_from == 3)
    {
        v.r = bottom_blob_int32_data[gi4.r];
        v.g = bottom_blob_int32_data[gi4.g];
        v.b = bottom_blob_int32_data[gi4.b];
        v.a = bottom_blob_int32_data[gi4.a];
    }
    else
    {
        v.r = i8buffer_ld1(bottom_blob_data, gi4.r);
        v.g = i8buffer_ld1(bottom_blob_data, gi4.g);
        v.b = i8buffer_ld1(bottom_blob_data, gi4.b);
        v.a = i8buffer_ld1(bottom_blob_data, gi4.a);
    }

    if (cast_type_to == 3)
    {
        top_blob_int32_data[gi] = v;
    }
    else
    {
        i8buffer_st4(top_blob_data, gi, v);
    }
 }
--- a/src/layer/vulkan/shader/packing_pack1to8_int8.comp
+++ b/src/layer/vulkan/shader/packing_pack1to8_int8.comp
@@ -0,0 +1,88 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int cast_type_from = 0;
 layout (constant_id = 1) const int cast_type_to = 1;

 #define shape_constant_id_offset 2
 layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; };
 layout (binding = 1) readonly buffer bottom_blob_int32 { int bottom_blob_int32_data[]; };
 layout (binding = 2) writeonly buffer top_blob { sint8vec8 top_blob_data[]; };
 layout (binding = 3) writeonly buffer top_blob_int32 { ivec8 top_blob_int32_data[]; };

 layout (push_constant) uniform parameter
 {
    uint n;
    uint c;
    uint stride;
 } p;

 void main()
 {
    const uint gx = gl_GlobalInvocationID.x;
    const uint gy = gl_GlobalInvocationID.y;

    if (gx >= psc(n) || gy >= psc(c))
        return;

    const uvec4 gi4 = (gy * 8 + uvec4(0, 1, 2, 3)) * psc(stride) + gx;
    const uvec4 gi8 = gi4 + psc(stride) * 4;

    const uint gi = gy * psc(n) + gx;

 //     if (cast_type_from == cast_type_to)
 //     {
 //         i8buffer_cp1to8(top_blob_data, gi, bottom_blob_data, gi4, gi8);
 //         return;
 //     }

    ivec8 v;
    if (cast_type_from == 3)
    {
        v.abcd.r = bottom_blob_int32_data[gi4.r];
        v.abcd.g = bottom_blob_int32_data[gi4.g];
        v.abcd.b = bottom_blob_int32_data[gi4.b];
        v.abcd.a = bottom_blob_int32_data[gi4.a];
        v.efgh.r = bottom_blob_int32_data[gi8.r];
        v.efgh.g = bottom_blob_int32_data[gi8.g];
        v.efgh.b = bottom_blob_int32_data[gi8.b];
        v.efgh.a = bottom_blob_int32_data[gi8.a];
    }
    else
    {
        v.abcd.r = i8buffer_ld1(bottom_blob_data, gi4.r);
        v.abcd.g = i8buffer_ld1(bottom_blob_data, gi4.g);
        v.abcd.b = i8buffer_ld1(bottom_blob_data, gi4.b);
        v.abcd.a = i8buffer_ld1(bottom_blob_data, gi4.a);
        v.efgh.r = i8buffer_ld1(bottom_blob_data, gi8.r);
        v.efgh.g = i8buffer_ld1(bottom_blob_data, gi8.g);
        v.efgh.b = i8buffer_ld1(bottom_blob_data, gi8.b);
        v.efgh.a = i8buffer_ld1(bottom_blob_data, gi8.a);
    }

    if (cast_type_to == 3)
    {
        top_blob_int32_data[gi] = v;
    }
    else
    {
        i8buffer_st8(top_blob_data, gi, v);
    }
 }
--- a/src/layer/vulkan/shader/packing_pack4to1_int8.comp
+++ b/src/layer/vulkan/shader/packing_pack4to1_int8.comp
@@ -0,0 +1,79 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int cast_type_from = 0;
 layout (constant_id = 1) const int cast_type_to = 1;

 #define shape_constant_id_offset 2
 layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; };
 layout (binding = 1) readonly buffer bottom_blob_int32 { ivec4 bottom_blob_int32_data[]; };
 layout (binding = 2) writeonly buffer top_blob { sint8 top_blob_data[]; };
 layout (binding = 3) writeonly buffer top_blob_int32 { int top_blob_int32_data[]; };

 layout (push_constant) uniform parameter
 {
    uint n;
    uint c;
    uint stride;
 } p;

 void main()
 {
    const uint gx = gl_GlobalInvocationID.x;
    const uint gy = gl_GlobalInvocationID.y;

    if (gx >= psc(n) || gy >= psc(c))
        return;

    const uint gi = gy * psc(n) + gx;

    const uvec4 gi4 = (gy * 4 + uvec4(0, 1, 2, 3)) * psc(stride) + gx;

 //     if (cast_type_from == cast_type_to)
 //     {
 //         buffer_cp4to1(top_blob_data, gi4, bottom_blob_data, gi);
 //         return;
 //     }

    ivec4 v;
    if (cast_type_from == 3)
    {
        v = bottom_blob_int32_data[gi];
    }
    else
    {
        v = i8buffer_ld4(bottom_blob_data, gi);
    }

    if (cast_type_to == 3)
    {
        top_blob_int32_data[gi4.r] = v.r;
        top_blob_int32_data[gi4.g] = v.g;
        top_blob_int32_data[gi4.b] = v.b;
        top_blob_int32_data[gi4.a] = v.a;
    }
    else
    {
        i8buffer_st1(top_blob_data, gi4.r, v.r);
        i8buffer_st1(top_blob_data, gi4.g, v.g);
        i8buffer_st1(top_blob_data, gi4.b, v.b);
        i8buffer_st1(top_blob_data, gi4.a, v.a);
    }
 }
--- a/src/layer/vulkan/shader/packing_pack4to8_int8.comp
+++ b/src/layer/vulkan/shader/packing_pack4to8_int8.comp
@@ -0,0 +1,75 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int cast_type_from = 0;
 layout (constant_id = 1) const int cast_type_to = 1;

 #define shape_constant_id_offset 2
 layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; };
 layout (binding = 1) readonly buffer bottom_blob_int32 { ivec4 bottom_blob_int32_data[]; };
 layout (binding = 2) writeonly buffer top_blob { sint8vec8 top_blob_data[]; };
 layout (binding = 3) writeonly buffer top_blob_int32 { ivec8 top_blob_int32_data[]; };

 layout (push_constant) uniform parameter
 {
    uint n;
    uint c;
    uint stride;
 } p;

 void main()
 {
    const uint gx = gl_GlobalInvocationID.x;
    const uint gy = gl_GlobalInvocationID.y;

    if (gx >= psc(n) || gy >= psc(c))
        return;

    const uvec2 gi2 = (gy * 2 + uvec2(0, 1)) * psc(stride) + gx;

    const uint gi = gy * psc(n) + gx;

 //     if (cast_type_from == cast_type_to)
 //     {
 //         buffer_cp4to8(top_blob_data, gi, bottom_blob_data, gi2);
 //         return;
 //     }

    ivec8 v;
    if (cast_type_from == 3)
    {
        v.abcd = bottom_blob_int32_data[gi2.r];
        v.efgh = bottom_blob_int32_data[gi2.g];
    }
    else
    {
        v.abcd = i8buffer_ld4(bottom_blob_data, gi2.r);
        v.efgh = i8buffer_ld4(bottom_blob_data, gi2.g);
    }

    if (cast_type_to == 3)
    {
        top_blob_int32_data[gi] = v;
    }
    else
    {
        i8buffer_st8(top_blob_data, gi, v);
    }
 }
--- a/src/layer/vulkan/shader/packing_pack8to1_int8.comp
+++ b/src/layer/vulkan/shader/packing_pack8to1_int8.comp
@@ -0,0 +1,88 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int cast_type_from = 0;
 layout (constant_id = 1) const int cast_type_to = 1;

 #define shape_constant_id_offset 2
 layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { sint8vec8 bottom_blob_data[]; };
 layout (binding = 1) readonly buffer bottom_blob_int32 { ivec8 bottom_blob_int32_data[]; };
 layout (binding = 2) writeonly buffer top_blob { sint8 top_blob_data[]; };
 layout (binding = 3) writeonly buffer top_blob_int32 { int top_blob_int32_data[]; };

 layout (push_constant) uniform parameter
 {
    uint n;
    uint c;
    uint stride;
 } p;

 void main()
 {
    const uint gx = gl_GlobalInvocationID.x;
    const uint gy = gl_GlobalInvocationID.y;

    if (gx >= psc(n) || gy >= psc(c))
        return;

    const uint gi = gy * psc(n) + gx;

    const uvec4 gi4 = (gy * 8 + uvec4(0, 1, 2, 3)) * psc(stride) + gx;
    const uvec4 gi8 = gi4 + psc(stride) * 4;

 //     if (cast_type_from == cast_type_to)
 //     {
 //         i8buffer_cp8to1(top_blob_data, gi4, gi8, bottom_blob_data, gi);
 //         return;
 //     }

    ivec8 v;
    if (cast_type_from == 3)
    {
        v = bottom_blob_int32_data[gi];
    }
    else
    {
        v = i8buffer_ld8(bottom_blob_data, gi);
    }

    if (cast_type_to == 3)
    {
        top_blob_int32_data[gi4.r] = v.abcd.r;
        top_blob_int32_data[gi4.g] = v.abcd.g;
        top_blob_int32_data[gi4.b] = v.abcd.b;
        top_blob_int32_data[gi4.a] = v.abcd.a;
        top_blob_int32_data[gi8.r] = v.efgh.r;
        top_blob_int32_data[gi8.g] = v.efgh.g;
        top_blob_int32_data[gi8.b] = v.efgh.b;
        top_blob_int32_data[gi8.a] = v.efgh.a;
    }
    else
    {
        i8buffer_st1(top_blob_data, gi4.r, v.abcd.r);
        i8buffer_st1(top_blob_data, gi4.g, v.abcd.g);
        i8buffer_st1(top_blob_data, gi4.b, v.abcd.b);
        i8buffer_st1(top_blob_data, gi4.a, v.abcd.a);
        i8buffer_st1(top_blob_data, gi8.r, v.efgh.r);
        i8buffer_st1(top_blob_data, gi8.g, v.efgh.g);
        i8buffer_st1(top_blob_data, gi8.b, v.efgh.b);
        i8buffer_st1(top_blob_data, gi8.a, v.efgh.a);
    }
 }
--- a/src/layer/vulkan/shader/packing_pack8to4_int8.comp
+++ b/src/layer/vulkan/shader/packing_pack8to4_int8.comp
@@ -0,0 +1,75 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int cast_type_from = 0;
 layout (constant_id = 1) const int cast_type_to = 1;

 #define shape_constant_id_offset 2
 layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { sint8vec8 bottom_blob_data[]; };
 layout (binding = 1) readonly buffer bottom_blob_int32 { ivec8 bottom_blob_int32_data[]; };
 layout (binding = 2) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
 layout (binding = 3) writeonly buffer top_blob_int32 { ivec4 top_blob_int32_data[]; };

 layout (push_constant) uniform parameter
 {
    uint n;
    uint c;
    uint stride;
 } p;

 void main()
 {
    const uint gx = gl_GlobalInvocationID.x;
    const uint gy = gl_GlobalInvocationID.y;

    if (gx >= psc(n) || gy >= psc(c))
        return;

    const uint gi = gy * psc(n) + gx;

    const uvec2 gi2 = (gy * 2 + uvec2(0, 1)) * psc(stride) + gx;

 //     if (cast_type_from == cast_type_to)
 //     {
 //         buffer_cp8to4(top_blob_data, gi2, bottom_blob_data, gi);
 //         return;
 //     }

    ivec8 v;
    if (cast_type_from == 3)
    {
        v = bottom_blob_int32_data[gi];
    }
    else
    {
        v = i8buffer_ld8(bottom_blob_data, gi);
    }

    if (cast_type_to == 3)
    {
        top_blob_int32_data[gi2.r] = v.abcd;
        top_blob_int32_data[gi2.g] = v.efgh;
    }
    else
    {
        i8buffer_st4(top_blob_data, gi2.r, v.abcd);
        i8buffer_st4(top_blob_data, gi2.g, v.efgh);
    }
 }
--- a/src/layer/vulkan/shader/quantize.comp
+++ b/src/layer/vulkan/shader/quantize.comp
@@ -0,0 +1,63 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int scale_data_size = 0;
 layout (constant_id = 1) const float scale_value = 1.f;

 #define shape_constant_id_offset 2
 layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; };
 layout (binding = 2) readonly buffer scale_blob { sfp scale_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    uint c;
    uint in_stride;
    uint out_stride;
 } p;

 void main()
 {
    const uint gi = gl_GlobalInvocationID.x;

    if (gi >= psc(in_stride) * psc(c))
        return;

    const uint gy = gi / psc(in_stride);
    const uint gx = gi % psc(in_stride);

    afp v = buffer_ld1(bottom_blob_data, gi);

    afp scale;
    if (scale_data_size == 1)
    {
        scale = afp(scale_value);
    }
    else
    {
        scale = buffer_ld1(scale_blob_data, gy);
    }

    int v_int = int(round(clamp(v * scale, afp(-127.f), afp(127.f))));

    const uint outgi = gy * psc(out_stride) + gx;

    i8buffer_st1(top_blob_data, outgi, v_int);
 }
--- a/src/layer/vulkan/shader/quantize_pack4.comp
+++ b/src/layer/vulkan/shader/quantize_pack4.comp
@@ -0,0 +1,63 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int scale_data_size = 0;
 layout (constant_id = 1) const float scale_value = 1.f;

 #define shape_constant_id_offset 2
 layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
 layout (binding = 2) readonly buffer scale_blob { sfpvec4 scale_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    uint c;
    uint in_stride;
    uint out_stride;
 } p;

 void main()
 {
    const uint gi = gl_GlobalInvocationID.x;

    if (gi >= psc(in_stride) * psc(c))
        return;

    const uint gy = gi / psc(in_stride);
    const uint gx = gi % psc(in_stride);

    afpvec4 v = buffer_ld4(bottom_blob_data, gi);

    afpvec4 scale;
    if (scale_data_size == 1)
    {
        scale = afpvec4(scale_value);
    }
    else
    {
        scale = buffer_ld4(scale_blob_data, gy);
    }

    ivec4 v_int = ivec4(round(clamp(v * scale, afp(-127.f), afp(127.f))));

    const uint outgi = gy * psc(out_stride) + gx;

    i8buffer_st4(top_blob_data, outgi, v_int);
 }
--- a/src/layer/vulkan/shader/quantize_pack8.comp
+++ b/src/layer/vulkan/shader/quantize_pack8.comp
@@ -0,0 +1,65 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int scale_data_size = 0;
 layout (constant_id = 1) const float scale_value = 1.f;

 #define shape_constant_id_offset 2
 layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sint8vec8 top_blob_data[]; };
 layout (binding = 2) readonly buffer scale_blob { sfpvec8 scale_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    uint c;
    uint in_stride;
    uint out_stride;
 } p;

 void main()
 {
    const uint gi = gl_GlobalInvocationID.x;

    if (gi >= psc(in_stride) * psc(c))
        return;

    const uint gy = gi / psc(in_stride);
    const uint gx = gi % psc(in_stride);

    afpvec8 v = buffer_ld8(bottom_blob_data, gi);

    afpvec8 scale;
    if (scale_data_size == 1)
    {
        scale = afpvec8(afpvec4(scale_value), afpvec4(scale_value));
    }
    else
    {
        scale = buffer_ld8(scale_blob_data, gy);
    }

    ivec8 v_int;
    v_int.abcd = ivec4(round(clamp(v[0] * scale[0], afp(-127.f), afp(127.f))));
    v_int.efgh = ivec4(round(clamp(v[1] * scale[1], afp(-127.f), afp(127.f))));

    const uint outgi = gy * psc(out_stride) + gx;

    i8buffer_st8(top_blob_data, outgi, v_int);
 }
--- a/src/layer/vulkan/shader/requantize.comp
+++ b/src/layer/vulkan/shader/requantize.comp
@@ -0,0 +1,103 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #extension GL_GOOGLE_include_directive: enable
 #include "vulkan_activation.comp"

 layout (constant_id = 0) const int scale_in_data_size = 0;
 layout (constant_id = 1) const float scale_in_value = 1.f;
 layout (constant_id = 2) const int scale_out_data_size = 0;
 layout (constant_id = 3) const float scale_out_value = 1.f;
 layout (constant_id = 4) const int bias_data_size = 0;
 layout (constant_id = 5) const float bias_value = 0.f;
 layout (constant_id = 6) const int activation_type = 0;
 layout (constant_id = 7) const float activation_param_0 = 0;
 layout (constant_id = 8) const float activation_param_1 = 0;

 #define shape_constant_id_offset 9
 layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { int bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; };
 layout (binding = 2) readonly buffer scale_in_blob { sfp scale_in_blob_data[]; };
 layout (binding = 3) readonly buffer scale_out_blob { sfp scale_out_blob_data[]; };
 layout (binding = 4) readonly buffer bias_blob { sfp bias_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    uint c;
    uint in_stride;
    uint out_stride;
 } p;

 void main()
 {
    const uint gi = gl_GlobalInvocationID.x;

    if (gi >= psc(in_stride) * psc(c))
        return;

    const uint gy = gi / psc(in_stride);
    const uint gx = gi % psc(in_stride);

    int v = bottom_blob_data[gi];

    afp scale_in;
    if (scale_in_data_size == 1)
    {
        scale_in = afp(scale_in_value);
    }
    else
    {
        scale_in = buffer_ld1(scale_in_blob_data, gy);
    }

    afp bias;
    if (bias_data_size == 0)
    {
        bias = afp(0.f);
    }
    else if (bias_data_size == 1)
    {
        bias = afp(bias_value);
    }
    else
    {
        bias = buffer_ld1(bias_blob_data, gy);
    }

    afp v_fp = afp(v) * scale_in + bias;

    v_fp = activation_afp(v_fp, activation_type, activation_param_0, activation_param_1);

    afp scale_out;
    if (scale_out_data_size == 1)
    {
        scale_out = afp(scale_out_value);
    }
    else
    {
        scale_out = buffer_ld1(scale_out_blob_data, gy);
    }

    int v_int = int(round(clamp(v_fp * scale_out, afp(-127.f), afp(127.f))));

    const uint outgi = gy * psc(out_stride) + gx;

    i8buffer_st1(top_blob_data, outgi, v_int);
 }
--- a/src/layer/vulkan/shader/requantize_pack4.comp
+++ b/src/layer/vulkan/shader/requantize_pack4.comp
@@ -0,0 +1,103 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #extension GL_GOOGLE_include_directive: enable
 #include "vulkan_activation.comp"

 layout (constant_id = 0) const int scale_in_data_size = 0;
 layout (constant_id = 1) const float scale_in_value = 1.f;
 layout (constant_id = 2) const int scale_out_data_size = 0;
 layout (constant_id = 3) const float scale_out_value = 1.f;
 layout (constant_id = 4) const int bias_data_size = 0;
 layout (constant_id = 5) const float bias_value = 0.f;
 layout (constant_id = 6) const int activation_type = 0;
 layout (constant_id = 7) const float activation_param_0 = 0;
 layout (constant_id = 8) const float activation_param_1 = 0;

 #define shape_constant_id_offset 9
 layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { ivec4 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
 layout (binding = 2) readonly buffer scale_in_blob { sfpvec4 scale_in_blob_data[]; };
 layout (binding = 3) readonly buffer scale_out_blob { sfpvec4 scale_out_blob_data[]; };
 layout (binding = 4) readonly buffer bias_blob { sfpvec4 bias_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    uint c;
    uint in_stride;
    uint out_stride;
 } p;

 void main()
 {
    const uint gi = gl_GlobalInvocationID.x;

    if (gi >= psc(in_stride) * psc(c))
        return;

    const uint gy = gi / psc(in_stride);
    const uint gx = gi % psc(in_stride);

    ivec4 v = bottom_blob_data[gi];

    afpvec4 scale_in;
    if (scale_in_data_size == 1)
    {
        scale_in = afpvec4(scale_in_value);
    }
    else
    {
        scale_in = buffer_ld4(scale_in_blob_data, gy);
    }

    afpvec4 bias;
    if (bias_data_size == 0)
    {
        bias = afpvec4(0.f);
    }
    else if (bias_data_size == 1)
    {
        bias = afpvec4(bias_value);
    }
    else
    {
        bias = buffer_ld4(bias_blob_data, gy);
    }

    afpvec4 v_fp = afpvec4(v) * scale_in + bias;

    v_fp = activation_afpvec4(v_fp, activation_type, activation_param_0, activation_param_1);

    afpvec4 scale_out;
    if (scale_out_data_size == 1)
    {
        scale_out = afpvec4(scale_out_value);
    }
    else
    {
        scale_out = buffer_ld4(scale_out_blob_data, gy);
    }

    ivec4 v_int = ivec4(round(clamp(v_fp * scale_out, afp(-127.f), afp(127.f))));

    const uint outgi = gy * psc(out_stride) + gx;

    i8buffer_st4(top_blob_data, outgi, v_int);
 }
--- a/src/layer/vulkan/shader/requantize_pack8.comp
+++ b/src/layer/vulkan/shader/requantize_pack8.comp
@@ -0,0 +1,107 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #extension GL_GOOGLE_include_directive: enable
 #include "vulkan_activation.comp"

 layout (constant_id = 0) const int scale_in_data_size = 0;
 layout (constant_id = 1) const float scale_in_value = 1.f;
 layout (constant_id = 2) const int scale_out_data_size = 0;
 layout (constant_id = 3) const float scale_out_value = 1.f;
 layout (constant_id = 4) const int bias_data_size = 0;
 layout (constant_id = 5) const float bias_value = 0.f;
 layout (constant_id = 6) const int activation_type = 0;
 layout (constant_id = 7) const float activation_param_0 = 0;
 layout (constant_id = 8) const float activation_param_1 = 0;

 #define shape_constant_id_offset 9
 layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
 layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
 layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;

 layout (binding = 0) readonly buffer bottom_blob { ivec8 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sint8vec8 top_blob_data[]; };
 layout (binding = 2) readonly buffer scale_in_blob { sfpvec8 scale_in_blob_data[]; };
 layout (binding = 3) readonly buffer scale_out_blob { sfpvec8 scale_out_blob_data[]; };
 layout (binding = 4) readonly buffer bias_blob { sfpvec8 bias_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    uint c;
    uint in_stride;
    uint out_stride;
 } p;

 void main()
 {
    const uint gi = gl_GlobalInvocationID.x;

    if (gi >= psc(in_stride) * psc(c))
        return;

    const uint gy = gi / psc(in_stride);
    const uint gx = gi % psc(in_stride);

    ivec8 v = bottom_blob_data[gi];

    afpvec8 scale_in;
    if (scale_in_data_size == 1)
    {
        scale_in = afpvec8(afpvec4(scale_in_value), afpvec4(scale_in_value));
    }
    else
    {
        scale_in = buffer_ld8(scale_in_blob_data, gy);
    }

    afpvec8 bias;
    if (bias_data_size == 0)
    {
        bias = afpvec8(afpvec4(0.f), afpvec4(0.f));
    }
    else if (bias_data_size == 1)
    {
        bias = afpvec8(afpvec4(bias_value), afpvec4(bias_value));
    }
    else
    {
        bias = buffer_ld8(bias_blob_data, gy);
    }

    afpvec8 v_fp;
    v_fp[0] = afpvec4(v.abcd) * scale_in[0] + bias[0];
    v_fp[1] = afpvec4(v.efgh) * scale_in[1] + bias[1];

    v_fp = activation_afpvec8(v_fp, activation_type, activation_param_0, activation_param_1);

    afpvec8 scale_out;
    if (scale_out_data_size == 1)
    {
        scale_out = afpvec8(afpvec4(scale_out_value), afpvec4(scale_out_value));
    }
    else
    {
        scale_out = buffer_ld8(scale_out_blob_data, gy);
    }

    ivec8 v_int;
    v_int.abcd = ivec4(round(clamp(v_fp[0] * scale_out[0], afp(-127.f), afp(127.f))));
    v_int.efgh = ivec4(round(clamp(v_fp[1] * scale_out[1], afp(-127.f), afp(127.f))));

    const uint outgi = gy * psc(out_stride) + gx;

    i8buffer_st8(top_blob_data, outgi, v_int);
 }
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -1043,6 +1043,9 @@ int Net::load_param(const DataReader& dr)
        // fp16a makes no sense when fp16 storage disabled
        if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;

        // int8a makes no sense when int8 storage disabled
        if (!opt.use_int8_packed && !opt.use_int8_storage) opt.use_int8_arithmetic = false;

        // fp16 uniform makes no sense when fp16 arithmetic disabled
        if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false;
    }
@@ -1339,6 +1342,9 @@ int Net::load_param_bin(const DataReader& dr)
        // fp16a makes no sense when fp16 storage disabled
        if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;

        // int8a makes no sense when int8 storage disabled
        if (!opt.use_int8_packed && !opt.use_int8_storage) opt.use_int8_arithmetic = false;

        // fp16 uniform makes no sense when fp16 arithmetic disabled
        if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false;
    }
--- a/tests/test_dequantize.cpp
+++ b/tests/test_dequantize.cpp
@@ -142,12 +142,8 @@ static int test_dequantize_3()
           || test_dequantize_pack8(RandomIntMat(15, 24), 24, 24)
           || test_dequantize_pack8(RandomIntMat(15, 24), 24, 1)
           || test_dequantize_pack8(RandomIntMat(15, 24), 24, 0)
           || test_dequantize_pack8(RandomIntMat(128), 1, 128)
           || test_dequantize_pack8(RandomIntMat(128), 1, 1)
           || test_dequantize_pack8(RandomIntMat(128), 1, 0)
           || test_dequantize_pack8(RandomIntMat(128), 128, 128)
           || test_dequantize_pack8(RandomIntMat(128), 128, 1)
           || test_dequantize_pack8(RandomIntMat(128), 128, 0);
           || test_dequantize_pack8(RandomIntMat(128), 1, 0);
 }

 int main()
--- a/tests/test_packing.cpp
+++ b/tests/test_packing.cpp
@@ -217,15 +217,12 @@ static int test_packing_cpu(const ncnn::Mat& a, int in_elempack, int out_elempac
 }

 #if NCNN_VULKAN

 static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_elempack)
 static int test_packing_gpu_fp32(const ncnn::Mat& a, int in_elempack, int out_elempack)
 {
    ncnn::ParamDict pd;
    pd.set(0, out_elempack);
    pd.set(2, 1); // cast_type_from
    pd.set(3, 1); // cast_type_to
    pd.set(4, 0); // storage_type_from
    pd.set(5, 0); // storage_type_to

    std::vector<ncnn::Mat> weights(0);

@@ -297,12 +294,112 @@ static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_

    if (CompareMat(b, d, 0.001) != 0)
    {
        fprintf(stderr, "test_packing_gpu_buffer failed a.dims=%d a=(%d %d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.d, a.c, in_elempack, out_elempack);
        fprintf(stderr, "test_packing_gpu failed a.dims=%d a=(%d %d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.d, a.c, in_elempack, out_elempack);
        return -1;
    }

    return 0;
 }

 static int test_packing_gpu_int8(const ncnn::Mat& a, int in_elempack, int out_elempack)
 {
    ncnn::ParamDict pd;
    pd.set(0, out_elempack);
    pd.set(2, 4); // cast_type_from
    pd.set(3, 4); // cast_type_to

    std::vector<ncnn::Mat> weights(0);

    ncnn::Option opt;
    opt.num_threads = 1;
    opt.use_vulkan_compute = true;
    opt.use_int8_inference = false;
    opt.use_fp16_packed = false;
    opt.use_fp16_storage = false;
    opt.use_fp16_arithmetic = false;
    opt.use_int8_storage = false;
    opt.use_int8_arithmetic = false;
    opt.use_packing_layout = true;
    opt.use_shader_pack8 = true;

    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();

    ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
    ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();

    opt.blob_vkallocator = blob_vkallocator;
    opt.workspace_vkallocator = blob_vkallocator;
    opt.staging_vkallocator = staging_vkallocator;

    if (!vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
    if (!vkdev->info.support_int8_storage()) opt.use_int8_storage = false;

    ncnn::Layer* op = ncnn::create_layer_vulkan("Packing");

    op->vkdev = vkdev;

    op->load_param(pd);

    ncnn::ModelBinFromMatArray mb(weights.data());

    op->load_model(mb);

    op->create_pipeline(opt);

    ncnn::Mat a8;
    if (a.dims == 1) a8 = RandomS8Mat(a.w);
    if (a.dims == 2) a8 = RandomS8Mat(a.w, a.h);
    if (a.dims == 3) a8 = RandomS8Mat(a.w, a.h, a.c);
    if (a.dims == 4) a8 = RandomS8Mat(a.w, a.h, a.d, a.c);

    ncnn::Mat ap;
    ncnn::convert_packing(a8, ap, in_elempack, opt);

    ncnn::Mat b;
    packing_cpu_naive(ap, b, out_elempack);

    ncnn::Mat c;

    // forward
    ncnn::VkCompute cmd(vkdev);

    // upload
    ncnn::VkMat a_gpu;
    cmd.record_clone(ap, a_gpu, opt);

    ncnn::VkMat c_gpu;
    op->forward(a_gpu, c_gpu, cmd, opt);

    // download
    cmd.record_clone(c_gpu, c, opt);

    cmd.submit_and_wait();

    op->destroy_pipeline(opt);

    delete op;

    ncnn::Mat b32;
    ncnn::cast_int8_to_float32(b, b32, opt);

    ncnn::Mat c32;
    ncnn::cast_int8_to_float32(c, c32, opt);

    if (CompareMat(b32, c32, 0.001) != 0)
    {
        fprintf(stderr, "test_packing_gpu_int8 failed a.dims=%d a=(%d %d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.d, a.c, in_elempack, out_elempack);
        return -1;
    }

    return 0;
 }

 static int test_packing_gpu(const ncnn::Mat& a, int in_elempack, int out_elempack)
 {
    return 0
           || test_packing_gpu_fp32(a, in_elempack, out_elempack)
           || test_packing_gpu_int8(a, in_elempack, out_elempack);
 }
 #endif

 static int test_packing_cpu(const ncnn::Mat& a)
@@ -329,15 +426,15 @@ static int test_packing_cpu(const ncnn::Mat& a)
 static int test_packing_gpu(const ncnn::Mat& a)
 {
    return 0
           || test_packing_gpu_buffer(a, 1, 1)
           || test_packing_gpu_buffer(a, 4, 4)
           || test_packing_gpu_buffer(a, 8, 8)
           || test_packing_gpu_buffer(a, 1, 4)
           || test_packing_gpu_buffer(a, 4, 1)
           || test_packing_gpu_buffer(a, 1, 8)
           || test_packing_gpu_buffer(a, 8, 1)
           || test_packing_gpu_buffer(a, 4, 8)
           || test_packing_gpu_buffer(a, 8, 4);
           || test_packing_gpu(a, 1, 1)
           || test_packing_gpu(a, 4, 4)
           || test_packing_gpu(a, 8, 8)
           || test_packing_gpu(a, 1, 4)
           || test_packing_gpu(a, 4, 1)
           || test_packing_gpu(a, 1, 8)
           || test_packing_gpu(a, 8, 1)
           || test_packing_gpu(a, 4, 8)
           || test_packing_gpu(a, 8, 4);
 }
 #endif // NCNN_VULKAN

--- a/tests/test_quantize.cpp
+++ b/tests/test_quantize.cpp
@@ -24,7 +24,7 @@ static int test_quantize(const ncnn::Mat& a, float scale_low, float scale_high)
    }
    else
    {
        if (a.dims == 1) scale_data.create(a.w);
        if (a.dims == 1) scale_data.create(1);
        if (a.dims == 2) scale_data.create(a.h);
        if (a.dims == 3) scale_data.create(a.c);
        Randomize(scale_data, scale_low, scale_high);
--- a/tests/test_quantize_oom.cpp
+++ b/tests/test_quantize_oom.cpp
@@ -24,7 +24,7 @@ static int test_quantize_oom(const ncnn::Mat& a, float scale_low, float scale_hi
    }
    else
    {
        if (a.dims == 1) scale_data.create(a.w);
        if (a.dims == 1) scale_data.create(1);
        if (a.dims == 2) scale_data.create(a.h);
        if (a.dims == 3) scale_data.create(a.c);
        Randomize(scale_data, scale_low, scale_high);
--- a/tests/testutil.cpp
+++ b/tests/testutil.cpp
@@ -759,7 +759,32 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
            std::vector<ncnn::VkMat> a_gpu(a.size());
            for (size_t i = 0; i < a_gpu.size(); i++)
            {
                cmd.record_upload(a[i], a_gpu[i], opt);
                if (flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING)
                {
                    // resolve dst_elempack
                    int dims = a[i].dims;
                    int elemcount = 0;
                    if (dims == 1) elemcount = a[i].elempack * a[i].w;
                    if (dims == 2) elemcount = a[i].elempack * a[i].h;
                    if (dims == 3 || dims == 4) elemcount = a[i].elempack * a[i].c;

                    const int dst_elempack = (opt.use_shader_pack8 && elemcount % 8 == 0) ? 8 : elemcount % 4 == 0 ? 4 : 1;

                    ncnn::Mat a4;
                    ncnn::convert_packing(a[i], a4, dst_elempack, opt);

                    ncnn::Option opt_upload = opt;
                    opt_upload.use_fp16_packed = false;
                    opt_upload.use_fp16_storage = false;
                    opt_upload.use_int8_packed = false;
                    opt_upload.use_int8_storage = false;

                    cmd.record_clone(a4, a_gpu[i], opt_upload);
                }
                else
                {
                    cmd.record_upload(a[i], a_gpu[i], opt);
                }
            }

            std::vector<ncnn::VkMat> d_gpu(top_blob_count);
@@ -1082,7 +1107,33 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
        {
            // upload
            ncnn::VkMat a_gpu;
            cmd.record_upload(a, a_gpu, opt);

            if (flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING)
            {
                // resolve dst_elempack
                int dims = a.dims;
                int elemcount = 0;
                if (dims == 1) elemcount = a.elempack * a.w;
                if (dims == 2) elemcount = a.elempack * a.h;
                if (dims == 3 || dims == 4) elemcount = a.elempack * a.c;

                const int dst_elempack = (opt.use_shader_pack8 && elemcount % 8 == 0) ? 8 : elemcount % 4 == 0 ? 4 : 1;

                ncnn::Mat a4;
                ncnn::convert_packing(a, a4, dst_elempack, opt);

                ncnn::Option opt_upload = opt;
                opt_upload.use_fp16_packed = false;
                opt_upload.use_fp16_storage = false;
                opt_upload.use_int8_packed = false;
                opt_upload.use_int8_storage = false;

                cmd.record_clone(a4, a_gpu, opt_upload);
            }
            else
            {
                cmd.record_upload(a, a_gpu, opt);
            }

            ncnn::VkMat d_gpu;
            if (op->support_inplace)