From 9f832c19c170322ecd01749a6d129e7e915ccdb1 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Thu, 26 Jun 2025 10:47:25 +0800
Subject: [PATCH] vulkan int8 packing quantize dequantize requantize (#3731)

* add int8 definitions
* packing vulkan int8/int32, quantize vulkan
* vulkan dequantize
* requantize vulkan
---
 src/allocator.cpp                             |  17 ++
 src/command.cpp                               |   5 +
 src/gpu.cpp                                   | 152 +++++++++++-
 src/gpu.h                                     |   2 +-
 src/layer/packing.h                           |   2 +
 src/layer/vulkan/dequantize_vulkan.cpp        | 231 ++++++++++++++++++
 src/layer/vulkan/dequantize_vulkan.h          |  46 ++++
 src/layer/vulkan/packing_vulkan.cpp           |  71 +++++-
 src/layer/vulkan/quantize_vulkan.cpp          | 215 ++++++++++++++++
 src/layer/vulkan/quantize_vulkan.h            |  45 ++++
 src/layer/vulkan/requantize_vulkan.cpp        | 231 ++++++++++++++++++
 src/layer/vulkan/requantize_vulkan.h          |  47 ++++
 src/layer/vulkan/shader/dequantize.comp       |  80 ++++++
 src/layer/vulkan/shader/dequantize_pack4.comp |  80 ++++++
 src/layer/vulkan/shader/dequantize_pack8.comp |  84 +++++++
 src/layer/vulkan/shader/packing_int8.comp     |  73 ++++++
 .../vulkan/shader/packing_pack1to4_int8.comp  |  79 ++++++
 .../vulkan/shader/packing_pack1to8_int8.comp  |  88 +++++++
 .../vulkan/shader/packing_pack4to1_int8.comp  |  79 ++++++
 .../vulkan/shader/packing_pack4to8_int8.comp  |  75 ++++++
 .../vulkan/shader/packing_pack8to1_int8.comp  |  88 +++++++
 .../vulkan/shader/packing_pack8to4_int8.comp  |  75 ++++++
 src/layer/vulkan/shader/quantize.comp         |  63 +++++
 src/layer/vulkan/shader/quantize_pack4.comp   |  63 +++++
 src/layer/vulkan/shader/quantize_pack8.comp   |  65 +++++
 src/layer/vulkan/shader/requantize.comp       | 103 ++++++++
 src/layer/vulkan/shader/requantize_pack4.comp | 103 ++++++++
 src/layer/vulkan/shader/requantize_pack8.comp | 107 ++++++++
 src/net.cpp                                   |   6 +
 tests/test_dequantize.cpp                     |   6 +-
 tests/test_packing.cpp                        | 125 ++++++++--
 tests/test_quantize.cpp                       |   2 +-
 tests/test_quantize_oom.cpp                   |   2 +-
 tests/testutil.cpp                            |  55 ++++-
 34 files changed, 2523 insertions(+), 42 deletions(-)
 create mode 100644 src/layer/vulkan/dequantize_vulkan.cpp
 create mode 100644 src/layer/vulkan/dequantize_vulkan.h
 create mode 100644 src/layer/vulkan/quantize_vulkan.cpp
 create mode 100644 src/layer/vulkan/quantize_vulkan.h
 create mode 100644 src/layer/vulkan/requantize_vulkan.cpp
 create mode 100644 src/layer/vulkan/requantize_vulkan.h
 create mode 100644 src/layer/vulkan/shader/dequantize.comp
 create mode 100644 src/layer/vulkan/shader/dequantize_pack4.comp
 create mode 100644 src/layer/vulkan/shader/dequantize_pack8.comp
 create mode 100644 src/layer/vulkan/shader/packing_int8.comp
 create mode 100644 src/layer/vulkan/shader/packing_pack1to4_int8.comp
 create mode 100644 src/layer/vulkan/shader/packing_pack1to8_int8.comp
 create mode 100644 src/layer/vulkan/shader/packing_pack4to1_int8.comp
 create mode 100644 src/layer/vulkan/shader/packing_pack4to8_int8.comp
 create mode 100644 src/layer/vulkan/shader/packing_pack8to1_int8.comp
 create mode 100644 src/layer/vulkan/shader/packing_pack8to4_int8.comp
 create mode 100644 src/layer/vulkan/shader/quantize.comp
 create mode 100644 src/layer/vulkan/shader/quantize_pack4.comp
 create mode 100644 src/layer/vulkan/shader/quantize_pack8.comp
 create mode 100644 src/layer/vulkan/shader/requantize.comp
 create mode 100644 src/layer/vulkan/shader/requantize_pack4.comp
 create mode 100644 src/layer/vulkan/shader/requantize_pack8.comp

diff --git a/src/allocator.cpp b/src/allocator.cpp
index 98115ccec..e036cc976 100644
--- a/src/allocator.cpp
+++ b/src/allocator.cpp
@@ -892,6 +892,13 @@ VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize,
         if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT;
         if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT;
     }
+    if (elemsize / elempack == 1)
+    {
+        // int8
+        if (elempack == 1) format = VK_FORMAT_R8_SINT;
+        if (elempack == 4) format = VK_FORMAT_R8G8B8A8_SINT;
+        if (elempack == 8) format = VK_FORMAT_R8G8B8A8_SINT;
+    }
 
     // resolve image width height depth
     int width = w;
@@ -1468,6 +1475,16 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsiz
         if (elempack == 32) format = VK_FORMAT_R16G16B16A16_SFLOAT;
         if (elempack == 64) format = VK_FORMAT_R16G16B16A16_SFLOAT;
     }
+    if (elemsize / elempack == 1)
+    {
+        // int8
+        if (elempack == 1) format = VK_FORMAT_R8_SINT;
+        if (elempack == 4) format = VK_FORMAT_R8G8B8A8_SINT;
+        if (elempack == 8) format = VK_FORMAT_R8G8B8A8_SINT;
+        if (elempack == 16) format = VK_FORMAT_R8G8B8A8_SINT;
+        if (elempack == 32) format = VK_FORMAT_R8G8B8A8_SINT;
+        if (elempack == 64) format = VK_FORMAT_R8G8B8A8_SINT;
+    }
 
     // resolve image width height depth
     int width = w;
diff --git a/src/command.cpp b/src/command.cpp
index 98af7c5d0..27f037178 100644
--- a/src/command.cpp
+++ b/src/command.cpp
@@ -450,6 +450,11 @@ void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt)
         cast_type_to = 1;
     }
 
+    if (src.elemsize == src.elempack * 1u)
+    {
+        cast_type_to = 4;
+    }
+
     VkMat dst_staging;
     vkdev->convert_packing(src, dst_staging, dst_elempack, cast_type_to, *this, opt_staging);
 
diff --git a/src/gpu.cpp b/src/gpu.cpp
index d6e5090f1..b9a7c76c3 100644
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -3032,6 +3032,10 @@ public:
     // to fp32 | fp16
     // to pack1 | pack4 | pack8
     mutable ncnn::Layer* uop_packing[2][2][3];
+    // from int8
+    // to int8
+    // to pack1 | pack4 | pack8
+    mutable ncnn::Layer* uop_packing_int8[3];
     mutable Mutex uop_lock;
 
     // device is valid and sucessfully initialized
@@ -3047,6 +3051,7 @@ VulkanDevicePrivate::VulkanDevicePrivate(VulkanDevice* _vkdev)
     pipeline_cache = 0;
     valid = false;
     memset(uop_packing, 0, sizeof(uop_packing));
+    memset(uop_packing_int8, 0, sizeof(uop_packing_int8));
 }
 
 int VulkanDevicePrivate::create_dummy_buffer_image()
@@ -3096,18 +3101,29 @@ void VulkanDevicePrivate::destroy_dummy_buffer_image()
 
 const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const
 {
+    bool use_fp16 = (cast_type_from_index == 1 || cast_type_to_index == 1);
+    bool use_int8 = (cast_type_from_index == 3 || cast_type_to_index == 3);
+
     MutexLockGuard lock(uop_lock);
 
-    const ncnn::Layer* cached_uop = uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index];
+    const ncnn::Layer* cached_uop = 0;
+    if (use_int8)
+    {
+        cached_uop = uop_packing_int8[packing_type_to_index];
+    }
+    else
+    {
+        cached_uop = uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index];
+    }
     if (cached_uop)
         return cached_uop;
 
-    bool use_fp16 = (cast_type_from_index == 1 || cast_type_to_index == 1);
-
     // create uop
     Option opt;
     opt.use_fp16_packed = use_fp16; // fp16p is always supported
     opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage();
+    opt.use_int8_packed = use_int8; // int8p is always supported
+    opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage();
 
     // fp16/int8 arithmetic are not necessary for packing
     // and may conflict with storage options
@@ -3132,14 +3148,21 @@ const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_
 
     ncnn::ParamDict pd;
     pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack
-    pd.set(2, cast_type_from_index + 1);                                            // 0=auto 1=fp32 2=fp16
+    pd.set(2, cast_type_from_index + 1);                                            // 0=auto 1=fp32 2=fp16 3=int8
     pd.set(3, cast_type_to_index + 1);
 
     uop->load_param(pd);
 
     uop->create_pipeline(opt);
 
-    uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop;
+    if (use_int8)
+    {
+        uop_packing_int8[packing_type_to_index] = uop;
+    }
+    else
+    {
+        uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop;
+    }
 
     return uop;
 }
@@ -3164,6 +3187,8 @@ void VulkanDevicePrivate::destroy_utility_operator()
 
             opt.use_fp16_packed = use_fp16;
             opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage();
+            opt.use_int8_packed = false;
+            opt.use_int8_storage = false;
 
             // to pack1 | pack4 | pack8
             for (int k = 0; k < 3; k++)
@@ -3183,6 +3208,33 @@ void VulkanDevicePrivate::destroy_utility_operator()
             }
         }
     }
+
+    // int8
+    {
+        bool use_int8 = true;
+
+        opt.use_fp16_packed = false;
+        opt.use_fp16_storage = false;
+        opt.use_int8_packed = use_int8;
+        opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage();
+
+        // to pack1 | pack4 | pack8
+        for (int k = 0; k < 3; k++)
+        {
+            // enable pack8 for pack8to1/pack8to4
+            opt.use_shader_pack8 = true;
+
+            ncnn::Layer* uop = uop_packing_int8[k];
+            if (!uop)
+                continue;
+
+            uop->destroy_pipeline(opt);
+
+            delete uop;
+
+            uop_packing_int8[k] = 0;
+        }
+    }
 }
 
 VulkanDevice::VulkanDevice(int device_index)
@@ -4232,18 +4284,35 @@ void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempac
     {
         cast_type_from_index = 0;
     }
-    else // if (src.elembits() == 16)
+    else if (src.elembits() == 16)
     {
         cast_type_from_index = 1;
     }
+    else // if (src.elembits() == 8)
+    {
+        cast_type_from_index = 3;
+    }
 
     int cast_type_to_index = cast_type_to ? cast_type_to - 1 : cast_type_from_index;
 
     // NCNN_LOGE("convert_packing b2b %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);
 
+    if ((cast_type_from_index == 0 || cast_type_from_index == 1) && (cast_type_to_index == 2 || cast_type_to_index == 3))
+    {
+        NCNN_LOGE("convert_packing from fp32/fp16 to int32/int8 is not supported");
+        return;
+    }
+    if ((cast_type_from_index == 2 || cast_type_from_index == 3) && (cast_type_to_index == 0 || cast_type_to_index == 1))
+    {
+        NCNN_LOGE("convert_packing from int32/int8 to fp32/fp16 is not supported");
+        return;
+    }
+
     Option opt2 = opt;
     opt2.use_fp16_packed = (cast_type_from_index == 1 || cast_type_to_index == 1);
     opt2.use_fp16_storage = (cast_type_from_index == 1 || cast_type_to_index == 1) && info.support_fp16_storage();
+    opt2.use_int8_packed = (cast_type_from_index == 3 || cast_type_to_index == 3);
+    opt2.use_int8_storage = (cast_type_from_index == 3 || cast_type_to_index == 3) && info.support_int8_storage();
 
     const ncnn::Layer* uop = d->get_utility_operator(cast_type_from_index, cast_type_to_index, packing_type_to_index);
     uop->forward(src, dst, cmd, opt2);
@@ -4809,6 +4878,49 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
         custom_defines.append("afp2sfpmat4(v)", "v");
     }
 
+    if (opt.use_int8_storage)
+    {
+        custom_defines.append("sint8", "int8_t");
+    }
+    else if (opt.use_int8_packed)
+    {
+        custom_defines.append("sint8", "int");
+    }
+    else
+    {
+        custom_defines.append("sint8", "int");
+    }
+
+    custom_defines.append("sint8vec4", "int");
+    custom_defines.append("sint8vec8", "ivec2");
+
+    custom_defines.append("aint8", "int");
+    custom_defines.append("aint8vec4", "ivec4");
+
+    custom_defines.append("unpackInt4x8(v)", "ivec4((v<<24)>>24,(v<<16)>>24,(v<<8)>>24,v>>24)");
+    custom_defines.append("packInt4x8(v)", "int((uint(v.r)&0xFFu)|((uint(v.g)&0xFFu)<<8)|((uint(v.b)&0xFFu)<<16)|((uint(v.a)&0xFFu)<<24))");
+
+    if (opt.use_int8_storage)
+    {
+        custom_defines.append("i8buffer_ld1(buf,i)", "int(buf[i])");
+        custom_defines.append("i8buffer_st1(buf,i,v)", "{buf[i]=int8_t(v);}");
+        custom_defines.append("i8buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
+    }
+    else
+    {
+        custom_defines.append("i8buffer_ld1(buf,i)", "int(((buf[(i)/4])<<(24-((i)%4)*8))>>24)");
+        custom_defines.append("i8buffer_st1(buf,i,v)", "{uint _i=uint(i);uint _id4=_i/4;uint _im4=_i%4;int _vs=int(v);int _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id4],0,0);ivec4 _v=unpackInt4x8(_old_v);_v[_im4]=_vs;_new_v=packInt4x8(_v);} while(atomicCompSwap(buf[_id4],_old_v,_new_v)!=_old_v);}");
+        custom_defines.append("i8buffer_cp1(buf,i,sbuf,si)", "{int _v=i8buffer_ld1(sbuf,si);i8buffer_st1(buf,i,_v);}");
+    }
+
+    custom_defines.append("i8buffer_ld4(buf,i)", "unpackInt4x8(buf[i])");
+    custom_defines.append("i8buffer_st4(buf,i,v)", "{buf[i]=packInt4x8(v);}");
+    custom_defines.append("i8buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
+
+    custom_defines.append("i8buffer_ld8(buf,i)", "ivec8(unpackInt4x8(buf[i].r),unpackInt4x8(buf[i].g))");
+    custom_defines.append("i8buffer_st8(buf,i,v)", "{buf[i]=ivec2(packInt4x8(v.abcd),packInt4x8(v.efgh));}");
+    custom_defines.append("i8buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
+
     custom_defines.append("psc(x)", "(x==0?p.x:x)");
 
     if (opt.use_fp16_storage)
@@ -5426,6 +5538,15 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
     {
         custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_float16: require\n";
     }
+    custom_exts += "struct ivec8 { ivec4 abcd; ivec4 efgh; };\n";
+    if (opt.use_int8_storage)
+    {
+        custom_exts += "#extension GL_EXT_shader_8bit_storage: require\n";
+    }
+    if (opt.use_int8_arithmetic)
+    {
+        custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_int8: require\n";
+    }
 #if ENABLE_VALIDATION_LAYER
     {
         custom_exts += "#extension GL_EXT_debug_printf : require\n";
@@ -5507,11 +5628,22 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
             NCNN_LOGE("%s", s.getInfoLog());
             NCNN_LOGE("%s", s.getInfoDebugLog());
 
-            // for (int i = 0; i < 4; i++)
+            // print as line_number: code
             {
-                int i = 3;
-                std::string s(comp_datas[i], comp_data_sizes[i]);
-                NCNN_LOGE("%s", s.c_str());
+                const char* p = comp_datas[3];
+                const char* line_end;
+                int line_number = 1;
+
+                while ((line_end = strchr(p, '\n')) != NULL)
+                {
+                    NCNN_LOGE("%d:\t%.*s", line_number++, (int)(line_end - p), p);
+                    p = line_end + 1;
+                }
+
+                if (*p != '\0')
+                {
+                    NCNN_LOGE("%d:\t%s", line_number, p);
+                }
             }
 
             compile_success = false;
diff --git a/src/gpu.h b/src/gpu.h
index d9668b837..cefb02363 100644
--- a/src/gpu.h
+++ b/src/gpu.h
@@ -465,7 +465,7 @@ public:
 
     // utility operator
     void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
-    // cast_type_to   0=auto(same as src)  1=fp32  2=fp16
+    // cast_type_to   0=auto(same as src)  1=fp32  2=fp16  3=int32  4=int8
     void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const;
 
     // VK_KHR_bind_memory2
diff --git a/src/layer/packing.h b/src/layer/packing.h
index f590f0fe1..bdb511da9 100644
--- a/src/layer/packing.h
+++ b/src/layer/packing.h
@@ -36,6 +36,8 @@ public:
     // 0 = auto
     // 1 = fp32
     // 2 = fp16
+    // 3 = int32
+    // 4 = int8
     int cast_type_from;
     int cast_type_to;
 };
diff --git a/src/layer/vulkan/dequantize_vulkan.cpp b/src/layer/vulkan/dequantize_vulkan.cpp
new file mode 100644
index 000000000..6ffccbc9e
--- /dev/null
+++ b/src/layer/vulkan/dequantize_vulkan.cpp
@@ -0,0 +1,231 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "dequantize_vulkan.h"
+
+#include "layer_shader_type.h"
+
+namespace ncnn {
+
+Dequantize_vulkan::Dequantize_vulkan()
+{
+    support_vulkan = true;
+
+    pipeline_dequantize = 0;
+    pipeline_dequantize_pack4 = 0;
+    pipeline_dequantize_pack8 = 0;
+}
+
+int Dequantize_vulkan::create_pipeline(const Option& opt)
+{
+    const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
+    const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
+
+    const int dims = shape.dims;
+
+    int elempack = 1;
+    if (dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (dims == 3 || dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    const size_t elemsize = elempack * 4u;
+    size_t out_elemsize;
+    if (opt.use_fp16_storage || opt.use_fp16_packed)
+    {
+        out_elemsize = elempack * 2u;
+    }
+    else
+    {
+        out_elemsize = elempack * 4u;
+    }
+
+    Mat shape_packed;
+    if (dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+    if (dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    Mat out_shape_packed;
+    if (dims == 1) out_shape_packed = Mat(out_shape.w / elempack, (void*)0, out_elemsize, elempack);
+    if (dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / elempack, (void*)0, out_elemsize, elempack);
+    if (dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / elempack, (void*)0, out_elemsize, elempack);
+    if (dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / elempack, (void*)0, out_elemsize, elempack);
+
+    size_t c = 0;
+    size_t in_stride = 0;
+    size_t out_stride = 0;
+    if (dims == 1)
+    {
+        c = 1;
+        in_stride = shape_packed.w;
+        out_stride = out_shape_packed.w;
+    }
+    if (dims == 2)
+    {
+        c = shape_packed.h;
+        in_stride = shape_packed.w;
+        out_stride = out_shape_packed.w;
+    }
+    if (dims == 3 || dims == 4)
+    {
+        c = shape_packed.c;
+        in_stride = shape_packed.cstep;
+        out_stride = out_shape_packed.cstep;
+    }
+
+    std::vector<vk_specialization_type> specializations(4 + 3);
+    specializations[0].i = scale_data_size;
+    specializations[1].f = scale_data_size == 1 ? scale_data[0] : 1.f;
+    specializations[2].i = bias_data_size;
+    specializations[3].f = bias_data_size == 1 ? bias_data[0] : 0.f;
+    specializations[4 + 0].u32 = c;
+    specializations[4 + 1].u32 = in_stride;
+    specializations[4 + 2].u32 = out_stride;
+
+    const int local_size_x = vkdev->info.subgroup_size();
+
+    // pack1
+    if (shape.dims == 0 || elempack == 1)
+    {
+        pipeline_dequantize = new Pipeline(vkdev);
+        pipeline_dequantize->set_optimal_local_size_xyz(local_size_x, 1, 1);
+        pipeline_dequantize->create(LayerShaderType::dequantize, opt, specializations);
+    }
+
+    // pack4
+    if (shape.dims == 0 || elempack == 4)
+    {
+        pipeline_dequantize_pack4 = new Pipeline(vkdev);
+        pipeline_dequantize_pack4->set_optimal_local_size_xyz(local_size_x, 1, 1);
+        pipeline_dequantize_pack4->create(LayerShaderType::dequantize_pack4, opt, specializations);
+    }
+
+    // pack8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
+    {
+        pipeline_dequantize_pack8 = new Pipeline(vkdev);
+        pipeline_dequantize_pack8->set_optimal_local_size_xyz(local_size_x, 1, 1);
+        pipeline_dequantize_pack8->create(LayerShaderType::dequantize_pack8, opt, specializations);
+    }
+
+    return 0;
+}
+
+int Dequantize_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_dequantize;
+    pipeline_dequantize = 0;
+
+    delete pipeline_dequantize_pack4;
+    pipeline_dequantize_pack4 = 0;
+
+    delete pipeline_dequantize_pack8;
+    pipeline_dequantize_pack8 = 0;
+
+    return 0;
+}
+
+int Dequantize_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
+{
+    if (scale_data_size > 1)
+    {
+        cmd.record_upload(scale_data, scale_data_gpu, opt);
+    }
+
+    if (bias_data_size > 1)
+    {
+        cmd.record_upload(bias_data, bias_data_gpu, opt);
+    }
+
+    return 0;
+}
+
+int Dequantize_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    const int dims = bottom_blob.dims;
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int d = bottom_blob.d;
+    const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+
+    size_t out_elemsize;
+    if (opt.use_fp16_storage || opt.use_fp16_packed)
+    {
+        out_elemsize = elempack * 2u;
+    }
+    else
+    {
+        out_elemsize = elempack * 4u;
+    }
+
+    if (dims == 1)
+        top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator);
+    if (dims == 2)
+        top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator);
+    if (dims == 3)
+        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator);
+    if (dims == 4)
+        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    size_t c = 0;
+    size_t in_stride = 0;
+    size_t out_stride = 0;
+    if (dims == 1)
+    {
+        c = 1;
+        in_stride = bottom_blob.w;
+        out_stride = top_blob.w;
+    }
+    if (dims == 2)
+    {
+        c = bottom_blob.h;
+        in_stride = bottom_blob.w;
+        out_stride = top_blob.w;
+    }
+    if (dims == 3 || dims == 4)
+    {
+        c = bottom_blob.c;
+        in_stride = bottom_blob.cstep;
+        out_stride = top_blob.cstep;
+    }
+
+    std::vector<VkMat> bindings(4);
+    bindings[0] = bottom_blob;
+    bindings[1] = top_blob;
+    bindings[2] = scale_data_gpu;
+    bindings[3] = bias_data_gpu;
+
+    std::vector<vk_constant_type> constants(3);
+    constants[0].u32 = c;
+    constants[1].u32 = in_stride;
+    constants[2].u32 = out_stride;
+
+    VkMat dispatcher;
+    dispatcher.w = in_stride * c;
+    dispatcher.h = 1;
+    dispatcher.c = 1;
+
+    const Pipeline* pipeline = elempack == 8 ? pipeline_dequantize_pack8
+                               : elempack == 4 ? pipeline_dequantize_pack4
+                               : pipeline_dequantize;
+
+    cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/vulkan/dequantize_vulkan.h b/src/layer/vulkan/dequantize_vulkan.h
new file mode 100644
index 000000000..08ee83fc5
--- /dev/null
+++ b/src/layer/vulkan/dequantize_vulkan.h
@@ -0,0 +1,46 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_DEQUANTIZE_VULKAN_H
+#define LAYER_DEQUANTIZE_VULKAN_H
+
+#include "dequantize.h"
+
+namespace ncnn {
+
+class Dequantize_vulkan : virtual public Dequantize
+{
+public:
+    Dequantize_vulkan();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+    using Dequantize::forward;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    VkMat scale_data_gpu;
+    VkMat bias_data_gpu;
+
+    Pipeline* pipeline_dequantize;
+    Pipeline* pipeline_dequantize_pack4;
+    Pipeline* pipeline_dequantize_pack8;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DEQUANTIZE_VULKAN_H
diff --git a/src/layer/vulkan/packing_vulkan.cpp b/src/layer/vulkan/packing_vulkan.cpp
index d314d4554..aba9c6557 100644
--- a/src/layer/vulkan/packing_vulkan.cpp
+++ b/src/layer/vulkan/packing_vulkan.cpp
@@ -45,6 +45,8 @@ int Packing_vulkan::create_pipeline(const Option& opt)
 
     const int local_size_x = vkdev->info.subgroup_size();
 
+    bool use_int8_shader = cast_type_from == 4 || cast_type_to == 4;
+
     std::vector<vk_specialization_type> specializations(2 + 3);
     specializations[0].i = cast_type_from;
     specializations[1].i = cast_type_to;
@@ -91,7 +93,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)
 
         pipeline_packing = new Pipeline(vkdev);
         pipeline_packing->set_optimal_local_size_xyz(local_size_x, 1, 1);
-        pipeline_packing->create(LayerShaderType::packing, opt, specializations);
+        if (use_int8_shader)
+        {
+            pipeline_packing->create(LayerShaderType::packing_int8, opt, specializations);
+        }
+        else
+        {
+            pipeline_packing->create(LayerShaderType::packing, opt, specializations);
+        }
     }
     if (shape.dims == 0 || elempack < out_elempack)
     {
@@ -126,7 +135,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)
 
             pipeline_packing_pack1to4 = new Pipeline(vkdev);
             pipeline_packing_pack1to4->set_optimal_local_size_xyz(local_size_x, 1, 1);
-            pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4, opt, specializations);
+            if (use_int8_shader)
+            {
+                pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4_int8, opt, specializations);
+            }
+            else
+            {
+                pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4, opt, specializations);
+            }
         }
 
         if (shape.dims == 0 || (elempack == 1 && out_elempack == 8))
@@ -138,7 +154,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)
 
             pipeline_packing_pack1to8 = new Pipeline(vkdev);
             pipeline_packing_pack1to8->set_optimal_local_size_xyz(local_size_x, 1, 1);
-            pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8, opt, specializations);
+            if (use_int8_shader)
+            {
+                pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8_int8, opt, specializations);
+            }
+            else
+            {
+                pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8, opt, specializations);
+            }
         }
 
         if (shape.dims == 0 || (elempack == 4 && out_elempack == 8))
@@ -150,7 +173,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)
 
             pipeline_packing_pack4to8 = new Pipeline(vkdev);
             pipeline_packing_pack4to8->set_optimal_local_size_xyz(local_size_x, 1, 1);
-            pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8, opt, specializations);
+            if (use_int8_shader)
+            {
+                pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8_int8, opt, specializations);
+            }
+            else
+            {
+                pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8, opt, specializations);
+            }
         }
     }
     if (shape.dims == 0 || elempack > out_elempack)
@@ -186,7 +216,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)
 
             pipeline_packing_pack4to1 = new Pipeline(vkdev);
             pipeline_packing_pack4to1->set_optimal_local_size_xyz(local_size_x, 1, 1);
-            pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1, opt, specializations);
+            if (use_int8_shader)
+            {
+                pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1_int8, opt, specializations);
+            }
+            else
+            {
+                pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1, opt, specializations);
+            }
         }
 
         if (shape.dims == 0 || (elempack == 8 && out_elempack == 1))
@@ -198,7 +235,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)
 
             pipeline_packing_pack8to1 = new Pipeline(vkdev);
             pipeline_packing_pack8to1->set_optimal_local_size_xyz(local_size_x, 1, 1);
-            pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1, opt, specializations);
+            if (use_int8_shader)
+            {
+                pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1_int8, opt, specializations);
+            }
+            else
+            {
+                pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1, opt, specializations);
+            }
         }
 
         if (shape.dims == 0 || (elempack == 8 && out_elempack == 4))
@@ -210,7 +254,14 @@ int Packing_vulkan::create_pipeline(const Option& opt)
 
             pipeline_packing_pack8to4 = new Pipeline(vkdev);
             pipeline_packing_pack8to4->set_optimal_local_size_xyz(local_size_x, 1, 1);
-            pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4, opt, specializations);
+            if (use_int8_shader)
+            {
+                pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4_int8, opt, specializations);
+            }
+            else
+            {
+                pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4, opt, specializations);
+            }
         }
     }
 
@@ -296,10 +347,14 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
     {
         out_elemsize = out_elempack * 4u;
     }
-    else // if (cast_type_to == 2)
+    else if (cast_type_to == 2)
     {
         out_elemsize = out_elempack * 2u;
     }
+    else // if (cast_type_to == 3)
+    {
+        out_elemsize = out_elempack * 1u;
+    }
 
     if (dims == 1)
     {
diff --git a/src/layer/vulkan/quantize_vulkan.cpp b/src/layer/vulkan/quantize_vulkan.cpp
new file mode 100644
index 000000000..8ad860147
--- /dev/null
+++ b/src/layer/vulkan/quantize_vulkan.cpp
@@ -0,0 +1,215 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "quantize_vulkan.h"
+
+#include "layer_shader_type.h"
+
+namespace ncnn {
+
+Quantize_vulkan::Quantize_vulkan()
+{
+    support_vulkan = true;
+
+    pipeline_quantize = 0;
+    pipeline_quantize_pack4 = 0;
+    pipeline_quantize_pack8 = 0;
+}
+
+int Quantize_vulkan::create_pipeline(const Option& opt)
+{
+    const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
+    const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
+
+    const int dims = shape.dims;
+
+    int elempack = 0;
+    if (dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (dims == 3 || dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    const size_t out_elemsize = elempack * 1u;
+    if (opt.use_fp16_storage || opt.use_fp16_packed)
+    {
+        elemsize = elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+    }
+
+    Mat shape_packed;
+    if (dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+    if (dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    Mat out_shape_packed;
+    if (dims == 1) out_shape_packed = Mat(out_shape.w / elempack, (void*)0, out_elemsize, elempack);
+    if (dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / elempack, (void*)0, out_elemsize, elempack);
+    if (dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / elempack, (void*)0, out_elemsize, elempack);
+    if (dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / elempack, (void*)0, out_elemsize, elempack);
+
+    size_t c = 0;
+    size_t in_stride = 0;
+    size_t out_stride = 0;
+    if (dims == 1)
+    {
+        c = 1;
+        in_stride = shape_packed.w;
+        out_stride = out_shape_packed.w;
+    }
+    if (dims == 2)
+    {
+        c = shape_packed.h;
+        in_stride = shape_packed.w;
+        out_stride = out_shape_packed.w;
+    }
+    if (dims == 3 || dims == 4)
+    {
+        c = shape_packed.c;
+        in_stride = shape_packed.cstep;
+        out_stride = out_shape_packed.cstep;
+    }
+
+    std::vector<vk_specialization_type> specializations(2 + 3);
+    specializations[0].i = scale_data_size;
+    specializations[1].f = scale_data_size == 1 ? scale_data[0] : 1.f;
+    specializations[2 + 0].u32 = c;
+    specializations[2 + 1].u32 = in_stride;
+    specializations[2 + 2].u32 = out_stride;
+
+    const int local_size_x = vkdev->info.subgroup_size();
+
+    // pack1
+    if (shape.dims == 0 || elempack == 1)
+    {
+        pipeline_quantize = new Pipeline(vkdev);
+        pipeline_quantize->set_optimal_local_size_xyz(local_size_x, 1, 1);
+        pipeline_quantize->create(LayerShaderType::quantize, opt, specializations);
+    }
+
+    // pack4
+    if (shape.dims == 0 || elempack == 4)
+    {
+        pipeline_quantize_pack4 = new Pipeline(vkdev);
+        pipeline_quantize_pack4->set_optimal_local_size_xyz(local_size_x, 1, 1);
+        pipeline_quantize_pack4->create(LayerShaderType::quantize_pack4, opt, specializations);
+    }
+
+    // pack8
+    if (shape.dims == 0 || elempack == 8)
+    {
+        pipeline_quantize_pack8 = new Pipeline(vkdev);
+        pipeline_quantize_pack8->set_optimal_local_size_xyz(local_size_x, 1, 1);
+        pipeline_quantize_pack8->create(LayerShaderType::quantize_pack8, opt, specializations);
+    }
+
+    return 0;
+}
+
+int Quantize_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_quantize;
+    pipeline_quantize = 0;
+
+    delete pipeline_quantize_pack4;
+    pipeline_quantize_pack4 = 0;
+
+    delete pipeline_quantize_pack8;
+    pipeline_quantize_pack8 = 0;
+
+    return 0;
+}
+
+int Quantize_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
+{
+    if (scale_data_size > 1)
+    {
+        cmd.record_upload(scale_data, scale_data_gpu, opt);
+    }
+
+    return 0;
+}
+
+int Quantize_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    const int dims = bottom_blob.dims;
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int d = bottom_blob.d;
+    const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+
+    const size_t out_elemsize = 1u * elempack;
+
+    if (dims == 1)
+        top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator);
+    if (dims == 2)
+        top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator);
+    if (dims == 3)
+        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator);
+    if (dims == 4)
+        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    size_t c = 0;
+    size_t in_stride = 0;
+    size_t out_stride = 0;
+    if (dims == 1)
+    {
+        c = 1;
+        in_stride = bottom_blob.w;
+        out_stride = top_blob.w;
+    }
+    if (dims == 2)
+    {
+        c = bottom_blob.h;
+        in_stride = bottom_blob.w;
+        out_stride = top_blob.w;
+    }
+    if (dims == 3 || dims == 4)
+    {
+        c = bottom_blob.c;
+        in_stride = bottom_blob.cstep;
+        out_stride = top_blob.cstep;
+    }
+
+    std::vector<VkMat> bindings(3);
+    bindings[0] = bottom_blob;
+    bindings[1] = top_blob;
+    bindings[2] = scale_data_gpu;
+
+    std::vector<vk_constant_type> constants(3);
+    constants[0].u32 = c;
+    constants[1].u32 = in_stride;
+    constants[2].u32 = out_stride;
+
+    VkMat dispatcher;
+    dispatcher.w = in_stride * c;
+    dispatcher.h = 1;
+    dispatcher.c = 1;
+
+    const Pipeline* pipeline = elempack == 8 ? pipeline_quantize_pack8
+                               : elempack == 4 ? pipeline_quantize_pack4
+                               : pipeline_quantize;
+
+    cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/vulkan/quantize_vulkan.h b/src/layer/vulkan/quantize_vulkan.h
new file mode 100644
index 000000000..9a1963932
--- /dev/null
+++ b/src/layer/vulkan/quantize_vulkan.h
@@ -0,0 +1,45 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_QUANTIZE_VULKAN_H
+#define LAYER_QUANTIZE_VULKAN_H
+
+#include "quantize.h"
+
+namespace ncnn {
+
+class Quantize_vulkan : virtual public Quantize
+{
+public:
+    Quantize_vulkan();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+    using Quantize::forward;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    VkMat scale_data_gpu;
+
+    Pipeline* pipeline_quantize;
+    Pipeline* pipeline_quantize_pack4;
+    Pipeline* pipeline_quantize_pack8;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_QUANTIZE_VULKAN_H
diff --git a/src/layer/vulkan/requantize_vulkan.cpp b/src/layer/vulkan/requantize_vulkan.cpp
new file mode 100644
index 000000000..e85743c4e
--- /dev/null
+++ b/src/layer/vulkan/requantize_vulkan.cpp
@@ -0,0 +1,231 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "requantize_vulkan.h"
+
+#include "layer_shader_type.h"
+
+namespace ncnn {
+
+Requantize_vulkan::Requantize_vulkan()
+{
+    support_vulkan = true;
+
+    pipeline_requantize = 0;
+    pipeline_requantize_pack4 = 0;
+    pipeline_requantize_pack8 = 0;
+}
+
+int Requantize_vulkan::create_pipeline(const Option& opt)
+{
+    const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
+    const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
+
+    const int dims = shape.dims;
+
+    int elempack = 1;
+    if (dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (dims == 3 || dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    int out_elempack = 1;
+    if (dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
+    if (dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1;
+    if (dims == 3 || dims == 4) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1;
+
+    const size_t elemsize = elempack * 4u;
+    const size_t out_elemsize = out_elempack * 1u;
+
+    Mat shape_packed;
+    if (dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+    if (dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    Mat out_shape_packed;
+    if (dims == 1) out_shape_packed = Mat(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
+    if (dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack);
+    if (dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
+    if (dims == 4) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.d, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
+
+    size_t c = 0;
+    size_t in_stride = 0;
+    size_t out_stride = 0;
+    if (dims == 1)
+    {
+        c = 1;
+        in_stride = shape_packed.w;
+        out_stride = out_shape_packed.w;
+    }
+    if (dims == 2)
+    {
+        c = shape_packed.h;
+        in_stride = shape_packed.w;
+        out_stride = out_shape_packed.w;
+    }
+    if (dims == 3 || dims == 4)
+    {
+        c = shape_packed.c;
+        in_stride = shape_packed.cstep;
+        out_stride = out_shape_packed.cstep;
+    }
+
+    std::vector<vk_specialization_type> specializations(9 + 3);
+    specializations[0].i = scale_in_data_size;
+    specializations[1].f = scale_in_data_size == 1 ? scale_in_data[0] : 1.f;
+    specializations[2].i = scale_out_data_size;
+    specializations[3].f = scale_out_data_size == 1 ? scale_out_data[0] : 1.f;
+    specializations[4].i = bias_data_size;
+    specializations[5].f = bias_data_size == 1 ? bias_data[0] : 0.f;
+    specializations[6].i = activation_type;
+    specializations[7].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+    specializations[8].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+    specializations[9 + 0].u32 = c;
+    specializations[9 + 1].u32 = in_stride;
+    specializations[9 + 2].u32 = out_stride;
+
+    const int local_size_x = vkdev->info.subgroup_size();
+
+    // pack1
+    if (shape.dims == 0 || elempack == 1)
+    {
+        pipeline_requantize = new Pipeline(vkdev);
+        pipeline_requantize->set_optimal_local_size_xyz(local_size_x, 1, 1);
+        pipeline_requantize->create(LayerShaderType::requantize, opt, specializations);
+    }
+
+    // pack4
+    if (shape.dims == 0 || elempack == 4)
+    {
+        pipeline_requantize_pack4 = new Pipeline(vkdev);
+        pipeline_requantize_pack4->set_optimal_local_size_xyz(local_size_x, 1, 1);
+        pipeline_requantize_pack4->create(LayerShaderType::requantize_pack4, opt, specializations);
+    }
+
+    // pack8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
+    {
+        pipeline_requantize_pack8 = new Pipeline(vkdev);
+        pipeline_requantize_pack8->set_optimal_local_size_xyz(local_size_x, 1, 1);
+        pipeline_requantize_pack8->create(LayerShaderType::requantize_pack8, opt, specializations);
+    }
+
+    return 0;
+}
+
+int Requantize_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_requantize;
+    pipeline_requantize = 0;
+
+    delete pipeline_requantize_pack4;
+    pipeline_requantize_pack4 = 0;
+
+    delete pipeline_requantize_pack8;
+    pipeline_requantize_pack8 = 0;
+
+    return 0;
+}
+
+int Requantize_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
+{
+    if (scale_in_data_size > 1)
+    {
+        cmd.record_upload(scale_in_data, scale_in_data_gpu, opt);
+    }
+
+    if (scale_out_data_size > 1)
+    {
+        cmd.record_upload(scale_out_data, scale_out_data_gpu, opt);
+    }
+
+    if (bias_data_size > 1)
+    {
+        cmd.record_upload(bias_data, bias_data_gpu, opt);
+    }
+
+    return 0;
+}
+
+int Requantize_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    const int dims = bottom_blob.dims;
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int d = bottom_blob.d;
+    const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+
+    size_t out_elemsize = 1u * elempack;
+
+    if (dims == 1)
+        top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator);
+    if (dims == 2)
+        top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator);
+    if (dims == 3)
+        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator);
+    if (dims == 4)
+        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    size_t c = 0;
+    size_t in_stride = 0;
+    size_t out_stride = 0;
+    if (dims == 1)
+    {
+        c = 1;
+        in_stride = bottom_blob.w;
+        out_stride = top_blob.w;
+    }
+    if (dims == 2)
+    {
+        c = bottom_blob.h;
+        in_stride = bottom_blob.w;
+        out_stride = top_blob.w;
+    }
+    if (dims == 3 || dims == 4)
+    {
+        c = bottom_blob.c;
+        in_stride = bottom_blob.cstep;
+        out_stride = top_blob.cstep;
+    }
+
+    std::vector<VkMat> bindings(5);
+    bindings[0] = bottom_blob;
+    bindings[1] = top_blob;
+    bindings[2] = scale_in_data_gpu;
+    bindings[3] = scale_out_data_gpu;
+    bindings[4] = bias_data_gpu;
+
+    std::vector<vk_constant_type> constants(3);
+    constants[0].u32 = c;
+    constants[1].u32 = in_stride;
+    constants[2].u32 = out_stride;
+
+    VkMat dispatcher;
+    dispatcher.w = in_stride * c;
+    dispatcher.h = 1;
+    dispatcher.c = 1;
+
+    const Pipeline* pipeline = elempack == 8 ? pipeline_requantize_pack8
+                               : elempack == 4 ? pipeline_requantize_pack4
+                               : pipeline_requantize;
+
+    cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/vulkan/requantize_vulkan.h b/src/layer/vulkan/requantize_vulkan.h
new file mode 100644
index 000000000..c0a86199e
--- /dev/null
+++ b/src/layer/vulkan/requantize_vulkan.h
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_REQUANTIZE_VULKAN_H
+#define LAYER_REQUANTIZE_VULKAN_H
+
+#include "requantize.h"
+
+namespace ncnn {
+
+class Requantize_vulkan : virtual public Requantize
+{
+public:
+    Requantize_vulkan();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+    using Requantize::forward;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    VkMat scale_in_data_gpu;
+    VkMat scale_out_data_gpu;
+    VkMat bias_data_gpu;
+
+    Pipeline* pipeline_requantize;
+    Pipeline* pipeline_requantize_pack4;
+    Pipeline* pipeline_requantize_pack8;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_REQUANTIZE_VULKAN_H
diff --git a/src/layer/vulkan/shader/dequantize.comp b/src/layer/vulkan/shader/dequantize.comp
new file mode 100644
index 000000000..4dd77d713
--- /dev/null
+++ b/src/layer/vulkan/shader/dequantize.comp
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int scale_data_size = 0;
+layout (constant_id = 1) const float scale_value = 1.f;
+layout (constant_id = 2) const int bias_data_size = 0;
+layout (constant_id = 3) const float bias_value = 0.f;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { int bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer scale_blob { sfp scale_blob_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint c;
+    uint in_stride;
+    uint out_stride;
+} p;
+
+void main()
+{
+    const uint gi = gl_GlobalInvocationID.x;
+
+    if (gi >= psc(in_stride) * psc(c))
+        return;
+
+    const uint gy = gi / psc(in_stride);
+    const uint gx = gi % psc(in_stride);
+
+    int v = bottom_blob_data[gi];
+
+    afp scale;
+    if (scale_data_size == 1)
+    {
+        scale = afp(scale_value);
+    }
+    else
+    {
+        scale = buffer_ld1(scale_blob_data, gy);
+    }
+
+    afp bias;
+    if (bias_data_size == 0)
+    {
+        bias = afp(0.f);
+    }
+    else if (bias_data_size == 1)
+    {
+        bias = afp(bias_value);
+    }
+    else
+    {
+        bias = buffer_ld1(bias_blob_data, gy);
+    }
+
+    afp v_fp = afp(v) * scale + bias;
+
+    const uint outgi = gy * psc(out_stride) + gx;
+
+    buffer_st1(top_blob_data, outgi, v_fp);
+}
diff --git a/src/layer/vulkan/shader/dequantize_pack4.comp b/src/layer/vulkan/shader/dequantize_pack4.comp
new file mode 100644
index 000000000..b54d0af92
--- /dev/null
+++ b/src/layer/vulkan/shader/dequantize_pack4.comp
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int scale_data_size = 0;
+layout (constant_id = 1) const float scale_value = 1.f;
+layout (constant_id = 2) const int bias_data_size = 0;
+layout (constant_id = 3) const float bias_value = 0.f;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { ivec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer scale_blob { sfpvec4 scale_blob_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint c;
+    uint in_stride;
+    uint out_stride;
+} p;
+
+void main()
+{
+    const uint gi = gl_GlobalInvocationID.x;
+
+    if (gi >= psc(in_stride) * psc(c))
+        return;
+
+    const uint gy = gi / psc(in_stride);
+    const uint gx = gi % psc(in_stride);
+
+    ivec4 v = bottom_blob_data[gi];
+
+    afpvec4 scale;
+    if (scale_data_size == 1)
+    {
+        scale = afpvec4(scale_value);
+    }
+    else
+    {
+        scale = buffer_ld4(scale_blob_data, gy);
+    }
+
+    afpvec4 bias;
+    if (bias_data_size == 0)
+    {
+        bias = afpvec4(0.f);
+    }
+    else if (bias_data_size == 1)
+    {
+        bias = afpvec4(bias_value);
+    }
+    else
+    {
+        bias = buffer_ld4(bias_blob_data, gy);
+    }
+
+    afpvec4 v_fp = afpvec4(v) * scale + bias;
+
+    const uint outgi = gy * psc(out_stride) + gx;
+
+    buffer_st4(top_blob_data, outgi, v_fp);
+}
diff --git a/src/layer/vulkan/shader/dequantize_pack8.comp b/src/layer/vulkan/shader/dequantize_pack8.comp
new file mode 100644
index 000000000..63b759b7c
--- /dev/null
+++ b/src/layer/vulkan/shader/dequantize_pack8.comp
@@ -0,0 +1,84 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int scale_data_size = 0;
+layout (constant_id = 1) const float scale_value = 1.f;
+layout (constant_id = 2) const int bias_data_size = 0;
+layout (constant_id = 3) const float bias_value = 0.f;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { ivec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer scale_blob { sfpvec8 scale_blob_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint c;
+    uint in_stride;
+    uint out_stride;
+} p;
+
+void main()
+{
+    const uint gi = gl_GlobalInvocationID.x;
+
+    if (gi >= psc(in_stride) * psc(c))
+        return;
+
+    const uint gy = gi / psc(in_stride);
+    const uint gx = gi % psc(in_stride);
+
+    ivec8 v = bottom_blob_data[gi];
+
+    afpvec8 scale;
+    if (scale_data_size == 1)
+    {
+        scale = afpvec8(afpvec4(scale_value), afpvec4(scale_value));
+    }
+    else
+    {
+        scale = buffer_ld8(scale_blob_data, gy);
+    }
+
+    afpvec8 bias;
+    if (bias_data_size == 0)
+    {
+        bias[0] = afpvec4(0.f);
+        bias[1] = afpvec4(0.f);
+    }
+    else if (bias_data_size == 1)
+    {
+        bias[0] = afpvec4(bias_value);
+        bias[1] = afpvec4(bias_value);
+    }
+    else
+    {
+        bias = buffer_ld8(bias_blob_data, gy);
+    }
+
+    afpvec8 v_fp;
+    v_fp[0] = afpvec4(v.abcd) * scale[0] + bias[0];
+    v_fp[1] = afpvec4(v.efgh) * scale[1] + bias[1];
+
+    const uint outgi = gy * psc(out_stride) + gx;
+
+    buffer_st8(top_blob_data, outgi, v_fp);
+}
diff --git a/src/layer/vulkan/shader/packing_int8.comp b/src/layer/vulkan/shader/packing_int8.comp
new file mode 100644
index 000000000..4ea5a8f93
--- /dev/null
+++ b/src/layer/vulkan/shader/packing_int8.comp
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int cast_type_from = 0;
+layout (constant_id = 1) const int cast_type_to = 1;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; };
+layout (binding = 1) readonly buffer bottom_blob_int32 { ivec4 bottom_blob_int32_data[]; };
+layout (binding = 2) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
+layout (binding = 3) writeonly buffer top_blob_int32 { ivec4 top_blob_int32_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint n;
+    uint c;
+    uint stride;
+} p;
+
+void main()
+{
+    const uint gx = gl_GlobalInvocationID.x;
+    const uint gy = gl_GlobalInvocationID.y;
+
+    if (gx >= psc(n) || gy >= psc(c))
+        return;
+
+    const uint gi = gy * psc(n) + gx;
+
+    if (cast_type_from == cast_type_to)
+    {
+        i8buffer_cp4(top_blob_data, gi, bottom_blob_data, gi);
+        return;
+    }
+
+    const uint gi2 = gy * psc(stride) + gx;
+
+    ivec4 v;
+    if (cast_type_from == 3)
+    {
+        v = bottom_blob_int32_data[gi];
+    }
+    else
+    {
+        v = i8buffer_ld4(bottom_blob_data, gi2);
+    }
+
+    if (cast_type_to == 3)
+    {
+        top_blob_int32_data[gi] = v;
+    }
+    else
+    {
+        i8buffer_st4(top_blob_data, gi2, v);
+    }
+}
diff --git a/src/layer/vulkan/shader/packing_pack1to4_int8.comp b/src/layer/vulkan/shader/packing_pack1to4_int8.comp
new file mode 100644
index 000000000..fb99d5d34
--- /dev/null
+++ b/src/layer/vulkan/shader/packing_pack1to4_int8.comp
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int cast_type_from = 0;
+layout (constant_id = 1) const int cast_type_to = 1;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; };
+layout (binding = 1) readonly buffer bottom_blob_int32 { int bottom_blob_int32_data[]; };
+layout (binding = 2) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
+layout (binding = 3) writeonly buffer top_blob_int32 { ivec4 top_blob_int32_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint n;
+    uint c;
+    uint stride;
+} p;
+
+void main()
+{
+    const uint gx = gl_GlobalInvocationID.x;
+    const uint gy = gl_GlobalInvocationID.y;
+
+    if (gx >= psc(n) || gy >= psc(c))
+        return;
+
+    const uvec4 gi4 = (gy * 4 + uvec4(0, 1, 2, 3)) * psc(stride) + gx;
+
+    const uint gi = gy * psc(n) + gx;
+
+//     if (cast_type_from == cast_type_to)
+//     {
+//         i8buffer_cp1to4(top_blob_data, gi, bottom_blob_data, gi4);
+//         return;
+//     }
+
+    ivec4 v;
+    if (cast_type_from == 3)
+    {
+        v.r = bottom_blob_int32_data[gi4.r];
+        v.g = bottom_blob_int32_data[gi4.g];
+        v.b = bottom_blob_int32_data[gi4.b];
+        v.a = bottom_blob_int32_data[gi4.a];
+    }
+    else
+    {
+        v.r = i8buffer_ld1(bottom_blob_data, gi4.r);
+        v.g = i8buffer_ld1(bottom_blob_data, gi4.g);
+        v.b = i8buffer_ld1(bottom_blob_data, gi4.b);
+        v.a = i8buffer_ld1(bottom_blob_data, gi4.a);
+    }
+
+    if (cast_type_to == 3)
+    {
+        top_blob_int32_data[gi] = v;
+    }
+    else
+    {
+        i8buffer_st4(top_blob_data, gi, v);
+    }
+}
diff --git a/src/layer/vulkan/shader/packing_pack1to8_int8.comp b/src/layer/vulkan/shader/packing_pack1to8_int8.comp
new file mode 100644
index 000000000..4f7b14732
--- /dev/null
+++ b/src/layer/vulkan/shader/packing_pack1to8_int8.comp
@@ -0,0 +1,88 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int cast_type_from = 0;
+layout (constant_id = 1) const int cast_type_to = 1;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { sint8 bottom_blob_data[]; };
+layout (binding = 1) readonly buffer bottom_blob_int32 { int bottom_blob_int32_data[]; };
+layout (binding = 2) writeonly buffer top_blob { sint8vec8 top_blob_data[]; };
+layout (binding = 3) writeonly buffer top_blob_int32 { ivec8 top_blob_int32_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint n;
+    uint c;
+    uint stride;
+} p;
+
+void main()
+{
+    const uint gx = gl_GlobalInvocationID.x;
+    const uint gy = gl_GlobalInvocationID.y;
+
+    if (gx >= psc(n) || gy >= psc(c))
+        return;
+
+    const uvec4 gi4 = (gy * 8 + uvec4(0, 1, 2, 3)) * psc(stride) + gx;
+    const uvec4 gi8 = gi4 + psc(stride) * 4;
+
+    const uint gi = gy * psc(n) + gx;
+
+//     if (cast_type_from == cast_type_to)
+//     {
+//         i8buffer_cp1to8(top_blob_data, gi, bottom_blob_data, gi4, gi8);
+//         return;
+//     }
+
+    ivec8 v;
+    if (cast_type_from == 3)
+    {
+        v.abcd.r = bottom_blob_int32_data[gi4.r];
+        v.abcd.g = bottom_blob_int32_data[gi4.g];
+        v.abcd.b = bottom_blob_int32_data[gi4.b];
+        v.abcd.a = bottom_blob_int32_data[gi4.a];
+        v.efgh.r = bottom_blob_int32_data[gi8.r];
+        v.efgh.g = bottom_blob_int32_data[gi8.g];
+        v.efgh.b = bottom_blob_int32_data[gi8.b];
+        v.efgh.a = bottom_blob_int32_data[gi8.a];
+    }
+    else
+    {
+        v.abcd.r = i8buffer_ld1(bottom_blob_data, gi4.r);
+        v.abcd.g = i8buffer_ld1(bottom_blob_data, gi4.g);
+        v.abcd.b = i8buffer_ld1(bottom_blob_data, gi4.b);
+        v.abcd.a = i8buffer_ld1(bottom_blob_data, gi4.a);
+        v.efgh.r = i8buffer_ld1(bottom_blob_data, gi8.r);
+        v.efgh.g = i8buffer_ld1(bottom_blob_data, gi8.g);
+        v.efgh.b = i8buffer_ld1(bottom_blob_data, gi8.b);
+        v.efgh.a = i8buffer_ld1(bottom_blob_data, gi8.a);
+    }
+
+    if (cast_type_to == 3)
+    {
+        top_blob_int32_data[gi] = v;
+    }
+    else
+    {
+        i8buffer_st8(top_blob_data, gi, v);
+    }
+}
diff --git a/src/layer/vulkan/shader/packing_pack4to1_int8.comp b/src/layer/vulkan/shader/packing_pack4to1_int8.comp
new file mode 100644
index 000000000..53145a40c
--- /dev/null
+++ b/src/layer/vulkan/shader/packing_pack4to1_int8.comp
@@ -0,0 +1,79 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int cast_type_from = 0;
+layout (constant_id = 1) const int cast_type_to = 1;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; };
+layout (binding = 1) readonly buffer bottom_blob_int32 { ivec4 bottom_blob_int32_data[]; };
+layout (binding = 2) writeonly buffer top_blob { sint8 top_blob_data[]; };
+layout (binding = 3) writeonly buffer top_blob_int32 { int top_blob_int32_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint n;
+    uint c;
+    uint stride;
+} p;
+
+void main()
+{
+    const uint gx = gl_GlobalInvocationID.x;
+    const uint gy = gl_GlobalInvocationID.y;
+
+    if (gx >= psc(n) || gy >= psc(c))
+        return;
+
+    const uint gi = gy * psc(n) + gx;
+
+    const uvec4 gi4 = (gy * 4 + uvec4(0, 1, 2, 3)) * psc(stride) + gx;
+
+//     if (cast_type_from == cast_type_to)
+//     {
+//         buffer_cp4to1(top_blob_data, gi4, bottom_blob_data, gi);
+//         return;
+//     }
+
+    ivec4 v;
+    if (cast_type_from == 3)
+    {
+        v = bottom_blob_int32_data[gi];
+    }
+    else
+    {
+        v = i8buffer_ld4(bottom_blob_data, gi);
+    }
+
+    if (cast_type_to == 3)
+    {
+        top_blob_int32_data[gi4.r] = v.r;
+        top_blob_int32_data[gi4.g] = v.g;
+        top_blob_int32_data[gi4.b] = v.b;
+        top_blob_int32_data[gi4.a] = v.a;
+    }
+    else
+    {
+        i8buffer_st1(top_blob_data, gi4.r, v.r);
+        i8buffer_st1(top_blob_data, gi4.g, v.g);
+        i8buffer_st1(top_blob_data, gi4.b, v.b);
+        i8buffer_st1(top_blob_data, gi4.a, v.a);
+    }
+}
diff --git a/src/layer/vulkan/shader/packing_pack4to8_int8.comp b/src/layer/vulkan/shader/packing_pack4to8_int8.comp
new file mode 100644
index 000000000..112dd2472
--- /dev/null
+++ b/src/layer/vulkan/shader/packing_pack4to8_int8.comp
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int cast_type_from = 0;
+layout (constant_id = 1) const int cast_type_to = 1;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { sint8vec4 bottom_blob_data[]; };
+layout (binding = 1) readonly buffer bottom_blob_int32 { ivec4 bottom_blob_int32_data[]; };
+layout (binding = 2) writeonly buffer top_blob { sint8vec8 top_blob_data[]; };
+layout (binding = 3) writeonly buffer top_blob_int32 { ivec8 top_blob_int32_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint n;
+    uint c;
+    uint stride;
+} p;
+
+void main()
+{
+    const uint gx = gl_GlobalInvocationID.x;
+    const uint gy = gl_GlobalInvocationID.y;
+
+    if (gx >= psc(n) || gy >= psc(c))
+        return;
+
+    const uvec2 gi2 = (gy * 2 + uvec2(0, 1)) * psc(stride) + gx;
+
+    const uint gi = gy * psc(n) + gx;
+
+//     if (cast_type_from == cast_type_to)
+//     {
+//         buffer_cp4to8(top_blob_data, gi, bottom_blob_data, gi2);
+//         return;
+//     }
+
+    ivec8 v;
+    if (cast_type_from == 3)
+    {
+        v.abcd = bottom_blob_int32_data[gi2.r];
+        v.efgh = bottom_blob_int32_data[gi2.g];
+    }
+    else
+    {
+        v.abcd = i8buffer_ld4(bottom_blob_data, gi2.r);
+        v.efgh = i8buffer_ld4(bottom_blob_data, gi2.g);
+    }
+
+    if (cast_type_to == 3)
+    {
+        top_blob_int32_data[gi] = v;
+    }
+    else
+    {
+        i8buffer_st8(top_blob_data, gi, v);
+    }
+}
diff --git a/src/layer/vulkan/shader/packing_pack8to1_int8.comp b/src/layer/vulkan/shader/packing_pack8to1_int8.comp
new file mode 100644
index 000000000..6d1c9a4a1
--- /dev/null
+++ b/src/layer/vulkan/shader/packing_pack8to1_int8.comp
@@ -0,0 +1,88 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int cast_type_from = 0;
+layout (constant_id = 1) const int cast_type_to = 1;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { sint8vec8 bottom_blob_data[]; };
+layout (binding = 1) readonly buffer bottom_blob_int32 { ivec8 bottom_blob_int32_data[]; };
+layout (binding = 2) writeonly buffer top_blob { sint8 top_blob_data[]; };
+layout (binding = 3) writeonly buffer top_blob_int32 { int top_blob_int32_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint n;
+    uint c;
+    uint stride;
+} p;
+
+void main()
+{
+    const uint gx = gl_GlobalInvocationID.x;
+    const uint gy = gl_GlobalInvocationID.y;
+
+    if (gx >= psc(n) || gy >= psc(c))
+        return;
+
+    const uint gi = gy * psc(n) + gx;
+
+    const uvec4 gi4 = (gy * 8 + uvec4(0, 1, 2, 3)) * psc(stride) + gx;
+    const uvec4 gi8 = gi4 + psc(stride) * 4;
+
+//     if (cast_type_from == cast_type_to)
+//     {
+//         i8buffer_cp8to1(top_blob_data, gi4, gi8, bottom_blob_data, gi);
+//         return;
+//     }
+
+    ivec8 v;
+    if (cast_type_from == 3)
+    {
+        v = bottom_blob_int32_data[gi];
+    }
+    else
+    {
+        v = i8buffer_ld8(bottom_blob_data, gi);
+    }
+
+    if (cast_type_to == 3)
+    {
+        top_blob_int32_data[gi4.r] = v.abcd.r;
+        top_blob_int32_data[gi4.g] = v.abcd.g;
+        top_blob_int32_data[gi4.b] = v.abcd.b;
+        top_blob_int32_data[gi4.a] = v.abcd.a;
+        top_blob_int32_data[gi8.r] = v.efgh.r;
+        top_blob_int32_data[gi8.g] = v.efgh.g;
+        top_blob_int32_data[gi8.b] = v.efgh.b;
+        top_blob_int32_data[gi8.a] = v.efgh.a;
+    }
+    else
+    {
+        i8buffer_st1(top_blob_data, gi4.r, v.abcd.r);
+        i8buffer_st1(top_blob_data, gi4.g, v.abcd.g);
+        i8buffer_st1(top_blob_data, gi4.b, v.abcd.b);
+        i8buffer_st1(top_blob_data, gi4.a, v.abcd.a);
+        i8buffer_st1(top_blob_data, gi8.r, v.efgh.r);
+        i8buffer_st1(top_blob_data, gi8.g, v.efgh.g);
+        i8buffer_st1(top_blob_data, gi8.b, v.efgh.b);
+        i8buffer_st1(top_blob_data, gi8.a, v.efgh.a);
+    }
+}
diff --git a/src/layer/vulkan/shader/packing_pack8to4_int8.comp b/src/layer/vulkan/shader/packing_pack8to4_int8.comp
new file mode 100644
index 000000000..c3df6dc6d
--- /dev/null
+++ b/src/layer/vulkan/shader/packing_pack8to4_int8.comp
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int cast_type_from = 0;
+layout (constant_id = 1) const int cast_type_to = 1;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const uint n = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { sint8vec8 bottom_blob_data[]; };
+layout (binding = 1) readonly buffer bottom_blob_int32 { ivec8 bottom_blob_int32_data[]; };
+layout (binding = 2) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
+layout (binding = 3) writeonly buffer top_blob_int32 { ivec4 top_blob_int32_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint n;
+    uint c;
+    uint stride;
+} p;
+
+void main()
+{
+    const uint gx = gl_GlobalInvocationID.x;
+    const uint gy = gl_GlobalInvocationID.y;
+
+    if (gx >= psc(n) || gy >= psc(c))
+        return;
+
+    const uint gi = gy * psc(n) + gx;
+
+    const uvec2 gi2 = (gy * 2 + uvec2(0, 1)) * psc(stride) + gx;
+
+//     if (cast_type_from == cast_type_to)
+//     {
+//         buffer_cp8to4(top_blob_data, gi2, bottom_blob_data, gi);
+//         return;
+//     }
+
+    ivec8 v;
+    if (cast_type_from == 3)
+    {
+        v = bottom_blob_int32_data[gi];
+    }
+    else
+    {
+        v = i8buffer_ld8(bottom_blob_data, gi);
+    }
+
+    if (cast_type_to == 3)
+    {
+        top_blob_int32_data[gi2.r] = v.abcd;
+        top_blob_int32_data[gi2.g] = v.efgh;
+    }
+    else
+    {
+        i8buffer_st4(top_blob_data, gi2.r, v.abcd);
+        i8buffer_st4(top_blob_data, gi2.g, v.efgh);
+    }
+}
diff --git a/src/layer/vulkan/shader/quantize.comp b/src/layer/vulkan/shader/quantize.comp
new file mode 100644
index 000000000..58d23f852
--- /dev/null
+++ b/src/layer/vulkan/shader/quantize.comp
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int scale_data_size = 0;
+layout (constant_id = 1) const float scale_value = 1.f;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; };
+layout (binding = 2) readonly buffer scale_blob { sfp scale_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint c;
+    uint in_stride;
+    uint out_stride;
+} p;
+
+void main()
+{
+    const uint gi = gl_GlobalInvocationID.x;
+
+    if (gi >= psc(in_stride) * psc(c))
+        return;
+
+    const uint gy = gi / psc(in_stride);
+    const uint gx = gi % psc(in_stride);
+
+    afp v = buffer_ld1(bottom_blob_data, gi);
+
+    afp scale;
+    if (scale_data_size == 1)
+    {
+        scale = afp(scale_value);
+    }
+    else
+    {
+        scale = buffer_ld1(scale_blob_data, gy);
+    }
+
+    int v_int = int(round(clamp(v * scale, afp(-127.f), afp(127.f))));
+
+    const uint outgi = gy * psc(out_stride) + gx;
+
+    i8buffer_st1(top_blob_data, outgi, v_int);
+}
diff --git a/src/layer/vulkan/shader/quantize_pack4.comp b/src/layer/vulkan/shader/quantize_pack4.comp
new file mode 100644
index 000000000..7b58eff1a
--- /dev/null
+++ b/src/layer/vulkan/shader/quantize_pack4.comp
@@ -0,0 +1,63 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int scale_data_size = 0;
+layout (constant_id = 1) const float scale_value = 1.f;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer scale_blob { sfpvec4 scale_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint c;
+    uint in_stride;
+    uint out_stride;
+} p;
+
+void main()
+{
+    const uint gi = gl_GlobalInvocationID.x;
+
+    if (gi >= psc(in_stride) * psc(c))
+        return;
+
+    const uint gy = gi / psc(in_stride);
+    const uint gx = gi % psc(in_stride);
+
+    afpvec4 v = buffer_ld4(bottom_blob_data, gi);
+
+    afpvec4 scale;
+    if (scale_data_size == 1)
+    {
+        scale = afpvec4(scale_value);
+    }
+    else
+    {
+        scale = buffer_ld4(scale_blob_data, gy);
+    }
+
+    ivec4 v_int = ivec4(round(clamp(v * scale, afp(-127.f), afp(127.f))));
+
+    const uint outgi = gy * psc(out_stride) + gx;
+
+    i8buffer_st4(top_blob_data, outgi, v_int);
+}
diff --git a/src/layer/vulkan/shader/quantize_pack8.comp b/src/layer/vulkan/shader/quantize_pack8.comp
new file mode 100644
index 000000000..032f8ff1f
--- /dev/null
+++ b/src/layer/vulkan/shader/quantize_pack8.comp
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int scale_data_size = 0;
+layout (constant_id = 1) const float scale_value = 1.f;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sint8vec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer scale_blob { sfpvec8 scale_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint c;
+    uint in_stride;
+    uint out_stride;
+} p;
+
+void main()
+{
+    const uint gi = gl_GlobalInvocationID.x;
+
+    if (gi >= psc(in_stride) * psc(c))
+        return;
+
+    const uint gy = gi / psc(in_stride);
+    const uint gx = gi % psc(in_stride);
+
+    afpvec8 v = buffer_ld8(bottom_blob_data, gi);
+
+    afpvec8 scale;
+    if (scale_data_size == 1)
+    {
+        scale = afpvec8(afpvec4(scale_value), afpvec4(scale_value));
+    }
+    else
+    {
+        scale = buffer_ld8(scale_blob_data, gy);
+    }
+
+    ivec8 v_int;
+    v_int.abcd = ivec4(round(clamp(v[0] * scale[0], afp(-127.f), afp(127.f))));
+    v_int.efgh = ivec4(round(clamp(v[1] * scale[1], afp(-127.f), afp(127.f))));
+
+    const uint outgi = gy * psc(out_stride) + gx;
+
+    i8buffer_st8(top_blob_data, outgi, v_int);
+}
diff --git a/src/layer/vulkan/shader/requantize.comp b/src/layer/vulkan/shader/requantize.comp
new file mode 100644
index 000000000..cb2ef7432
--- /dev/null
+++ b/src/layer/vulkan/shader/requantize.comp
@@ -0,0 +1,103 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int scale_in_data_size = 0;
+layout (constant_id = 1) const float scale_in_value = 1.f;
+layout (constant_id = 2) const int scale_out_data_size = 0;
+layout (constant_id = 3) const float scale_out_value = 1.f;
+layout (constant_id = 4) const int bias_data_size = 0;
+layout (constant_id = 5) const float bias_value = 0.f;
+layout (constant_id = 6) const int activation_type = 0;
+layout (constant_id = 7) const float activation_param_0 = 0;
+layout (constant_id = 8) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 9
+layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { int bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sint8 top_blob_data[]; };
+layout (binding = 2) readonly buffer scale_in_blob { sfp scale_in_blob_data[]; };
+layout (binding = 3) readonly buffer scale_out_blob { sfp scale_out_blob_data[]; };
+layout (binding = 4) readonly buffer bias_blob { sfp bias_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint c;
+    uint in_stride;
+    uint out_stride;
+} p;
+
+void main()
+{
+    const uint gi = gl_GlobalInvocationID.x;
+
+    if (gi >= psc(in_stride) * psc(c))
+        return;
+
+    const uint gy = gi / psc(in_stride);
+    const uint gx = gi % psc(in_stride);
+
+    int v = bottom_blob_data[gi];
+
+    afp scale_in;
+    if (scale_in_data_size == 1)
+    {
+        scale_in = afp(scale_in_value);
+    }
+    else
+    {
+        scale_in = buffer_ld1(scale_in_blob_data, gy);
+    }
+
+    afp bias;
+    if (bias_data_size == 0)
+    {
+        bias = afp(0.f);
+    }
+    else if (bias_data_size == 1)
+    {
+        bias = afp(bias_value);
+    }
+    else
+    {
+        bias = buffer_ld1(bias_blob_data, gy);
+    }
+
+    afp v_fp = afp(v) * scale_in + bias;
+
+    v_fp = activation_afp(v_fp, activation_type, activation_param_0, activation_param_1);
+
+    afp scale_out;
+    if (scale_out_data_size == 1)
+    {
+        scale_out = afp(scale_out_value);
+    }
+    else
+    {
+        scale_out = buffer_ld1(scale_out_blob_data, gy);
+    }
+
+    int v_int = int(round(clamp(v_fp * scale_out, afp(-127.f), afp(127.f))));
+
+    const uint outgi = gy * psc(out_stride) + gx;
+
+    i8buffer_st1(top_blob_data, outgi, v_int);
+}
diff --git a/src/layer/vulkan/shader/requantize_pack4.comp b/src/layer/vulkan/shader/requantize_pack4.comp
new file mode 100644
index 000000000..2fcbc862b
--- /dev/null
+++ b/src/layer/vulkan/shader/requantize_pack4.comp
@@ -0,0 +1,103 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int scale_in_data_size = 0;
+layout (constant_id = 1) const float scale_in_value = 1.f;
+layout (constant_id = 2) const int scale_out_data_size = 0;
+layout (constant_id = 3) const float scale_out_value = 1.f;
+layout (constant_id = 4) const int bias_data_size = 0;
+layout (constant_id = 5) const float bias_value = 0.f;
+layout (constant_id = 6) const int activation_type = 0;
+layout (constant_id = 7) const float activation_param_0 = 0;
+layout (constant_id = 8) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 9
+layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { ivec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sint8vec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer scale_in_blob { sfpvec4 scale_in_blob_data[]; };
+layout (binding = 3) readonly buffer scale_out_blob { sfpvec4 scale_out_blob_data[]; };
+layout (binding = 4) readonly buffer bias_blob { sfpvec4 bias_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint c;
+    uint in_stride;
+    uint out_stride;
+} p;
+
+void main()
+{
+    const uint gi = gl_GlobalInvocationID.x;
+
+    if (gi >= psc(in_stride) * psc(c))
+        return;
+
+    const uint gy = gi / psc(in_stride);
+    const uint gx = gi % psc(in_stride);
+
+    ivec4 v = bottom_blob_data[gi];
+
+    afpvec4 scale_in;
+    if (scale_in_data_size == 1)
+    {
+        scale_in = afpvec4(scale_in_value);
+    }
+    else
+    {
+        scale_in = buffer_ld4(scale_in_blob_data, gy);
+    }
+
+    afpvec4 bias;
+    if (bias_data_size == 0)
+    {
+        bias = afpvec4(0.f);
+    }
+    else if (bias_data_size == 1)
+    {
+        bias = afpvec4(bias_value);
+    }
+    else
+    {
+        bias = buffer_ld4(bias_blob_data, gy);
+    }
+
+    afpvec4 v_fp = afpvec4(v) * scale_in + bias;
+
+    v_fp = activation_afpvec4(v_fp, activation_type, activation_param_0, activation_param_1);
+
+    afpvec4 scale_out;
+    if (scale_out_data_size == 1)
+    {
+        scale_out = afpvec4(scale_out_value);
+    }
+    else
+    {
+        scale_out = buffer_ld4(scale_out_blob_data, gy);
+    }
+
+    ivec4 v_int = ivec4(round(clamp(v_fp * scale_out, afp(-127.f), afp(127.f))));
+
+    const uint outgi = gy * psc(out_stride) + gx;
+
+    i8buffer_st4(top_blob_data, outgi, v_int);
+}
diff --git a/src/layer/vulkan/shader/requantize_pack8.comp b/src/layer/vulkan/shader/requantize_pack8.comp
new file mode 100644
index 000000000..fedff0151
--- /dev/null
+++ b/src/layer/vulkan/shader/requantize_pack8.comp
@@ -0,0 +1,107 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int scale_in_data_size = 0;
+layout (constant_id = 1) const float scale_in_value = 1.f;
+layout (constant_id = 2) const int scale_out_data_size = 0;
+layout (constant_id = 3) const float scale_out_value = 1.f;
+layout (constant_id = 4) const int bias_data_size = 0;
+layout (constant_id = 5) const float bias_value = 0.f;
+layout (constant_id = 6) const int activation_type = 0;
+layout (constant_id = 7) const float activation_param_0 = 0;
+layout (constant_id = 8) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 9
+layout (constant_id = shape_constant_id_offset + 0) const uint c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const uint in_stride = 0;
+layout (constant_id = shape_constant_id_offset + 2) const uint out_stride = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { ivec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sint8vec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer scale_in_blob { sfpvec8 scale_in_blob_data[]; };
+layout (binding = 3) readonly buffer scale_out_blob { sfpvec8 scale_out_blob_data[]; };
+layout (binding = 4) readonly buffer bias_blob { sfpvec8 bias_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    uint c;
+    uint in_stride;
+    uint out_stride;
+} p;
+
+void main()
+{
+    const uint gi = gl_GlobalInvocationID.x;
+
+    if (gi >= psc(in_stride) * psc(c))
+        return;
+
+    const uint gy = gi / psc(in_stride);
+    const uint gx = gi % psc(in_stride);
+
+    ivec8 v = bottom_blob_data[gi];
+
+    afpvec8 scale_in;
+    if (scale_in_data_size == 1)
+    {
+        scale_in = afpvec8(afpvec4(scale_in_value), afpvec4(scale_in_value));
+    }
+    else
+    {
+        scale_in = buffer_ld8(scale_in_blob_data, gy);
+    }
+
+    afpvec8 bias;
+    if (bias_data_size == 0)
+    {
+        bias = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    }
+    else if (bias_data_size == 1)
+    {
+        bias = afpvec8(afpvec4(bias_value), afpvec4(bias_value));
+    }
+    else
+    {
+        bias = buffer_ld8(bias_blob_data, gy);
+    }
+
+    afpvec8 v_fp;
+    v_fp[0] = afpvec4(v.abcd) * scale_in[0] + bias[0];
+    v_fp[1] = afpvec4(v.efgh) * scale_in[1] + bias[1];
+
+    v_fp = activation_afpvec8(v_fp, activation_type, activation_param_0, activation_param_1);
+
+    afpvec8 scale_out;
+    if (scale_out_data_size == 1)
+    {
+        scale_out = afpvec8(afpvec4(scale_out_value), afpvec4(scale_out_value));
+    }
+    else
+    {
+        scale_out = buffer_ld8(scale_out_blob_data, gy);
+    }
+
+    ivec8 v_int;
+    v_int.abcd = ivec4(round(clamp(v_fp[0] * scale_out[0], afp(-127.f), afp(127.f))));
+    v_int.efgh = ivec4(round(clamp(v_fp[1] * scale_out[1], afp(-127.f), afp(127.f))));
+
+    const uint outgi = gy * psc(out_stride) + gx;
+
+    i8buffer_st8(top_blob_data, outgi, v_int);
+}
diff --git a/src/net.cpp b/src/net.cpp
index 21b99fcf5..05f121dd9 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -1043,6 +1043,9 @@ int Net::load_param(const DataReader& dr)
         // fp16a makes no sense when fp16 storage disabled
         if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;
 
+        // int8a makes no sense when int8 storage disabled
+        if (!opt.use_int8_packed && !opt.use_int8_storage) opt.use_int8_arithmetic = false;
+
         // fp16 uniform makes no sense when fp16 arithmetic disabled
         if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false;
     }
@@ -1339,6 +1342,9 @@ int Net::load_param_bin(const DataReader& dr)
         // fp16a makes no sense when fp16 storage disabled
         if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;
 
+        // int8a makes no sense when int8 storage disabled
+        if (!opt.use_int8_packed && !opt.use_int8_storage) opt.use_int8_arithmetic = false;
+
         // fp16 uniform makes no sense when fp16 arithmetic disabled
         if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false;
     }
diff --git a/tests/test_dequantize.cpp b/tests/test_dequantize.cpp
index 431201b2e..f80684539 100644
--- a/tests/test_dequantize.cpp
+++ b/tests/test_dequantize.cpp
@@ -142,12 +142,8 @@ static int test_dequantize_3()
            || test_dequantize_pack8(RandomIntMat(15, 24), 24, 24)
            || test_dequantize_pack8(RandomIntMat(15, 24), 24, 1)
            || test_dequantize_pack8(RandomIntMat(15, 24), 24, 0)
-           || test_dequantize_pack8(RandomIntMat(128), 1, 128)
            || test_dequantize_pack8(RandomIntMat(128), 1, 1)
-           || test_dequantize_pack8(RandomIntMat(128), 1, 0)
-           || test_dequantize_pack8(RandomIntMat(128), 128, 128)
-           || test_dequantize_pack8(RandomIntMat(128), 128, 1)
-           || test_dequantize_pack8(RandomIntMat(128), 128, 0);
+           || test_dequantize_pack8(RandomIntMat(128), 1, 0);
 }
 
 int main()
diff --git a/tests/test_packing.cpp b/tests/test_packing.cpp
index 2d84199eb..a8e5c6c28 100644
--- a/tests/test_packing.cpp
+++ b/tests/test_packing.cpp
@@ -217,15 +217,12 @@ static int test_packing_cpu(const ncnn::Mat& a, int in_elempack, int out_elempac
 }
 
 #if NCNN_VULKAN
-
-static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_elempack)
+static int test_packing_gpu_fp32(const ncnn::Mat& a, int in_elempack, int out_elempack)
 {
     ncnn::ParamDict pd;
     pd.set(0, out_elempack);
     pd.set(2, 1); // cast_type_from
     pd.set(3, 1); // cast_type_to
-    pd.set(4, 0); // storage_type_from
-    pd.set(5, 0); // storage_type_to
 
     std::vector<ncnn::Mat> weights(0);
 
@@ -297,12 +294,112 @@ static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_
 
     if (CompareMat(b, d, 0.001) != 0)
     {
-        fprintf(stderr, "test_packing_gpu_buffer failed a.dims=%d a=(%d %d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.d, a.c, in_elempack, out_elempack);
+        fprintf(stderr, "test_packing_gpu failed a.dims=%d a=(%d %d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.d, a.c, in_elempack, out_elempack);
         return -1;
     }
 
     return 0;
 }
+
+static int test_packing_gpu_int8(const ncnn::Mat& a, int in_elempack, int out_elempack)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, out_elempack);
+    pd.set(2, 4); // cast_type_from
+    pd.set(3, 4); // cast_type_to
+
+    std::vector<ncnn::Mat> weights(0);
+
+    ncnn::Option opt;
+    opt.num_threads = 1;
+    opt.use_vulkan_compute = true;
+    opt.use_int8_inference = false;
+    opt.use_fp16_packed = false;
+    opt.use_fp16_storage = false;
+    opt.use_fp16_arithmetic = false;
+    opt.use_int8_storage = false;
+    opt.use_int8_arithmetic = false;
+    opt.use_packing_layout = true;
+    opt.use_shader_pack8 = true;
+
+    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
+
+    ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
+    ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
+
+    opt.blob_vkallocator = blob_vkallocator;
+    opt.workspace_vkallocator = blob_vkallocator;
+    opt.staging_vkallocator = staging_vkallocator;
+
+    if (!vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
+    if (!vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
+
+    ncnn::Layer* op = ncnn::create_layer_vulkan("Packing");
+
+    op->vkdev = vkdev;
+
+    op->load_param(pd);
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    op->create_pipeline(opt);
+
+    ncnn::Mat a8;
+    if (a.dims == 1) a8 = RandomS8Mat(a.w);
+    if (a.dims == 2) a8 = RandomS8Mat(a.w, a.h);
+    if (a.dims == 3) a8 = RandomS8Mat(a.w, a.h, a.c);
+    if (a.dims == 4) a8 = RandomS8Mat(a.w, a.h, a.d, a.c);
+
+    ncnn::Mat ap;
+    ncnn::convert_packing(a8, ap, in_elempack, opt);
+
+    ncnn::Mat b;
+    packing_cpu_naive(ap, b, out_elempack);
+
+    ncnn::Mat c;
+
+    // forward
+    ncnn::VkCompute cmd(vkdev);
+
+    // upload
+    ncnn::VkMat a_gpu;
+    cmd.record_clone(ap, a_gpu, opt);
+
+    ncnn::VkMat c_gpu;
+    op->forward(a_gpu, c_gpu, cmd, opt);
+
+    // download
+    cmd.record_clone(c_gpu, c, opt);
+
+    cmd.submit_and_wait();
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    ncnn::Mat b32;
+    ncnn::cast_int8_to_float32(b, b32, opt);
+
+    ncnn::Mat c32;
+    ncnn::cast_int8_to_float32(c, c32, opt);
+
+    if (CompareMat(b32, c32, 0.001) != 0)
+    {
+        fprintf(stderr, "test_packing_gpu_int8 failed a.dims=%d a=(%d %d %d %d) in_elempack=%d out_elempack=%d\n", a.dims, a.w, a.h, a.d, a.c, in_elempack, out_elempack);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int test_packing_gpu(const ncnn::Mat& a, int in_elempack, int out_elempack)
+{
+    return 0
+           || test_packing_gpu_fp32(a, in_elempack, out_elempack)
+           || test_packing_gpu_int8(a, in_elempack, out_elempack);
+}
 #endif
 
 static int test_packing_cpu(const ncnn::Mat& a)
@@ -329,15 +426,15 @@ static int test_packing_cpu(const ncnn::Mat& a)
 static int test_packing_gpu(const ncnn::Mat& a)
 {
     return 0
-           || test_packing_gpu_buffer(a, 1, 1)
-           || test_packing_gpu_buffer(a, 4, 4)
-           || test_packing_gpu_buffer(a, 8, 8)
-           || test_packing_gpu_buffer(a, 1, 4)
-           || test_packing_gpu_buffer(a, 4, 1)
-           || test_packing_gpu_buffer(a, 1, 8)
-           || test_packing_gpu_buffer(a, 8, 1)
-           || test_packing_gpu_buffer(a, 4, 8)
-           || test_packing_gpu_buffer(a, 8, 4);
+           || test_packing_gpu(a, 1, 1)
+           || test_packing_gpu(a, 4, 4)
+           || test_packing_gpu(a, 8, 8)
+           || test_packing_gpu(a, 1, 4)
+           || test_packing_gpu(a, 4, 1)
+           || test_packing_gpu(a, 1, 8)
+           || test_packing_gpu(a, 8, 1)
+           || test_packing_gpu(a, 4, 8)
+           || test_packing_gpu(a, 8, 4);
 }
 #endif // NCNN_VULKAN
 
diff --git a/tests/test_quantize.cpp b/tests/test_quantize.cpp
index a6e67b23d..be137a49f 100644
--- a/tests/test_quantize.cpp
+++ b/tests/test_quantize.cpp
@@ -24,7 +24,7 @@ static int test_quantize(const ncnn::Mat& a, float scale_low, float scale_high)
     }
     else
     {
-        if (a.dims == 1) scale_data.create(a.w);
+        if (a.dims == 1) scale_data.create(1);
         if (a.dims == 2) scale_data.create(a.h);
         if (a.dims == 3) scale_data.create(a.c);
         Randomize(scale_data, scale_low, scale_high);
diff --git a/tests/test_quantize_oom.cpp b/tests/test_quantize_oom.cpp
index ca78535ed..cc029e0bb 100644
--- a/tests/test_quantize_oom.cpp
+++ b/tests/test_quantize_oom.cpp
@@ -24,7 +24,7 @@ static int test_quantize_oom(const ncnn::Mat& a, float scale_low, float scale_hi
     }
     else
     {
-        if (a.dims == 1) scale_data.create(a.w);
+        if (a.dims == 1) scale_data.create(1);
         if (a.dims == 2) scale_data.create(a.h);
         if (a.dims == 3) scale_data.create(a.c);
         Randomize(scale_data, scale_low, scale_high);
diff --git a/tests/testutil.cpp b/tests/testutil.cpp
index fa2f0cc01..db7b4ca8e 100644
--- a/tests/testutil.cpp
+++ b/tests/testutil.cpp
@@ -759,7 +759,32 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
             std::vector<ncnn::VkMat> a_gpu(a.size());
             for (size_t i = 0; i < a_gpu.size(); i++)
             {
-                cmd.record_upload(a[i], a_gpu[i], opt);
+                if (flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING)
+                {
+                    // resolve dst_elempack
+                    int dims = a[i].dims;
+                    int elemcount = 0;
+                    if (dims == 1) elemcount = a[i].elempack * a[i].w;
+                    if (dims == 2) elemcount = a[i].elempack * a[i].h;
+                    if (dims == 3 || dims == 4) elemcount = a[i].elempack * a[i].c;
+
+                    const int dst_elempack = (opt.use_shader_pack8 && elemcount % 8 == 0) ? 8 : elemcount % 4 == 0 ? 4 : 1;
+
+                    ncnn::Mat a4;
+                    ncnn::convert_packing(a[i], a4, dst_elempack, opt);
+
+                    ncnn::Option opt_upload = opt;
+                    opt_upload.use_fp16_packed = false;
+                    opt_upload.use_fp16_storage = false;
+                    opt_upload.use_int8_packed = false;
+                    opt_upload.use_int8_storage = false;
+
+                    cmd.record_clone(a4, a_gpu[i], opt_upload);
+                }
+                else
+                {
+                    cmd.record_upload(a[i], a_gpu[i], opt);
+                }
             }
 
             std::vector<ncnn::VkMat> d_gpu(top_blob_count);
@@ -1082,7 +1107,33 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
         {
             // upload
             ncnn::VkMat a_gpu;
-            cmd.record_upload(a, a_gpu, opt);
+
+            if (flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING)
+            {
+                // resolve dst_elempack
+                int dims = a.dims;
+                int elemcount = 0;
+                if (dims == 1) elemcount = a.elempack * a.w;
+                if (dims == 2) elemcount = a.elempack * a.h;
+                if (dims == 3 || dims == 4) elemcount = a.elempack * a.c;
+
+                const int dst_elempack = (opt.use_shader_pack8 && elemcount % 8 == 0) ? 8 : elemcount % 4 == 0 ? 4 : 1;
+
+                ncnn::Mat a4;
+                ncnn::convert_packing(a, a4, dst_elempack, opt);
+
+                ncnn::Option opt_upload = opt;
+                opt_upload.use_fp16_packed = false;
+                opt_upload.use_fp16_storage = false;
+                opt_upload.use_int8_packed = false;
+                opt_upload.use_int8_storage = false;
+
+                cmd.record_clone(a4, a_gpu, opt_upload);
+            }
+            else
+            {
+                cmd.record_upload(a, a_gpu, opt);
+            }
 
             ncnn::VkMat d_gpu;
             if (op->support_inplace)