diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp
index 1a31015e5..fc6d0ddc1 100644
--- a/benchmark/benchncnn.cpp
+++ b/benchmark/benchncnn.cpp
@@ -98,8 +98,6 @@ public:
 static int g_warmup_loop_count = 3;
 static int g_loop_count = 4;
 
-static ncnn::Option g_default_option;
-
 static ncnn::UnlockedPoolAllocator g_blob_pool_allocator;
 static ncnn::PoolAllocator g_workspace_pool_allocator;
 
@@ -109,11 +107,11 @@ static ncnn::VkAllocator* g_blob_vkallocator = 0;
 static ncnn::VkAllocator* g_staging_vkallocator = 0;
 #endif // NCNN_VULKAN
 
-void benchmark(const char* comment, const ncnn::Mat& in)
+void benchmark(const char* comment, const ncnn::Mat& in, const ncnn::Option& opt)
 {
     ncnn::BenchNet net;
 
-    net.opt = g_default_option;
+    net.opt = opt;
 
 #if NCNN_VULKAN
     if (net.opt.use_vulkan_compute)
@@ -228,24 +226,26 @@ int main(int argc, char** argv)
 #endif // NCNN_VULKAN
 
     // default option
-    g_default_option.lightmode = true;
-    g_default_option.num_threads = num_threads;
-    g_default_option.blob_allocator = &g_blob_pool_allocator;
-    g_default_option.workspace_allocator = &g_workspace_pool_allocator;
+    ncnn::Option opt;
+    opt.lightmode = true;
+    opt.num_threads = num_threads;
+    opt.blob_allocator = &g_blob_pool_allocator;
+    opt.workspace_allocator = &g_workspace_pool_allocator;
 #if NCNN_VULKAN
-    g_default_option.blob_vkallocator = g_blob_vkallocator;
-    g_default_option.workspace_vkallocator = g_blob_vkallocator;
-    g_default_option.staging_vkallocator = g_staging_vkallocator;
+    opt.blob_vkallocator = g_blob_vkallocator;
+    opt.workspace_vkallocator = g_blob_vkallocator;
+    opt.staging_vkallocator = g_staging_vkallocator;
 #endif // NCNN_VULKAN
-    g_default_option.use_winograd_convolution = true;
-    g_default_option.use_sgemm_convolution = true;
-    g_default_option.use_int8_inference = true;
-    g_default_option.use_vulkan_compute = use_vulkan_compute;
-    g_default_option.use_fp16_packed = true;
-    g_default_option.use_fp16_storage = true;
-    g_default_option.use_fp16_arithmetic = true;
-    g_default_option.use_int8_storage = true;
-    g_default_option.use_int8_arithmetic = true;
+    opt.use_winograd_convolution = true;
+    opt.use_sgemm_convolution = true;
+    opt.use_int8_inference = true;
+    opt.use_vulkan_compute = use_vulkan_compute;
+    opt.use_fp16_packed = true;
+    opt.use_fp16_storage = true;
+    opt.use_fp16_arithmetic = true;
+    opt.use_int8_storage = true;
+    opt.use_int8_arithmetic = true;
+    opt.use_packing_layout = true;
 
     ncnn::set_cpu_powersave(powersave);
 
@@ -258,84 +258,116 @@ int main(int argc, char** argv)
     fprintf(stderr, "gpu_device = %d\n", gpu_device);
 
     // run
-    benchmark("squeezenet", ncnn::Mat(227, 227, 3));
+    benchmark("squeezenet", ncnn::Mat(227, 227, 3), opt);
 
 #if NCNN_VULKAN
     if (!use_vulkan_compute)
 #endif // NCNN_VULKAN
-    benchmark("squeezenet_int8", ncnn::Mat(227, 227, 3));
+    {
+    opt.use_packing_layout = false;
+    benchmark("squeezenet_int8", ncnn::Mat(227, 227, 3), opt);
+    opt.use_packing_layout = true;
+    }
 
-    benchmark("mobilenet", ncnn::Mat(224, 224, 3));
+    benchmark("mobilenet", ncnn::Mat(224, 224, 3), opt);
 
 #if NCNN_VULKAN
     if (!use_vulkan_compute)
 #endif // NCNN_VULKAN
-    benchmark("mobilenet_int8", ncnn::Mat(224, 224, 3));
+    {
+    opt.use_packing_layout = false;
+    benchmark("mobilenet_int8", ncnn::Mat(224, 224, 3), opt);
+    opt.use_packing_layout = true;
+    }
 
-    benchmark("mobilenet_v2", ncnn::Mat(224, 224, 3));
+    benchmark("mobilenet_v2", ncnn::Mat(224, 224, 3), opt);
 
 // #if NCNN_VULKAN
 //     if (!use_vulkan_compute)
 // #endif // NCNN_VULKAN
-//     benchmark("mobilenet_v2_int8", ncnn::Mat(224, 224, 3));
+//     benchmark("mobilenet_v2_int8", ncnn::Mat(224, 224, 3), opt);
 
-    benchmark("mobilenet_v3", ncnn::Mat(224, 224, 3));
+    benchmark("mobilenet_v3", ncnn::Mat(224, 224, 3), opt);
 
-    benchmark("shufflenet", ncnn::Mat(224, 224, 3));
+    benchmark("shufflenet", ncnn::Mat(224, 224, 3), opt);
 
-    benchmark("shufflenet_v2", ncnn::Mat(224, 224, 3));
+    benchmark("shufflenet_v2", ncnn::Mat(224, 224, 3), opt);
 
-    benchmark("mnasnet", ncnn::Mat(224, 224, 3));
+    benchmark("mnasnet", ncnn::Mat(224, 224, 3), opt);
 
-    benchmark("proxylessnasnet", ncnn::Mat(224, 224, 3));
+    benchmark("proxylessnasnet", ncnn::Mat(224, 224, 3), opt);
 
-    benchmark("googlenet", ncnn::Mat(224, 224, 3));
+    benchmark("googlenet", ncnn::Mat(224, 224, 3), opt);
 
 #if NCNN_VULKAN
     if (!use_vulkan_compute)
 #endif // NCNN_VULKAN
-    benchmark("googlenet_int8", ncnn::Mat(224, 224, 3));
+    {
+    opt.use_packing_layout = false;
+    benchmark("googlenet_int8", ncnn::Mat(224, 224, 3), opt);
+    opt.use_packing_layout = true;
+    }
 
-    benchmark("resnet18", ncnn::Mat(224, 224, 3));
+    benchmark("resnet18", ncnn::Mat(224, 224, 3), opt);
 
 #if NCNN_VULKAN
     if (!use_vulkan_compute)
 #endif // NCNN_VULKAN
-    benchmark("resnet18_int8", ncnn::Mat(224, 224, 3));
+    {
+    opt.use_packing_layout = false;
+    benchmark("resnet18_int8", ncnn::Mat(224, 224, 3), opt);
+    opt.use_packing_layout = true;
+    }
 
-    benchmark("alexnet", ncnn::Mat(227, 227, 3));
+    benchmark("alexnet", ncnn::Mat(227, 227, 3), opt);
 
-    benchmark("vgg16", ncnn::Mat(224, 224, 3));
+    benchmark("vgg16", ncnn::Mat(224, 224, 3), opt);
 
 #if NCNN_VULKAN
     if (!use_vulkan_compute)
 #endif // NCNN_VULKAN
-    benchmark("vgg16_int8", ncnn::Mat(224, 224, 3));
+    {
+    opt.use_packing_layout = false;
+    benchmark("vgg16_int8", ncnn::Mat(224, 224, 3), opt);
+    opt.use_packing_layout = true;
+    }
 
-    benchmark("resnet50", ncnn::Mat(224, 224, 3));
+    benchmark("resnet50", ncnn::Mat(224, 224, 3), opt);
 
 #if NCNN_VULKAN
     if (!use_vulkan_compute)
 #endif // NCNN_VULKAN
-    benchmark("resnet50_int8", ncnn::Mat(224, 224, 3));
+    {
+    opt.use_packing_layout = false;
+    benchmark("resnet50_int8", ncnn::Mat(224, 224, 3), opt);
+    opt.use_packing_layout = true;
+    }
 
-    benchmark("squeezenet_ssd", ncnn::Mat(300, 300, 3));
+    benchmark("squeezenet_ssd", ncnn::Mat(300, 300, 3), opt);
 
 #if NCNN_VULKAN
     if (!use_vulkan_compute)
 #endif // NCNN_VULKAN
-    benchmark("squeezenet_ssd_int8", ncnn::Mat(300, 300, 3));
+    {
+    opt.use_packing_layout = false;
+    benchmark("squeezenet_ssd_int8", ncnn::Mat(300, 300, 3), opt);
+    opt.use_packing_layout = true;
+    }
 
-    benchmark("mobilenet_ssd", ncnn::Mat(300, 300, 3));
+    benchmark("mobilenet_ssd", ncnn::Mat(300, 300, 3), opt);
 
 #if NCNN_VULKAN
     if (!use_vulkan_compute)
 #endif // NCNN_VULKAN
-    benchmark("mobilenet_ssd_int8", ncnn::Mat(300, 300, 3));
+    {
+    opt.use_packing_layout = false;
+    benchmark("mobilenet_ssd_int8", ncnn::Mat(300, 300, 3), opt);
+    opt.use_packing_layout = true;
+    }
 
-    benchmark("mobilenet_yolo", ncnn::Mat(416, 416, 3));
+    benchmark("mobilenet_yolo", ncnn::Mat(416, 416, 3), opt);
 
-    benchmark("mobilenetv2_yolov3", ncnn::Mat(352, 352, 3));
+    benchmark("mobilenetv2_yolov3", ncnn::Mat(352, 352, 3), opt);
 
 #if NCNN_VULKAN
     delete g_blob_vkallocator;
diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
index 686013b8d..855c0f962 100644
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -54,6 +54,7 @@ Convolution_arm::Convolution_arm()
 {
 #if __ARM_NEON
     support_packing = true;
+    use_fp32_packing_inference = false;
 #endif // __ARM_NEON
 
     activation = 0;
@@ -102,7 +103,16 @@ int Convolution_arm::create_pipeline(const Option& opt)
     int num_input = weight_data_size / maxk / num_output;
 
 #if __ARM_NEON
-    if (opt.use_packing_layout)
+    bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);
+
+    use_fp32_packing_inference = opt.use_packing_layout && weight_data_is_float32 && !use_int8_inference;
+
+    if (use_int8_inference)
+    {
+        support_packing = false;
+    }
+
+    if (use_fp32_packing_inference)
     {
 
     // pack4
@@ -188,6 +198,8 @@ int Convolution_arm::create_pipeline(const Option& opt)
                 }
             }
         }
+
+        return 0;
     }
 
     // pack1to4
@@ -230,6 +242,8 @@ int Convolution_arm::create_pipeline(const Option& opt)
                 }
             }
         }
+
+        return 0;
     }
 
     // pack4to1
@@ -281,6 +295,8 @@ int Convolution_arm::create_pipeline(const Option& opt)
                 }
             }
         }
+
+        return 0;
     }
 
     } // opt.use_packing_layout
@@ -525,7 +541,7 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
     // value = value + bias
 
 #if __ARM_NEON
-    if (opt.use_packing_layout)
+    if (use_fp32_packing_inference)
     {
 
     int w = bottom_blob.w;
diff --git a/src/layer/arm/convolution_arm.h b/src/layer/arm/convolution_arm.h
index 49afd35cd..093b0ec02 100644
--- a/src/layer/arm/convolution_arm.h
+++ b/src/layer/arm/convolution_arm.h
@@ -46,6 +46,8 @@ public:
     Mat weight_sgemm_data;
     std::vector<Mat> weight_3x3_winograd23_int8_data;
 
+    bool use_fp32_packing_inference;
+
     // pack4
     Mat weight_data_pack4;
     Mat weight_data_pack1to4;
diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp
index 159e07018..de25f645a 100644
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -40,6 +40,7 @@ ConvolutionDepthWise_arm::ConvolutionDepthWise_arm()
 {
 #if __ARM_NEON
     support_packing = true;
+    use_fp32_packing_inference = false;
 #endif // __ARM_NEON
 
     activation = 0;
@@ -89,7 +90,16 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
     int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
 
 #if __ARM_NEON
-    if (opt.use_packing_layout)
+    bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);
+
+    use_fp32_packing_inference = opt.use_packing_layout && weight_data_is_float32 && !use_int8_inference;
+
+    if (use_int8_inference)
+    {
+        support_packing = false;
+    }
+
+    if (use_fp32_packing_inference)
     {
 
     // depth-wise
@@ -100,6 +110,8 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
         {
             Mat weight_data_r2 = weight_data.reshape(maxk, group);
             convert_packing(weight_data_r2, weight_data_pack4, 4);
+
+            return 0;
         }
     }
 
@@ -298,7 +310,7 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
     size_t out_elemsize = elemsize / elempack * out_elempack;
 
 #if __ARM_NEON
-    if (opt.use_packing_layout)
+    if (use_fp32_packing_inference)
     {
 
     const int maxk = kernel_w * kernel_h;
diff --git a/src/layer/arm/convolutiondepthwise_arm.h b/src/layer/arm/convolutiondepthwise_arm.h
index 3eff74a4f..0707b2112 100644
--- a/src/layer/arm/convolutiondepthwise_arm.h
+++ b/src/layer/arm/convolutiondepthwise_arm.h
@@ -33,6 +33,8 @@ public:
     Layer* activation;
     std::vector<ncnn::Layer*> group_ops;
 
+    bool use_fp32_packing_inference;
+
     // packing
     Mat weight_data_pack4;
 };
diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp
index 6721103d6..456ffc7e0 100644
--- a/src/layer/arm/innerproduct_arm.cpp
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -30,6 +30,7 @@ InnerProduct_arm::InnerProduct_arm()
 {
 #if __ARM_NEON
     support_packing = true;
+    use_fp32_packing_inference = false;
 #endif // __ARM_NEON
 
     flatten = 0;
@@ -38,7 +39,16 @@ InnerProduct_arm::InnerProduct_arm()
 int InnerProduct_arm::create_pipeline(const Option& opt)
 {
 #if __ARM_NEON
-    if (opt.use_packing_layout)
+    bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);
+
+    use_fp32_packing_inference = opt.use_packing_layout && weight_data_is_float32 && !use_int8_inference;
+
+    if (use_int8_inference)
+    {
+        support_packing = false;
+    }
+
+    if (use_fp32_packing_inference)
     {
 
     {
@@ -85,7 +95,7 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
     int size = w * h;
 
 #if __ARM_NEON
-    if (opt.use_packing_layout)
+    if (use_fp32_packing_inference)
     {
 
     if (elempack == 4)
diff --git a/src/layer/arm/innerproduct_arm.h b/src/layer/arm/innerproduct_arm.h
index 0233ea506..dbced4604 100644
--- a/src/layer/arm/innerproduct_arm.h
+++ b/src/layer/arm/innerproduct_arm.h
@@ -30,6 +30,8 @@ public:
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
+    bool use_fp32_packing_inference;
+
     ncnn::Layer* flatten;
 };
 
diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp
index c96f27923..b0b6d3e72 100644
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -77,20 +77,17 @@ int Convolution::load_model(const ModelBin& mb)
 
 int Convolution::create_pipeline(const Option& opt)
 {
-    use_int8_inference = opt.use_int8_inference;
-
-    if (int8_scale_term == 0)
-        use_int8_inference = false;
-
     bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u);
     bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);
 
-    if (weight_data_is_int8 && !use_int8_inference)
+    if (weight_data_is_int8 && !opt.use_int8_inference)
     {
         fprintf(stderr, "quantized int8 weight loaded but use_int8_inference disabled\n");
         return -1;
     }
 
+    use_int8_inference = opt.use_int8_inference && (weight_data_is_int8 || (weight_data_is_float32 && int8_scale_term));
+
     // runtime quantize the weight data
     if (weight_data_is_float32 && use_int8_inference)
     {
diff --git a/src/layer/convolutiondepthwise.cpp b/src/layer/convolutiondepthwise.cpp
index 946ff83db..70cd80c8f 100644
--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -100,20 +100,17 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
 
 int ConvolutionDepthWise::create_pipeline(const Option& opt)
 {
-    use_int8_inference = opt.use_int8_inference;
-
-    if (int8_scale_term == 0)
-        use_int8_inference = false;
-
     bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u);
     bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);
 
-    if (weight_data_is_int8 && !use_int8_inference)
+    if (weight_data_is_int8 && !opt.use_int8_inference)
     {
         fprintf(stderr, "quantized int8 weight loaded but use_int8_inference disabled\n");
         return -1;
     }
 
+    use_int8_inference = opt.use_int8_inference && (weight_data_is_int8 || (weight_data_is_float32 && int8_scale_term));
+
     if (weight_data_is_float32 && use_int8_inference)
     {
         // quantize weight to int8
diff --git a/src/layer/innerproduct.cpp b/src/layer/innerproduct.cpp
index f4b0e97f5..b27551500 100644
--- a/src/layer/innerproduct.cpp
+++ b/src/layer/innerproduct.cpp
@@ -64,20 +64,17 @@ int InnerProduct::load_model(const ModelBin& mb)
 
 int InnerProduct::create_pipeline(const Option& opt)
 {
-    use_int8_inference = opt.use_int8_inference;
-
-    if (int8_scale_term == 0)
-        use_int8_inference = false;
-
     bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u);
     bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);
 
-    if (weight_data_is_int8 && !use_int8_inference)
+    if (weight_data_is_int8 && !opt.use_int8_inference)
     {
         fprintf(stderr, "quantized int8 weight loaded but use_int8_inference disabled\n");
         return -1;
     }
 
+    use_int8_inference = opt.use_int8_inference && (weight_data_is_int8 || (weight_data_is_float32 && int8_scale_term));
+
     // initial the quantize,dequantize op layer
     if (use_int8_inference)
     {