diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp
index 5bcadee1e..7ea854363 100644
--- a/benchmark/benchncnn.cpp
+++ b/benchmark/benchncnn.cpp
@@ -409,20 +409,20 @@ void mobilenet_yolo_run(const ncnn::Net& net)
     ncnn::Mat in(416, 416, 3);
     ex.input("data", in);
 
-#if NCNN_VULKAN
-    // TODO chain conv22 conv23
-    if (g_use_vulkan_compute)
-    {
-        ncnn::Mat conv22;
-        ex.extract("conv22", conv22);
-
-        ncnn::Mat conv23;
-        ex.extract("conv23", conv23);
-
-        // use cpu for detection_out
-        ex.set_vulkan_compute(false);
-    }
-#endif // NCNN_VULKAN
+// #if NCNN_VULKAN
+//     // TODO chain conv22 conv23
+//     if (g_use_vulkan_compute)
+//     {
+//         ncnn::Mat conv22;
+//         ex.extract("conv22", conv22);
+//
+//         ncnn::Mat conv23;
+//         ex.extract("conv23", conv23);
+//
+//         // use cpu for detection_out
+//         ex.set_vulkan_compute(false);
+//     }
+// #endif // NCNN_VULKAN
 
     ncnn::Mat out;
     ex.extract("detection_out", out);
@@ -440,20 +440,20 @@ void mobilenet_yolov3_run(const ncnn::Net& net)
     ncnn::Mat in(416, 416, 3);
     ex.input("data", in);
 
-#if NCNN_VULKAN
-    // TODO chain conv19 conv20
-    if (g_use_vulkan_compute)
-    {
-        ncnn::Mat conv19;
-        ex.extract("conv19", conv19);
-
-        ncnn::Mat conv20;
-        ex.extract("conv20", conv20);
-
-        // use cpu for detection_out
-        ex.set_vulkan_compute(false);
-    }
-#endif // NCNN_VULKAN
+// #if NCNN_VULKAN
+//     // TODO chain conv19 conv20
+//     if (g_use_vulkan_compute)
+//     {
+//         ncnn::Mat conv19;
+//         ex.extract("conv19", conv19);
+//
+//         ncnn::Mat conv20;
+//         ex.extract("conv20", conv20);
+//
+//         // use cpu for detection_out
+//         ex.set_vulkan_compute(false);
+//     }
+// #endif // NCNN_VULKAN
 
     ncnn::Mat out;
     ex.extract("detection_out", out);
diff --git a/src/layer/permute.cpp b/src/layer/permute.cpp
index 33948d39c..f6e00f8df 100644
--- a/src/layer/permute.cpp
+++ b/src/layer/permute.cpp
@@ -317,8 +317,6 @@ int Permute::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd,
         }
     }
 
-    fprintf(stderr, "%d %d %d\n", top_blob.w, top_blob.h, top_blob.c);
-
 //     fprintf(stderr, "Permute::forward %p %p\n", bottom_blob.buffer(), top_blob.buffer());
 
     std::vector<VkMat> bindings(2);
diff --git a/src/net.cpp b/src/net.cpp
index 9b63cf97a..25d202572 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -1145,201 +1145,425 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, Option& opt
 }
 
 #if NCNN_VULKAN
-int Net::forward_layer(int layer_index, std::vector<VkMat>& blob_mats, std::vector<int>& wait_barrier_counts, VkCompute& cmd, Option& opt) const
+int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector<VkMat>& blob_mats_gpu, std::vector<int>& wait_barrier_counts, VkCompute& cmd, Option& opt) const
 {
     const Layer* layer = layers[layer_index];
 
-//     fprintf(stderr, "forward_layer %d %s\n", layer_index, layer->name.c_str());
+//     fprintf(stderr, "forward_layer %d %d %s\n", layer->support_vulkan, layer_index, layer->name.c_str());
 
-    if (layer->one_blob_only)
+    if (layer->support_vulkan)
     {
-        // load bottom blob
-        int bottom_blob_index = layer->bottoms[0];
-        int top_blob_index = layer->tops[0];
-
-        if (blob_mats[bottom_blob_index].dims == 0)
-        {
-            int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, wait_barrier_counts, cmd, opt);
-            if (ret != 0)
-                return ret;
-        }
-        else if (blob_mats[bottom_blob_index].staging_data)
+        if (layer->one_blob_only)
         {
-            // upload
-            const VkMat& bottom_blob = blob_mats[bottom_blob_index];
-            cmd.record_prepare_transfer_barrier(bottom_blob);
-            cmd.record_upload(bottom_blob);
-        }
+            // load bottom blob
+            int bottom_blob_index = layer->bottoms[0];
+            int top_blob_index = layer->tops[0];
 
-        VkMat bottom_blob = blob_mats[bottom_blob_index];
+            if (blob_mats_gpu[bottom_blob_index].dims == 0)
+            {
+                if (blob_mats[bottom_blob_index].dims == 0)
+                {
+                    int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, wait_barrier_counts, cmd, opt);
+                    if (ret != 0)
+                        return ret;
+                }
 
-        if (opt.lightmode)
-        {
+                if (blob_mats_gpu[bottom_blob_index].dims == 0)
+                {
+                    // upload
+                    VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index];
+                    bottom_blob.create_like(blob_mats[bottom_blob_index], opt.blob_vkallocator, opt.staging_vkallocator);
+
+                    if (!bottom_blob.allocator->mappable)
+                    {
+                        bottom_blob.prepare_staging_buffer();
+                    }
 
-            wait_barrier_counts[bottom_blob_index] += layer->tops.size();
+                    bottom_blob.upload(blob_mats[bottom_blob_index]);
 
-            // deep copy for inplace forward if data is shared
-            if (layer->support_inplace && wait_barrier_counts[bottom_blob_index] != 1)
+                    cmd.record_prepare_transfer_barrier(bottom_blob);
+                    cmd.record_upload(bottom_blob);
+
+                    // TODO convert packing
+//                     fprintf(stderr, "upload %d %d %d  %lu %d\n", bottom_blob.w, bottom_blob.h, bottom_blob.c, bottom_blob.elemsize, bottom_blob.packing);
+                }
+            }
+
+            VkMat bottom_blob = blob_mats_gpu[bottom_blob_index];
+
+            if (opt.lightmode)
             {
-                VkMat bottom_blob_copy;
-                bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator);
 
-//                 fprintf(stderr, "clone %p %p\n", bottom_blob.buffer(), bottom_blob_copy.buffer());
+                wait_barrier_counts[bottom_blob_index] += layer->tops.size();
+
+                // deep copy for inplace forward if data is shared
+                if (layer->support_inplace && wait_barrier_counts[bottom_blob_index] != 1)
+                {
+                    VkMat bottom_blob_copy;
+                    bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator);
+
+//                     fprintf(stderr, "clone %p %p\n", bottom_blob.buffer(), bottom_blob_copy.buffer());
 
-                cmd.record_prepare_transfer_barrier(bottom_blob);
-                cmd.record_clone(bottom_blob, bottom_blob_copy);
-                bottom_blob = bottom_blob_copy;
+                    cmd.record_prepare_transfer_barrier(bottom_blob);
+                    cmd.record_clone(bottom_blob, bottom_blob_copy);
+                    bottom_blob = bottom_blob_copy;
 
-                wait_barrier_counts[bottom_blob_index]--;
+                    wait_barrier_counts[bottom_blob_index]--;
+                }
             }
-        }
 
-        // forward
-        if (opt.lightmode && layer->support_inplace)
-        {
-            VkMat& bottom_top_blob = bottom_blob;
-            int ret = layer->forward_inplace(bottom_top_blob, cmd, opt);
-            if (ret != 0)
-                return ret;
+            // forward
+            if (opt.lightmode && layer->support_inplace)
+            {
+                VkMat& bottom_top_blob = bottom_blob;
+                int ret = layer->forward_inplace(bottom_top_blob, cmd, opt);
+                if (ret != 0)
+                    return ret;
 
-            // store top blob
-            blob_mats[top_blob_index] = bottom_top_blob;
+                // store top blob
+                blob_mats_gpu[top_blob_index] = bottom_top_blob;
+            }
+            else
+            {
+                VkMat top_blob;
+                int ret = layer->forward(bottom_blob, top_blob, cmd, opt);
+                if (ret != 0)
+                    return ret;
+
+                // store top blob
+                blob_mats_gpu[top_blob_index] = top_blob;
+            }
+
+            if (opt.lightmode)
+            {
+                // reclaim producer bottom_blob as free when consuming bottom_blob
+                const Layer* producer = layers[ blobs[bottom_blob_index].producer ];
+                for (size_t i=0; i<producer->bottoms.size(); i++)
+                {
+                    int producer_bottom_blob_index = producer->bottoms[i];
+
+                    wait_barrier_counts[producer_bottom_blob_index]--;
+                    if (wait_barrier_counts[producer_bottom_blob_index] == 0)
+                    {
+//                         fprintf(stderr, "reclaim free %p\n", blob_mats_gpu[producer_bottom_blob_index].buffer());
+
+                        blob_mats_gpu[producer_bottom_blob_index].release();
+                    }
+                }
+            }
         }
         else
         {
-            VkMat top_blob;
-            int ret = layer->forward(bottom_blob, top_blob, cmd, opt);
-            if (ret != 0)
-                return ret;
+            // load bottom blobs
+            std::vector<VkMat> bottom_blobs;
+            bottom_blobs.resize(layer->bottoms.size());
+            for (size_t i=0; i<layer->bottoms.size(); i++)
+            {
+                int bottom_blob_index = layer->bottoms[i];
 
-            // store top blob
-            blob_mats[top_blob_index] = top_blob;
-        }
+                if (blob_mats_gpu[bottom_blob_index].dims == 0)
+                {
+                    if (blob_mats[bottom_blob_index].dims == 0)
+                    {
+                        int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, wait_barrier_counts, cmd, opt);
+                        if (ret != 0)
+                            return ret;
+                    }
 
-        if (opt.lightmode)
-        {
-            // reclaim producer bottom_blob as free when consuming bottom_blob
-            const Layer* producer = layers[ blobs[bottom_blob_index].producer ];
-            for (size_t i=0; i<producer->bottoms.size(); i++)
-            {
-                int producer_bottom_blob_index = producer->bottoms[i];
+                    if (blob_mats_gpu[bottom_blob_index].dims == 0)
+                    {
+                        // upload
+                        VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index];
+                        bottom_blob.create_like(blob_mats[bottom_blob_index], opt.blob_vkallocator, opt.staging_vkallocator);
+
+                        if (!bottom_blob.allocator->mappable)
+                        {
+                            bottom_blob.prepare_staging_buffer();
+                        }
+
+                        bottom_blob.upload(blob_mats[bottom_blob_index]);
+
+                        cmd.record_prepare_transfer_barrier(bottom_blob);
+                        cmd.record_upload(bottom_blob);
+
+                        // TODO convert packing
+//                         fprintf(stderr, "upload %d %d %d  %lu %d\n", bottom_blob.w, bottom_blob.h, bottom_blob.c, bottom_blob.elemsize, bottom_blob.packing);
+                    }
+                }
 
-                wait_barrier_counts[producer_bottom_blob_index]--;
-                if (wait_barrier_counts[producer_bottom_blob_index] == 0)
+                bottom_blobs[i] = blob_mats_gpu[bottom_blob_index];
+
+                if (opt.lightmode)
                 {
-//                     fprintf(stderr, "reclaim free %p\n", blob_mats[producer_bottom_blob_index].buffer());
 
-                    blob_mats[producer_bottom_blob_index].release();
+                    wait_barrier_counts[bottom_blob_index] = layer->tops.size();
+
+                    // deep copy for inplace forward if data is shared
+                    if (layer->support_inplace && wait_barrier_counts[bottom_blob_index] != 1)
+                    {
+                        VkMat bottom_blob_copy;
+                        bottom_blob_copy.create_like(bottom_blobs[i], bottom_blobs[i].allocator, bottom_blobs[i].staging_allocator);
+
+//                         fprintf(stderr, "clone %p %p\n", bottom_blobs[i].buffer(), bottom_blob_copy.buffer());
+
+                        cmd.record_prepare_transfer_barrier(bottom_blobs[i]);
+                        cmd.record_clone(bottom_blobs[i], bottom_blob_copy);
+                        bottom_blobs[i] = bottom_blob_copy;
+
+                        wait_barrier_counts[bottom_blob_index]--;
+                    }
                 }
             }
-        }
-    }
-    else
-    {
-        // load bottom blobs
-        std::vector<VkMat> bottom_blobs;
-        bottom_blobs.resize(layer->bottoms.size());
-        for (size_t i=0; i<layer->bottoms.size(); i++)
-        {
-            int bottom_blob_index = layer->bottoms[i];
 
-            if (blob_mats[bottom_blob_index].dims == 0)
+            // forward
+            if (opt.lightmode && layer->support_inplace)
             {
-                int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, wait_barrier_counts, cmd, opt);
+                std::vector<VkMat>& bottom_top_blobs = bottom_blobs;
+                int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt);
                 if (ret != 0)
                     return ret;
+
+                // store top blobs
+                for (size_t i=0; i<layer->tops.size(); i++)
+                {
+                    int top_blob_index = layer->tops[i];
+
+                    blob_mats_gpu[top_blob_index] = bottom_top_blobs[i];
+                }
             }
-            else if (blob_mats[bottom_blob_index].staging_data)
+            else
             {
-                // upload
-                const VkMat& bottom_blob = blob_mats[bottom_blob_index];
-                cmd.record_prepare_transfer_barrier(bottom_blob);
-                cmd.record_upload(bottom_blob);
-            }
+                std::vector<VkMat> top_blobs;
+                top_blobs.resize(layer->tops.size());
+                int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt);
+                if (ret != 0)
+                    return ret;
 
-            bottom_blobs[i] = blob_mats[bottom_blob_index];
+                // store top blobs
+                for (size_t i=0; i<layer->tops.size(); i++)
+                {
+                    int top_blob_index = layer->tops[i];
+
+                    blob_mats_gpu[top_blob_index] = top_blobs[i];
+                }
+            }
 
             if (opt.lightmode)
             {
-
-                wait_barrier_counts[bottom_blob_index] = layer->tops.size();
-
-                // deep copy for inplace forward if data is shared
-                if (layer->support_inplace && wait_barrier_counts[bottom_blob_index] != 1)
+                for (size_t i=0; i<layer->bottoms.size(); i++)
                 {
-                    VkMat bottom_blob_copy;
-                    bottom_blob_copy.create_like(bottom_blobs[i], bottom_blobs[i].allocator, bottom_blobs[i].staging_allocator);
+                    int bottom_blob_index = layer->bottoms[i];
 
-//                     fprintf(stderr, "clone %p %p\n", bottom_blobs[i].buffer(), bottom_blob_copy.buffer());
+                    // reclaim producer bottom_blob as free when consuming bottom_blob
+                    const Layer* producer = layers[ blobs[bottom_blob_index].producer ];
+                    for (size_t i=0; i<producer->bottoms.size(); i++)
+                    {
+                        int producer_bottom_blob_index = producer->bottoms[i];
 
-                    cmd.record_prepare_transfer_barrier(bottom_blobs[i]);
-                    cmd.record_clone(bottom_blobs[i], bottom_blob_copy);
-                    bottom_blobs[i] = bottom_blob_copy;
+                        wait_barrier_counts[producer_bottom_blob_index]--;
+                        if (wait_barrier_counts[producer_bottom_blob_index] == 0)
+                        {
+//                             fprintf(stderr, "reclaim free %p\n", blob_mats_gpu[producer_bottom_blob_index].buffer());
 
-                    wait_barrier_counts[bottom_blob_index]--;
+                            blob_mats_gpu[producer_bottom_blob_index].release();
+                        }
+                    }
                 }
             }
+
         }
 
-        // forward
-        if (opt.lightmode && layer->support_inplace)
+    }
+    else
+    {
+        if (layer->one_blob_only)
         {
-            std::vector<VkMat>& bottom_top_blobs = bottom_blobs;
-            int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt);
-            if (ret != 0)
-                return ret;
+            // load bottom blob
+            int bottom_blob_index = layer->bottoms[0];
+            int top_blob_index = layer->tops[0];
 
-            // store top blobs
-            for (size_t i=0; i<layer->tops.size(); i++)
+            if (blob_mats[bottom_blob_index].dims == 0)
             {
-                int top_blob_index = layer->tops[i];
+                if (blob_mats_gpu[bottom_blob_index].dims == 0)
+                {
+                    int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, wait_barrier_counts, cmd, opt);
+                    if (ret != 0)
+                        return ret;
+                }
 
-                blob_mats[top_blob_index] = bottom_top_blobs[i];
+                if (blob_mats[bottom_blob_index].dims == 0)
+                {
+                    // download
+                    VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index];
+
+                    if (!bottom_blob.allocator->mappable)
+                    {
+                        cmd.record_prepare_transfer_barrier(bottom_blob);
+                        bottom_blob.prepare_staging_buffer();
+                        cmd.record_download(bottom_blob);
+                    }
+
+                    cmd.submit();
+
+                    cmd.wait();
+
+                    cmd.reset();
+
+                    blob_mats[bottom_blob_index].create_like(bottom_blob, opt.blob_allocator);
+                    bottom_blob.download(blob_mats[bottom_blob_index]);
+
+                    if (!bottom_blob.allocator->mappable)
+                    {
+                        bottom_blob.discard_staging_buffer();
+                    }
+
+                    // TODO convert packing
+//                     fprintf(stderr, "download %d %d %d  %lu %d\n", bottom_blob.w, bottom_blob.h, bottom_blob.c, bottom_blob.elemsize, bottom_blob.packing);
+                }
             }
+
+            Mat bottom_blob = blob_mats[bottom_blob_index];
+
+            if (opt.lightmode)
+            {
+                // delete after taken in light mode
+                blob_mats[bottom_blob_index].release();
+                // deep copy for inplace forward if data is shared
+                if (layer->support_inplace && *bottom_blob.refcount != 1)
+                {
+                    bottom_blob = bottom_blob.clone();
+                }
+            }
+
+            // forward
+            if (opt.lightmode && layer->support_inplace)
+            {
+                Mat& bottom_top_blob = bottom_blob;
+
+                int ret = layer->forward_inplace(bottom_top_blob, opt);
+                if (ret != 0)
+                    return ret;
+
+                // store top blob
+                blob_mats[top_blob_index] = bottom_top_blob;
+            }
+            else
+            {
+                Mat top_blob;
+
+                int ret = layer->forward(bottom_blob, top_blob, opt);
+                if (ret != 0)
+                    return ret;
+
+                // store top blob
+                blob_mats[top_blob_index] = top_blob;
+            }
+
         }
         else
         {
-            std::vector<VkMat> top_blobs;
-            top_blobs.resize(layer->tops.size());
-            int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt);
-            if (ret != 0)
-                return ret;
+            // load bottom blobs
+            std::vector<Mat> bottom_blobs;
+            bottom_blobs.resize(layer->bottoms.size());
+            for (size_t i=0; i<layer->bottoms.size(); i++)
+            {
+                int bottom_blob_index = layer->bottoms[i];
+
+                if (blob_mats[bottom_blob_index].dims == 0)
+                {
+                    if (blob_mats_gpu[bottom_blob_index].dims == 0)
+                    {
+                        int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, wait_barrier_counts, cmd, opt);
+                        if (ret != 0)
+                            return ret;
+                    }
+
+                    if (blob_mats[bottom_blob_index].dims == 0)
+                    {
+                        // download
+                        VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index];
+
+                        if (!bottom_blob.allocator->mappable)
+                        {
+                            cmd.record_prepare_transfer_barrier(bottom_blob);
+                            bottom_blob.prepare_staging_buffer();
+                            cmd.record_download(bottom_blob);
+                        }
+                    }
+                }
+            }
 
-            // store top blobs
-            for (size_t i=0; i<layer->tops.size(); i++)
             {
-                int top_blob_index = layer->tops[i];
+                cmd.submit();
 
-                blob_mats[top_blob_index] = top_blobs[i];
+                cmd.wait();
+
+                cmd.reset();
             }
-        }
 
-        if (opt.lightmode)
-        {
             for (size_t i=0; i<layer->bottoms.size(); i++)
             {
                 int bottom_blob_index = layer->bottoms[i];
 
-                // reclaim producer bottom_blob as free when consuming bottom_blob
-                const Layer* producer = layers[ blobs[bottom_blob_index].producer ];
-                for (size_t i=0; i<producer->bottoms.size(); i++)
+                if (blob_mats[bottom_blob_index].dims == 0)
                 {
-                    int producer_bottom_blob_index = producer->bottoms[i];
+                    // download
+                    VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index];
 
-                    wait_barrier_counts[producer_bottom_blob_index]--;
-                    if (wait_barrier_counts[producer_bottom_blob_index] == 0)
-                    {
-//                         fprintf(stderr, "reclaim free %p\n", blob_mats[producer_bottom_blob_index].buffer());
+                    blob_mats[bottom_blob_index].create_like(bottom_blob, opt.blob_allocator);
+                    bottom_blob.download(blob_mats[bottom_blob_index]);
 
-                        blob_mats[producer_bottom_blob_index].release();
+                    if (!bottom_blob.allocator->mappable)
+                    {
+                        bottom_blob.discard_staging_buffer();
                     }
+
+                    // TODO convert packing
+//                     fprintf(stderr, "download %d %d %d  %lu %d\n", bottom_blob.w, bottom_blob.h, bottom_blob.c, bottom_blob.elemsize, bottom_blob.packing);
                 }
+
+                bottom_blobs[i] = blob_mats[bottom_blob_index];
             }
-        }
 
+            // forward
+            if (opt.lightmode && layer->support_inplace)
+            {
+                std::vector<Mat>& bottom_top_blobs = bottom_blobs;
+
+                int ret = layer->forward_inplace(bottom_top_blobs, opt);
+                if (ret != 0)
+                    return ret;
+
+                // store top blobs
+                for (size_t i=0; i<layer->tops.size(); i++)
+                {
+                    int top_blob_index = layer->tops[i];
+
+                    blob_mats[top_blob_index] = bottom_top_blobs[i];
+                }
+            }
+            else
+            {
+                std::vector<Mat> top_blobs;
+                top_blobs.resize(layer->tops.size());
+
+                int ret = layer->forward(bottom_blobs, top_blobs, opt);
+                if (ret != 0)
+                    return ret;
+
+                // store top blobs
+                for (size_t i=0; i<layer->tops.size(); i++)
+                {
+                    int top_blob_index = layer->tops[i];
+
+                    blob_mats[top_blob_index] = top_blobs[i];
+                }
+            }
+
+        }
     }
 
-//     fprintf(stderr, "forward_layer %d %s done\n", layer_index, layer->name.c_str());
+//     fprintf(stderr, "forward_layer %d %d %s done\n", layer->support_vulkan, layer_index, layer->name.c_str());
 
     return 0;
 }
@@ -1440,22 +1664,6 @@ int Extractor::input(int blob_index, const Mat& in)
 
     blob_mats[blob_index] = in;
 
-#if NCNN_VULKAN
-    if (opt.vulkan_compute)
-    {
-        VkMat& in_gpu = blob_mats_gpu[blob_index];
-
-        in_gpu.create_like(in, opt.blob_vkallocator, opt.staging_vkallocator);
-
-        if (!in_gpu.allocator->mappable)
-        {
-            in_gpu.prepare_staging_buffer();
-        }
-
-        in_gpu.upload(in);
-    }
-#endif // NCNN_VULKAN
-
     return 0;
 }
 
@@ -1473,33 +1681,35 @@ int Extractor::extract(int blob_index, Mat& feat)
 #if NCNN_VULKAN
         if (opt.vulkan_compute)
         {
-            VkMat feat_gpu;
-
             ncnn::VkCompute cmd(net->vkdev);
 
+            VkMat feat_gpu;
             ret = extract(blob_index, feat_gpu, cmd);
 
-            if (!feat_gpu.allocator->mappable)
+            if (blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0)
             {
                 // download
-                cmd.record_prepare_transfer_barrier(feat_gpu);
+                if (!feat_gpu.allocator->mappable)
+                {
+                    cmd.record_prepare_transfer_barrier(feat_gpu);
 
-                feat_gpu.prepare_staging_buffer();
+                    feat_gpu.prepare_staging_buffer();
 
-                cmd.record_download(feat_gpu);
-            }
+                    cmd.record_download(feat_gpu);
+                }
 
-            cmd.submit();
+                cmd.submit();
 
-            cmd.wait();
+                cmd.wait();
 
-            blob_mats[blob_index].create_like(feat_gpu, opt.blob_allocator);
+                blob_mats[blob_index].create_like(feat_gpu, opt.blob_allocator);
 
-            feat_gpu.download(blob_mats[blob_index]);
+                feat_gpu.download(blob_mats[blob_index]);
 
-            if (!feat_gpu.allocator->mappable)
-            {
-                feat_gpu.discard_staging_buffer();
+                if (!feat_gpu.allocator->mappable)
+                {
+                    feat_gpu.discard_staging_buffer();
+                }
             }
         }
         else
@@ -1558,7 +1768,7 @@ int Extractor::extract(int blob_index, VkMat& feat, VkCompute& cmd)
     if (blob_mats_gpu[blob_index].dims == 0)
     {
         int layer_index = net->blobs[blob_index].producer;
-        ret = net->forward_layer(layer_index, blob_mats_gpu, wait_barrier_counts, cmd, opt);
+        ret = net->forward_layer(layer_index, blob_mats, blob_mats_gpu, wait_barrier_counts, cmd, opt);
     }
 
     feat = blob_mats_gpu[blob_index];
diff --git a/src/net.h b/src/net.h
index 25b68b34a..c4b9d1966 100644
--- a/src/net.h
+++ b/src/net.h
@@ -122,7 +122,7 @@ protected:
     int forward_layer(int layer_index, std::vector<Mat>& blob_mats, Option& opt) const;
 
 #if NCNN_VULKAN
-    int forward_layer(int layer_index, std::vector<VkMat>& blob_mats, std::vector<int>& wait_barrier_counts, VkCompute& cmd, Option& opt) const;
+    int forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector<VkMat>& blob_mats_gpu, std::vector<int>& wait_barrier_counts, VkCompute& cmd, Option& opt) const;
 #endif // NCNN_VULKAN
 
 protected: