fallback to cpu forward if layer not support vulkan, automatically!

7 years ago · 83efa73cf6
--- a/benchmark/benchncnn.cpp
+++ b/benchmark/benchncnn.cpp
@@ -409,20 +409,20 @@ void mobilenet_yolo_run(const ncnn::Net& net)
    ncnn::Mat in(416, 416, 3);
    ex.input("data", in);

 #if NCNN_VULKAN
    // TODO chain conv22 conv23
    if (g_use_vulkan_compute)
    {
        ncnn::Mat conv22;
        ex.extract("conv22", conv22);

        ncnn::Mat conv23;
        ex.extract("conv23", conv23);

        // use cpu for detection_out
        ex.set_vulkan_compute(false);
    }
 #endif // NCNN_VULKAN
 // #if NCNN_VULKAN
 //     // TODO chain conv22 conv23
 //     if (g_use_vulkan_compute)
 //     {
 //         ncnn::Mat conv22;
 //         ex.extract("conv22", conv22);
 //
 //         ncnn::Mat conv23;
 //         ex.extract("conv23", conv23);
 //
 //         // use cpu for detection_out
 //         ex.set_vulkan_compute(false);
 //     }
 // #endif // NCNN_VULKAN

    ncnn::Mat out;
    ex.extract("detection_out", out);
@@ -440,20 +440,20 @@ void mobilenet_yolov3_run(const ncnn::Net& net)
    ncnn::Mat in(416, 416, 3);
    ex.input("data", in);

 #if NCNN_VULKAN
    // TODO chain conv19 conv20
    if (g_use_vulkan_compute)
    {
        ncnn::Mat conv19;
        ex.extract("conv19", conv19);

        ncnn::Mat conv20;
        ex.extract("conv20", conv20);

        // use cpu for detection_out
        ex.set_vulkan_compute(false);
    }
 #endif // NCNN_VULKAN
 // #if NCNN_VULKAN
 //     // TODO chain conv19 conv20
 //     if (g_use_vulkan_compute)
 //     {
 //         ncnn::Mat conv19;
 //         ex.extract("conv19", conv19);
 //
 //         ncnn::Mat conv20;
 //         ex.extract("conv20", conv20);
 //
 //         // use cpu for detection_out
 //         ex.set_vulkan_compute(false);
 //     }
 // #endif // NCNN_VULKAN

    ncnn::Mat out;
    ex.extract("detection_out", out);
--- a/src/layer/permute.cpp
+++ b/src/layer/permute.cpp
@@ -317,8 +317,6 @@ int Permute::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd,
        }
    }

    fprintf(stderr, "%d %d %d\n", top_blob.w, top_blob.h, top_blob.c);

 //     fprintf(stderr, "Permute::forward %p %p\n", bottom_blob.buffer(), top_blob.buffer());

    std::vector<VkMat> bindings(2);
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -1145,201 +1145,425 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, Option& opt
 }

 #if NCNN_VULKAN
 int Net::forward_layer(int layer_index, std::vector<VkMat>& blob_mats, std::vector<int>& wait_barrier_counts, VkCompute& cmd, Option& opt) const
 int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector<VkMat>& blob_mats_gpu, std::vector<int>& wait_barrier_counts, VkCompute& cmd, Option& opt) const
 {
    const Layer* layer = layers[layer_index];

 //     fprintf(stderr, "forward_layer %d %s\n", layer_index, layer->name.c_str());
 //     fprintf(stderr, "forward_layer %d %d %s\n", layer->support_vulkan, layer_index, layer->name.c_str());

    if (layer->one_blob_only)
    if (layer->support_vulkan)
    {
        // load bottom blob
        int bottom_blob_index = layer->bottoms[0];
        int top_blob_index = layer->tops[0];

        if (blob_mats[bottom_blob_index].dims == 0)
        {
            int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, wait_barrier_counts, cmd, opt);
            if (ret != 0)
                return ret;
        }
        else if (blob_mats[bottom_blob_index].staging_data)
        if (layer->one_blob_only)
        {
            // upload
            const VkMat& bottom_blob = blob_mats[bottom_blob_index];
            cmd.record_prepare_transfer_barrier(bottom_blob);
            cmd.record_upload(bottom_blob);
        }
            // load bottom blob
            int bottom_blob_index = layer->bottoms[0];
            int top_blob_index = layer->tops[0];

        VkMat bottom_blob = blob_mats[bottom_blob_index];
            if (blob_mats_gpu[bottom_blob_index].dims == 0)
            {
                if (blob_mats[bottom_blob_index].dims == 0)
                {
                    int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, wait_barrier_counts, cmd, opt);
                    if (ret != 0)
                        return ret;
                }

        if (opt.lightmode)
        {
                if (blob_mats_gpu[bottom_blob_index].dims == 0)
                {
                    // upload
                    VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index];
                    bottom_blob.create_like(blob_mats[bottom_blob_index], opt.blob_vkallocator, opt.staging_vkallocator);

                    if (!bottom_blob.allocator->mappable)
                    {
                        bottom_blob.prepare_staging_buffer();
                    }

            wait_barrier_counts[bottom_blob_index] += layer->tops.size();
                    bottom_blob.upload(blob_mats[bottom_blob_index]);

            // deep copy for inplace forward if data is shared
            if (layer->support_inplace && wait_barrier_counts[bottom_blob_index] != 1)
                    cmd.record_prepare_transfer_barrier(bottom_blob);
                    cmd.record_upload(bottom_blob);

                    // TODO convert packing
 //                     fprintf(stderr, "upload %d %d %d  %lu %d\n", bottom_blob.w, bottom_blob.h, bottom_blob.c, bottom_blob.elemsize, bottom_blob.packing);
                }
            }

            VkMat bottom_blob = blob_mats_gpu[bottom_blob_index];

            if (opt.lightmode)
            {
                VkMat bottom_blob_copy;
                bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator);

 //                 fprintf(stderr, "clone %p %p\n", bottom_blob.buffer(), bottom_blob_copy.buffer());
                wait_barrier_counts[bottom_blob_index] += layer->tops.size();

                // deep copy for inplace forward if data is shared
                if (layer->support_inplace && wait_barrier_counts[bottom_blob_index] != 1)
                {
                    VkMat bottom_blob_copy;
                    bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator);

 //                     fprintf(stderr, "clone %p %p\n", bottom_blob.buffer(), bottom_blob_copy.buffer());

                cmd.record_prepare_transfer_barrier(bottom_blob);
                cmd.record_clone(bottom_blob, bottom_blob_copy);
                bottom_blob = bottom_blob_copy;
                    cmd.record_prepare_transfer_barrier(bottom_blob);
                    cmd.record_clone(bottom_blob, bottom_blob_copy);
                    bottom_blob = bottom_blob_copy;

                wait_barrier_counts[bottom_blob_index]--;
                    wait_barrier_counts[bottom_blob_index]--;
                }
            }
        }

        // forward
        if (opt.lightmode && layer->support_inplace)
        {
            VkMat& bottom_top_blob = bottom_blob;
            int ret = layer->forward_inplace(bottom_top_blob, cmd, opt);
            if (ret != 0)
                return ret;
            // forward
            if (opt.lightmode && layer->support_inplace)
            {
                VkMat& bottom_top_blob = bottom_blob;
                int ret = layer->forward_inplace(bottom_top_blob, cmd, opt);
                if (ret != 0)
                    return ret;

            // store top blob
            blob_mats[top_blob_index] = bottom_top_blob;
                // store top blob
                blob_mats_gpu[top_blob_index] = bottom_top_blob;
            }
            else
            {
                VkMat top_blob;
                int ret = layer->forward(bottom_blob, top_blob, cmd, opt);
                if (ret != 0)
                    return ret;

                // store top blob
                blob_mats_gpu[top_blob_index] = top_blob;
            }

            if (opt.lightmode)
            {
                // reclaim producer bottom_blob as free when consuming bottom_blob
                const Layer* producer = layers[ blobs[bottom_blob_index].producer ];
                for (size_t i=0; i<producer->bottoms.size(); i++)
                {
                    int producer_bottom_blob_index = producer->bottoms[i];

                    wait_barrier_counts[producer_bottom_blob_index]--;
                    if (wait_barrier_counts[producer_bottom_blob_index] == 0)
                    {
 //                         fprintf(stderr, "reclaim free %p\n", blob_mats_gpu[producer_bottom_blob_index].buffer());

                        blob_mats_gpu[producer_bottom_blob_index].release();
                    }
                }
            }
        }
        else
        {
            VkMat top_blob;
            int ret = layer->forward(bottom_blob, top_blob, cmd, opt);
            if (ret != 0)
                return ret;
            // load bottom blobs
            std::vector<VkMat> bottom_blobs;
            bottom_blobs.resize(layer->bottoms.size());
            for (size_t i=0; i<layer->bottoms.size(); i++)
            {
                int bottom_blob_index = layer->bottoms[i];

            // store top blob
            blob_mats[top_blob_index] = top_blob;
        }
                if (blob_mats_gpu[bottom_blob_index].dims == 0)
                {
                    if (blob_mats[bottom_blob_index].dims == 0)
                    {
                        int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, wait_barrier_counts, cmd, opt);
                        if (ret != 0)
                            return ret;
                    }

        if (opt.lightmode)
        {
            // reclaim producer bottom_blob as free when consuming bottom_blob
            const Layer* producer = layers[ blobs[bottom_blob_index].producer ];
            for (size_t i=0; i<producer->bottoms.size(); i++)
            {
                int producer_bottom_blob_index = producer->bottoms[i];
                    if (blob_mats_gpu[bottom_blob_index].dims == 0)
                    {
                        // upload
                        VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index];
                        bottom_blob.create_like(blob_mats[bottom_blob_index], opt.blob_vkallocator, opt.staging_vkallocator);

                        if (!bottom_blob.allocator->mappable)
                        {
                            bottom_blob.prepare_staging_buffer();
                        }

                        bottom_blob.upload(blob_mats[bottom_blob_index]);

                        cmd.record_prepare_transfer_barrier(bottom_blob);
                        cmd.record_upload(bottom_blob);

                        // TODO convert packing
 //                         fprintf(stderr, "upload %d %d %d  %lu %d\n", bottom_blob.w, bottom_blob.h, bottom_blob.c, bottom_blob.elemsize, bottom_blob.packing);
                    }
                }

                wait_barrier_counts[producer_bottom_blob_index]--;
                if (wait_barrier_counts[producer_bottom_blob_index] == 0)
                bottom_blobs[i] = blob_mats_gpu[bottom_blob_index];

                if (opt.lightmode)
                {
 //                     fprintf(stderr, "reclaim free %p\n", blob_mats[producer_bottom_blob_index].buffer());

                    blob_mats[producer_bottom_blob_index].release();
                    wait_barrier_counts[bottom_blob_index] = layer->tops.size();

                    // deep copy for inplace forward if data is shared
                    if (layer->support_inplace && wait_barrier_counts[bottom_blob_index] != 1)
                    {
                        VkMat bottom_blob_copy;
                        bottom_blob_copy.create_like(bottom_blobs[i], bottom_blobs[i].allocator, bottom_blobs[i].staging_allocator);

 //                         fprintf(stderr, "clone %p %p\n", bottom_blobs[i].buffer(), bottom_blob_copy.buffer());

                        cmd.record_prepare_transfer_barrier(bottom_blobs[i]);
                        cmd.record_clone(bottom_blobs[i], bottom_blob_copy);
                        bottom_blobs[i] = bottom_blob_copy;

                        wait_barrier_counts[bottom_blob_index]--;
                    }
                }
            }
        }
    }
    else
    {
        // load bottom blobs
        std::vector<VkMat> bottom_blobs;
        bottom_blobs.resize(layer->bottoms.size());
        for (size_t i=0; i<layer->bottoms.size(); i++)
        {
            int bottom_blob_index = layer->bottoms[i];

            if (blob_mats[bottom_blob_index].dims == 0)
            // forward
            if (opt.lightmode && layer->support_inplace)
            {
                int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, wait_barrier_counts, cmd, opt);
                std::vector<VkMat>& bottom_top_blobs = bottom_blobs;
                int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt);
                if (ret != 0)
                    return ret;

                // store top blobs
                for (size_t i=0; i<layer->tops.size(); i++)
                {
                    int top_blob_index = layer->tops[i];

                    blob_mats_gpu[top_blob_index] = bottom_top_blobs[i];
                }
            }
            else if (blob_mats[bottom_blob_index].staging_data)
            else
            {
                // upload
                const VkMat& bottom_blob = blob_mats[bottom_blob_index];
                cmd.record_prepare_transfer_barrier(bottom_blob);
                cmd.record_upload(bottom_blob);
            }
                std::vector<VkMat> top_blobs;
                top_blobs.resize(layer->tops.size());
                int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt);
                if (ret != 0)
                    return ret;

            bottom_blobs[i] = blob_mats[bottom_blob_index];
                // store top blobs
                for (size_t i=0; i<layer->tops.size(); i++)
                {
                    int top_blob_index = layer->tops[i];

                    blob_mats_gpu[top_blob_index] = top_blobs[i];
                }
            }

            if (opt.lightmode)
            {

                wait_barrier_counts[bottom_blob_index] = layer->tops.size();

                // deep copy for inplace forward if data is shared
                if (layer->support_inplace && wait_barrier_counts[bottom_blob_index] != 1)
                for (size_t i=0; i<layer->bottoms.size(); i++)
                {
                    VkMat bottom_blob_copy;
                    bottom_blob_copy.create_like(bottom_blobs[i], bottom_blobs[i].allocator, bottom_blobs[i].staging_allocator);
                    int bottom_blob_index = layer->bottoms[i];

 //                     fprintf(stderr, "clone %p %p\n", bottom_blobs[i].buffer(), bottom_blob_copy.buffer());
                    // reclaim producer bottom_blob as free when consuming bottom_blob
                    const Layer* producer = layers[ blobs[bottom_blob_index].producer ];
                    for (size_t i=0; i<producer->bottoms.size(); i++)
                    {
                        int producer_bottom_blob_index = producer->bottoms[i];

                    cmd.record_prepare_transfer_barrier(bottom_blobs[i]);
                    cmd.record_clone(bottom_blobs[i], bottom_blob_copy);
                    bottom_blobs[i] = bottom_blob_copy;
                        wait_barrier_counts[producer_bottom_blob_index]--;
                        if (wait_barrier_counts[producer_bottom_blob_index] == 0)
                        {
 //                             fprintf(stderr, "reclaim free %p\n", blob_mats_gpu[producer_bottom_blob_index].buffer());

                    wait_barrier_counts[bottom_blob_index]--;
                            blob_mats_gpu[producer_bottom_blob_index].release();
                        }
                    }
                }
            }

        }

        // forward
        if (opt.lightmode && layer->support_inplace)
    }
    else
    {
        if (layer->one_blob_only)
        {
            std::vector<VkMat>& bottom_top_blobs = bottom_blobs;
            int ret = layer->forward_inplace(bottom_top_blobs, cmd, opt);
            if (ret != 0)
                return ret;
            // load bottom blob
            int bottom_blob_index = layer->bottoms[0];
            int top_blob_index = layer->tops[0];

            // store top blobs
            for (size_t i=0; i<layer->tops.size(); i++)
            if (blob_mats[bottom_blob_index].dims == 0)
            {
                int top_blob_index = layer->tops[i];
                if (blob_mats_gpu[bottom_blob_index].dims == 0)
                {
                    int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, wait_barrier_counts, cmd, opt);
                    if (ret != 0)
                        return ret;
                }

                blob_mats[top_blob_index] = bottom_top_blobs[i];
                if (blob_mats[bottom_blob_index].dims == 0)
                {
                    // download
                    VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index];

                    if (!bottom_blob.allocator->mappable)
                    {
                        cmd.record_prepare_transfer_barrier(bottom_blob);
                        bottom_blob.prepare_staging_buffer();
                        cmd.record_download(bottom_blob);
                    }

                    cmd.submit();

                    cmd.wait();

                    cmd.reset();

                    blob_mats[bottom_blob_index].create_like(bottom_blob, opt.blob_allocator);
                    bottom_blob.download(blob_mats[bottom_blob_index]);

                    if (!bottom_blob.allocator->mappable)
                    {
                        bottom_blob.discard_staging_buffer();
                    }

                    // TODO convert packing
 //                     fprintf(stderr, "download %d %d %d  %lu %d\n", bottom_blob.w, bottom_blob.h, bottom_blob.c, bottom_blob.elemsize, bottom_blob.packing);
                }
            }

            Mat bottom_blob = blob_mats[bottom_blob_index];

            if (opt.lightmode)
            {
                // delete after taken in light mode
                blob_mats[bottom_blob_index].release();
                // deep copy for inplace forward if data is shared
                if (layer->support_inplace && *bottom_blob.refcount != 1)
                {
                    bottom_blob = bottom_blob.clone();
                }
            }

            // forward
            if (opt.lightmode && layer->support_inplace)
            {
                Mat& bottom_top_blob = bottom_blob;

                int ret = layer->forward_inplace(bottom_top_blob, opt);
                if (ret != 0)
                    return ret;

                // store top blob
                blob_mats[top_blob_index] = bottom_top_blob;
            }
            else
            {
                Mat top_blob;

                int ret = layer->forward(bottom_blob, top_blob, opt);
                if (ret != 0)
                    return ret;

                // store top blob
                blob_mats[top_blob_index] = top_blob;
            }

        }
        else
        {
            std::vector<VkMat> top_blobs;
            top_blobs.resize(layer->tops.size());
            int ret = layer->forward(bottom_blobs, top_blobs, cmd, opt);
            if (ret != 0)
                return ret;
            // load bottom blobs
            std::vector<Mat> bottom_blobs;
            bottom_blobs.resize(layer->bottoms.size());
            for (size_t i=0; i<layer->bottoms.size(); i++)
            {
                int bottom_blob_index = layer->bottoms[i];

                if (blob_mats[bottom_blob_index].dims == 0)
                {
                    if (blob_mats_gpu[bottom_blob_index].dims == 0)
                    {
                        int ret = forward_layer(blobs[bottom_blob_index].producer, blob_mats, blob_mats_gpu, wait_barrier_counts, cmd, opt);
                        if (ret != 0)
                            return ret;
                    }

                    if (blob_mats[bottom_blob_index].dims == 0)
                    {
                        // download
                        VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index];

                        if (!bottom_blob.allocator->mappable)
                        {
                            cmd.record_prepare_transfer_barrier(bottom_blob);
                            bottom_blob.prepare_staging_buffer();
                            cmd.record_download(bottom_blob);
                        }
                    }
                }
            }

            // store top blobs
            for (size_t i=0; i<layer->tops.size(); i++)
            {
                int top_blob_index = layer->tops[i];
                cmd.submit();

                blob_mats[top_blob_index] = top_blobs[i];
                cmd.wait();

                cmd.reset();
            }
        }

        if (opt.lightmode)
        {
            for (size_t i=0; i<layer->bottoms.size(); i++)
            {
                int bottom_blob_index = layer->bottoms[i];

                // reclaim producer bottom_blob as free when consuming bottom_blob
                const Layer* producer = layers[ blobs[bottom_blob_index].producer ];
                for (size_t i=0; i<producer->bottoms.size(); i++)
                if (blob_mats[bottom_blob_index].dims == 0)
                {
                    int producer_bottom_blob_index = producer->bottoms[i];
                    // download
                    VkMat& bottom_blob = blob_mats_gpu[bottom_blob_index];

                    wait_barrier_counts[producer_bottom_blob_index]--;
                    if (wait_barrier_counts[producer_bottom_blob_index] == 0)
                    {
 //                         fprintf(stderr, "reclaim free %p\n", blob_mats[producer_bottom_blob_index].buffer());
                    blob_mats[bottom_blob_index].create_like(bottom_blob, opt.blob_allocator);
                    bottom_blob.download(blob_mats[bottom_blob_index]);

                        blob_mats[producer_bottom_blob_index].release();
                    if (!bottom_blob.allocator->mappable)
                    {
                        bottom_blob.discard_staging_buffer();
                    }

                    // TODO convert packing
 //                     fprintf(stderr, "download %d %d %d  %lu %d\n", bottom_blob.w, bottom_blob.h, bottom_blob.c, bottom_blob.elemsize, bottom_blob.packing);
                }

                bottom_blobs[i] = blob_mats[bottom_blob_index];
            }
        }

            // forward
            if (opt.lightmode && layer->support_inplace)
            {
                std::vector<Mat>& bottom_top_blobs = bottom_blobs;

                int ret = layer->forward_inplace(bottom_top_blobs, opt);
                if (ret != 0)
                    return ret;

                // store top blobs
                for (size_t i=0; i<layer->tops.size(); i++)
                {
                    int top_blob_index = layer->tops[i];

                    blob_mats[top_blob_index] = bottom_top_blobs[i];
                }
            }
            else
            {
                std::vector<Mat> top_blobs;
                top_blobs.resize(layer->tops.size());

                int ret = layer->forward(bottom_blobs, top_blobs, opt);
                if (ret != 0)
                    return ret;

                // store top blobs
                for (size_t i=0; i<layer->tops.size(); i++)
                {
                    int top_blob_index = layer->tops[i];

                    blob_mats[top_blob_index] = top_blobs[i];
                }
            }

        }
    }

 //     fprintf(stderr, "forward_layer %d %s done\n", layer_index, layer->name.c_str());
 //     fprintf(stderr, "forward_layer %d %d %s done\n", layer->support_vulkan, layer_index, layer->name.c_str());

    return 0;
 }
@@ -1440,22 +1664,6 @@ int Extractor::input(int blob_index, const Mat& in)

    blob_mats[blob_index] = in;

 #if NCNN_VULKAN
    if (opt.vulkan_compute)
    {
        VkMat& in_gpu = blob_mats_gpu[blob_index];

        in_gpu.create_like(in, opt.blob_vkallocator, opt.staging_vkallocator);

        if (!in_gpu.allocator->mappable)
        {
            in_gpu.prepare_staging_buffer();
        }

        in_gpu.upload(in);
    }
 #endif // NCNN_VULKAN

    return 0;
 }

@@ -1473,33 +1681,35 @@ int Extractor::extract(int blob_index, Mat& feat)
 #if NCNN_VULKAN
        if (opt.vulkan_compute)
        {
            VkMat feat_gpu;

            ncnn::VkCompute cmd(net->vkdev);

            VkMat feat_gpu;
            ret = extract(blob_index, feat_gpu, cmd);

            if (!feat_gpu.allocator->mappable)
            if (blob_mats[blob_index].dims == 0 && feat_gpu.dims != 0)
            {
                // download
                cmd.record_prepare_transfer_barrier(feat_gpu);
                if (!feat_gpu.allocator->mappable)
                {
                    cmd.record_prepare_transfer_barrier(feat_gpu);

                feat_gpu.prepare_staging_buffer();
                    feat_gpu.prepare_staging_buffer();

                cmd.record_download(feat_gpu);
            }
                    cmd.record_download(feat_gpu);
                }

            cmd.submit();
                cmd.submit();

            cmd.wait();
                cmd.wait();

            blob_mats[blob_index].create_like(feat_gpu, opt.blob_allocator);
                blob_mats[blob_index].create_like(feat_gpu, opt.blob_allocator);

            feat_gpu.download(blob_mats[blob_index]);
                feat_gpu.download(blob_mats[blob_index]);

            if (!feat_gpu.allocator->mappable)
            {
                feat_gpu.discard_staging_buffer();
                if (!feat_gpu.allocator->mappable)
                {
                    feat_gpu.discard_staging_buffer();
                }
            }
        }
        else
@@ -1558,7 +1768,7 @@ int Extractor::extract(int blob_index, VkMat& feat, VkCompute& cmd)
    if (blob_mats_gpu[blob_index].dims == 0)
    {
        int layer_index = net->blobs[blob_index].producer;
        ret = net->forward_layer(layer_index, blob_mats_gpu, wait_barrier_counts, cmd, opt);
        ret = net->forward_layer(layer_index, blob_mats, blob_mats_gpu, wait_barrier_counts, cmd, opt);
    }

    feat = blob_mats_gpu[blob_index];
--- a/src/net.h
+++ b/src/net.h
@@ -122,7 +122,7 @@ protected:
    int forward_layer(int layer_index, std::vector<Mat>& blob_mats, Option& opt) const;

 #if NCNN_VULKAN
    int forward_layer(int layer_index, std::vector<VkMat>& blob_mats, std::vector<int>& wait_barrier_counts, VkCompute& cmd, Option& opt) const;
    int forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector<VkMat>& blob_mats_gpu, std::vector<int>& wait_barrier_counts, VkCompute& cmd, Option& opt) const;
 #endif // NCNN_VULKAN

 protected: