vkmat and command api breaks (#1689)

* vkmat and command api breaks * always use compute queue for compute buffer transfer * no barrier for readonly weight buffer * record clone, drop queue_owner * bring back layer forward * fix validation errors * lifecycle inside command makes life easier * update doc * record_import_android_hardware_buffer
6 years ago · 7365bb80a2
--- a/docs/developer-guide/low-level-operation-api.md
+++ b/docs/developer-guide/low-level-operation-api.md
@@ -150,6 +150,17 @@ ncnn::VkWeightStagingBufferAllocator g_weight_staging_vkallocator(vkdev);
 ncnn::Layer* convolution = ncnn::create_layer("Convolution");
 convolution->vkdev = vkdev;

 // set option
 ncnn::Option opt;
 opt.lightmode = true;
 opt.num_threads = 4;
 opt.blob_allocator = 0;
 opt.workspace_allocator = 0;
 opt.vulkan_compute = true;
 opt.blob_vkallocator = &g_blob_vkallocator;
 opt.workspace_vkallocator = &g_blob_vkallocator;
 opt.staging_vkallocator = &g_staging_vkallocator;

 // load param
 {
 ncnn::ParamDict pd;
@@ -171,76 +182,42 @@ ncnn::ModelBinFromMatArray mb(weights);
 convolution->load_model(mb);
 }

 // upload model
 {
 ncnn::VkTransfer cmd(vkdev);
 cmd.weight_vkallocator = &g_weight_vkallocator;
 cmd.staging_vkallocator = &g_weight_staging_vkallocator;

 convolution->upload_model(cmd);

 cmd.submit();
 cmd.wait();

 g_weight_staging_vkallocator.clear();
 }

 // create pipeline
 convolution->create_pipeline(opt);

 // set default option
 // upload model
 {
 ncnn::Option opt = ncnn::get_default_option();
 ncnn::VkTransfer cmd(vkdev);

 opt.lightmode = true;
 opt.num_threads = 4;
 opt.blob_allocator = 0;
 opt.workspace_allocator = 0;
 ncnn::Option opt_upload = opt;
 opt_upload.blob_vkallocator = &g_weight_vkallocator;
 opt_upload.workspace_vkallocator = &g_weight_vkallocator;
 opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;

 opt.vulkan_compute = true;
 opt.blob_vkallocator = &g_blob_vkallocator;
 opt.workspace_vkallocator = &g_blob_vkallocator;
 opt.staging_vkallocator = &g_staging_vkallocator;
 convolution->upload_model(cmd, opt_upload);

 ncnn::set_default_option(opt);
 cmd.submit_and_wait();
 }

 ncnn::Mat bottom = random_mat(w, h, inch);

 ncnn::VkMat bottom_gpu;

 // copy bottom to bottom_gpu
 {
 bottom_gpu.create_like(bottom, &g_blob_vkallocator, &g_staging_vkallocator);
 bottom_gpu.prepare_staging_buffer();
 bottom_gpu.upload(bottom);
 }

 ncnn::VkMat top_gpu;
 ncnn::Mat top;

 // forward
 {
 ncnn::VkCompute cmd(vkdev);

 cmd.record_upload(bottom_gpu);
 ncnn::VkMat bottom_gpu;
 cmd.record_upload(bottom, bottom_gpu, opt);

 ncnn::VkMat top_gpu;
 convolution->forward(bottom_gpu, top_gpu, cmd, opt);

 top_gpu.prepare_staging_buffer();

 cmd.record_download(top_gpu);
 cmd.record_download(top_gpu, top, opt);

 cmd.submit_and_wait();
 }

 ncnn::Mat top;

 // copy top_gpu to top
 {
 top.create_like(top_gpu);
 top_gpu.download(top);
 }

 convolution->destroy_pipeline(opt);

 delete convolution;
--- a/src/allocator.cpp
+++ b/src/allocator.cpp
@@ -470,7 +470,8 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size)
            ptr->memory = buffer_blocks[i]->memory;
            ptr->capacity = aligned_size;
            ptr->mapped_ptr = buffer_blocks[i]->mapped_ptr;
            ptr->state = 1;
            ptr->access_flags = 0;
            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

            // adjust budgets
            if (budget_size == aligned_size)
@@ -540,7 +541,8 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size)
    ptr->memory = block->memory;
    ptr->capacity = aligned_size;
    ptr->mapped_ptr = block->mapped_ptr;
    ptr->state = 1;
    ptr->access_flags = 0;
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

    // adjust budgets
    std::list< std::pair<size_t, size_t> > budget;
@@ -715,7 +717,8 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size)
        ptr->memory = buffer_blocks[block_index]->memory;
        ptr->capacity = aligned_size;
        ptr->mapped_ptr = buffer_blocks[block_index]->mapped_ptr;
        ptr->state = 1;
        ptr->access_flags = 0;
        ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

        buffer_block_free_spaces[block_index] -= aligned_size;

@@ -790,7 +793,8 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size)
            ptr->memory = block->memory;
            ptr->capacity = new_block_size;
            ptr->mapped_ptr = block->mapped_ptr;
            ptr->state = 1;
            ptr->access_flags = 0;
            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

            return ptr;
        }
@@ -841,7 +845,8 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size)
    ptr->memory = block->memory;
    ptr->capacity = aligned_size;
    ptr->mapped_ptr = block->mapped_ptr;
    ptr->state = 1;
    ptr->access_flags = 0;
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

    return ptr;
 }
@@ -940,7 +945,8 @@ VkBufferMemory* VkStagingBufferAllocator::fastMalloc(size_t size)

    vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);

    ptr->state = 1;
    ptr->access_flags = 0;
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

 //     fprintf(stderr, "VkStagingBufferAllocator M %p %lu\n", ptr->buffer, size);

@@ -989,7 +995,8 @@ VkBufferMemory* VkWeightStagingBufferAllocator::fastMalloc(size_t size)

    vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);

    ptr->state = 1;
    ptr->access_flags = 0;
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

 //     fprintf(stderr, "VkWeightStagingBufferAllocator M %p %lu\n", ptr->buffer, size);

@@ -1137,7 +1144,8 @@ VkImageMemory* VkSimpleImageAllocator::fastMalloc(int width, int height, VkForma

    ptr->imageview = create_imageview(ptr->image, format);

    ptr->state = 1;
    ptr->access_flags = 0;
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

    return ptr;
 }
@@ -1290,7 +1298,8 @@ VkImageMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(int /*width*/,
    ptr->image = image;
    ptr->memory = memory;
    ptr->imageview = imageview;
    ptr->state = 1;
    ptr->access_flags = 0;
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

    return ptr;
 }
--- a/src/allocator.h
+++ b/src/allocator.h
@@ -192,12 +192,8 @@ public:
    void* mapped_ptr;

    // buffer state, modified by command functions internally
    // 0=null
    // 1=created
    // 2=transfer
    // 3=compute
    // 4=readonly
    mutable int state;
    mutable VkAccessFlags access_flags;
    mutable VkPipelineStageFlags stage_flags;

    // initialize and modified by mat
    int refcount;
@@ -311,13 +307,9 @@ public:

    VkDeviceMemory memory;

    // buffer state, modified by command functions internally
    // 0=null
    // 1=created
    // 2=transfer
    // 3=compute
    // 4=readonly
    mutable int state;
    // image state, modified by command functions internally
    mutable VkAccessFlags access_flags;
    mutable VkPipelineStageFlags stage_flags;

    // initialize and modified by mat
    int refcount;
--- a/src/command.cpp
+++ b/src/command.cpp
--- a/src/command.h
+++ b/src/command.h
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -22,61 +22,31 @@
 #include <vector>
 #include <vulkan/vulkan.h>
 #include "mat.h"
 #include "pipeline.h"

 namespace ncnn {

 class Command
 {
 public:
    Command(const VulkanDevice* vkdev, uint32_t queue_family_index);
    virtual ~Command();

 protected:
    int create_command_pool();
    int create_command_buffer();

    // record issue
    int begin_command_buffer();
    int end_command_buffer();
    int queue_submit_and_wait_fence();

 protected:
    const VulkanDevice* vkdev;
    uint32_t queue_family_index;

    VkCommandPool command_pool;
    VkCommandBuffer command_buffer;

    VkFence fence;
 };

 class VkCompute : public Command
 class Pipeline;
 class VkCompute
 {
 public:
    VkCompute(const VulkanDevice* vkdev);
    ~VkCompute();

    void record_upload(const VkMat& m);

    void record_download(const VkMat& m);
    virtual ~VkCompute();

    void record_clone(const VkMat& src, const VkMat& dst);
 public:
    void record_upload(const Mat& src, VkMat& dst, const Option& opt);

    void record_copy_region(const VkMat& src, const VkMat& dst, const VkBufferCopy& region);
    void record_download(const VkMat& src, Mat& dst, const Option& opt);

    void record_copy_regions(const VkMat& src, const VkMat& dst, const std::vector<VkBufferCopy>& regions);
    void record_clone(const VkMat& src, VkMat& dst, const Option& opt);

    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& m);
    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);

 #if NCNN_BENCHMARK
    void record_write_timestamp(uint32_t query);
 #endif // NCNN_BENCHMARK

    void record_queue_transfer_acquire(const VkMat& m, uint32_t src_queue_family_index);

 #if __ANDROID_API__ >= 26
    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& im, const VkMat& m);
    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst);
 #endif // __ANDROID_API__ >= 26

    int submit_and_wait();
@@ -90,116 +60,73 @@ public:
 #endif // NCNN_BENCHMARK

 protected:
    // record pipeline things
    void record_bind_pipeline(VkPipeline pipeline);
    void record_update_bindings(VkPipelineLayout pipeline_layout, VkDescriptorSetLayout descriptorset_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const std::vector<VkMat>& bindings);
    void record_push_constants(VkPipelineLayout pipeline_layout, const std::vector<vk_constant_type>& constants);
    void record_dispatch(const uint32_t* group_count_xyz);

    // record barrier things
    void record_transfer_compute_barrier(const VkMat& m);
    void record_compute_transfer_barrier(const VkMat& m);
    void record_compute_compute_barrier(const VkMat& m);
    void record_transfer_transfer_barrier(const VkMat& m);
    void record_host_transfer_barrier(const VkMat& m);
    void record_transfer_host_barrier(const VkMat& m);
    void record_host_compute_barrier(const VkMat& m);
    void record_compute_host_barrier(const VkMat& m);

    // record prepare things
    void record_prepare_transfer_barrier(const VkMat& m);
    void record_prepare_compute_barrier(const VkMat& m);
    void record_prepare_host_barrier(const VkMat& m);

    void record_initial_image_compute_barrier(const VkImageMat& im);
    int init();
    int begin_command_buffer();
    int end_command_buffer();

 #if __ANDROID_API__ >= 26
    void record_update_import_android_hardware_buffer_bindings(VkPipelineLayout pipeline_layout, VkDescriptorSetLayout descriptorset_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, VkSampler sampler, const VkImageMat& im, const VkMat& m);
 #endif // __ANDROID_API__ >= 26
 protected:
    const VulkanDevice* vkdev;

 #if NCNN_BENCHMARK
    void reset_query_pool();
 #endif // NCNN_BENCHMARK
    VkCommandPool compute_command_pool;

 protected:
    // recording issue
    void copy_buffer(VkBuffer src, size_t src_offset, VkBuffer dst, size_t dst_offset, size_t size);
    void copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vector<VkBufferCopy>& regions);
    void bind_pipeline(VkPipeline pipeline);
    void bind_descriptorset(VkPipelineLayout pipeline_layout, VkDescriptorSet descriptorset);
    void update_bindings(VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const std::vector<VkDescriptorBufferInfo>& descriptorBufferInfos);
    void push_constants(VkPipelineLayout pipeline_layout, const std::vector<vk_constant_type>& constants);
    void dispatch(const uint32_t* group_count_xyz);
    void transfer_compute_barrier(VkBuffer buffer, size_t offset, size_t size);
    void compute_transfer_barrier(VkBuffer buffer, size_t offset, size_t size);
    void compute_compute_barrier(VkBuffer buffer, size_t offset, size_t size);
    void transfer_transfer_barrier(VkBuffer buffer, size_t offset, size_t size);
    void host_transfer_barrier(VkBuffer buffer, size_t offset, size_t size);
    void transfer_host_barrier(VkBuffer buffer, size_t offset, size_t size);
    void host_compute_barrier(VkBuffer buffer, size_t offset, size_t size);
    void compute_host_barrier(VkBuffer buffer, size_t offset, size_t size);
    void queue_transfer_acquire_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t src_queue_family_index);
    void initial_image_compute_barrier(VkImage image);
 #if __ANDROID_API__ >= 26
    void update_import_android_hardware_buffer_bindings(VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const VkDescriptorImageInfo& descriptorImageInfo, const VkDescriptorBufferInfo& descriptorBufferInfo);
 #endif // __ANDROID_API__ >= 26
 #if NCNN_BENCHMARK
    void write_timestamp(uint32_t query);
 #endif // NCNN_BENCHMARK
    VkCommandBuffer compute_command_buffer;

    VkFence compute_command_fence;

    std::vector<VkMat> upload_staging_buffers;
    std::vector<VkMat> download_post_buffers;
    std::vector<Mat> download_post_mats;

 protected:
    // delayed record
    // the good-old path for device without VK_KHR_push_descriptor
    std::vector<VkDescriptorPool> descriptor_pools;
    std::vector<VkDescriptorSet> descriptorsets;
    struct record_type

    struct record
    {
        // 0=copy
        // 1=copy regions
        // 2=bind pipeline
        // 3=bind descriptorset
        // 4=push constants
        // 5=dispatch
        // 6=transfer-compute barrier
        // 7=compute-transfer barrier
        // 8=compute-compute barrier
        // 9=transfer-transfer barrier
        // 10=write timestamp
        // 11=initial image compute barrier
        // 12=host-transfer barrier
        // 13=transfer-host barrier
        // 14=host-compute barrier
        // 15=compute-host barrier
        // 16=queue-transfer-acquire barrier
        enum
        {
            TYPE_copy_buffer,
            TYPE_bind_pipeline,
            TYPE_bind_descriptorsets,
            TYPE_push_constants,
            TYPE_dispatch,
            TYPE_memory_barrers,
            TYPE_buffer_barrers,
            TYPE_image_barrers,

 #if NCNN_BENCHMARK
            TYPE_write_timestamp,
 #endif // NCNN_BENCHMARK

            TYPE_post_download,
        };

        int type;
        VkCommandBuffer command_buffer;

        union
        {
        struct { VkBuffer src; size_t src_offset; VkBuffer dst; size_t dst_offset; size_t size; } copy;
        struct { VkBuffer src; VkBuffer dst; } copy_regions;
        struct { VkPipeline pipeline; } bind_pipeline;
        struct { VkPipelineLayout pipeline_layout; VkDescriptorSet descriptorset; } bind_descriptorset;
        struct { VkPipelineLayout pipeline_layout; } push_constants;
        struct { uint32_t group_count_xyz[3]; } dispatch;
        struct { VkBuffer buffer; size_t offset; size_t size; } transfer_compute_barrier;
        struct { VkBuffer buffer; size_t offset; size_t size; } compute_transfer_barrier;
        struct { VkBuffer buffer; size_t offset; size_t size; } compute_compute_barrier;
        struct { VkBuffer buffer; size_t offset; size_t size; } transfer_transfer_barrier;
        struct { VkBuffer src; VkBuffer dst; uint32_t region_count; const VkBufferCopy* regions; } copy_buffer;

        struct { VkPipelineBindPoint bind_point; VkPipeline pipeline; } bind_pipeline;
        struct { VkPipelineBindPoint bind_point; VkPipelineLayout pipeline_layout; uint32_t descriptorset_count; uint32_t descriptorset_offset; } bind_descriptorsets;
        struct { VkPipelineLayout pipeline_layout; VkShaderStageFlags stage_flags; uint32_t size; const void* values; } push_constants;

        struct { uint32_t group_count_x; uint32_t group_count_y; uint32_t group_count_z; } dispatch;

        struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkMemoryBarrier* barriers; } memory_barrers;
        struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkBufferMemoryBarrier* barriers; } buffer_barrers;
        struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkImageMemoryBarrier* barriers; } image_barrers;

 #if NCNN_BENCHMARK
        struct { uint32_t query; } write_timestamp;
 #endif // NCNN_BENCHMARK
        struct { VkImage image; } initial_image_compute_barrier;
        struct { VkBuffer buffer; size_t offset; size_t size; } host_transfer_barrier;
        struct { VkBuffer buffer; size_t offset; size_t size; } transfer_host_barrier;
        struct { VkBuffer buffer; size_t offset; size_t size; } host_compute_barrier;
        struct { VkBuffer buffer; size_t offset; size_t size; } compute_host_barrier;
        struct { VkBuffer buffer; size_t offset; size_t size; size_t src_queue_family_index; } queue_transfer_acquire_barrier;
        };

        std::vector<VkBufferCopy> regions;
        std::vector<vk_constant_type> constants;
        struct { uint32_t download_post_buffer_mat_offset; } post_download;
        };
    };
    std::vector<record_type> delayed_records;

    std::vector<record> delayed_records;

 #if NCNN_BENCHMARK
    uint32_t query_count;
@@ -207,38 +134,37 @@ protected:
 #endif // NCNN_BENCHMARK
 };

 class VkTransfer : public Command
 class VkTransfer
 {
 public:
    VkTransfer(const VulkanDevice* vkdev);
    ~VkTransfer();

 public:
    void record_upload(const Mat& src, VkMat& dst, const Option& opt);

    int submit_and_wait();

 public:
    VkAllocator* weight_vkallocator;
    VkAllocator* staging_vkallocator;

 protected:
    // recording issue
    void copy_buffer(VkBuffer src, size_t src_offset, VkBuffer dst, size_t dst_offset, size_t size);
    void copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vector<VkBufferCopy>& regions);
    void queue_transfer_release_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t target_queue_family_index);
    int init();
    int begin_command_buffer();
    int end_command_buffer();

 protected:
    size_t buffer_offset_alignment;
    VkBufferMemory* staging_data;
    const VulkanDevice* vkdev;

    // delayed record
    struct record_type
    {
        size_t size;
        Mat mat;
        VkMat vkmat;
    };
    std::vector<record_type> delayed_records;
    VkCommandPool compute_command_pool;
    VkCommandPool transfer_command_pool;

    VkCommandBuffer upload_command_buffer;
    VkCommandBuffer compute_command_buffer;

    VkSemaphore upload_compute_semaphore;

    VkFence upload_command_fence;
    VkFence compute_command_fence;

    std::vector<VkMat> upload_staging_buffers;
 };

 } // namespace ncnn
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -628,6 +628,8 @@ int create_gpu_instance()
        gpu_info.graphics_queue_count = queueFamilyProperties[gpu_info.graphics_queue_family_index].queueCount;
        gpu_info.transfer_queue_count = queueFamilyProperties[gpu_info.transfer_queue_family_index].queueCount;

        gpu_info.unified_compute_transfer_queue = gpu_info.compute_queue_family_index == gpu_info.transfer_queue_family_index;

        // cache memory properties
        vkGetPhysicalDeviceMemoryProperties(physicalDevice, &gpu_info.physicalDeviceMemoryProperties);

--- a/src/gpu.h
+++ b/src/gpu.h
@@ -111,6 +111,9 @@ public:
    uint32_t graphics_queue_count;
    uint32_t transfer_queue_count;

    // property
    bool unified_compute_transfer_queue;

    // bug is not feature
    bool bug_local_size_spec_const;

--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -121,11 +121,7 @@ int Layer::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& t
    top_blobs.resize(bottom_blobs.size());
    for (int i = 0; i < (int)top_blobs.size(); i++)
    {
        top_blobs[i].create_like(bottom_blobs[i], bottom_blobs[i].allocator, bottom_blobs[i].staging_allocator);
        if (top_blobs[i].empty())
            return -100;

        cmd.record_clone(bottom_blobs[i], top_blobs[i]);
        cmd.record_clone(bottom_blobs[i], top_blobs[i], opt);
    }

    return forward_inplace(top_blobs, cmd, opt);
@@ -136,11 +132,7 @@ int Layer::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, co
    if (!support_inplace)
        return -1;

    top_blob.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator);
    if (top_blob.empty())
        return -100;

    cmd.record_clone(bottom_blob, top_blob);
    cmd.record_clone(bottom_blob, top_blob, opt);

    return forward_inplace(top_blob, cmd, opt);
 }
--- a/src/layer/vulkan/binaryop_vulkan.cpp
+++ b/src/layer/vulkan/binaryop_vulkan.cpp
@@ -317,21 +317,21 @@ int BinaryOp_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
    // broadcast
    if (bottom_blob.dims > bottom_blob1.dims)
    {
        top_blob.create_like(bottom_blob, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create_like(bottom_blob, opt.blob_vkallocator);
    }
    else if (bottom_blob.dims < bottom_blob1.dims)
    {
        top_blob.create_like(bottom_blob1, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create_like(bottom_blob1, opt.blob_vkallocator);
    }
    else // if (bottom_blob.dims == bottom_blob1.dims)
    {
        if (bottom_blob.w * bottom_blob.h * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.c * bottom_blob1.elempack)
        {
            top_blob.create_like(bottom_blob, opt.blob_vkallocator, opt.staging_vkallocator);
            top_blob.create_like(bottom_blob, opt.blob_vkallocator);
        }
        else
        {
            top_blob.create_like(bottom_blob1, opt.blob_vkallocator, opt.staging_vkallocator);
            top_blob.create_like(bottom_blob1, opt.blob_vkallocator);
        }
    }
    if (top_blob.empty())
--- a/src/layer/vulkan/cast_vulkan.cpp
+++ b/src/layer/vulkan/cast_vulkan.cpp
@@ -234,15 +234,15 @@ int Cast_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c

    if (dims == 1)
    {
        top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator);
    }
    else if (dims == 2)
    {
        top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator);
    }
    else if (dims == 3)
    {
        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator);
    }
    if (top_blob.empty())
        return -100;
--- a/src/layer/vulkan/concat_vulkan.cpp
+++ b/src/layer/vulkan/concat_vulkan.cpp
@@ -312,14 +312,14 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V
        }

        VkMat& top_blob = top_blobs[0];
        top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;

        VkMat top_blob_unpacked = top_blob;
        if (elempack < out_elempack)
        {
            top_blob_unpacked.create(top_w / elempack, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
            top_blob_unpacked.create(top_w / elempack, elemsize, elempack, opt.workspace_vkallocator);
            if (top_blob_unpacked.empty())
                return -100;
        }
@@ -415,14 +415,14 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V
        }

        VkMat& top_blob = top_blobs[0];
        top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;

        VkMat top_blob_unpacked = top_blob;
        if (elempack < out_elempack)
        {
            top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
            top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_vkallocator);
            if (top_blob_unpacked.empty())
                return -100;
        }
@@ -506,7 +506,7 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V
        }

        VkMat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(top_w, h, elemsize, elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;

@@ -573,14 +573,14 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V
        }

        VkMat& top_blob = top_blobs[0];
        top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;

        VkMat top_blob_unpacked = top_blob;
        if (elempack < out_elempack)
        {
            top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
            top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_vkallocator);
            if (top_blob_unpacked.empty())
                return -100;
        }
@@ -665,7 +665,7 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V
        }

        VkMat& top_blob = top_blobs[0];
        top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;

@@ -720,7 +720,7 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V
        }

        VkMat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;

--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -1010,8 +1010,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
            Option opt_pad = opt;
            opt_pad.blob_vkallocator = opt.workspace_vkallocator;

            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
            padding_param_blob.prepare_staging_buffer();
            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
            int* padding_params = padding_param_blob.mapped();

            padding_params[0] = hpad / 2;
@@ -1037,8 +1036,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
            Option opt_pad = opt;
            opt_pad.blob_vkallocator = opt.workspace_vkallocator;

            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
            padding_param_blob.prepare_staging_buffer();
            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
            int* padding_params = padding_param_blob.mapped();

            padding_params[0] = hpad - hpad / 2;
@@ -1089,8 +1087,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
            Option opt_pad = opt;
            opt_pad.blob_vkallocator = opt.workspace_vkallocator;

            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
            padding_param_blob.prepare_staging_buffer();
            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
            int* padding_params = padding_param_blob.mapped();

            padding_params[0] = 0;
@@ -1110,7 +1107,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
        // transform input
        VkMat bottom_tm_blob;
        {
            bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
            bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator);
            if (bottom_tm_blob.empty())
                return -100;

@@ -1138,7 +1135,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
        // gemm
        VkMat top_tm_blob;
        {
            top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
            top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
            if (top_tm_blob.empty())
                return -100;

@@ -1165,7 +1162,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
        // transform output
        VkMat top_blob_bordered;
        {
            top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
            top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
            if (top_blob_bordered.empty())
                return -100;

@@ -1193,8 +1190,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom

        // crop top_blob
        {
            VkMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
            crop_param_blob.prepare_staging_buffer();
            VkMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
            int* crop_params = crop_param_blob.mapped();

            crop_params[0] = 0;
@@ -1232,8 +1228,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
            Option opt_pad = opt;
            opt_pad.blob_vkallocator = opt.workspace_vkallocator;

            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
            padding_param_blob.prepare_staging_buffer();
            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
            int* padding_params = padding_param_blob.mapped();

            padding_params[0] = 0;
@@ -1253,7 +1248,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
        // transform input
        VkMat bottom_tm_blob;
        {
            bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
            bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator);
            if (bottom_tm_blob.empty())
                return -100;

@@ -1281,7 +1276,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
        // gemm
        VkMat top_tm_blob;
        {
            top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
            top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
            if (top_tm_blob.empty())
                return -100;

@@ -1308,7 +1303,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
        // transform output
        VkMat top_blob_bordered;
        {
            top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
            top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
            if (top_blob_bordered.empty())
                return -100;

@@ -1336,8 +1331,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom

        // crop top_blob
        {
            VkMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
            crop_param_blob.prepare_staging_buffer();
            VkMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
            int* crop_params = crop_param_blob.mapped();

            crop_params[0] = 0;
@@ -1360,7 +1354,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
    }


    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

--- a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
@@ -534,8 +534,7 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl
            Option opt_pad = opt;
            opt_pad.blob_vkallocator = opt.workspace_vkallocator;

            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
            padding_param_blob.prepare_staging_buffer();
            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
            int* padding_params = padding_param_blob.mapped();

            padding_params[0] = hpad / 2;
@@ -561,8 +560,7 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl
            Option opt_pad = opt;
            opt_pad.blob_vkallocator = opt.workspace_vkallocator;

            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
            padding_param_blob.prepare_staging_buffer();
            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
            int* padding_params = padding_param_blob.mapped();

            padding_params[0] = hpad - hpad / 2;
@@ -595,7 +593,7 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl
        if (out_elempack == 1) out_elemsize = 4u;
    }

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

@@ -656,7 +654,7 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl
    VkMat top_blob_unpacked = top_blob;
    if (out_elempack_g < out_elempack)
    {
        top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator, opt.staging_vkallocator);
        top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator);
        if (top_blob_unpacked.empty())
            return -100;
    }
--- a/src/layer/vulkan/crop_vulkan.cpp
+++ b/src/layer/vulkan/crop_vulkan.cpp
@@ -381,7 +381,7 @@ int Crop_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c
            packing->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1);
        }

        top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;

@@ -515,7 +515,7 @@ int Crop_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkM

        VkMat& top_blob = top_blobs[0];

        top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;

--- a/src/layer/vulkan/deconvolution_vulkan.cpp
+++ b/src/layer/vulkan/deconvolution_vulkan.cpp
@@ -414,11 +414,11 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
    VkMat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
    }
    else
    {
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
    }
    if (top_blob_bordered.empty())
        return -100;
@@ -528,8 +528,7 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
        int wcut = top_blob_bordered_adj.w - output_w;
        int hcut = top_blob_bordered_adj.h - output_h;

        VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
        crop_param_blob.prepare_staging_buffer();
        VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
        int* crop_params = crop_param_blob.mapped();

        if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
--- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
@@ -597,11 +597,11 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_
    VkMat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
    }
    else
    {
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
    }
    if (top_blob_bordered.empty())
        return -100;
@@ -681,8 +681,7 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_
            int wcut = top_blob_bordered_adj.w - output_w;
            int hcut = top_blob_bordered_adj.h - output_h;

            VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
            crop_param_blob.prepare_staging_buffer();
            VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
            int* crop_params = crop_param_blob.mapped();

            if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
@@ -763,7 +762,7 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_
    VkMat top_blob_unpacked = top_blob_bordered;
    if (out_elempack_g < out_elempack)
    {
        top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator, opt.staging_vkallocator);
        top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator);
        if (top_blob_unpacked.empty())
            return -100;
    }
@@ -883,8 +882,7 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_
        int wcut = top_blob_bordered_adj.w - output_w;
        int hcut = top_blob_bordered_adj.h - output_h;

        VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
        crop_param_blob.prepare_staging_buffer();
        VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
        int* crop_params = crop_param_blob.mapped();

        if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
--- a/src/layer/vulkan/deepcopy_vulkan.cpp
+++ b/src/layer/vulkan/deepcopy_vulkan.cpp
@@ -144,7 +144,7 @@ int DeepCopy_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkComput
 {
    int elempack = bottom_blob.elempack;

    top_blob.create_like(bottom_blob, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create_like(bottom_blob, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

--- a/src/layer/vulkan/eltwise_vulkan.cpp
+++ b/src/layer/vulkan/eltwise_vulkan.cpp
@@ -157,7 +157,7 @@ int Eltwise_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<
    int elempack = bottom_blob.elempack;

    VkMat& top_blob = top_blobs[0];
    top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

--- a/src/layer/vulkan/flatten_vulkan.cpp
+++ b/src/layer/vulkan/flatten_vulkan.cpp
@@ -205,7 +205,7 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
        return 0;
    }

    top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

--- a/src/layer/vulkan/innerproduct_vulkan.cpp
+++ b/src/layer/vulkan/innerproduct_vulkan.cpp
@@ -306,7 +306,7 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo
        if (out_elempack == 1) out_elemsize = 4u;
    }

    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

--- a/src/layer/vulkan/instancenorm_vulkan.cpp
+++ b/src/layer/vulkan/instancenorm_vulkan.cpp
@@ -380,7 +380,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd,
    int elempack = bottom_top_blob.elempack;

    // mean
    VkMat mean_workspace(c, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
    VkMat mean_workspace(c, elemsize, elempack, opt.workspace_vkallocator);
    {
        // reduce sum
        VkMat sum_workspace;
@@ -389,7 +389,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd,
        int reduced_h = 1;
        int reduced_c = bottom_top_blob.c;

        sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator);
        {
        std::vector<VkMat> bindings(2);
        bindings[0] = bottom_top_blob;
@@ -419,7 +419,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd,
        int reduced_c = sum_workspace.c;

        VkMat sum_workspace_reduced;
        sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator);

        {
        std::vector<VkMat> bindings(2);
@@ -466,11 +466,11 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd,
    }

    // var
    VkMat var_workspace(c, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
    VkMat var_workspace(c, elemsize, elempack, opt.workspace_vkallocator);
    {
        // sub mean and square
        VkMat square_workspace;
        square_workspace.create(w, h, c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        square_workspace.create(w, h, c, 4u*elempack, elempack, opt.workspace_vkallocator);
        {
        std::vector<VkMat> bindings(3);
        bindings[0] = bottom_top_blob;
@@ -509,7 +509,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd,
        int reduced_c = sqsum_workspace.c;

        VkMat sqsum_workspace_reduced;
        sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator);

        {
        std::vector<VkMat> bindings(2);
@@ -557,7 +557,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd,

    // coeffs
    VkMat coeffs_workspace;
    coeffs_workspace.create(c, elemsize * 2, elempack * 2, opt.workspace_vkallocator, opt.staging_vkallocator);
    coeffs_workspace.create(c, elemsize * 2, elempack * 2, opt.workspace_vkallocator);
    {
    std::vector<VkMat> bindings(5);
    bindings[0] = coeffs_workspace;
--- a/src/layer/vulkan/interp_vulkan.cpp
+++ b/src/layer/vulkan/interp_vulkan.cpp
@@ -274,7 +274,7 @@ int Interp_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute&
        return 0;
    }

    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

@@ -306,11 +306,11 @@ int Interp_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute&
    }
    else if (resize_type == 3) // bicubic
    {
        VkMat alpha(outw, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator, opt.staging_vkallocator);
        VkMat alpha(outw, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator);
        if (alpha.empty())
            return -100;

        VkMat xofs(outw, (size_t)4u, 1, opt.workspace_vkallocator, opt.staging_vkallocator);
        VkMat xofs(outw, (size_t)4u, 1, opt.workspace_vkallocator);
        if (xofs.empty())
            return -100;

@@ -328,11 +328,11 @@ int Interp_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute&
            cmd.record_pipeline(pipeline_interp_bicubic_coeffs_x, bindings, constants, alpha);
        }

        VkMat beta(outh, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator, opt.staging_vkallocator);
        VkMat beta(outh, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator);
        if (beta.empty())
            return -100;

        VkMat yofs(outh, (size_t)4u, 1, opt.workspace_vkallocator, opt.staging_vkallocator);
        VkMat yofs(outh, (size_t)4u, 1, opt.workspace_vkallocator);
        if (yofs.empty())
            return -100;

--- a/src/layer/vulkan/lrn_vulkan.cpp
+++ b/src/layer/vulkan/lrn_vulkan.cpp
@@ -254,11 +254,11 @@ int LRN_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Op
    if (region_type == NormRegion_ACROSS_CHANNELS)
    {
        // always create scalar square workspace blob for norm across channel
        square_workspace.create(w, h, channels * elempack + local_size - 1, 4u, 1, opt.workspace_vkallocator, opt.staging_vkallocator);
        square_workspace.create(w, h, channels * elempack + local_size - 1, 4u, 1, opt.workspace_vkallocator);
    }
    else if (region_type == NormRegion_WITHIN_CHANNEL)
    {
        square_workspace.create(w + local_size - 1, h + local_size - 1, channels, elempack * 4u, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        square_workspace.create(w + local_size - 1, h + local_size - 1, channels, elempack * 4u, elempack, opt.workspace_vkallocator);
    }

    // square pad
--- a/src/layer/vulkan/normalize_vulkan.cpp
+++ b/src/layer/vulkan/normalize_vulkan.cpp
@@ -298,7 +298,7 @@ int Normalize_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co
            reduced_c = (bottom_top_blob.c + 3) / 4;
        }

        sqsum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        sqsum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator);
        {
        std::vector<VkMat> bindings(2);
        bindings[0] = bottom_top_blob;
@@ -347,7 +347,7 @@ int Normalize_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co
        }

        VkMat sqsum_workspace_reduced;
        sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator);

        {
        std::vector<VkMat> bindings(2);
@@ -377,7 +377,7 @@ int Normalize_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co

    // coeffs
    VkMat coeffs_workspace;
    coeffs_workspace.create(sqsum_workspace.w * sqsum_workspace.h * sqsum_workspace.c, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
    coeffs_workspace.create(sqsum_workspace.w * sqsum_workspace.h * sqsum_workspace.c, elemsize, elempack, opt.workspace_vkallocator);
    {
        std::vector<VkMat> bindings(2);
        bindings[0] = sqsum_workspace;
--- a/src/layer/vulkan/packing_vulkan.cpp
+++ b/src/layer/vulkan/packing_vulkan.cpp
@@ -203,7 +203,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
            if (out_elempack == 1) out_elemsize = 4u;
        }

        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;
    }
@@ -219,7 +219,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
            if (out_elempack == 1) out_elemsize = 4u;
        }

        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;
    }
@@ -235,7 +235,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
            if (out_elempack == 1) out_elemsize = 4u;
        }

        top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;
    }
--- a/src/layer/vulkan/padding_vulkan.cpp
+++ b/src/layer/vulkan/padding_vulkan.cpp
@@ -170,7 +170,7 @@ int Padding_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
    int outw = w + left + right;
    int outh = h + top + bottom;

    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

@@ -239,7 +239,7 @@ int Padding_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<
    int outw = w + _left + _right;
    int outh = h + _top + _bottom;

    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

--- a/src/layer/vulkan/permute_vulkan.cpp
+++ b/src/layer/vulkan/permute_vulkan.cpp
@@ -270,7 +270,7 @@ int Permute_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
            if (out_elempack == 1) out_elemsize = 4u;
        }

        top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;
    }
@@ -329,7 +329,7 @@ int Permute_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
            if (out_elempack == 1) out_elemsize = 4u;
        }

        top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;
    }
--- a/src/layer/vulkan/pixelshuffle_vulkan.cpp
+++ b/src/layer/vulkan/pixelshuffle_vulkan.cpp
@@ -200,7 +200,7 @@ int PixelShuffle_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo
        if (out_elempack == 1) out_elemsize = 4u;
    }

    top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

--- a/src/layer/vulkan/pooling_vulkan.cpp
+++ b/src/layer/vulkan/pooling_vulkan.cpp
@@ -287,7 +287,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute

    if (global_pooling)
    {
        top_blob.create(channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(channels, elemsize, elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;

@@ -295,7 +295,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
        bindings[0] = bottom_blob;
        bindings[1] = top_blob;

        std::vector<vk_constant_type> constants(12);
        std::vector<vk_constant_type> constants(10);
        constants[0].i = bottom_blob.dims;
        constants[1].i = bottom_blob.w;
        constants[2].i = bottom_blob.h;
@@ -306,8 +306,6 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
        constants[7].i = top_blob.h;
        constants[8].i = top_blob.c;
        constants[9].i = top_blob.cstep;
        constants[10].i = 0;
        constants[11].i = 0;

        const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_global_pack8
                                 : elempack == 4 ? pipeline_pooling_global_pack4
@@ -336,8 +334,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
        Option opt_pad = opt;
        opt_pad.blob_vkallocator = opt.workspace_vkallocator;

        VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
        padding_param_blob.prepare_staging_buffer();
        VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
        int* padding_params = padding_param_blob.mapped();

        padding_params[0] = pad_top;
@@ -369,8 +366,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
            Option opt_pad = opt;
            opt_pad.blob_vkallocator = opt.workspace_vkallocator;

            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
            padding_param_blob.prepare_staging_buffer();
            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
            int* padding_params = padding_param_blob.mapped();

            padding_params[0] = hpad / 2;
@@ -396,8 +392,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
            Option opt_pad = opt;
            opt_pad.blob_vkallocator = opt.workspace_vkallocator;

            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
            padding_param_blob.prepare_staging_buffer();
            VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator);
            int* padding_params = padding_param_blob.mapped();

            padding_params[0] = hpad - hpad / 2;
@@ -421,7 +416,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
    int outw = (w - kernel_w) / stride_w + 1;
    int outh = (h - kernel_h) / stride_h + 1;

    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

--- a/src/layer/vulkan/priorbox_vulkan.cpp
+++ b/src/layer/vulkan/priorbox_vulkan.cpp
@@ -163,7 +163,7 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
        }

        VkMat& top_blob = top_blobs[0];
        top_blob.create(4 * w * h * num_prior / elempack, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(4 * w * h * num_prior / elempack, elemsize, elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;

@@ -217,7 +217,7 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
    }

    VkMat& top_blob = top_blobs[0];
    top_blob.create(4 * w * h * num_prior, 2, elemsize, 1, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create(4 * w * h * num_prior, 2, elemsize, 1, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

--- a/src/layer/vulkan/reorg_vulkan.cpp
+++ b/src/layer/vulkan/reorg_vulkan.cpp
@@ -192,7 +192,7 @@ int Reorg_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute&
        if (out_elempack == 1) out_elemsize = 4u;
    }

    top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

--- a/src/layer/vulkan/reshape_vulkan.cpp
+++ b/src/layer/vulkan/reshape_vulkan.cpp
@@ -275,7 +275,7 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
            return 0;
        }

        top_blob.create(_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
    }
    else if (ndim == 2)
    {
@@ -308,7 +308,7 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
            return 0;
        }

        top_blob.create(_w, _h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(_w, _h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
    }
    else // if (ndim == 3)
    {
@@ -348,7 +348,7 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
            return 0;
        }

        top_blob.create(_w, _h, _c / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(_w, _h, _c / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
    }

    if (top_blob.empty())
--- a/src/layer/vulkan/shufflechannel_vulkan.cpp
+++ b/src/layer/vulkan/shufflechannel_vulkan.cpp
@@ -142,7 +142,7 @@ int ShuffleChannel_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, Vk
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator);
    if (top_blob.empty())
        return -100;

--- a/src/layer/vulkan/slice_vulkan.cpp
+++ b/src/layer/vulkan/slice_vulkan.cpp
@@ -314,7 +314,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk
            }

            VkMat& top_blob = top_blobs[i];
            top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
            top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
            if (top_blob.empty())
                return -100;

@@ -416,7 +416,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk
            }

            VkMat& top_blob = top_blobs[i];
            top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
            top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
            if (top_blob.empty())
                return -100;

@@ -508,7 +508,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk
            }

            VkMat& top_blob = top_blobs[i];
            top_blob.create(slice, h, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
            top_blob.create(slice, h, elemsize, elempack, opt.blob_vkallocator);
            if (top_blob.empty())
                return -100;

@@ -576,7 +576,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk
            }

            VkMat& top_blob = top_blobs[i];
            top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator);
            top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
            if (top_blob.empty())
                return -100;

@@ -669,7 +669,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk
            }

            VkMat& top_blob = top_blobs[i];
            top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
            top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_vkallocator);
            if (top_blob.empty())
                return -100;

@@ -727,7 +727,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk
            }

            VkMat& top_blob = top_blobs[i];
            top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator);
            top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_vkallocator);
            if (top_blob.empty())
                return -100;

--- a/src/layer/vulkan/softmax_vulkan.cpp
+++ b/src/layer/vulkan/softmax_vulkan.cpp
@@ -283,33 +283,33 @@ int Softmax_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, cons

    if (dims == 1) // axis == 0
    {
        max_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        sum_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        max_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator);
        sum_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator);
    }
    else if (dims == 2 && axis == 0)
    {
        max_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        sum_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        max_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator);
        sum_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator);
    }
    else if (dims == 2 && axis == 1)
    {
        max_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        sum_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        max_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator);
        sum_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator);
    }
    else if (dims == 3 && axis == 0)
    {
        max_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        sum_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        max_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator);
        sum_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator);
    }
    else if (dims == 3 && axis == 1)
    {
        max_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        sum_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        max_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator);
        sum_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator);
    }
    else if (dims == 3 && axis == 2)
    {
        max_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        sum_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator);
        max_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator);
        sum_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator);
    }

    // reduce max
--- a/src/mat.h
+++ b/src/mat.h
@@ -261,59 +261,51 @@ public:
    // empty
    VkMat();
    // vec
    VkMat(int w, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
    VkMat(int w, size_t elemsize, VkAllocator* allocator);
    // image
    VkMat(int w, int h, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
    VkMat(int w, int h, size_t elemsize, VkAllocator* allocator);
    // dim
    VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
    VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
    // packed vec
    VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
    VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
    // packed image
    VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
    VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
    // packed dim
    VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
    VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
    // copy
    VkMat(const VkMat& m);
    // external vec
    VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
    VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
    // external image
    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
    // external dim
    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
    // external packed vec
    VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
    VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
    // external packed image
    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
    // external packed dim
    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
    // release
    ~VkMat();
    // assign
    VkMat& operator=(const VkMat& m);
    // allocate vec
    void create(int w, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
    void create(int w, size_t elemsize, VkAllocator* allocator);
    // allocate image
    void create(int w, int h, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
    // allocate dim
    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator);
    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
    // allocate packed vec
    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
    // allocate packed image
    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
    // allocate packed dim
    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator);
    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
    // allocate like
    void create_like(const Mat& m, VkAllocator* allocator, VkAllocator* staging_allocator);
    void create_like(const Mat& m, VkAllocator* allocator);
    // allocate like
    void create_like(const VkMat& m, VkAllocator* allocator, VkAllocator* staging_allocator);

    // staging buffer
    void prepare_staging_buffer();
    void discard_staging_buffer();

    // copy
    void upload(const Mat& m);
    void download(Mat& m) const;
    void create_like(const VkMat& m, VkAllocator* allocator);

    // mapped
    Mat mapped() const;
@@ -333,19 +325,14 @@ public:
    // low-level reference
    VkBuffer buffer() const;
    size_t buffer_offset() const;
    VkBuffer staging_buffer() const;
    size_t staging_buffer_offset() const;
    size_t buffer_capacity() const;

    // device buffer
    VkBufferMemory* data;

    // staging buffer
    VkBufferMemory* staging_data;

    // pointer to the reference counter
    // when points to user-allocated data, the pointer is NULL
    int* refcount;
    int* staging_refcount;

    // element size in bytes
    // 4 = float32/int32
@@ -362,7 +349,6 @@ public:

    // the allocator
    VkAllocator* allocator;
    VkAllocator* staging_allocator;

    // the dimension rank
    int dims;
@@ -1234,90 +1220,87 @@ inline const float& Mat::operator[](size_t i) const
 #if NCNN_VULKAN

 inline VkMat::VkMat()
    : data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 {
 }

 inline VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
    : data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 inline VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator)
    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 {
    create(_w, _elemsize, _allocator, _staging_allocator);
    create(_w, _elemsize, _allocator);
 }

 inline VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
    : data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 inline VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 {
    create(_w, _h, _elemsize, _allocator, _staging_allocator);
    create(_w, _h, _elemsize, _allocator);
 }

 inline VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
    : data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 inline VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 {
    create(_w, _h, _c, _elemsize, _allocator, _staging_allocator);
    create(_w, _h, _c, _elemsize, _allocator);
 }

 inline VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
    : data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 inline VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 {
    create(_w, _elemsize, _elempack, _allocator, _staging_allocator);
    create(_w, _elemsize, _elempack, _allocator);
 }

 inline VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
    : data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 inline VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 {
    create(_w, _h, _elemsize, _elempack, _allocator, _staging_allocator);
    create(_w, _h, _elemsize, _elempack, _allocator);
 }

 inline VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
    : data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 inline VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
 {
    create(_w, _h, _c, _elemsize, _elempack, _allocator, _staging_allocator);
    create(_w, _h, _c, _elemsize, _elempack, _allocator);
 }

 inline VkMat::VkMat(const VkMat& m)
    : data(m.data), staging_data(m.staging_data), refcount(m.refcount), staging_refcount(m.staging_refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), staging_allocator(m.staging_allocator), dims(m.dims), w(m.w), h(m.h), c(m.c)
    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), c(m.c)
 {
    if (refcount)
        NCNN_XADD(refcount, 1);

    if (staging_refcount)
        NCNN_XADD(staging_refcount, 1);

    cstep = m.cstep;
 }

 inline VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
    : data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), staging_allocator(_staging_allocator), dims(1), w(_w), h(1), c(1)
 inline VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), c(1)
 {
    cstep = w;
 }

 inline VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
    : data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), staging_allocator(_staging_allocator), dims(2), w(_w), h(_h), c(1)
 inline VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), c(1)
 {
    cstep = w * h;
 }

 inline VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
    : data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), staging_allocator(_staging_allocator), dims(3), w(_w), h(_h), c(_c)
 inline VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), c(_c)
 {
    cstep = alignSize(w * h * elemsize, 16) / elemsize;
 }

 inline VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
    : data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), staging_allocator(_staging_allocator), dims(1), w(_w), h(1), c(1)
 inline VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), c(1)
 {
    cstep = w;
 }

 inline VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
    : data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), staging_allocator(_staging_allocator), dims(2), w(_w), h(_h), c(1)
 inline VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), c(1)
 {
    cstep = w * h;
 }

 inline VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
    : data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), staging_allocator(_staging_allocator), dims(3), w(_w), h(_h), c(_c)
 inline VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), c(_c)
 {
    cstep = alignSize(w * h * elemsize, 16) / elemsize;
 }
@@ -1335,19 +1318,13 @@ inline VkMat& VkMat::operator=(const VkMat& m)
    if (m.refcount)
        NCNN_XADD(m.refcount, 1);

    if (m.staging_refcount)
        NCNN_XADD(m.staging_refcount, 1);

    release();

    data = m.data;
    staging_data = m.staging_data;
    refcount = m.refcount;
    staging_refcount = m.staging_refcount;
    elemsize = m.elemsize;
    elempack = m.elempack;
    allocator = m.allocator;
    staging_allocator = m.staging_allocator;

    dims = m.dims;
    w = m.w;
@@ -1359,9 +1336,9 @@ inline VkMat& VkMat::operator=(const VkMat& m)
    return *this;
 }

 inline void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
 inline void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator)
 {
    if (dims == 1 && w == _w && elemsize == _elemsize && elempack == 1 && allocator == _allocator && staging_allocator == _staging_allocator)
    if (dims == 1 && w == _w && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
        return;

    release();
@@ -1369,7 +1346,6 @@ inline void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator, VkA
    elemsize = _elemsize;
    elempack = 1;
    allocator = _allocator;
    staging_allocator = _staging_allocator;

    dims = 1;
    w = _w;
@@ -1389,9 +1365,9 @@ inline void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator, VkA
    }
 }

 inline void VkMat::create(int _w, int _h, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
 inline void VkMat::create(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
 {
    if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == 1 && allocator == _allocator && staging_allocator == _staging_allocator)
    if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
        return;

    release();
@@ -1399,7 +1375,6 @@ inline void VkMat::create(int _w, int _h, size_t _elemsize, VkAllocator* _alloca
    elemsize = _elemsize;
    elempack = 1;
    allocator = _allocator;
    staging_allocator = _staging_allocator;

    dims = 2;
    w = _w;
@@ -1419,9 +1394,9 @@ inline void VkMat::create(int _w, int _h, size_t _elemsize, VkAllocator* _alloca
    }
 }

 inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator)
 inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
 {
    if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == 1 && allocator == _allocator && staging_allocator == _staging_allocator)
    if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
        return;

    release();
@@ -1429,7 +1404,6 @@ inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator*
    elemsize = _elemsize;
    elempack = 1;
    allocator = _allocator;
    staging_allocator = _staging_allocator;

    dims = 3;
    w = _w;
@@ -1449,9 +1423,9 @@ inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator*
    }
 }

 inline void VkMat::create(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
 inline void VkMat::create(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
 {
    if (dims == 1 && w == _w && elemsize == _elemsize && elempack == _elempack && allocator == _allocator && staging_allocator == _staging_allocator)
    if (dims == 1 && w == _w && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
        return;

    release();
@@ -1459,7 +1433,6 @@ inline void VkMat::create(int _w, size_t _elemsize, int _elempack, VkAllocator*
    elemsize = _elemsize;
    elempack = _elempack;
    allocator = _allocator;
    staging_allocator = _staging_allocator;

    dims = 1;
    w = _w;
@@ -1479,9 +1452,9 @@ inline void VkMat::create(int _w, size_t _elemsize, int _elempack, VkAllocator*
    }
 }

 inline void VkMat::create(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
 inline void VkMat::create(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
 {
    if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == _elempack && allocator == _allocator && staging_allocator == _staging_allocator)
    if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
        return;

    release();
@@ -1489,7 +1462,6 @@ inline void VkMat::create(int _w, int _h, size_t _elemsize, int _elempack, VkAll
    elemsize = _elemsize;
    elempack = _elempack;
    allocator = _allocator;
    staging_allocator = _staging_allocator;

    dims = 2;
    w = _w;
@@ -1509,9 +1481,9 @@ inline void VkMat::create(int _w, int _h, size_t _elemsize, int _elempack, VkAll
    }
 }

 inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator)
 inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
 {
    if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == _elempack && allocator == _allocator && staging_allocator == _staging_allocator)
    if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
        return;

    release();
@@ -1519,7 +1491,6 @@ inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempac
    elemsize = _elemsize;
    elempack = _elempack;
    allocator = _allocator;
    staging_allocator = _staging_allocator;

    dims = 3;
    w = _w;
@@ -1539,82 +1510,33 @@ inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempac
    }
 }

 inline void VkMat::create_like(const Mat& m, VkAllocator* _allocator, VkAllocator* _staging_allocator)
 inline void VkMat::create_like(const Mat& m, VkAllocator* _allocator)
 {
    int _dims = m.dims;
    if (_dims == 1)
        create(m.w, m.elemsize, m.elempack, _allocator, _staging_allocator);
        create(m.w, m.elemsize, m.elempack, _allocator);
    if (_dims == 2)
        create(m.w, m.h, m.elemsize, m.elempack, _allocator, _staging_allocator);
        create(m.w, m.h, m.elemsize, m.elempack, _allocator);
    if (_dims == 3)
        create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator, _staging_allocator);
        create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator);
 }

 inline void VkMat::create_like(const VkMat& m, VkAllocator* _allocator, VkAllocator* _staging_allocator)
 inline void VkMat::create_like(const VkMat& m, VkAllocator* _allocator)
 {
    int _dims = m.dims;
    if (_dims == 1)
        create(m.w, m.elemsize, m.elempack, _allocator, _staging_allocator);
        create(m.w, m.elemsize, m.elempack, _allocator);
    if (_dims == 2)
        create(m.w, m.h, m.elemsize, m.elempack, _allocator, _staging_allocator);
        create(m.w, m.h, m.elemsize, m.elempack, _allocator);
    if (_dims == 3)
        create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator, _staging_allocator);
 }

 inline void VkMat::prepare_staging_buffer()
 {
    if (allocator->mappable)
        return;

    if (staging_allocator && staging_data)
        return;

    size_t totalsize = alignSize(total() * elemsize, 4);
    staging_data = staging_allocator->fastMalloc(totalsize);

    staging_refcount = (int*)((unsigned char*)staging_data + offsetof(VkBufferMemory, refcount));
    *staging_refcount = 1;
 }

 inline void VkMat::discard_staging_buffer()
 {
    if (allocator->mappable)
        return;

    if (staging_refcount && NCNN_XADD(staging_refcount, -1) == 1)
    {
        if (staging_allocator && staging_data)
        {
            staging_allocator->fastFree(staging_data);
        }
    }

    staging_data = 0;
    staging_refcount = 0;
 }

 inline void VkMat::upload(const Mat& m)
 {
    memcpy(mapped_ptr(), m.data, m.total() * m.elemsize);

    if (allocator->mappable)
    {
        allocator->flush(data);
    }
 }

 inline void VkMat::download(Mat& m) const
 {
    if (allocator->mappable)
    {
        allocator->invalidate(data);
    }

    memcpy(m.data, mapped_ptr(), total() * elemsize);
        create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator);
 }

 inline Mat VkMat::mapped() const
 {
    if (!allocator->mappable)
        return Mat();

    if (dims == 1)
        return Mat(w, mapped_ptr(), elemsize, elempack, 0);

@@ -1629,17 +1551,16 @@ inline Mat VkMat::mapped() const

 inline void* VkMat::mapped_ptr() const
 {
    VkBufferMemory* mappable_data = allocator->mappable ? data : staging_data;
    return (unsigned char*)mappable_data->mapped_ptr + mappable_data->offset;
    if (!allocator->mappable)
        return 0;

    return (unsigned char*)data->mapped_ptr + data->offset;
 }

 inline void VkMat::addref()
 {
    if (refcount)
        NCNN_XADD(refcount, 1);

    if (staging_refcount)
        NCNN_XADD(staging_refcount, 1);
 }

 inline void VkMat::release()
@@ -1652,16 +1573,7 @@ inline void VkMat::release()
        }
    }

    if (staging_refcount && NCNN_XADD(staging_refcount, -1) == 1)
    {
        if (staging_allocator && staging_data)
        {
            staging_allocator->fastFree(staging_data);
        }
    }

    data = 0;
    staging_data = 0;

    elemsize = 0;
    elempack = 0;
@@ -1674,7 +1586,6 @@ inline void VkMat::release()
    cstep = 0;

    refcount = 0;
    staging_refcount = 0;
 }

 inline bool VkMat::empty() const
@@ -1709,14 +1620,9 @@ inline size_t VkMat::buffer_offset() const
    return data->offset;
 }

 inline VkBuffer VkMat::staging_buffer() const
 {
    return staging_data->buffer;
 }

 inline size_t VkMat::staging_buffer_offset() const
 inline size_t VkMat::buffer_capacity() const
 {
    return staging_data->offset;
    return data->capacity;
 }

 inline VkImageMat::VkImageMat()
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -924,14 +924,16 @@ int Net::upload_model()
        weight_staging_vkallocator = new VkWeightStagingBufferAllocator(vkdev);
    }

    cmd.weight_vkallocator = weight_vkallocator;
    cmd.staging_vkallocator = weight_staging_vkallocator;
    Option opt_upload = opt;
    opt_upload.blob_vkallocator = weight_vkallocator;
    opt_upload.workspace_vkallocator = weight_vkallocator;
    opt_upload.staging_vkallocator = weight_staging_vkallocator;

    for (size_t i=0; i<layers.size(); i++)
    {
        if (layers[i]->support_vulkan)
        {
            int uret = layers[i]->upload_model(cmd, opt);
            int uret = layers[i]->upload_model(cmd, opt_upload);
            if (uret != 0)
            {
                fprintf(stderr, "layer upload_model %d failed\n", (int)i);
@@ -1347,12 +1349,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector

                    // upload
                    VkMat bottom_blob_unpacked;
                    bottom_blob_unpacked.create_like(bottom_blob_cpu_fp16, opt.blob_vkallocator, opt.staging_vkallocator);

                    bottom_blob_unpacked.prepare_staging_buffer();
                    bottom_blob_unpacked.upload(bottom_blob_cpu_fp16);

                    cmd.record_upload(bottom_blob_unpacked);
                    cmd.record_upload(bottom_blob_cpu_fp16, bottom_blob_unpacked, opt);

                    // cast to fp16 (integrated gpu)
                    VkMat bottom_blob_unpacked_fp16;
@@ -1390,11 +1387,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
                if (layer->support_inplace && *bottom_blob.refcount != 1)
                {
                    VkMat bottom_blob_copy;
                    bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator);

                    cmd.record_clone(bottom_blob, bottom_blob_copy, opt);
 //                     fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset());

                    cmd.record_clone(bottom_blob, bottom_blob_copy);
                    bottom_blob = bottom_blob_copy;
                }
            }
@@ -1437,7 +1431,6 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
        {
            // load bottom blobs
            std::vector<VkMat> bottom_blobs(layer->bottoms.size());
            std::vector<VkMat> bottom_blobs_unpacked(layer->bottoms.size());
            for (size_t i=0; i<layer->bottoms.size(); i++)
            {
                int bottom_blob_index = layer->bottoms[i];
@@ -1471,13 +1464,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
                        }

                        // upload
                        VkMat& bottom_blob_unpacked = bottom_blobs_unpacked[i];
                        bottom_blob_unpacked.create_like(bottom_blob_cpu_fp16, opt.blob_vkallocator, opt.staging_vkallocator);

                        bottom_blob_unpacked.prepare_staging_buffer();
                        bottom_blob_unpacked.upload(bottom_blob_cpu_fp16);

                        cmd.record_upload(bottom_blob_unpacked);
                        VkMat bottom_blob_unpacked;
                        cmd.record_upload(bottom_blob_cpu_fp16, bottom_blob_unpacked, opt);

                        // cast to fp16 (integrated gpu)
                        VkMat bottom_blob_unpacked_fp16;
@@ -1515,11 +1503,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
                    if (layer->support_inplace && *bottom_blobs[i].refcount != 1)
                    {
                        VkMat bottom_blob_copy;
                        bottom_blob_copy.create_like(bottom_blobs[i], bottom_blobs[i].allocator, bottom_blobs[i].staging_allocator);

                        cmd.record_clone(bottom_blobs[i], bottom_blob_copy, opt);
 //                         fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blobs[i].buffer(), bottom_blobs[i].buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset());

                        cmd.record_clone(bottom_blobs[i], bottom_blob_copy);
                        bottom_blobs[i] = bottom_blob_copy;
                    }
                }
@@ -1602,11 +1587,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
                        if (layer->support_inplace && *bottom_blob.refcount != 1)
                        {
                            VkMat bottom_blob_copy;
                            bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator);

                            cmd.record_clone(bottom_blob, bottom_blob_copy, opt);
 //                             fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset());

                            cmd.record_clone(bottom_blob, bottom_blob_copy);
                            bottom_blob = bottom_blob_copy;
                        }
                    }
@@ -1614,7 +1596,6 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
                    VkMat bottom_blob_unpacked_fp16;
                    if (opt.use_packing_layout && layer->support_packing)
                    {
 //                         bottom_blob_unpacked_fp16 = bottom_blob;
                        packing_pack4->forward(bottom_blob, bottom_blob_unpacked_fp16, cmd, opt);
                    }
                    else
@@ -1635,8 +1616,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
                    }

                    // download
                    bottom_blob_unpacked.prepare_staging_buffer();
                    cmd.record_download(bottom_blob_unpacked);
                    Mat bottom_blob_cpu_fp16;
                    cmd.record_download(bottom_blob_unpacked, bottom_blob_cpu_fp16, opt);

                    cmd.submit_and_wait();

@@ -1657,12 +1638,6 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector

                    cmd.reset();

                    Mat bottom_blob_cpu_fp16;
                    bottom_blob_cpu_fp16.create_like(bottom_blob_unpacked, opt.blob_allocator);
                    bottom_blob_unpacked.download(bottom_blob_cpu_fp16);

                    bottom_blob_unpacked.discard_staging_buffer();

                    // cast to fp32 (discrete gpu)
                    Mat& bottom_blob_cpu = blob_mats[bottom_blob_index];
                    if (opt.use_fp16_storage && vkdev->info.type == 0)
@@ -1742,7 +1717,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
        else
        {
            // load bottom blobs
            std::vector<VkMat> bottom_blobs_unpacked(layer->bottoms.size());
            std::vector<Mat> bottom_blobs_cpu_fp16(layer->bottoms.size());
            for (size_t i=0; i<layer->bottoms.size(); i++)
            {
                int bottom_blob_index = layer->bottoms[i];
@@ -1770,11 +1745,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
                            if (layer->support_inplace && *bottom_blob.refcount != 1)
                            {
                                VkMat bottom_blob_copy;
                                bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator);

                                cmd.record_clone(bottom_blob, bottom_blob_copy, opt);
 //                                 fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset());

                                cmd.record_clone(bottom_blob, bottom_blob_copy);
                                bottom_blob = bottom_blob_copy;
                            }
                        }
@@ -1782,7 +1754,6 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
                        VkMat bottom_blob_unpacked_fp16;
                        if (opt.use_packing_layout && layer->support_packing)
                        {
 //                             bottom_blob_unpacked_fp16 = bottom_blob;
                            packing_pack4->forward(bottom_blob, bottom_blob_unpacked_fp16, cmd, opt);
                        }
                        else
@@ -1792,7 +1763,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
                        }

                        // cast to fp32 (integrated gpu)
                        VkMat& bottom_blob_unpacked = bottom_blobs_unpacked[i];
                        VkMat bottom_blob_unpacked;
                        if (opt.use_fp16_storage && vkdev->info.type != 0)
                        {
                            cast_float16_to_float32->forward(bottom_blob_unpacked_fp16, bottom_blob_unpacked, cmd, opt);
@@ -1803,8 +1774,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
                        }

                        // download
                        bottom_blob_unpacked.prepare_staging_buffer();
                        cmd.record_download(bottom_blob_unpacked);
                        Mat& bottom_blob_cpu_fp16 = bottom_blobs_cpu_fp16[i];
                        cmd.record_download(bottom_blob_unpacked, bottom_blob_cpu_fp16, opt);
                    }
                }
            }
@@ -1837,13 +1808,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector

                if (blob_mats[bottom_blob_index].dims == 0)
                {
                    VkMat& bottom_blob_unpacked = bottom_blobs_unpacked[i];

                    Mat bottom_blob_cpu_fp16;
                    bottom_blob_cpu_fp16.create_like(bottom_blob_unpacked, opt.blob_allocator);
                    bottom_blob_unpacked.download(bottom_blob_cpu_fp16);

                    bottom_blob_unpacked.discard_staging_buffer();
                    const Mat& bottom_blob_cpu_fp16 = bottom_blobs_cpu_fp16[i];

                    // cast to fp32 (discrete gpu)
                    Mat& bottom_blob_cpu = blob_mats[bottom_blob_index];
@@ -1884,7 +1849,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
                }
            }

            bottom_blobs_unpacked.clear();
            bottom_blobs_cpu_fp16.clear();

            // forward
            if (opt.lightmode && layer->support_inplace)
@@ -2113,15 +2078,15 @@ int Extractor::extract(int blob_index, Mat& feat)
                }

                // download
                feat_gpu_unpacked.prepare_staging_buffer();
                cmd.record_download(feat_gpu_unpacked);
                Mat feat_cpu_fp16;
                cmd.record_download(feat_gpu_unpacked, feat_cpu_fp16, opt);

                cmd.submit_and_wait();

 #if NCNN_BENCHMARK
                std::vector<uint64_t> results(net->layers.size() * 2);
                cmd.get_query_pool_results(0, net->layers.size() * 2, results);
                for (int i=0; i<net->layers.size(); i++)
                for (size_t i=0; i<net->layers.size(); i++)
                {
                    uint64_t start = results[i*2];
                    uint64_t end = results[i*2+1];
@@ -2133,12 +2098,6 @@ int Extractor::extract(int blob_index, Mat& feat)
                }
 #endif // NCNN_BENCHMARK

                Mat feat_cpu_fp16;
                feat_cpu_fp16.create_like(feat_gpu_unpacked, opt.blob_allocator);
                feat_gpu_unpacked.download(feat_cpu_fp16);

                feat_gpu_unpacked.discard_staging_buffer();

                // cast to fp32 (discrete gpu)
                Mat& feat_cpu = blob_mats[blob_index];
                if (opt.use_fp16_storage && net->vkdev->info.type == 0)
--- a/src/pipeline.cpp
+++ b/src/pipeline.cpp
@@ -267,7 +267,7 @@ int Pipeline::create_pipeline_layout(int push_constant_count)
    VkPushConstantRange pushConstantRange;
    pushConstantRange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
    pushConstantRange.offset = 0;
    pushConstantRange.size = sizeof(int) * push_constant_count;
    pushConstantRange.size = sizeof(vk_constant_type) * push_constant_count;

    VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo;
    pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
--- a/tests/test_cast.cpp
+++ b/tests/test_cast.cpp
@@ -207,39 +207,30 @@ static int test_cast_gpu_fp16p(const ncnn::Mat& a, int type_from, int type_to)
        a4_fp16 = a4;
    }

    // upload
    ncnn::VkMat a4_gpu;
    a4_gpu.create_like(a4_fp16, opt.blob_vkallocator, opt.staging_vkallocator);
    a4_gpu.prepare_staging_buffer();
    a4_gpu.upload(a4_fp16);

    // forward
    ncnn::VkCompute cmd(vkdev);

    cmd.record_upload(a4_gpu);
    // upload
    ncnn::VkMat a4_gpu;
    cmd.record_upload(a4_fp16, a4_gpu, opt);

    ncnn::VkMat d4_gpu;
    if (op->support_inplace)
    {
        d4_gpu.create_like(a4_gpu, a4_gpu.allocator, a4_gpu.staging_allocator);
        cmd.record_clone(a4_gpu, d4_gpu);
        op->forward_inplace(d4_gpu, cmd, opt);
        op->forward_inplace(a4_gpu, cmd, opt);

        d4_gpu = a4_gpu;
    }
    else
    {
        op->forward(a4_gpu, d4_gpu, cmd, opt);
    }

    d4_gpu.prepare_staging_buffer();

    cmd.record_download(d4_gpu);
    // download
    cmd.record_download(d4_gpu, d, opt);

    cmd.submit_and_wait();

    // download
    d.create_like(d4_gpu);
    d4_gpu.download(d);

    op->destroy_pipeline(opt);

    delete op;
@@ -331,39 +322,30 @@ static int test_cast_gpu_fp16p_pack8(const ncnn::Mat& a, int type_from, int type
        a4_fp16 = a4;
    }

    // upload
    ncnn::VkMat a4_gpu;
    a4_gpu.create_like(a4_fp16, opt.blob_vkallocator, opt.staging_vkallocator);
    a4_gpu.prepare_staging_buffer();
    a4_gpu.upload(a4_fp16);

    // forward
    ncnn::VkCompute cmd(vkdev);

    cmd.record_upload(a4_gpu);
    // upload
    ncnn::VkMat a4_gpu;
    cmd.record_upload(a4_fp16, a4_gpu, opt);

    ncnn::VkMat d4_gpu;
    if (op->support_inplace)
    {
        d4_gpu.create_like(a4_gpu, a4_gpu.allocator, a4_gpu.staging_allocator);
        cmd.record_clone(a4_gpu, d4_gpu);
        op->forward_inplace(d4_gpu, cmd, opt);
        op->forward_inplace(a4_gpu, cmd, opt);

        d4_gpu = a4_gpu;
    }
    else
    {
        op->forward(a4_gpu, d4_gpu, cmd, opt);
    }

    d4_gpu.prepare_staging_buffer();

    cmd.record_download(d4_gpu);
    // download
    cmd.record_download(d4_gpu, d, opt);

    cmd.submit_and_wait();

    // download
    d.create_like(d4_gpu);
    d4_gpu.download(d);

    op->destroy_pipeline(opt);

    delete op;
--- a/tests/testutil.h
+++ b/tests/testutil.h
@@ -261,10 +261,13 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn:
    if (opt.use_vulkan_compute)
    {
        ncnn::VkTransfer cmd(vkdev);
        cmd.weight_vkallocator = &g_weight_vkallocator;
        cmd.staging_vkallocator = &g_weight_staging_vkallocator;

        op->upload_model(cmd, opt);
        ncnn::Option opt_upload = opt;
        opt_upload.blob_vkallocator = &g_weight_vkallocator;
        opt_upload.workspace_vkallocator = &g_weight_vkallocator;
        opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;

        op->upload_model(cmd, opt_upload);

        cmd.submit_and_wait();
    }
@@ -367,57 +370,35 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn:
            }
        }

        // upload
        std::vector<ncnn::VkMat> a4_fp16_gpu(a4_fp16.size());
        for (size_t i=0; i<a4_fp16.size(); i++)
        {
            a4_fp16_gpu[i].create_like(a4_fp16[i], opt.blob_vkallocator, opt.staging_vkallocator);
            a4_fp16_gpu[i].prepare_staging_buffer();
            a4_fp16_gpu[i].upload(a4_fp16[i]);
        }

        // forward
        ncnn::VkCompute cmd(vkdev);

        // upload
        std::vector<ncnn::VkMat> a4_fp16_gpu(a4_fp16.size());
        for (size_t i=0; i<a4_fp16_gpu.size(); i++)
        {
            cmd.record_upload(a4_fp16_gpu[i]);
            cmd.record_upload(a4_fp16[i], a4_fp16_gpu[i], opt);
        }

        std::vector<ncnn::VkMat> d4_fp16_gpu(top_blob_count);
        if (op->support_inplace)
        {
            for (size_t i=0; i<a4_fp16_gpu.size(); i++)
            {
                d4_fp16_gpu[i].create_like(a4_fp16_gpu[i], a4_fp16_gpu[i].allocator, a4_fp16_gpu[i].staging_allocator);
                cmd.record_clone(a4_fp16_gpu[i], d4_fp16_gpu[i]);
            }
            op->forward_inplace(a4_fp16_gpu, cmd, opt);

            op->forward_inplace(d4_fp16_gpu, cmd, opt);
            d4_fp16_gpu = a4_fp16_gpu;
        }
        else
        {
            op->forward(a4_fp16_gpu, d4_fp16_gpu, cmd, opt);
        }

        // download
        for (size_t i=0; i<d4_fp16_gpu.size(); i++)
        {
            d4_fp16_gpu[i].prepare_staging_buffer();
        }

        for (size_t i=0; i<d4_fp16_gpu.size(); i++)
        {
            cmd.record_download(d4_fp16_gpu[i]);
            cmd.record_download(d4_fp16_gpu[i], d[i], opt);
        }

        cmd.submit_and_wait();

        // download
        for (size_t i=0; i<d4_fp16_gpu.size(); i++)
        {
            d[i].create_like(d4_fp16_gpu[i]);
            d4_fp16_gpu[i].download(d[i]);
        }
    }
 #endif // NCNN_VULKAN

@@ -509,14 +490,15 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn:
    if (opt.use_vulkan_compute)
    {
        ncnn::VkTransfer cmd(vkdev);
        cmd.weight_vkallocator = &g_weight_vkallocator;
        cmd.staging_vkallocator = &g_weight_staging_vkallocator;

        op->upload_model(cmd, opt);
        ncnn::Option opt_upload = opt;
        opt_upload.blob_vkallocator = &g_weight_vkallocator;
        opt_upload.workspace_vkallocator = &g_weight_vkallocator;
        opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;

        cmd.submit_and_wait();
        op->upload_model(cmd, opt_upload);

        g_weight_staging_vkallocator.clear();
        cmd.submit_and_wait();
    }
 #endif // NCNN_VULKAN

@@ -594,38 +576,29 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn:
            a4_fp16 = a4;
        }

        // upload
        ncnn::VkMat a4_fp16_gpu;
        a4_fp16_gpu.create_like(a4_fp16, opt.blob_vkallocator, opt.staging_vkallocator);
        a4_fp16_gpu.prepare_staging_buffer();
        a4_fp16_gpu.upload(a4_fp16);

        // forward
        ncnn::VkCompute cmd(vkdev);

        cmd.record_upload(a4_fp16_gpu);
        // upload
        ncnn::VkMat a4_fp16_gpu;
        cmd.record_upload(a4_fp16, a4_fp16_gpu, opt);

        ncnn::VkMat d4_fp16_gpu;
        if (op->support_inplace)
        {
            d4_fp16_gpu.create_like(a4_fp16_gpu, a4_fp16_gpu.allocator, a4_fp16_gpu.staging_allocator);
            cmd.record_clone(a4_fp16_gpu, d4_fp16_gpu);
            op->forward_inplace(d4_fp16_gpu, cmd, opt);
            op->forward_inplace(a4_fp16_gpu, cmd, opt);

            d4_fp16_gpu = a4_fp16_gpu;
        }
        else
        {
            op->forward(a4_fp16_gpu, d4_fp16_gpu, cmd, opt);
        }

        d4_fp16_gpu.prepare_staging_buffer();

        cmd.record_download(d4_fp16_gpu);
        // download
        cmd.record_download(d4_fp16_gpu, d, opt);

        cmd.submit_and_wait();

        // download
        d.create_like(d4_fp16_gpu);
        d4_fp16_gpu.download(d);
    }
 #endif // NCNN_VULKAN