more fix for fp16p, still disabled by default

7 years ago · cd7559c639
--- a/src/command.cpp
+++ b/src/command.cpp
@@ -731,7 +731,7 @@ void VkCompute::dispatch(const uint32_t* group_count_xyz)

 void VkCompute::transfer_compute_barrier(VkBuffer buffer, size_t offset, size_t size)
 {
 //     fprintf(stderr, "cmd transfer_compute_barrier %p[+%lu]\n", buffer, offset);
 //     fprintf(stderr, "cmd transfer_compute_barrier %p[+%lu] %lu\n", buffer, offset, size);

    VkBufferMemoryBarrier bufferBarrier;
    bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -752,7 +752,7 @@ void VkCompute::transfer_compute_barrier(VkBuffer buffer, size_t offset, size_t

 void VkCompute::compute_transfer_barrier(VkBuffer buffer, size_t offset, size_t size)
 {
 //     fprintf(stderr, "cmd compute_transfer_barrier %p[+%lu]\n", buffer, offset);
 //     fprintf(stderr, "cmd compute_transfer_barrier %p[+%lu] %lu\n", buffer, offset, size);

    VkBufferMemoryBarrier bufferBarrier;
    bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -773,7 +773,7 @@ void VkCompute::compute_transfer_barrier(VkBuffer buffer, size_t offset, size_t

 void VkCompute::compute_compute_barrier(VkBuffer buffer, size_t offset, size_t size)
 {
 //     fprintf(stderr, "cmd compute_compute_barrier %p[+%lu]\n", buffer, offset);
 //     fprintf(stderr, "cmd compute_compute_barrier %p[+%lu] %lu\n", buffer, offset, size);

    VkBufferMemoryBarrier bufferBarrier;
    bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -794,7 +794,7 @@ void VkCompute::compute_compute_barrier(VkBuffer buffer, size_t offset, size_t s

 void VkCompute::transfer_transfer_barrier(VkBuffer buffer, size_t offset, size_t size)
 {
 //     fprintf(stderr, "cmd transfer_transfer_barrier %p[+%lu]\n", buffer, offset);
 //     fprintf(stderr, "cmd transfer_transfer_barrier %p[+%lu] %lu\n", buffer, offset, size);

    VkBufferMemoryBarrier bufferBarrier;
    bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -843,30 +843,35 @@ VkTransfer::~VkTransfer()

 void VkTransfer::record_upload(const Mat& src, VkMat& dst)
 {
    if ((vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage) && src.elemsize / src.packing == 4)
    if (src.elemsize / src.packing == 4)
    {
        Mat src_fp16;
        cast_float32_to_float16(src, src_fp16);
        if (vkdev->info.support_fp16_storage || (vkdev->info.support_fp16_packed && src.packing % 4 == 0))
        {
            Mat src_fp16;
            cast_float32_to_float16(src, src_fp16);

        record_upload(src_fp16, dst);
            record_upload(src_fp16, dst);

        return;
            return;
        }
    }

    dst.create_like(src, weight_vkallocator, staging_vkallocator);
    Mat src_flattened = src.reshape(src.w * src.h * src.c);

    dst.create_like(src_flattened, weight_vkallocator, staging_vkallocator);

    // set weight blob as readonly
    dst.data->state = 4;

    if (dst.allocator->mappable)
    {
        dst.upload(src);
        dst.upload(src_flattened);
        return;
    }

    record_type r;
    r.size = src.total() * src.elemsize;
    r.mat = src;
    r.size = src_flattened.total() * src_flattened.elemsize;
    r.mat = src_flattened;
    r.vkmat = dst;
    delayed_records.push_back(r);
 }
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -712,8 +712,8 @@ int create_gpu_instance()
                gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count,
                gpu_info.unified_memory_index, gpu_info.device_local_memory_index, gpu_info.host_visible_memory_index);

        fprintf(stderr, "[%u %s]  fp16s=%d  fp16a=%d  int8s=%d  int8a=%d\n", i, physicalDeviceProperties.deviceName,
                gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
        fprintf(stderr, "[%u %s]  fp16p=%d  fp16s=%d  fp16a=%d  int8s=%d  int8a=%d\n", i, physicalDeviceProperties.deviceName,
                gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
                gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic);

        gpu_info_index++;
@@ -999,6 +999,15 @@ VkAllocator* VulkanDevice::staging_allocator() const
    return staging_buffer_allocator;
 }

 static inline bool string_ends_with_fp16p(const char* name)
 {
    int len = strlen(name);
    if (len < 6)
        return false;

    return memcmp(name + len - 6, "_fp16p", 6) == 0;
 }

 static inline bool string_ends_with_fp16s(const char* name)
 {
    int len = strlen(name);
@@ -1025,15 +1034,15 @@ int VulkanDevice::create_shader_module()
    {
        const char* shader_name = layer_shader_registry[i].name;

        if (!info.support_fp16_storage)
        if (!info.support_fp16_packed)
        {
            if (string_ends_with_fp16s(shader_name))
                continue;

            if (strcmp(shader_name, "cast_fp16_to_fp32") == 0 || strcmp(shader_name, "cast_fp16_to_fp32_pack4") == 0)
            if (string_ends_with_fp16p(shader_name))
                continue;
        }

            if (strcmp(shader_name, "cast_fp32_to_fp16") == 0 || strcmp(shader_name, "cast_fp32_to_fp16_pack4") == 0)
        if (!info.support_fp16_storage)
        {
            if (string_ends_with_fp16s(shader_name))
                continue;
        }

--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -315,7 +315,7 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
        {
            Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

            weight_data_pack4.create(16*maxk, num_input/4, num_output/4);
            weight_data_pack4.create(maxk, num_input/4, num_output/4, (size_t)4*16, 16);

            for (int q=0; q+3<num_output; q+=4)
            {
@@ -378,7 +378,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
            }
        }

        weight_data_pack4 = weight_data_pack4.reshape(16*maxk * (num_input/4) * (num_output/4));
        cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4);

        if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1 && num_input >= 16 && num_output >= 16)
@@ -434,7 +433,7 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
            // dst = 4a-4b-16-inch/4a-outch/4b
            Mat weight_data_pack4_tm;
            {
                weight_data_pack4_tm.create(16*16, num_input/4, num_output/4);
                weight_data_pack4_tm.create(16, num_input/4, num_output/4, (size_t)4*16, 16);

                for (int q=0; q+3<num_output; q+=4)
                {
@@ -497,7 +496,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
                }
            }

            weight_data_pack4_tm = weight_data_pack4_tm.reshape(16*16 * (num_input/4) * (num_output/4));
            cmd.record_upload(weight_data_pack4_tm, weight_data_gpu_pack4_tm);
        }
    }
@@ -511,7 +509,7 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
        {
            Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

            weight_data_pack1to4.create(4*maxk, num_input, num_output/4);
            weight_data_pack1to4.create(maxk, num_input, num_output/4, (size_t)4*4, 4);

            for (int q=0; q+3<num_output; q+=4)
            {
@@ -544,7 +542,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
            }
        }

        weight_data_pack1to4 = weight_data_pack1to4.reshape(4*maxk * num_input * (num_output/4));
        cmd.record_upload(weight_data_pack1to4, weight_data_gpu_pack1to4);
    }

@@ -557,7 +554,7 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
        {
            Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

            weight_data_pack4to1.create(4*maxk, num_input/4, num_output);
            weight_data_pack4to1.create(maxk, num_input/4, num_output, (size_t)4*4, 4);

            for (int q=0; q<num_output; q++)
            {
@@ -586,7 +583,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
            }
        }

        weight_data_pack4to1 = weight_data_pack4to1.reshape(4*maxk * (num_input/4) * num_output);
        cmd.record_upload(weight_data_pack4to1, weight_data_gpu_pack4to1);
    }

@@ -739,6 +735,12 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
    int out_packing = num_output % 4 == 0 ? 4 : 1;
    size_t out_elemsize = elemsize / packing * out_packing;

    if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
    {
        if (out_packing == 4) out_elemsize = 4*2u;
        if (out_packing == 1) out_elemsize = 4u;
    }

    bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;
    if (packing == 4 && out_packing == 4 && is_conv3x3s1d1 && channels * packing >= 16 && num_output >= 16)
    {
--- a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
@@ -224,7 +224,6 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
            Mat weight_data_r2 = weight_data.reshape(maxk, group);
            convert_packing(weight_data_r2, weight_data_pack4, 4);

            weight_data_pack4 = weight_data_pack4.reshape(maxk * (group/4));
            cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4);
        }

@@ -265,7 +264,7 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
        {
            Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group);

            weight_data_pack4_groups.create(16*maxk, channels_g/4, num_output_g/4 * group);
            weight_data_pack4_groups.create(maxk, channels_g/4, num_output_g/4 * group, (size_t)4*16, 16);

            for (int g=0; g<group; g++)
            {
@@ -335,7 +334,6 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
            }
        }

        weight_data_pack4_groups = weight_data_pack4_groups.reshape(16*maxk * (channels_g/4) * (num_output_g/4) * group);
        cmd.record_upload(weight_data_pack4_groups, weight_data_gpu_pack4);
    }

@@ -348,7 +346,7 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
        {
            Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group);

            weight_data_pack1to4_groups.create(4*maxk, channels_g, num_output_g/4 * group);
            weight_data_pack1to4_groups.create(maxk, channels_g, num_output_g/4 * group, (size_t)4*4, 4);

            for (int g=0; g<group; g++)
            {
@@ -388,7 +386,6 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
            }
        }

        weight_data_pack1to4_groups = weight_data_pack1to4_groups.reshape(4*maxk * channels_g * (num_output_g/4) * group);
        cmd.record_upload(weight_data_pack1to4_groups, weight_data_gpu_pack1to4);
    }

@@ -401,7 +398,7 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
        {
            Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group);

            weight_data_pack4to1_groups.create(4*maxk, channels_g/4, num_output_g * group);
            weight_data_pack4to1_groups.create(maxk, channels_g/4, num_output_g * group, (size_t)4*4, 4);

            for (int g=0; g<group; g++)
            {
@@ -437,7 +434,6 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
            }
        }

        weight_data_pack4to1_groups = weight_data_pack4to1_groups.reshape(4*maxk * (channels_g/4) * num_output_g * group);
        cmd.record_upload(weight_data_pack4to1_groups, weight_data_gpu_pack4to1);
    }

@@ -517,6 +513,12 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl
    int out_packing = num_output % 4 == 0 ? 4 : 1;
    size_t out_elemsize = elemsize / packing * out_packing;

    if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
    {
        if (out_packing == 4) out_elemsize = 4*2u;
        if (out_packing == 1) out_elemsize = 4u;
    }

    top_blob.create(outw, outh, num_output / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
    if (top_blob.empty())
        return -100;
--- a/src/layer/vulkan/crop_vulkan.cpp
+++ b/src/layer/vulkan/crop_vulkan.cpp
@@ -98,6 +98,12 @@ int Crop_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c
    int out_packing = _outc % 4 == 0 ? 4 : 1;
    size_t out_elemsize = elemsize / packing * out_packing;

    if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
    {
        if (out_packing == 4) out_elemsize = 4*2u;
        if (out_packing == 1) out_elemsize = 4u;
    }

    top_blob.create(_outw, _outh, _outc / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
    if (top_blob.empty())
        return -100;
@@ -189,6 +195,12 @@ int Crop_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkM
    int out_packing = _outc % 4 == 0 ? 4 : 1;
    size_t out_elemsize = elemsize / packing * out_packing;

    if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
    {
        if (out_packing == 4) out_elemsize = 4*2u;
        if (out_packing == 1) out_elemsize = 4u;
    }

    VkMat& top_blob = top_blobs[0];

    top_blob.create(_outw, _outh, _outc / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
--- a/src/layer/vulkan/deconvolution_vulkan.cpp
+++ b/src/layer/vulkan/deconvolution_vulkan.cpp
@@ -158,7 +158,7 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd)
        {
            Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);

            weight_data_pack4.create(16*maxk, num_input/4, num_output/4);
            weight_data_pack4.create(maxk, num_input/4, num_output/4, (size_t)4*16, 16);

            for (int q=0; q+3<num_output; q+=4)
            {
@@ -221,7 +221,6 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd)
            }
        }

        weight_data_pack4 = weight_data_pack4.reshape(16*maxk * (num_input/4) * (num_output/4));
        cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4);
    }

@@ -234,7 +233,7 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd)
        {
            Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);

            weight_data_pack1to4.create(4*maxk, num_input, num_output/4);
            weight_data_pack1to4.create(maxk, num_input, num_output/4, (size_t)4*4, 4);

            for (int q=0; q+3<num_output; q+=4)
            {
@@ -267,7 +266,6 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd)
            }
        }

        weight_data_pack1to4 = weight_data_pack1to4.reshape(4*maxk * num_input * (num_output/4));
        cmd.record_upload(weight_data_pack1to4, weight_data_gpu_pack1to4);
    }

@@ -280,7 +278,7 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd)
        {
            Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);

            weight_data_pack4to1.create(4*maxk, num_input/4, num_output);
            weight_data_pack4to1.create(maxk, num_input/4, num_output, (size_t)4*4, 4);

            for (int q=0; q<num_output; q++)
            {
@@ -309,7 +307,6 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd)
            }
        }

        weight_data_pack4to1 = weight_data_pack4to1.reshape(4*maxk * (num_input/4) * num_output);
        cmd.record_upload(weight_data_pack4to1, weight_data_gpu_pack4to1);
    }

--- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
@@ -281,7 +281,7 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
        {
            Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);

            weight_data_pack4_groups.create(16*maxk, channels_g/4, num_output_g/4 * group);
            weight_data_pack4_groups.create(maxk, channels_g/4, num_output_g/4 * group, (size_t)4*16, 16);

            for (int g=0; g<group; g++)
            {
@@ -351,7 +351,6 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
            }
        }

        weight_data_pack4_groups = weight_data_pack4_groups.reshape(16*maxk * (channels_g/4) * (num_output_g/4) * group);
        cmd.record_upload(weight_data_pack4_groups, weight_data_gpu_pack4);
    }

@@ -364,7 +363,7 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
        {
            Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);

            weight_data_pack1to4_groups.create(4*maxk, channels_g, num_output_g/4 * group);
            weight_data_pack1to4_groups.create(maxk, channels_g, num_output_g/4 * group, (size_t)4*4, 4);

            for (int g=0; g<group; g++)
            {
@@ -404,7 +403,6 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
            }
        }

        weight_data_pack1to4_groups = weight_data_pack1to4_groups.reshape(4*maxk * channels_g * (num_output_g/4) * group);
        cmd.record_upload(weight_data_pack1to4_groups, weight_data_gpu_pack1to4);
    }

@@ -417,7 +415,7 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
        {
            Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);

            weight_data_pack4to1_groups.create(4*maxk, channels_g/4, num_output_g * group);
            weight_data_pack4to1_groups.create(maxk, channels_g/4, num_output_g * group, (size_t)4*4, 4);

            for (int g=0; g<group; g++)
            {
@@ -453,7 +451,6 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
            }
        }

        weight_data_pack4to1_groups = weight_data_pack4to1_groups.reshape(4*maxk * (channels_g/4) * num_output_g * group);
        cmd.record_upload(weight_data_pack4to1_groups, weight_data_gpu_pack4to1);
    }

--- a/src/layer/vulkan/flatten_vulkan.cpp
+++ b/src/layer/vulkan/flatten_vulkan.cpp
@@ -24,6 +24,7 @@ Flatten_vulkan::Flatten_vulkan()

    pipeline_flatten = 0;
    pipeline_flatten_pack4 = 0;
    pipeline_flatten_pack1to4 = 0;
 }

 int Flatten_vulkan::create_pipeline(const Option& opt)
@@ -44,6 +45,13 @@ int Flatten_vulkan::create_pipeline(const Option& opt)
        pipeline_flatten_pack4->create("flatten_pack4", specializations, 2, 10);
    }

    // pack1to4
    {
        pipeline_flatten_pack1to4 = new Pipeline(vkdev);
        pipeline_flatten_pack1to4->set_optimal_local_size_xyz();
        pipeline_flatten_pack1to4->create("flatten_pack1to4", specializations, 2, 10);
    }

    return 0;
 }

@@ -55,6 +63,9 @@ int Flatten_vulkan::destroy_pipeline(const Option& opt)
    delete pipeline_flatten_pack4;
    pipeline_flatten_pack4 = 0;

    delete pipeline_flatten_pack1to4;
    pipeline_flatten_pack1to4 = 0;

    return 0;
 }

@@ -79,6 +90,12 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
    int out_packing = total % 4 == 0 ? 4 : 1;
    size_t out_elemsize = elemsize / packing * out_packing;

    if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
    {
        if (out_packing == 4) out_elemsize = 4*2u;
        if (out_packing == 1) out_elemsize = 4u;
    }

    if (dims == 2 && packing == 1)
    {
        top_blob = bottom_blob;
@@ -106,25 +123,26 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
    constants[3].i = bottom_blob.c;
    constants[4].i = bottom_blob.cstep;
    constants[5].i = top_blob.dims;
    constants[6].i = (packing == 1 && out_packing == 4) ? total : top_blob.w;
    constants[6].i = top_blob.w;
    constants[7].i = top_blob.h;
    constants[8].i = top_blob.c;
    constants[9].i = top_blob.cstep;

    const Pipeline* pipeline = packing == 4 ? pipeline_flatten_pack4 : pipeline_flatten;

    if (packing == 1 && out_packing == 4)
    const Pipeline* pipeline = 0;
    if (packing == 1 && out_packing == 1)
    {
        VkMat dispatcher;
        dispatcher.w = total;
        dispatcher.h = 1;
        dispatcher.c = 1;
        cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
        pipeline = pipeline_flatten;
    }
    else
    else if (packing == 4 /*&& out_packing == 4*/)
    {
        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
        pipeline = pipeline_flatten_pack4;
    }
    else if (packing == 1 && out_packing == 4)
    {
        pipeline = pipeline_flatten_pack1to4;
    }

    cmd.record_pipeline(pipeline, bindings, constants, top_blob);

    return 0;
 }
--- a/src/layer/vulkan/flatten_vulkan.h
+++ b/src/layer/vulkan/flatten_vulkan.h
@@ -32,6 +32,7 @@ public:
 public:
    Pipeline* pipeline_flatten;
    Pipeline* pipeline_flatten_pack4;
    Pipeline* pipeline_flatten_pack1to4;
 };

 } // namespace ncnn
--- a/src/layer/vulkan/innerproduct_vulkan.cpp
+++ b/src/layer/vulkan/innerproduct_vulkan.cpp
@@ -141,7 +141,7 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
        {
            Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

            weight_data_pack4.create(16, num_input/4, num_output/4);
            weight_data_pack4.create(num_input/4, num_output/4, (size_t)4*16, 16);

            for (int q=0; q+3<num_output; q+=4)
            {
@@ -150,7 +150,7 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
                const float* k2 = weight_data_r2.row(q+2);
                const float* k3 = weight_data_r2.row(q+3);

                float* g00 = weight_data_pack4.channel(q/4);
                float* g00 = weight_data_pack4.row(q/4);

                for (int p=0; p+3<num_input; p+=4)
                {
@@ -183,7 +183,6 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
            }
        }

        weight_data_pack4 = weight_data_pack4.reshape(16 * (num_input/4) * (num_output/4));
        cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4);
    }

@@ -196,7 +195,7 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
        {
            Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

            weight_data_pack1to4.create(4, num_input, num_output/4);
            weight_data_pack1to4.create(num_input, num_output/4, (size_t)4*4, 4);

            for (int q=0; q+3<num_output; q+=4)
            {
@@ -205,7 +204,7 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
                const float* k2 = weight_data_r2.row(q+2);
                const float* k3 = weight_data_r2.row(q+3);

                float* g00 = weight_data_pack1to4.channel(q/4);
                float* g00 = weight_data_pack1to4.row(q/4);

                for (int p=0; p<num_input; p++)
                {
@@ -219,7 +218,6 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
            }
        }

        weight_data_pack1to4 = weight_data_pack1to4.reshape(4 * num_input * (num_output/4));
        cmd.record_upload(weight_data_pack1to4, weight_data_gpu_pack1to4);
    }

@@ -232,13 +230,13 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
        {
            Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

            weight_data_pack4to1.create(4, num_input/4, num_output);
            weight_data_pack4to1.create(num_input/4, num_output, (size_t)4*4, 4);

            for (int q=0; q<num_output; q++)
            {
                const float* k0 = weight_data_r2.row(q);

                float* g00 = weight_data_pack4to1.channel(q);
                float* g00 = weight_data_pack4to1.row(q);

                for (int p=0; p+3<num_input; p+=4)
                {
@@ -253,7 +251,6 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
            }
        }

        weight_data_pack4to1 = weight_data_pack4to1.reshape(4 * (num_input/4) * num_output);
        cmd.record_upload(weight_data_pack4to1, weight_data_gpu_pack4to1);
    }

@@ -293,6 +290,12 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo
    int out_packing = num_output % 4 == 0 ? 4 : 1;
    size_t out_elemsize = elemsize / packing * out_packing;

    if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
    {
        if (out_packing == 4) out_elemsize = 4*2u;
        if (out_packing == 1) out_elemsize = 4u;
    }

    top_blob.create(num_output / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
    if (top_blob.empty())
        return -100;
--- a/src/layer/vulkan/packing_vulkan.cpp
+++ b/src/layer/vulkan/packing_vulkan.cpp
@@ -96,7 +96,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute

    if (dims == 1)
    {
        if (out_packing == 1)
        if (vkdev->info.support_fp16_storage && out_packing == 1)
        {
            top_blob = bottom_blob;
            top_blob.w = w * packing;
@@ -108,6 +108,11 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute

        int outw = (w * packing + out_packing - 1) / out_packing;
        size_t out_elemsize = elemsize / packing * out_packing;
        if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
        {
            if (out_packing == 4) out_elemsize = 4*2u;
            if (out_packing == 1) out_elemsize = 4u;
        }

        top_blob.create(outw, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
        if (top_blob.empty())
@@ -118,6 +123,11 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
    {
        int outh = (h * packing + out_packing - 1) / out_packing;
        size_t out_elemsize = elemsize / packing * out_packing;
        if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
        {
            if (out_packing == 4) out_elemsize = 4*2u;
            if (out_packing == 1) out_elemsize = 4u;
        }

        top_blob.create(w, outh, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
        if (top_blob.empty())
@@ -128,6 +138,11 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
    {
        int outc = (channels * packing + out_packing - 1) / out_packing;
        size_t out_elemsize = elemsize / packing * out_packing;
        if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
        {
            if (out_packing == 4) out_elemsize = 4*2u;
            if (out_packing == 1) out_elemsize = 4u;
        }

        top_blob.create(w, h, outc, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
        if (top_blob.empty())
--- a/src/layer/vulkan/permute_vulkan.cpp
+++ b/src/layer/vulkan/permute_vulkan.cpp
@@ -72,6 +72,12 @@ int Permute_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
    int out_packing = 1;
    size_t out_elemsize = elemsize / packing;

    if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
    {
        if (out_packing == 4) out_elemsize = 4*2u;
        if (out_packing == 1) out_elemsize = 4u;
    }

    if (dims == 2)
    {
        // order_type
--- a/src/layer/vulkan/priorbox_vulkan.cpp
+++ b/src/layer/vulkan/priorbox_vulkan.cpp
@@ -107,6 +107,12 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
 {
    int w = bottom_blobs[0].w;
    int h = bottom_blobs[0].h;
    size_t elemsize = 4u;

    if (vkdev->info.support_fp16_storage)
    {
        elemsize = 2u;
    }

    if (bottom_blobs.size() == 1 && image_width == -233 && image_height == -233 && max_sizes.empty())
    {
@@ -124,7 +130,7 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
        int num_prior = num_sizes - 1 + num_ratios;

        VkMat& top_blob = top_blobs[0];
        top_blob.create(4 * w * h * num_prior, 4u, opt.blob_vkallocator, opt.staging_vkallocator);
        top_blob.create(4 * w * h * num_prior, elemsize, 1, opt.blob_vkallocator, opt.staging_vkallocator);
        if (top_blob.empty())
            return -100;

@@ -172,7 +178,7 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
        num_prior += num_min_size * num_aspect_ratio;

    VkMat& top_blob = top_blobs[0];
    top_blob.create(4 * w * h * num_prior, 2, 4u, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create(4 * w * h * num_prior, 2, elemsize, 1, opt.blob_vkallocator, opt.staging_vkallocator);
    if (top_blob.empty())
        return -100;

--- a/src/layer/vulkan/reorg_vulkan.cpp
+++ b/src/layer/vulkan/reorg_vulkan.cpp
@@ -85,6 +85,12 @@ int Reorg_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute&
    int out_packing = outc % 4 == 0 ? 4 : 1;
    size_t out_elemsize = elemsize / packing * out_packing;

    if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
    {
        if (out_packing == 4) out_elemsize = 4*2u;
        if (out_packing == 1) out_elemsize = 4u;
    }

    top_blob.create(outw, outh, outc / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
    if (top_blob.empty())
        return -100;
--- a/src/layer/vulkan/reshape_vulkan.cpp
+++ b/src/layer/vulkan/reshape_vulkan.cpp
@@ -105,6 +105,12 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
        out_packing = _w % 4 == 0 ? 4 : 1;
        size_t out_elemsize = elemsize / packing * out_packing;

        if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
        {
            if (out_packing == 4) out_elemsize = 4*2u;
            if (out_packing == 1) out_elemsize = 4u;
        }

        if (dims == 1 && bottom_blob.w == _w && packing == out_packing)
        {
            top_blob = bottom_blob;
@@ -131,6 +137,12 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
        out_packing = _h % 4 == 0 ? 4 : 1;
        size_t out_elemsize = elemsize / packing * out_packing;

        if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
        {
            if (out_packing == 4) out_elemsize = 4*2u;
            if (out_packing == 1) out_elemsize = 4u;
        }

        if (dims == 2 && bottom_blob.h == _h && packing == out_packing)
        {
            top_blob = bottom_blob;
@@ -162,6 +174,12 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
        out_packing = _c % 4 == 0 ? 4 : 1;
        size_t out_elemsize = elemsize / packing * out_packing;

        if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
        {
            if (out_packing == 4) out_elemsize = 4*2u;
            if (out_packing == 1) out_elemsize = 4u;
        }

        if (dims == 3 && bottom_blob.c == _c && packing == out_packing)
        {
            top_blob = bottom_blob;
--- a/src/layer/vulkan/shader/flatten_pack1to4.comp
+++ b/src/layer/vulkan/shader/flatten_pack1to4.comp
@@ -0,0 +1,76 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_AMD_gpu_shader_half_float: require
 #endif

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= 1 || gz >= 1)
        return;

    ivec4 i4 = gx * 4 + ivec4(0, 1, 2, 3);

    int size = p.w * p.h;

    ivec4 z4 = i4 / size;
    ivec4 y4 = i4 % size / p.w;
    ivec4 x4 = i4 % size % p.w;

    ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4;

 #if NCNN_fp16_packed
    vec2 v0 = vec2(bottom_blob_data[v_offset.r], bottom_blob_data[v_offset.g]);
    vec2 v1 = vec2(bottom_blob_data[v_offset.b], bottom_blob_data[v_offset.a]);

    top_blob_data[gx] = uvec2(packHalf2x16(v0), packHalf2x16(v1));
 #else
    top_blob_data[gx].r = bottom_blob_data[v_offset.r];
    top_blob_data[gx].g = bottom_blob_data[v_offset.g];
    top_blob_data[gx].b = bottom_blob_data[v_offset.b];
    top_blob_data[gx].a = bottom_blob_data[v_offset.a];
 #endif
 }
--- a/src/layer/vulkan/shader/padding_pack4.comp
+++ b/src/layer/vulkan/shader/padding_pack4.comp
@@ -70,7 +70,12 @@ void main()
        }
        else
        {
 #if NCNN_fp16_packed
            uint v = packHalf2x16(vec2(value));
            top_blob_data[gz * p.outcstep + gy * p.outw + gx] = uvec2(v, v);
 #else
            top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(value);
 #endif
        }
    }
    else if (type == 1)
--- a/src/layer/vulkan/shader/priorbox.comp
+++ b/src/layer/vulkan/shader/priorbox.comp
@@ -37,7 +37,11 @@ layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 #if NCNN_fp16_packed
 layout (binding = 0) writeonly buffer top_blob { vec4 top_blob_data[]; };
 #else
 layout (binding = 0) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
 #endif
 layout (binding = 1) readonly buffer min_sizes { sfp min_sizes_data[]; };
 layout (binding = 2) readonly buffer max_sizes { sfp max_sizes_data[]; };
 layout (binding = 3) readonly buffer aspect_ratios { sfp aspect_ratios_data[]; };
@@ -80,9 +84,7 @@ void main()
    afp min_size = sfp2afp(min_sizes_data[gx]);

 #if NCNN_fp16_packed
    vec2 v0 = vec2(variances_0, variances_1);
    vec2 v1 = vec2(variances_2, variances_3);
    uvec2 variances = uvec2(packHalf2x16(v0), packHalf2x16(v1));
    vec4 variances = vec4(variances_0, variances_1, variances_2, variances_3);
 #elif !NCNN_fp16_storage
    // per component assignment makes qcom-adreno driver unhappy :(
    sfpvec4 variances = sfpvec4(variances_0, variances_1, variances_2, variances_3);
@@ -94,7 +96,11 @@ void main()
    box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm;
    box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box;

 #if NCNN_fp16_packed
    top_blob_data[v_offset] = vec4(box);
 #else
    top_blob_data[v_offset] = afp2sfpvec4(box);
 #endif
 #if NCNN_fp16_packed || !NCNN_fp16_storage
    top_blob_data[var_offset] = variances;
 #else
@@ -117,7 +123,11 @@ void main()
        box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm;
        box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box;

 #if NCNN_fp16_packed
        top_blob_data[v_offset] = vec4(box);
 #else
        top_blob_data[v_offset] = afp2sfpvec4(box);
 #endif
 #if NCNN_fp16_packed || !NCNN_fp16_storage
        top_blob_data[var_offset] = variances;
 #else
@@ -141,7 +151,11 @@ void main()
        box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm;
        box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box;

 #if NCNN_fp16_packed
        top_blob_data[v_offset] = vec4(box);
 #else
        top_blob_data[v_offset] = afp2sfpvec4(box);
 #endif
 #if NCNN_fp16_packed || !NCNN_fp16_storage
        top_blob_data[var_offset] = variances;
 #else
@@ -159,7 +173,11 @@ void main()
            box = (center + afpvec4(-box_h, -box_w, box_h, box_w) * afp(0.5f)) * image_norm;
            box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box;

 #if NCNN_fp16_packed
            top_blob_data[v_offset] = vec4(box);
 #else
            top_blob_data[v_offset] = afp2sfpvec4(box);
 #endif
 #if NCNN_fp16_packed || !NCNN_fp16_storage
            top_blob_data[var_offset] = variances;
 #else
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -1407,7 +1407,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector

                    // cast to fp16
                    VkMat bottom_blob_unpacked_fp16;
                    if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage)
                    if (vkdev->info.support_fp16_storage)
                    {
                        cast_float32_to_float16->forward(bottom_blob_unpacked, bottom_blob_unpacked_fp16, cmd, opt);
                    }
@@ -1509,7 +1509,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector

                        // cast to fp16
                        VkMat bottom_blob_unpacked_fp16;
                        if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage)
                        if (vkdev->info.support_fp16_storage)
                        {
                            cast_float32_to_float16->forward(bottom_blob_unpacked, bottom_blob_unpacked_fp16, cmd, opt);
                        }
@@ -1638,7 +1638,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector

                    // cast to fp32
                    VkMat bottom_blob_unpacked_fp32;
                    if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage)
                    if (vkdev->info.support_fp16_storage)
                    {
                        cast_float16_to_float32->forward(bottom_blob_unpacked, bottom_blob_unpacked_fp32, cmd, opt);
                    }
@@ -1773,7 +1773,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
                        packing_pack1->forward(bottom_blob, bottom_blob_unpacked, cmd, opt);

                        // cast to fp32
                        if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage)
                        if (vkdev->info.support_fp16_storage)
                        {
                            cast_float16_to_float32->forward(bottom_blob_unpacked, bottom_blobs_unpacked_fp32[i], cmd, opt);
                        }