| @@ -731,7 +731,7 @@ void VkCompute::dispatch(const uint32_t* group_count_xyz) | |||
| void VkCompute::transfer_compute_barrier(VkBuffer buffer, size_t offset, size_t size) | |||
| { | |||
| // fprintf(stderr, "cmd transfer_compute_barrier %p[+%lu]\n", buffer, offset); | |||
| // fprintf(stderr, "cmd transfer_compute_barrier %p[+%lu] %lu\n", buffer, offset, size); | |||
| VkBufferMemoryBarrier bufferBarrier; | |||
| bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; | |||
| @@ -752,7 +752,7 @@ void VkCompute::transfer_compute_barrier(VkBuffer buffer, size_t offset, size_t | |||
| void VkCompute::compute_transfer_barrier(VkBuffer buffer, size_t offset, size_t size) | |||
| { | |||
| // fprintf(stderr, "cmd compute_transfer_barrier %p[+%lu]\n", buffer, offset); | |||
| // fprintf(stderr, "cmd compute_transfer_barrier %p[+%lu] %lu\n", buffer, offset, size); | |||
| VkBufferMemoryBarrier bufferBarrier; | |||
| bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; | |||
| @@ -773,7 +773,7 @@ void VkCompute::compute_transfer_barrier(VkBuffer buffer, size_t offset, size_t | |||
| void VkCompute::compute_compute_barrier(VkBuffer buffer, size_t offset, size_t size) | |||
| { | |||
| // fprintf(stderr, "cmd compute_compute_barrier %p[+%lu]\n", buffer, offset); | |||
| // fprintf(stderr, "cmd compute_compute_barrier %p[+%lu] %lu\n", buffer, offset, size); | |||
| VkBufferMemoryBarrier bufferBarrier; | |||
| bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; | |||
| @@ -794,7 +794,7 @@ void VkCompute::compute_compute_barrier(VkBuffer buffer, size_t offset, size_t s | |||
| void VkCompute::transfer_transfer_barrier(VkBuffer buffer, size_t offset, size_t size) | |||
| { | |||
| // fprintf(stderr, "cmd transfer_transfer_barrier %p[+%lu]\n", buffer, offset); | |||
| // fprintf(stderr, "cmd transfer_transfer_barrier %p[+%lu] %lu\n", buffer, offset, size); | |||
| VkBufferMemoryBarrier bufferBarrier; | |||
| bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; | |||
| @@ -843,30 +843,35 @@ VkTransfer::~VkTransfer() | |||
| void VkTransfer::record_upload(const Mat& src, VkMat& dst) | |||
| { | |||
| if ((vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage) && src.elemsize / src.packing == 4) | |||
| if (src.elemsize / src.packing == 4) | |||
| { | |||
| Mat src_fp16; | |||
| cast_float32_to_float16(src, src_fp16); | |||
| if (vkdev->info.support_fp16_storage || (vkdev->info.support_fp16_packed && src.packing % 4 == 0)) | |||
| { | |||
| Mat src_fp16; | |||
| cast_float32_to_float16(src, src_fp16); | |||
| record_upload(src_fp16, dst); | |||
| record_upload(src_fp16, dst); | |||
| return; | |||
| return; | |||
| } | |||
| } | |||
| dst.create_like(src, weight_vkallocator, staging_vkallocator); | |||
| Mat src_flattened = src.reshape(src.w * src.h * src.c); | |||
| dst.create_like(src_flattened, weight_vkallocator, staging_vkallocator); | |||
| // set weight blob as readonly | |||
| dst.data->state = 4; | |||
| if (dst.allocator->mappable) | |||
| { | |||
| dst.upload(src); | |||
| dst.upload(src_flattened); | |||
| return; | |||
| } | |||
| record_type r; | |||
| r.size = src.total() * src.elemsize; | |||
| r.mat = src; | |||
| r.size = src_flattened.total() * src_flattened.elemsize; | |||
| r.mat = src_flattened; | |||
| r.vkmat = dst; | |||
| delayed_records.push_back(r); | |||
| } | |||
| @@ -712,8 +712,8 @@ int create_gpu_instance() | |||
| gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count, | |||
| gpu_info.unified_memory_index, gpu_info.device_local_memory_index, gpu_info.host_visible_memory_index); | |||
| fprintf(stderr, "[%u %s] fp16s=%d fp16a=%d int8s=%d int8a=%d\n", i, physicalDeviceProperties.deviceName, | |||
| gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic, | |||
| fprintf(stderr, "[%u %s] fp16p=%d fp16s=%d fp16a=%d int8s=%d int8a=%d\n", i, physicalDeviceProperties.deviceName, | |||
| gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic, | |||
| gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic); | |||
| gpu_info_index++; | |||
| @@ -999,6 +999,15 @@ VkAllocator* VulkanDevice::staging_allocator() const | |||
| return staging_buffer_allocator; | |||
| } | |||
| static inline bool string_ends_with_fp16p(const char* name) | |||
| { | |||
| int len = strlen(name); | |||
| if (len < 6) | |||
| return false; | |||
| return memcmp(name + len - 6, "_fp16p", 6) == 0; | |||
| } | |||
| static inline bool string_ends_with_fp16s(const char* name) | |||
| { | |||
| int len = strlen(name); | |||
| @@ -1025,15 +1034,15 @@ int VulkanDevice::create_shader_module() | |||
| { | |||
| const char* shader_name = layer_shader_registry[i].name; | |||
| if (!info.support_fp16_storage) | |||
| if (!info.support_fp16_packed) | |||
| { | |||
| if (string_ends_with_fp16s(shader_name)) | |||
| continue; | |||
| if (strcmp(shader_name, "cast_fp16_to_fp32") == 0 || strcmp(shader_name, "cast_fp16_to_fp32_pack4") == 0) | |||
| if (string_ends_with_fp16p(shader_name)) | |||
| continue; | |||
| } | |||
| if (strcmp(shader_name, "cast_fp32_to_fp16") == 0 || strcmp(shader_name, "cast_fp32_to_fp16_pack4") == 0) | |||
| if (!info.support_fp16_storage) | |||
| { | |||
| if (string_ends_with_fp16s(shader_name)) | |||
| continue; | |||
| } | |||
| @@ -315,7 +315,7 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd) | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); | |||
| weight_data_pack4.create(16*maxk, num_input/4, num_output/4); | |||
| weight_data_pack4.create(maxk, num_input/4, num_output/4, (size_t)4*16, 16); | |||
| for (int q=0; q+3<num_output; q+=4) | |||
| { | |||
| @@ -378,7 +378,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack4 = weight_data_pack4.reshape(16*maxk * (num_input/4) * (num_output/4)); | |||
| cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4); | |||
| if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1 && num_input >= 16 && num_output >= 16) | |||
| @@ -434,7 +433,7 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd) | |||
| // dst = 4a-4b-16-inch/4a-outch/4b | |||
| Mat weight_data_pack4_tm; | |||
| { | |||
| weight_data_pack4_tm.create(16*16, num_input/4, num_output/4); | |||
| weight_data_pack4_tm.create(16, num_input/4, num_output/4, (size_t)4*16, 16); | |||
| for (int q=0; q+3<num_output; q+=4) | |||
| { | |||
| @@ -497,7 +496,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack4_tm = weight_data_pack4_tm.reshape(16*16 * (num_input/4) * (num_output/4)); | |||
| cmd.record_upload(weight_data_pack4_tm, weight_data_gpu_pack4_tm); | |||
| } | |||
| } | |||
| @@ -511,7 +509,7 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd) | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); | |||
| weight_data_pack1to4.create(4*maxk, num_input, num_output/4); | |||
| weight_data_pack1to4.create(maxk, num_input, num_output/4, (size_t)4*4, 4); | |||
| for (int q=0; q+3<num_output; q+=4) | |||
| { | |||
| @@ -544,7 +542,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack1to4 = weight_data_pack1to4.reshape(4*maxk * num_input * (num_output/4)); | |||
| cmd.record_upload(weight_data_pack1to4, weight_data_gpu_pack1to4); | |||
| } | |||
| @@ -557,7 +554,7 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd) | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); | |||
| weight_data_pack4to1.create(4*maxk, num_input/4, num_output); | |||
| weight_data_pack4to1.create(maxk, num_input/4, num_output, (size_t)4*4, 4); | |||
| for (int q=0; q<num_output; q++) | |||
| { | |||
| @@ -586,7 +583,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack4to1 = weight_data_pack4to1.reshape(4*maxk * (num_input/4) * num_output); | |||
| cmd.record_upload(weight_data_pack4to1, weight_data_gpu_pack4to1); | |||
| } | |||
| @@ -739,6 +735,12 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| int out_packing = num_output % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / packing * out_packing; | |||
| if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) | |||
| { | |||
| if (out_packing == 4) out_elemsize = 4*2u; | |||
| if (out_packing == 1) out_elemsize = 4u; | |||
| } | |||
| bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; | |||
| if (packing == 4 && out_packing == 4 && is_conv3x3s1d1 && channels * packing >= 16 && num_output >= 16) | |||
| { | |||
| @@ -224,7 +224,6 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd) | |||
| Mat weight_data_r2 = weight_data.reshape(maxk, group); | |||
| convert_packing(weight_data_r2, weight_data_pack4, 4); | |||
| weight_data_pack4 = weight_data_pack4.reshape(maxk * (group/4)); | |||
| cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4); | |||
| } | |||
| @@ -265,7 +264,7 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd) | |||
| { | |||
| Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group); | |||
| weight_data_pack4_groups.create(16*maxk, channels_g/4, num_output_g/4 * group); | |||
| weight_data_pack4_groups.create(maxk, channels_g/4, num_output_g/4 * group, (size_t)4*16, 16); | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| @@ -335,7 +334,6 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack4_groups = weight_data_pack4_groups.reshape(16*maxk * (channels_g/4) * (num_output_g/4) * group); | |||
| cmd.record_upload(weight_data_pack4_groups, weight_data_gpu_pack4); | |||
| } | |||
| @@ -348,7 +346,7 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd) | |||
| { | |||
| Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group); | |||
| weight_data_pack1to4_groups.create(4*maxk, channels_g, num_output_g/4 * group); | |||
| weight_data_pack1to4_groups.create(maxk, channels_g, num_output_g/4 * group, (size_t)4*4, 4); | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| @@ -388,7 +386,6 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack1to4_groups = weight_data_pack1to4_groups.reshape(4*maxk * channels_g * (num_output_g/4) * group); | |||
| cmd.record_upload(weight_data_pack1to4_groups, weight_data_gpu_pack1to4); | |||
| } | |||
| @@ -401,7 +398,7 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd) | |||
| { | |||
| Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group); | |||
| weight_data_pack4to1_groups.create(4*maxk, channels_g/4, num_output_g * group); | |||
| weight_data_pack4to1_groups.create(maxk, channels_g/4, num_output_g * group, (size_t)4*4, 4); | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| @@ -437,7 +434,6 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack4to1_groups = weight_data_pack4to1_groups.reshape(4*maxk * (channels_g/4) * num_output_g * group); | |||
| cmd.record_upload(weight_data_pack4to1_groups, weight_data_gpu_pack4to1); | |||
| } | |||
| @@ -517,6 +513,12 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl | |||
| int out_packing = num_output % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / packing * out_packing; | |||
| if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) | |||
| { | |||
| if (out_packing == 4) out_elemsize = 4*2u; | |||
| if (out_packing == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(outw, outh, num_output / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -98,6 +98,12 @@ int Crop_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c | |||
| int out_packing = _outc % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / packing * out_packing; | |||
| if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) | |||
| { | |||
| if (out_packing == 4) out_elemsize = 4*2u; | |||
| if (out_packing == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(_outw, _outh, _outc / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -189,6 +195,12 @@ int Crop_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkM | |||
| int out_packing = _outc % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / packing * out_packing; | |||
| if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) | |||
| { | |||
| if (out_packing == 4) out_elemsize = 4*2u; | |||
| if (out_packing == 1) out_elemsize = 4u; | |||
| } | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(_outw, _outh, _outc / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| @@ -158,7 +158,7 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd) | |||
| { | |||
| Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output); | |||
| weight_data_pack4.create(16*maxk, num_input/4, num_output/4); | |||
| weight_data_pack4.create(maxk, num_input/4, num_output/4, (size_t)4*16, 16); | |||
| for (int q=0; q+3<num_output; q+=4) | |||
| { | |||
| @@ -221,7 +221,6 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack4 = weight_data_pack4.reshape(16*maxk * (num_input/4) * (num_output/4)); | |||
| cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4); | |||
| } | |||
| @@ -234,7 +233,7 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd) | |||
| { | |||
| Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output); | |||
| weight_data_pack1to4.create(4*maxk, num_input, num_output/4); | |||
| weight_data_pack1to4.create(maxk, num_input, num_output/4, (size_t)4*4, 4); | |||
| for (int q=0; q+3<num_output; q+=4) | |||
| { | |||
| @@ -267,7 +266,6 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack1to4 = weight_data_pack1to4.reshape(4*maxk * num_input * (num_output/4)); | |||
| cmd.record_upload(weight_data_pack1to4, weight_data_gpu_pack1to4); | |||
| } | |||
| @@ -280,7 +278,7 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd) | |||
| { | |||
| Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output); | |||
| weight_data_pack4to1.create(4*maxk, num_input/4, num_output); | |||
| weight_data_pack4to1.create(maxk, num_input/4, num_output, (size_t)4*4, 4); | |||
| for (int q=0; q<num_output; q++) | |||
| { | |||
| @@ -309,7 +307,6 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack4to1 = weight_data_pack4to1.reshape(4*maxk * (num_input/4) * num_output); | |||
| cmd.record_upload(weight_data_pack4to1, weight_data_gpu_pack4to1); | |||
| } | |||
| @@ -281,7 +281,7 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd) | |||
| { | |||
| Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group); | |||
| weight_data_pack4_groups.create(16*maxk, channels_g/4, num_output_g/4 * group); | |||
| weight_data_pack4_groups.create(maxk, channels_g/4, num_output_g/4 * group, (size_t)4*16, 16); | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| @@ -351,7 +351,6 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack4_groups = weight_data_pack4_groups.reshape(16*maxk * (channels_g/4) * (num_output_g/4) * group); | |||
| cmd.record_upload(weight_data_pack4_groups, weight_data_gpu_pack4); | |||
| } | |||
| @@ -364,7 +363,7 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd) | |||
| { | |||
| Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group); | |||
| weight_data_pack1to4_groups.create(4*maxk, channels_g, num_output_g/4 * group); | |||
| weight_data_pack1to4_groups.create(maxk, channels_g, num_output_g/4 * group, (size_t)4*4, 4); | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| @@ -404,7 +403,6 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack1to4_groups = weight_data_pack1to4_groups.reshape(4*maxk * channels_g * (num_output_g/4) * group); | |||
| cmd.record_upload(weight_data_pack1to4_groups, weight_data_gpu_pack1to4); | |||
| } | |||
| @@ -417,7 +415,7 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd) | |||
| { | |||
| Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group); | |||
| weight_data_pack4to1_groups.create(4*maxk, channels_g/4, num_output_g * group); | |||
| weight_data_pack4to1_groups.create(maxk, channels_g/4, num_output_g * group, (size_t)4*4, 4); | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| @@ -453,7 +451,6 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack4to1_groups = weight_data_pack4to1_groups.reshape(4*maxk * (channels_g/4) * num_output_g * group); | |||
| cmd.record_upload(weight_data_pack4to1_groups, weight_data_gpu_pack4to1); | |||
| } | |||
| @@ -24,6 +24,7 @@ Flatten_vulkan::Flatten_vulkan() | |||
| pipeline_flatten = 0; | |||
| pipeline_flatten_pack4 = 0; | |||
| pipeline_flatten_pack1to4 = 0; | |||
| } | |||
| int Flatten_vulkan::create_pipeline(const Option& opt) | |||
| @@ -44,6 +45,13 @@ int Flatten_vulkan::create_pipeline(const Option& opt) | |||
| pipeline_flatten_pack4->create("flatten_pack4", specializations, 2, 10); | |||
| } | |||
| // pack1to4 | |||
| { | |||
| pipeline_flatten_pack1to4 = new Pipeline(vkdev); | |||
| pipeline_flatten_pack1to4->set_optimal_local_size_xyz(); | |||
| pipeline_flatten_pack1to4->create("flatten_pack1to4", specializations, 2, 10); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -55,6 +63,9 @@ int Flatten_vulkan::destroy_pipeline(const Option& opt) | |||
| delete pipeline_flatten_pack4; | |||
| pipeline_flatten_pack4 = 0; | |||
| delete pipeline_flatten_pack1to4; | |||
| pipeline_flatten_pack1to4 = 0; | |||
| return 0; | |||
| } | |||
| @@ -79,6 +90,12 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| int out_packing = total % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / packing * out_packing; | |||
| if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) | |||
| { | |||
| if (out_packing == 4) out_elemsize = 4*2u; | |||
| if (out_packing == 1) out_elemsize = 4u; | |||
| } | |||
| if (dims == 2 && packing == 1) | |||
| { | |||
| top_blob = bottom_blob; | |||
| @@ -106,25 +123,26 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| constants[3].i = bottom_blob.c; | |||
| constants[4].i = bottom_blob.cstep; | |||
| constants[5].i = top_blob.dims; | |||
| constants[6].i = (packing == 1 && out_packing == 4) ? total : top_blob.w; | |||
| constants[6].i = top_blob.w; | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = top_blob.cstep; | |||
| const Pipeline* pipeline = packing == 4 ? pipeline_flatten_pack4 : pipeline_flatten; | |||
| if (packing == 1 && out_packing == 4) | |||
| const Pipeline* pipeline = 0; | |||
| if (packing == 1 && out_packing == 1) | |||
| { | |||
| VkMat dispatcher; | |||
| dispatcher.w = total; | |||
| dispatcher.h = 1; | |||
| dispatcher.c = 1; | |||
| cmd.record_pipeline(pipeline, bindings, constants, dispatcher); | |||
| pipeline = pipeline_flatten; | |||
| } | |||
| else | |||
| else if (packing == 4 /*&& out_packing == 4*/) | |||
| { | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| pipeline = pipeline_flatten_pack4; | |||
| } | |||
| else if (packing == 1 && out_packing == 4) | |||
| { | |||
| pipeline = pipeline_flatten_pack1to4; | |||
| } | |||
| cmd.record_pipeline(pipeline, bindings, constants, top_blob); | |||
| return 0; | |||
| } | |||
| @@ -32,6 +32,7 @@ public: | |||
| public: | |||
| Pipeline* pipeline_flatten; | |||
| Pipeline* pipeline_flatten_pack4; | |||
| Pipeline* pipeline_flatten_pack1to4; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -141,7 +141,7 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd) | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(num_input, num_output); | |||
| weight_data_pack4.create(16, num_input/4, num_output/4); | |||
| weight_data_pack4.create(num_input/4, num_output/4, (size_t)4*16, 16); | |||
| for (int q=0; q+3<num_output; q+=4) | |||
| { | |||
| @@ -150,7 +150,7 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd) | |||
| const float* k2 = weight_data_r2.row(q+2); | |||
| const float* k3 = weight_data_r2.row(q+3); | |||
| float* g00 = weight_data_pack4.channel(q/4); | |||
| float* g00 = weight_data_pack4.row(q/4); | |||
| for (int p=0; p+3<num_input; p+=4) | |||
| { | |||
| @@ -183,7 +183,6 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack4 = weight_data_pack4.reshape(16 * (num_input/4) * (num_output/4)); | |||
| cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4); | |||
| } | |||
| @@ -196,7 +195,7 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd) | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(num_input, num_output); | |||
| weight_data_pack1to4.create(4, num_input, num_output/4); | |||
| weight_data_pack1to4.create(num_input, num_output/4, (size_t)4*4, 4); | |||
| for (int q=0; q+3<num_output; q+=4) | |||
| { | |||
| @@ -205,7 +204,7 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd) | |||
| const float* k2 = weight_data_r2.row(q+2); | |||
| const float* k3 = weight_data_r2.row(q+3); | |||
| float* g00 = weight_data_pack1to4.channel(q/4); | |||
| float* g00 = weight_data_pack1to4.row(q/4); | |||
| for (int p=0; p<num_input; p++) | |||
| { | |||
| @@ -219,7 +218,6 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack1to4 = weight_data_pack1to4.reshape(4 * num_input * (num_output/4)); | |||
| cmd.record_upload(weight_data_pack1to4, weight_data_gpu_pack1to4); | |||
| } | |||
| @@ -232,13 +230,13 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd) | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(num_input, num_output); | |||
| weight_data_pack4to1.create(4, num_input/4, num_output); | |||
| weight_data_pack4to1.create(num_input/4, num_output, (size_t)4*4, 4); | |||
| for (int q=0; q<num_output; q++) | |||
| { | |||
| const float* k0 = weight_data_r2.row(q); | |||
| float* g00 = weight_data_pack4to1.channel(q); | |||
| float* g00 = weight_data_pack4to1.row(q); | |||
| for (int p=0; p+3<num_input; p+=4) | |||
| { | |||
| @@ -253,7 +251,6 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd) | |||
| } | |||
| } | |||
| weight_data_pack4to1 = weight_data_pack4to1.reshape(4 * (num_input/4) * num_output); | |||
| cmd.record_upload(weight_data_pack4to1, weight_data_gpu_pack4to1); | |||
| } | |||
| @@ -293,6 +290,12 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo | |||
| int out_packing = num_output % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / packing * out_packing; | |||
| if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) | |||
| { | |||
| if (out_packing == 4) out_elemsize = 4*2u; | |||
| if (out_packing == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(num_output / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -96,7 +96,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| if (dims == 1) | |||
| { | |||
| if (out_packing == 1) | |||
| if (vkdev->info.support_fp16_storage && out_packing == 1) | |||
| { | |||
| top_blob = bottom_blob; | |||
| top_blob.w = w * packing; | |||
| @@ -108,6 +108,11 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| int outw = (w * packing + out_packing - 1) / out_packing; | |||
| size_t out_elemsize = elemsize / packing * out_packing; | |||
| if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) | |||
| { | |||
| if (out_packing == 4) out_elemsize = 4*2u; | |||
| if (out_packing == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(outw, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| if (top_blob.empty()) | |||
| @@ -118,6 +123,11 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| { | |||
| int outh = (h * packing + out_packing - 1) / out_packing; | |||
| size_t out_elemsize = elemsize / packing * out_packing; | |||
| if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) | |||
| { | |||
| if (out_packing == 4) out_elemsize = 4*2u; | |||
| if (out_packing == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(w, outh, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| if (top_blob.empty()) | |||
| @@ -128,6 +138,11 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| { | |||
| int outc = (channels * packing + out_packing - 1) / out_packing; | |||
| size_t out_elemsize = elemsize / packing * out_packing; | |||
| if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) | |||
| { | |||
| if (out_packing == 4) out_elemsize = 4*2u; | |||
| if (out_packing == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(w, h, outc, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| if (top_blob.empty()) | |||
| @@ -72,6 +72,12 @@ int Permute_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| int out_packing = 1; | |||
| size_t out_elemsize = elemsize / packing; | |||
| if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) | |||
| { | |||
| if (out_packing == 4) out_elemsize = 4*2u; | |||
| if (out_packing == 1) out_elemsize = 4u; | |||
| } | |||
| if (dims == 2) | |||
| { | |||
| // order_type | |||
| @@ -107,6 +107,12 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector | |||
| { | |||
| int w = bottom_blobs[0].w; | |||
| int h = bottom_blobs[0].h; | |||
| size_t elemsize = 4u; | |||
| if (vkdev->info.support_fp16_storage) | |||
| { | |||
| elemsize = 2u; | |||
| } | |||
| if (bottom_blobs.size() == 1 && image_width == -233 && image_height == -233 && max_sizes.empty()) | |||
| { | |||
| @@ -124,7 +130,7 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector | |||
| int num_prior = num_sizes - 1 + num_ratios; | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(4 * w * h * num_prior, 4u, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(4 * w * h * num_prior, elemsize, 1, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -172,7 +178,7 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector | |||
| num_prior += num_min_size * num_aspect_ratio; | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(4 * w * h * num_prior, 2, 4u, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(4 * w * h * num_prior, 2, elemsize, 1, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -85,6 +85,12 @@ int Reorg_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& | |||
| int out_packing = outc % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / packing * out_packing; | |||
| if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) | |||
| { | |||
| if (out_packing == 4) out_elemsize = 4*2u; | |||
| if (out_packing == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(outw, outh, outc / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -105,6 +105,12 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| out_packing = _w % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / packing * out_packing; | |||
| if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) | |||
| { | |||
| if (out_packing == 4) out_elemsize = 4*2u; | |||
| if (out_packing == 1) out_elemsize = 4u; | |||
| } | |||
| if (dims == 1 && bottom_blob.w == _w && packing == out_packing) | |||
| { | |||
| top_blob = bottom_blob; | |||
| @@ -131,6 +137,12 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| out_packing = _h % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / packing * out_packing; | |||
| if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) | |||
| { | |||
| if (out_packing == 4) out_elemsize = 4*2u; | |||
| if (out_packing == 1) out_elemsize = 4u; | |||
| } | |||
| if (dims == 2 && bottom_blob.h == _h && packing == out_packing) | |||
| { | |||
| top_blob = bottom_blob; | |||
| @@ -162,6 +174,12 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| out_packing = _c % 4 == 0 ? 4 : 1; | |||
| size_t out_elemsize = elemsize / packing * out_packing; | |||
| if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) | |||
| { | |||
| if (out_packing == 4) out_elemsize = 4*2u; | |||
| if (out_packing == 1) out_elemsize = 4u; | |||
| } | |||
| if (dims == 3 && bottom_blob.c == _c && packing == out_packing) | |||
| { | |||
| top_blob = bottom_blob; | |||
| @@ -0,0 +1,76 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #if NCNN_fp16_storage | |||
| #extension GL_EXT_shader_16bit_storage: require | |||
| #endif | |||
| #if NCNN_fp16_arithmetic | |||
| #extension GL_AMD_gpu_shader_half_float: require | |||
| #endif | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.outw || gy >= 1 || gz >= 1) | |||
| return; | |||
| ivec4 i4 = gx * 4 + ivec4(0, 1, 2, 3); | |||
| int size = p.w * p.h; | |||
| ivec4 z4 = i4 / size; | |||
| ivec4 y4 = i4 % size / p.w; | |||
| ivec4 x4 = i4 % size % p.w; | |||
| ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4; | |||
| #if NCNN_fp16_packed | |||
| vec2 v0 = vec2(bottom_blob_data[v_offset.r], bottom_blob_data[v_offset.g]); | |||
| vec2 v1 = vec2(bottom_blob_data[v_offset.b], bottom_blob_data[v_offset.a]); | |||
| top_blob_data[gx] = uvec2(packHalf2x16(v0), packHalf2x16(v1)); | |||
| #else | |||
| top_blob_data[gx].r = bottom_blob_data[v_offset.r]; | |||
| top_blob_data[gx].g = bottom_blob_data[v_offset.g]; | |||
| top_blob_data[gx].b = bottom_blob_data[v_offset.b]; | |||
| top_blob_data[gx].a = bottom_blob_data[v_offset.a]; | |||
| #endif | |||
| } | |||
| @@ -70,7 +70,12 @@ void main() | |||
| } | |||
| else | |||
| { | |||
| #if NCNN_fp16_packed | |||
| uint v = packHalf2x16(vec2(value)); | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = uvec2(v, v); | |||
| #else | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(value); | |||
| #endif | |||
| } | |||
| } | |||
| else if (type == 1) | |||
| @@ -37,7 +37,11 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_fp16_packed | |||
| layout (binding = 0) writeonly buffer top_blob { vec4 top_blob_data[]; }; | |||
| #else | |||
| layout (binding = 0) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| #endif | |||
| layout (binding = 1) readonly buffer min_sizes { sfp min_sizes_data[]; }; | |||
| layout (binding = 2) readonly buffer max_sizes { sfp max_sizes_data[]; }; | |||
| layout (binding = 3) readonly buffer aspect_ratios { sfp aspect_ratios_data[]; }; | |||
| @@ -80,9 +84,7 @@ void main() | |||
| afp min_size = sfp2afp(min_sizes_data[gx]); | |||
| #if NCNN_fp16_packed | |||
| vec2 v0 = vec2(variances_0, variances_1); | |||
| vec2 v1 = vec2(variances_2, variances_3); | |||
| uvec2 variances = uvec2(packHalf2x16(v0), packHalf2x16(v1)); | |||
| vec4 variances = vec4(variances_0, variances_1, variances_2, variances_3); | |||
| #elif !NCNN_fp16_storage | |||
| // per component assignment makes qcom-adreno driver unhappy :( | |||
| sfpvec4 variances = sfpvec4(variances_0, variances_1, variances_2, variances_3); | |||
| @@ -94,7 +96,11 @@ void main() | |||
| box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; | |||
| box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box; | |||
| #if NCNN_fp16_packed | |||
| top_blob_data[v_offset] = vec4(box); | |||
| #else | |||
| top_blob_data[v_offset] = afp2sfpvec4(box); | |||
| #endif | |||
| #if NCNN_fp16_packed || !NCNN_fp16_storage | |||
| top_blob_data[var_offset] = variances; | |||
| #else | |||
| @@ -117,7 +123,11 @@ void main() | |||
| box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; | |||
| box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box; | |||
| #if NCNN_fp16_packed | |||
| top_blob_data[v_offset] = vec4(box); | |||
| #else | |||
| top_blob_data[v_offset] = afp2sfpvec4(box); | |||
| #endif | |||
| #if NCNN_fp16_packed || !NCNN_fp16_storage | |||
| top_blob_data[var_offset] = variances; | |||
| #else | |||
| @@ -141,7 +151,11 @@ void main() | |||
| box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; | |||
| box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box; | |||
| #if NCNN_fp16_packed | |||
| top_blob_data[v_offset] = vec4(box); | |||
| #else | |||
| top_blob_data[v_offset] = afp2sfpvec4(box); | |||
| #endif | |||
| #if NCNN_fp16_packed || !NCNN_fp16_storage | |||
| top_blob_data[var_offset] = variances; | |||
| #else | |||
| @@ -159,7 +173,11 @@ void main() | |||
| box = (center + afpvec4(-box_h, -box_w, box_h, box_w) * afp(0.5f)) * image_norm; | |||
| box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box; | |||
| #if NCNN_fp16_packed | |||
| top_blob_data[v_offset] = vec4(box); | |||
| #else | |||
| top_blob_data[v_offset] = afp2sfpvec4(box); | |||
| #endif | |||
| #if NCNN_fp16_packed || !NCNN_fp16_storage | |||
| top_blob_data[var_offset] = variances; | |||
| #else | |||
| @@ -1407,7 +1407,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| // cast to fp16 | |||
| VkMat bottom_blob_unpacked_fp16; | |||
| if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage) | |||
| if (vkdev->info.support_fp16_storage) | |||
| { | |||
| cast_float32_to_float16->forward(bottom_blob_unpacked, bottom_blob_unpacked_fp16, cmd, opt); | |||
| } | |||
| @@ -1509,7 +1509,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| // cast to fp16 | |||
| VkMat bottom_blob_unpacked_fp16; | |||
| if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage) | |||
| if (vkdev->info.support_fp16_storage) | |||
| { | |||
| cast_float32_to_float16->forward(bottom_blob_unpacked, bottom_blob_unpacked_fp16, cmd, opt); | |||
| } | |||
| @@ -1638,7 +1638,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| // cast to fp32 | |||
| VkMat bottom_blob_unpacked_fp32; | |||
| if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage) | |||
| if (vkdev->info.support_fp16_storage) | |||
| { | |||
| cast_float16_to_float32->forward(bottom_blob_unpacked, bottom_blob_unpacked_fp32, cmd, opt); | |||
| } | |||
| @@ -1773,7 +1773,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| packing_pack1->forward(bottom_blob, bottom_blob_unpacked, cmd, opt); | |||
| // cast to fp32 | |||
| if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage) | |||
| if (vkdev->info.support_fp16_storage) | |||
| { | |||
| cast_float16_to_float32->forward(bottom_blob_unpacked, bottom_blobs_unpacked_fp32[i], cmd, opt); | |||
| } | |||