Browse Source

more fix for fp16p, still disabled by default

tags/20190611
nihuini 7 years ago
parent
commit
cd7559c639
19 changed files with 274 additions and 78 deletions
  1. +18
    -13
      src/command.cpp
  2. +17
    -8
      src/gpu.cpp
  3. +10
    -8
      src/layer/vulkan/convolution_vulkan.cpp
  4. +9
    -7
      src/layer/vulkan/convolutiondepthwise_vulkan.cpp
  5. +12
    -0
      src/layer/vulkan/crop_vulkan.cpp
  6. +3
    -6
      src/layer/vulkan/deconvolution_vulkan.cpp
  7. +3
    -6
      src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
  8. +29
    -11
      src/layer/vulkan/flatten_vulkan.cpp
  9. +1
    -0
      src/layer/vulkan/flatten_vulkan.h
  10. +12
    -9
      src/layer/vulkan/innerproduct_vulkan.cpp
  11. +16
    -1
      src/layer/vulkan/packing_vulkan.cpp
  12. +6
    -0
      src/layer/vulkan/permute_vulkan.cpp
  13. +8
    -2
      src/layer/vulkan/priorbox_vulkan.cpp
  14. +6
    -0
      src/layer/vulkan/reorg_vulkan.cpp
  15. +18
    -0
      src/layer/vulkan/reshape_vulkan.cpp
  16. +76
    -0
      src/layer/vulkan/shader/flatten_pack1to4.comp
  17. +5
    -0
      src/layer/vulkan/shader/padding_pack4.comp
  18. +21
    -3
      src/layer/vulkan/shader/priorbox.comp
  19. +4
    -4
      src/net.cpp

+ 18
- 13
src/command.cpp View File

@@ -731,7 +731,7 @@ void VkCompute::dispatch(const uint32_t* group_count_xyz)

void VkCompute::transfer_compute_barrier(VkBuffer buffer, size_t offset, size_t size)
{
// fprintf(stderr, "cmd transfer_compute_barrier %p[+%lu]\n", buffer, offset);
// fprintf(stderr, "cmd transfer_compute_barrier %p[+%lu] %lu\n", buffer, offset, size);

VkBufferMemoryBarrier bufferBarrier;
bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -752,7 +752,7 @@ void VkCompute::transfer_compute_barrier(VkBuffer buffer, size_t offset, size_t

void VkCompute::compute_transfer_barrier(VkBuffer buffer, size_t offset, size_t size)
{
// fprintf(stderr, "cmd compute_transfer_barrier %p[+%lu]\n", buffer, offset);
// fprintf(stderr, "cmd compute_transfer_barrier %p[+%lu] %lu\n", buffer, offset, size);

VkBufferMemoryBarrier bufferBarrier;
bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -773,7 +773,7 @@ void VkCompute::compute_transfer_barrier(VkBuffer buffer, size_t offset, size_t

void VkCompute::compute_compute_barrier(VkBuffer buffer, size_t offset, size_t size)
{
// fprintf(stderr, "cmd compute_compute_barrier %p[+%lu]\n", buffer, offset);
// fprintf(stderr, "cmd compute_compute_barrier %p[+%lu] %lu\n", buffer, offset, size);

VkBufferMemoryBarrier bufferBarrier;
bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -794,7 +794,7 @@ void VkCompute::compute_compute_barrier(VkBuffer buffer, size_t offset, size_t s

void VkCompute::transfer_transfer_barrier(VkBuffer buffer, size_t offset, size_t size)
{
// fprintf(stderr, "cmd transfer_transfer_barrier %p[+%lu]\n", buffer, offset);
// fprintf(stderr, "cmd transfer_transfer_barrier %p[+%lu] %lu\n", buffer, offset, size);

VkBufferMemoryBarrier bufferBarrier;
bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -843,30 +843,35 @@ VkTransfer::~VkTransfer()

void VkTransfer::record_upload(const Mat& src, VkMat& dst)
{
if ((vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage) && src.elemsize / src.packing == 4)
if (src.elemsize / src.packing == 4)
{
Mat src_fp16;
cast_float32_to_float16(src, src_fp16);
if (vkdev->info.support_fp16_storage || (vkdev->info.support_fp16_packed && src.packing % 4 == 0))
{
Mat src_fp16;
cast_float32_to_float16(src, src_fp16);

record_upload(src_fp16, dst);
record_upload(src_fp16, dst);

return;
return;
}
}

dst.create_like(src, weight_vkallocator, staging_vkallocator);
Mat src_flattened = src.reshape(src.w * src.h * src.c);

dst.create_like(src_flattened, weight_vkallocator, staging_vkallocator);

// set weight blob as readonly
dst.data->state = 4;

if (dst.allocator->mappable)
{
dst.upload(src);
dst.upload(src_flattened);
return;
}

record_type r;
r.size = src.total() * src.elemsize;
r.mat = src;
r.size = src_flattened.total() * src_flattened.elemsize;
r.mat = src_flattened;
r.vkmat = dst;
delayed_records.push_back(r);
}


+ 17
- 8
src/gpu.cpp View File

@@ -712,8 +712,8 @@ int create_gpu_instance()
gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count,
gpu_info.unified_memory_index, gpu_info.device_local_memory_index, gpu_info.host_visible_memory_index);

fprintf(stderr, "[%u %s] fp16s=%d fp16a=%d int8s=%d int8a=%d\n", i, physicalDeviceProperties.deviceName,
gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
fprintf(stderr, "[%u %s] fp16p=%d fp16s=%d fp16a=%d int8s=%d int8a=%d\n", i, physicalDeviceProperties.deviceName,
gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic);

gpu_info_index++;
@@ -999,6 +999,15 @@ VkAllocator* VulkanDevice::staging_allocator() const
return staging_buffer_allocator;
}

static inline bool string_ends_with_fp16p(const char* name)
{
int len = strlen(name);
if (len < 6)
return false;

return memcmp(name + len - 6, "_fp16p", 6) == 0;
}

static inline bool string_ends_with_fp16s(const char* name)
{
int len = strlen(name);
@@ -1025,15 +1034,15 @@ int VulkanDevice::create_shader_module()
{
const char* shader_name = layer_shader_registry[i].name;

if (!info.support_fp16_storage)
if (!info.support_fp16_packed)
{
if (string_ends_with_fp16s(shader_name))
continue;

if (strcmp(shader_name, "cast_fp16_to_fp32") == 0 || strcmp(shader_name, "cast_fp16_to_fp32_pack4") == 0)
if (string_ends_with_fp16p(shader_name))
continue;
}

if (strcmp(shader_name, "cast_fp32_to_fp16") == 0 || strcmp(shader_name, "cast_fp32_to_fp16_pack4") == 0)
if (!info.support_fp16_storage)
{
if (string_ends_with_fp16s(shader_name))
continue;
}



+ 10
- 8
src/layer/vulkan/convolution_vulkan.cpp View File

@@ -315,7 +315,7 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
{
Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

weight_data_pack4.create(16*maxk, num_input/4, num_output/4);
weight_data_pack4.create(maxk, num_input/4, num_output/4, (size_t)4*16, 16);

for (int q=0; q+3<num_output; q+=4)
{
@@ -378,7 +378,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack4 = weight_data_pack4.reshape(16*maxk * (num_input/4) * (num_output/4));
cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4);

if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1 && num_input >= 16 && num_output >= 16)
@@ -434,7 +433,7 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
// dst = 4a-4b-16-inch/4a-outch/4b
Mat weight_data_pack4_tm;
{
weight_data_pack4_tm.create(16*16, num_input/4, num_output/4);
weight_data_pack4_tm.create(16, num_input/4, num_output/4, (size_t)4*16, 16);

for (int q=0; q+3<num_output; q+=4)
{
@@ -497,7 +496,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack4_tm = weight_data_pack4_tm.reshape(16*16 * (num_input/4) * (num_output/4));
cmd.record_upload(weight_data_pack4_tm, weight_data_gpu_pack4_tm);
}
}
@@ -511,7 +509,7 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
{
Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

weight_data_pack1to4.create(4*maxk, num_input, num_output/4);
weight_data_pack1to4.create(maxk, num_input, num_output/4, (size_t)4*4, 4);

for (int q=0; q+3<num_output; q+=4)
{
@@ -544,7 +542,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack1to4 = weight_data_pack1to4.reshape(4*maxk * num_input * (num_output/4));
cmd.record_upload(weight_data_pack1to4, weight_data_gpu_pack1to4);
}

@@ -557,7 +554,7 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
{
Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

weight_data_pack4to1.create(4*maxk, num_input/4, num_output);
weight_data_pack4to1.create(maxk, num_input/4, num_output, (size_t)4*4, 4);

for (int q=0; q<num_output; q++)
{
@@ -586,7 +583,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack4to1 = weight_data_pack4to1.reshape(4*maxk * (num_input/4) * num_output);
cmd.record_upload(weight_data_pack4to1, weight_data_gpu_pack4to1);
}

@@ -739,6 +735,12 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
int out_packing = num_output % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;

if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
{
if (out_packing == 4) out_elemsize = 4*2u;
if (out_packing == 1) out_elemsize = 4u;
}

bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;
if (packing == 4 && out_packing == 4 && is_conv3x3s1d1 && channels * packing >= 16 && num_output >= 16)
{


+ 9
- 7
src/layer/vulkan/convolutiondepthwise_vulkan.cpp View File

@@ -224,7 +224,6 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
Mat weight_data_r2 = weight_data.reshape(maxk, group);
convert_packing(weight_data_r2, weight_data_pack4, 4);

weight_data_pack4 = weight_data_pack4.reshape(maxk * (group/4));
cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4);
}

@@ -265,7 +264,7 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
{
Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group);

weight_data_pack4_groups.create(16*maxk, channels_g/4, num_output_g/4 * group);
weight_data_pack4_groups.create(maxk, channels_g/4, num_output_g/4 * group, (size_t)4*16, 16);

for (int g=0; g<group; g++)
{
@@ -335,7 +334,6 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack4_groups = weight_data_pack4_groups.reshape(16*maxk * (channels_g/4) * (num_output_g/4) * group);
cmd.record_upload(weight_data_pack4_groups, weight_data_gpu_pack4);
}

@@ -348,7 +346,7 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
{
Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group);

weight_data_pack1to4_groups.create(4*maxk, channels_g, num_output_g/4 * group);
weight_data_pack1to4_groups.create(maxk, channels_g, num_output_g/4 * group, (size_t)4*4, 4);

for (int g=0; g<group; g++)
{
@@ -388,7 +386,6 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack1to4_groups = weight_data_pack1to4_groups.reshape(4*maxk * channels_g * (num_output_g/4) * group);
cmd.record_upload(weight_data_pack1to4_groups, weight_data_gpu_pack1to4);
}

@@ -401,7 +398,7 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
{
Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group);

weight_data_pack4to1_groups.create(4*maxk, channels_g/4, num_output_g * group);
weight_data_pack4to1_groups.create(maxk, channels_g/4, num_output_g * group, (size_t)4*4, 4);

for (int g=0; g<group; g++)
{
@@ -437,7 +434,6 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack4to1_groups = weight_data_pack4to1_groups.reshape(4*maxk * (channels_g/4) * num_output_g * group);
cmd.record_upload(weight_data_pack4to1_groups, weight_data_gpu_pack4to1);
}

@@ -517,6 +513,12 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl
int out_packing = num_output % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;

if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
{
if (out_packing == 4) out_elemsize = 4*2u;
if (out_packing == 1) out_elemsize = 4u;
}

top_blob.create(outw, outh, num_output / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
return -100;


+ 12
- 0
src/layer/vulkan/crop_vulkan.cpp View File

@@ -98,6 +98,12 @@ int Crop_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c
int out_packing = _outc % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;

if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
{
if (out_packing == 4) out_elemsize = 4*2u;
if (out_packing == 1) out_elemsize = 4u;
}

top_blob.create(_outw, _outh, _outc / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
return -100;
@@ -189,6 +195,12 @@ int Crop_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkM
int out_packing = _outc % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;

if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
{
if (out_packing == 4) out_elemsize = 4*2u;
if (out_packing == 1) out_elemsize = 4u;
}

VkMat& top_blob = top_blobs[0];

top_blob.create(_outw, _outh, _outc / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);


+ 3
- 6
src/layer/vulkan/deconvolution_vulkan.cpp View File

@@ -158,7 +158,7 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd)
{
Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);

weight_data_pack4.create(16*maxk, num_input/4, num_output/4);
weight_data_pack4.create(maxk, num_input/4, num_output/4, (size_t)4*16, 16);

for (int q=0; q+3<num_output; q+=4)
{
@@ -221,7 +221,6 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack4 = weight_data_pack4.reshape(16*maxk * (num_input/4) * (num_output/4));
cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4);
}

@@ -234,7 +233,7 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd)
{
Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);

weight_data_pack1to4.create(4*maxk, num_input, num_output/4);
weight_data_pack1to4.create(maxk, num_input, num_output/4, (size_t)4*4, 4);

for (int q=0; q+3<num_output; q+=4)
{
@@ -267,7 +266,6 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack1to4 = weight_data_pack1to4.reshape(4*maxk * num_input * (num_output/4));
cmd.record_upload(weight_data_pack1to4, weight_data_gpu_pack1to4);
}

@@ -280,7 +278,7 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd)
{
Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);

weight_data_pack4to1.create(4*maxk, num_input/4, num_output);
weight_data_pack4to1.create(maxk, num_input/4, num_output, (size_t)4*4, 4);

for (int q=0; q<num_output; q++)
{
@@ -309,7 +307,6 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack4to1 = weight_data_pack4to1.reshape(4*maxk * (num_input/4) * num_output);
cmd.record_upload(weight_data_pack4to1, weight_data_gpu_pack4to1);
}



+ 3
- 6
src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp View File

@@ -281,7 +281,7 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
{
Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);

weight_data_pack4_groups.create(16*maxk, channels_g/4, num_output_g/4 * group);
weight_data_pack4_groups.create(maxk, channels_g/4, num_output_g/4 * group, (size_t)4*16, 16);

for (int g=0; g<group; g++)
{
@@ -351,7 +351,6 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack4_groups = weight_data_pack4_groups.reshape(16*maxk * (channels_g/4) * (num_output_g/4) * group);
cmd.record_upload(weight_data_pack4_groups, weight_data_gpu_pack4);
}

@@ -364,7 +363,7 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
{
Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);

weight_data_pack1to4_groups.create(4*maxk, channels_g, num_output_g/4 * group);
weight_data_pack1to4_groups.create(maxk, channels_g, num_output_g/4 * group, (size_t)4*4, 4);

for (int g=0; g<group; g++)
{
@@ -404,7 +403,6 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack1to4_groups = weight_data_pack1to4_groups.reshape(4*maxk * channels_g * (num_output_g/4) * group);
cmd.record_upload(weight_data_pack1to4_groups, weight_data_gpu_pack1to4);
}

@@ -417,7 +415,7 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
{
Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);

weight_data_pack4to1_groups.create(4*maxk, channels_g/4, num_output_g * group);
weight_data_pack4to1_groups.create(maxk, channels_g/4, num_output_g * group, (size_t)4*4, 4);

for (int g=0; g<group; g++)
{
@@ -453,7 +451,6 @@ int DeconvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack4to1_groups = weight_data_pack4to1_groups.reshape(4*maxk * (channels_g/4) * num_output_g * group);
cmd.record_upload(weight_data_pack4to1_groups, weight_data_gpu_pack4to1);
}



+ 29
- 11
src/layer/vulkan/flatten_vulkan.cpp View File

@@ -24,6 +24,7 @@ Flatten_vulkan::Flatten_vulkan()

pipeline_flatten = 0;
pipeline_flatten_pack4 = 0;
pipeline_flatten_pack1to4 = 0;
}

int Flatten_vulkan::create_pipeline(const Option& opt)
@@ -44,6 +45,13 @@ int Flatten_vulkan::create_pipeline(const Option& opt)
pipeline_flatten_pack4->create("flatten_pack4", specializations, 2, 10);
}

// pack1to4
{
pipeline_flatten_pack1to4 = new Pipeline(vkdev);
pipeline_flatten_pack1to4->set_optimal_local_size_xyz();
pipeline_flatten_pack1to4->create("flatten_pack1to4", specializations, 2, 10);
}

return 0;
}

@@ -55,6 +63,9 @@ int Flatten_vulkan::destroy_pipeline(const Option& opt)
delete pipeline_flatten_pack4;
pipeline_flatten_pack4 = 0;

delete pipeline_flatten_pack1to4;
pipeline_flatten_pack1to4 = 0;

return 0;
}

@@ -79,6 +90,12 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
int out_packing = total % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;

if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
{
if (out_packing == 4) out_elemsize = 4*2u;
if (out_packing == 1) out_elemsize = 4u;
}

if (dims == 2 && packing == 1)
{
top_blob = bottom_blob;
@@ -106,25 +123,26 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
constants[3].i = bottom_blob.c;
constants[4].i = bottom_blob.cstep;
constants[5].i = top_blob.dims;
constants[6].i = (packing == 1 && out_packing == 4) ? total : top_blob.w;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = top_blob.cstep;

const Pipeline* pipeline = packing == 4 ? pipeline_flatten_pack4 : pipeline_flatten;

if (packing == 1 && out_packing == 4)
const Pipeline* pipeline = 0;
if (packing == 1 && out_packing == 1)
{
VkMat dispatcher;
dispatcher.w = total;
dispatcher.h = 1;
dispatcher.c = 1;
cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
pipeline = pipeline_flatten;
}
else
else if (packing == 4 /*&& out_packing == 4*/)
{
cmd.record_pipeline(pipeline, bindings, constants, top_blob);
pipeline = pipeline_flatten_pack4;
}
else if (packing == 1 && out_packing == 4)
{
pipeline = pipeline_flatten_pack1to4;
}

cmd.record_pipeline(pipeline, bindings, constants, top_blob);

return 0;
}


+ 1
- 0
src/layer/vulkan/flatten_vulkan.h View File

@@ -32,6 +32,7 @@ public:
public:
Pipeline* pipeline_flatten;
Pipeline* pipeline_flatten_pack4;
Pipeline* pipeline_flatten_pack1to4;
};

} // namespace ncnn


+ 12
- 9
src/layer/vulkan/innerproduct_vulkan.cpp View File

@@ -141,7 +141,7 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
{
Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

weight_data_pack4.create(16, num_input/4, num_output/4);
weight_data_pack4.create(num_input/4, num_output/4, (size_t)4*16, 16);

for (int q=0; q+3<num_output; q+=4)
{
@@ -150,7 +150,7 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
const float* k2 = weight_data_r2.row(q+2);
const float* k3 = weight_data_r2.row(q+3);

float* g00 = weight_data_pack4.channel(q/4);
float* g00 = weight_data_pack4.row(q/4);

for (int p=0; p+3<num_input; p+=4)
{
@@ -183,7 +183,6 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack4 = weight_data_pack4.reshape(16 * (num_input/4) * (num_output/4));
cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4);
}

@@ -196,7 +195,7 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
{
Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

weight_data_pack1to4.create(4, num_input, num_output/4);
weight_data_pack1to4.create(num_input, num_output/4, (size_t)4*4, 4);

for (int q=0; q+3<num_output; q+=4)
{
@@ -205,7 +204,7 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
const float* k2 = weight_data_r2.row(q+2);
const float* k3 = weight_data_r2.row(q+3);

float* g00 = weight_data_pack1to4.channel(q/4);
float* g00 = weight_data_pack1to4.row(q/4);

for (int p=0; p<num_input; p++)
{
@@ -219,7 +218,6 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack1to4 = weight_data_pack1to4.reshape(4 * num_input * (num_output/4));
cmd.record_upload(weight_data_pack1to4, weight_data_gpu_pack1to4);
}

@@ -232,13 +230,13 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
{
Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

weight_data_pack4to1.create(4, num_input/4, num_output);
weight_data_pack4to1.create(num_input/4, num_output, (size_t)4*4, 4);

for (int q=0; q<num_output; q++)
{
const float* k0 = weight_data_r2.row(q);

float* g00 = weight_data_pack4to1.channel(q);
float* g00 = weight_data_pack4to1.row(q);

for (int p=0; p+3<num_input; p+=4)
{
@@ -253,7 +251,6 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd)
}
}

weight_data_pack4to1 = weight_data_pack4to1.reshape(4 * (num_input/4) * num_output);
cmd.record_upload(weight_data_pack4to1, weight_data_gpu_pack4to1);
}

@@ -293,6 +290,12 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo
int out_packing = num_output % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;

if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
{
if (out_packing == 4) out_elemsize = 4*2u;
if (out_packing == 1) out_elemsize = 4u;
}

top_blob.create(num_output / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
return -100;


+ 16
- 1
src/layer/vulkan/packing_vulkan.cpp View File

@@ -96,7 +96,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute

if (dims == 1)
{
if (out_packing == 1)
if (vkdev->info.support_fp16_storage && out_packing == 1)
{
top_blob = bottom_blob;
top_blob.w = w * packing;
@@ -108,6 +108,11 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute

int outw = (w * packing + out_packing - 1) / out_packing;
size_t out_elemsize = elemsize / packing * out_packing;
if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
{
if (out_packing == 4) out_elemsize = 4*2u;
if (out_packing == 1) out_elemsize = 4u;
}

top_blob.create(outw, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
@@ -118,6 +123,11 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
{
int outh = (h * packing + out_packing - 1) / out_packing;
size_t out_elemsize = elemsize / packing * out_packing;
if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
{
if (out_packing == 4) out_elemsize = 4*2u;
if (out_packing == 1) out_elemsize = 4u;
}

top_blob.create(w, outh, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
@@ -128,6 +138,11 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
{
int outc = (channels * packing + out_packing - 1) / out_packing;
size_t out_elemsize = elemsize / packing * out_packing;
if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
{
if (out_packing == 4) out_elemsize = 4*2u;
if (out_packing == 1) out_elemsize = 4u;
}

top_blob.create(w, h, outc, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())


+ 6
- 0
src/layer/vulkan/permute_vulkan.cpp View File

@@ -72,6 +72,12 @@ int Permute_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
int out_packing = 1;
size_t out_elemsize = elemsize / packing;

if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
{
if (out_packing == 4) out_elemsize = 4*2u;
if (out_packing == 1) out_elemsize = 4u;
}

if (dims == 2)
{
// order_type


+ 8
- 2
src/layer/vulkan/priorbox_vulkan.cpp View File

@@ -107,6 +107,12 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
{
int w = bottom_blobs[0].w;
int h = bottom_blobs[0].h;
size_t elemsize = 4u;

if (vkdev->info.support_fp16_storage)
{
elemsize = 2u;
}

if (bottom_blobs.size() == 1 && image_width == -233 && image_height == -233 && max_sizes.empty())
{
@@ -124,7 +130,7 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
int num_prior = num_sizes - 1 + num_ratios;

VkMat& top_blob = top_blobs[0];
top_blob.create(4 * w * h * num_prior, 4u, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(4 * w * h * num_prior, elemsize, 1, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
return -100;

@@ -172,7 +178,7 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector
num_prior += num_min_size * num_aspect_ratio;

VkMat& top_blob = top_blobs[0];
top_blob.create(4 * w * h * num_prior, 2, 4u, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(4 * w * h * num_prior, 2, elemsize, 1, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
return -100;



+ 6
- 0
src/layer/vulkan/reorg_vulkan.cpp View File

@@ -85,6 +85,12 @@ int Reorg_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute&
int out_packing = outc % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;

if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
{
if (out_packing == 4) out_elemsize = 4*2u;
if (out_packing == 1) out_elemsize = 4u;
}

top_blob.create(outw, outh, outc / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
return -100;


+ 18
- 0
src/layer/vulkan/reshape_vulkan.cpp View File

@@ -105,6 +105,12 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
out_packing = _w % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;

if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
{
if (out_packing == 4) out_elemsize = 4*2u;
if (out_packing == 1) out_elemsize = 4u;
}

if (dims == 1 && bottom_blob.w == _w && packing == out_packing)
{
top_blob = bottom_blob;
@@ -131,6 +137,12 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
out_packing = _h % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;

if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
{
if (out_packing == 4) out_elemsize = 4*2u;
if (out_packing == 1) out_elemsize = 4u;
}

if (dims == 2 && bottom_blob.h == _h && packing == out_packing)
{
top_blob = bottom_blob;
@@ -162,6 +174,12 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute
out_packing = _c % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;

if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage)
{
if (out_packing == 4) out_elemsize = 4*2u;
if (out_packing == 1) out_elemsize = 4u;
}

if (dims == 3 && bottom_blob.c == _c && packing == out_packing)
{
top_blob = bottom_blob;


+ 76
- 0
src/layer/vulkan/shader/flatten_pack1to4.comp View File

@@ -0,0 +1,76 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require
#endif

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= 1 || gz >= 1)
return;

ivec4 i4 = gx * 4 + ivec4(0, 1, 2, 3);

int size = p.w * p.h;

ivec4 z4 = i4 / size;
ivec4 y4 = i4 % size / p.w;
ivec4 x4 = i4 % size % p.w;

ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4;

#if NCNN_fp16_packed
vec2 v0 = vec2(bottom_blob_data[v_offset.r], bottom_blob_data[v_offset.g]);
vec2 v1 = vec2(bottom_blob_data[v_offset.b], bottom_blob_data[v_offset.a]);

top_blob_data[gx] = uvec2(packHalf2x16(v0), packHalf2x16(v1));
#else
top_blob_data[gx].r = bottom_blob_data[v_offset.r];
top_blob_data[gx].g = bottom_blob_data[v_offset.g];
top_blob_data[gx].b = bottom_blob_data[v_offset.b];
top_blob_data[gx].a = bottom_blob_data[v_offset.a];
#endif
}

+ 5
- 0
src/layer/vulkan/shader/padding_pack4.comp View File

@@ -70,7 +70,12 @@ void main()
}
else
{
#if NCNN_fp16_packed
uint v = packHalf2x16(vec2(value));
top_blob_data[gz * p.outcstep + gy * p.outw + gx] = uvec2(v, v);
#else
top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(value);
#endif
}
}
else if (type == 1)


+ 21
- 3
src/layer/vulkan/shader/priorbox.comp View File

@@ -37,7 +37,11 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_fp16_packed
layout (binding = 0) writeonly buffer top_blob { vec4 top_blob_data[]; };
#else
layout (binding = 0) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#endif
layout (binding = 1) readonly buffer min_sizes { sfp min_sizes_data[]; };
layout (binding = 2) readonly buffer max_sizes { sfp max_sizes_data[]; };
layout (binding = 3) readonly buffer aspect_ratios { sfp aspect_ratios_data[]; };
@@ -80,9 +84,7 @@ void main()
afp min_size = sfp2afp(min_sizes_data[gx]);

#if NCNN_fp16_packed
vec2 v0 = vec2(variances_0, variances_1);
vec2 v1 = vec2(variances_2, variances_3);
uvec2 variances = uvec2(packHalf2x16(v0), packHalf2x16(v1));
vec4 variances = vec4(variances_0, variances_1, variances_2, variances_3);
#elif !NCNN_fp16_storage
// per component assignment makes qcom-adreno driver unhappy :(
sfpvec4 variances = sfpvec4(variances_0, variances_1, variances_2, variances_3);
@@ -94,7 +96,11 @@ void main()
box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm;
box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box;

#if NCNN_fp16_packed
top_blob_data[v_offset] = vec4(box);
#else
top_blob_data[v_offset] = afp2sfpvec4(box);
#endif
#if NCNN_fp16_packed || !NCNN_fp16_storage
top_blob_data[var_offset] = variances;
#else
@@ -117,7 +123,11 @@ void main()
box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm;
box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box;

#if NCNN_fp16_packed
top_blob_data[v_offset] = vec4(box);
#else
top_blob_data[v_offset] = afp2sfpvec4(box);
#endif
#if NCNN_fp16_packed || !NCNN_fp16_storage
top_blob_data[var_offset] = variances;
#else
@@ -141,7 +151,11 @@ void main()
box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm;
box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box;

#if NCNN_fp16_packed
top_blob_data[v_offset] = vec4(box);
#else
top_blob_data[v_offset] = afp2sfpvec4(box);
#endif
#if NCNN_fp16_packed || !NCNN_fp16_storage
top_blob_data[var_offset] = variances;
#else
@@ -159,7 +173,11 @@ void main()
box = (center + afpvec4(-box_h, -box_w, box_h, box_w) * afp(0.5f)) * image_norm;
box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box;

#if NCNN_fp16_packed
top_blob_data[v_offset] = vec4(box);
#else
top_blob_data[v_offset] = afp2sfpvec4(box);
#endif
#if NCNN_fp16_packed || !NCNN_fp16_storage
top_blob_data[var_offset] = variances;
#else


+ 4
- 4
src/net.cpp View File

@@ -1407,7 +1407,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector

// cast to fp16
VkMat bottom_blob_unpacked_fp16;
if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage)
if (vkdev->info.support_fp16_storage)
{
cast_float32_to_float16->forward(bottom_blob_unpacked, bottom_blob_unpacked_fp16, cmd, opt);
}
@@ -1509,7 +1509,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector

// cast to fp16
VkMat bottom_blob_unpacked_fp16;
if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage)
if (vkdev->info.support_fp16_storage)
{
cast_float32_to_float16->forward(bottom_blob_unpacked, bottom_blob_unpacked_fp16, cmd, opt);
}
@@ -1638,7 +1638,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector

// cast to fp32
VkMat bottom_blob_unpacked_fp32;
if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage)
if (vkdev->info.support_fp16_storage)
{
cast_float16_to_float32->forward(bottom_blob_unpacked, bottom_blob_unpacked_fp32, cmd, opt);
}
@@ -1773,7 +1773,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector
packing_pack1->forward(bottom_blob, bottom_blob_unpacked, cmd, opt);

// cast to fp32
if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage)
if (vkdev->info.support_fp16_storage)
{
cast_float16_to_float32->forward(bottom_blob_unpacked, bottom_blobs_unpacked_fp32[i], cmd, opt);
}


Loading…
Cancel
Save