| @@ -224,6 +224,7 @@ void VkCompute::record_download(const VkMat& m) | |||
| void VkCompute::record_clone(const VkMat& src, const VkMat& dst) | |||
| { | |||
| record_prepare_transfer_barrier(src); | |||
| record_prepare_transfer_barrier(dst); | |||
| if (vkdev->info.support_VK_KHR_push_descriptor) | |||
| return copy_buffer(src.buffer(), src.buffer_offset(), dst.buffer(), dst.buffer_offset(), src.total() * src.elemsize); | |||
| @@ -249,6 +250,7 @@ void VkCompute::record_copy_region(const VkMat& src, const VkMat& dst, const VkB | |||
| void VkCompute::record_copy_regions(const VkMat& src, const VkMat& dst, const std::vector<VkBufferCopy>& regions) | |||
| { | |||
| record_prepare_transfer_barrier(src); | |||
| record_prepare_transfer_barrier(dst); | |||
| if (vkdev->info.support_VK_KHR_push_descriptor) | |||
| return copy_buffer_regions(src.buffer(), dst.buffer(), regions); | |||
| @@ -28,7 +28,7 @@ layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; | |||
| layout (binding = 1) readonly buffer sum_workspace { sfp sum_workspace_data[]; }; | |||
| layout (binding = 1) readonly buffer sum_workspace { sfpvec4 sum_workspace_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -56,54 +56,42 @@ void main() | |||
| if (p.dims == 1) // axis == 0 | |||
| { | |||
| afp sum = afp(sum_workspace_data[0]); | |||
| afpvec4 v = afpvec4(bottom_top_blob_data[gx]); | |||
| bottom_top_blob_data[gx] = sfpvec4(v / sum); | |||
| bottom_top_blob_data[gx] = sfpvec4(afpvec4(bottom_top_blob_data[gx]) / afpvec4(sum_workspace_data[0])); | |||
| return; | |||
| } | |||
| if (p.dims == 2 && axis == 0) | |||
| { | |||
| int gi = gy * p.w + gx; | |||
| afp sum = afp(sum_workspace_data[gx]); | |||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||
| bottom_top_blob_data[gi] = sfpvec4(v / sum); | |||
| bottom_top_blob_data[gi] = sfpvec4(afpvec4(bottom_top_blob_data[gi]) / afpvec4(sum_workspace_data[gx])); | |||
| return; | |||
| } | |||
| if (p.dims == 2 && axis == 1) | |||
| { | |||
| int gi = gy * p.w + gx; | |||
| afp sum = afp(sum_workspace_data[gy]); | |||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||
| bottom_top_blob_data[gi] = sfpvec4(v / sum); | |||
| bottom_top_blob_data[gi] = sfpvec4(afpvec4(bottom_top_blob_data[gi]) / afpvec4(sum_workspace_data[gy])); | |||
| return; | |||
| } | |||
| if (p.dims == 3 && axis == 0) | |||
| { | |||
| int gi = gz * p.cstep + gy * p.w + gx; | |||
| afp sum = afp(sum_workspace_data[gy * p.w + gx]); | |||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||
| bottom_top_blob_data[gi] = sfpvec4(v / sum); | |||
| bottom_top_blob_data[gi] = sfpvec4(afpvec4(bottom_top_blob_data[gi]) / afpvec4(sum_workspace_data[gy * p.w + gx])); | |||
| return; | |||
| } | |||
| if (p.dims == 3 && axis == 1) | |||
| { | |||
| int gi = gz * p.cstep + gy * p.w + gx; | |||
| afp sum = afp(sum_workspace_data[gz * p.w + gx]); | |||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||
| bottom_top_blob_data[gi] = sfpvec4(v / sum); | |||
| bottom_top_blob_data[gi] = sfpvec4(afpvec4(bottom_top_blob_data[gi]) / afpvec4(sum_workspace_data[gz * p.w + gx])); | |||
| return; | |||
| } | |||
| if (p.dims == 3 && axis == 2) | |||
| { | |||
| int gi = gz * p.cstep + gy * p.w + gx; | |||
| afp sum = afp(sum_workspace_data[gz * p.h + gy]); | |||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||
| bottom_top_blob_data[gi] = sfpvec4(v / sum); | |||
| bottom_top_blob_data[gi] = sfpvec4(afpvec4(bottom_top_blob_data[gi]) / afpvec4(sum_workspace_data[gz * p.h + gy])); | |||
| return; | |||
| } | |||
| } | |||
| @@ -28,7 +28,7 @@ layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; | |||
| layout (binding = 1) readonly buffer max_workspace { sfp max_workspace_data[]; }; | |||
| layout (binding = 1) readonly buffer max_workspace { sfpvec4 max_workspace_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -56,42 +56,42 @@ void main() | |||
| if (p.dims == 1) // axis == 0 | |||
| { | |||
| bottom_top_blob_data[gx] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gx]) - afp(max_workspace_data[0]))); | |||
| bottom_top_blob_data[gx] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gx]) - afpvec4(max_workspace_data[0]))); | |||
| return; | |||
| } | |||
| if (p.dims == 2 && axis == 0) | |||
| { | |||
| int gi = gy * p.w + gx; | |||
| bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afp(max_workspace_data[gx]))); | |||
| bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afpvec4(max_workspace_data[gx]))); | |||
| return; | |||
| } | |||
| if (p.dims == 2 && axis == 1) | |||
| { | |||
| int gi = gy * p.w + gx; | |||
| bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afp(max_workspace_data[gy]))); | |||
| bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afpvec4(max_workspace_data[gy]))); | |||
| return; | |||
| } | |||
| if (p.dims == 3 && axis == 0) | |||
| { | |||
| int gi = gz * p.cstep + gy * p.w + gx; | |||
| bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afp(max_workspace_data[gy * p.w + gx]))); | |||
| bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afpvec4(max_workspace_data[gy * p.w + gx]))); | |||
| return; | |||
| } | |||
| if (p.dims == 3 && axis == 1) | |||
| { | |||
| int gi = gz * p.cstep + gy * p.w + gx; | |||
| bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afp(max_workspace_data[gz * p.w + gx]))); | |||
| bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afpvec4(max_workspace_data[gz * p.w + gx]))); | |||
| return; | |||
| } | |||
| if (p.dims == 3 && axis == 2) | |||
| { | |||
| int gi = gz * p.cstep + gy * p.w + gx; | |||
| bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afp(max_workspace_data[gz * p.h + gy]))); | |||
| bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afpvec4(max_workspace_data[gz * p.h + gy]))); | |||
| return; | |||
| } | |||
| } | |||
| @@ -28,7 +28,7 @@ layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer max_workspace { sfp max_workspace_data[]; }; | |||
| layout (binding = 1) writeonly buffer max_workspace { sfpvec4 max_workspace_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -62,7 +62,7 @@ void main() | |||
| max_value = max(max_value, afpvec4(bottom_top_blob_data[i])); | |||
| } | |||
| afpvec2 max2 = max(max_value.rg, max_value.ba); | |||
| max_workspace_data[0] = sfp(max(max2.r, max2.g)); | |||
| max_workspace_data[0] = sfpvec4(max(max2.r, max2.g)); | |||
| return; | |||
| } | |||
| @@ -76,7 +76,7 @@ void main() | |||
| max_value = max(max_value, afpvec4(bottom_top_blob_data[v_offset])); | |||
| } | |||
| afpvec2 max2 = max(max_value.rg, max_value.ba); | |||
| max_workspace_data[gx] = sfp(max(max2.r, max2.g)); | |||
| max_workspace_data[gx] = sfpvec4(max(max2.r, max2.g)); | |||
| return; | |||
| } | |||
| @@ -89,8 +89,7 @@ void main() | |||
| int v_offset = gx * p.w + i; | |||
| max_value = max(max_value, afpvec4(bottom_top_blob_data[v_offset])); | |||
| } | |||
| afpvec2 max2 = max(max_value.rg, max_value.ba); | |||
| max_workspace_data[gx] = sfp(max(max2.r, max2.g)); | |||
| max_workspace_data[gx] = sfpvec4(max_value); | |||
| return; | |||
| } | |||
| @@ -104,7 +103,7 @@ void main() | |||
| max_value = max(max_value, afpvec4(bottom_top_blob_data[v_offset])); | |||
| } | |||
| afpvec2 max2 = max(max_value.rg, max_value.ba); | |||
| max_workspace_data[gy * p.w + gx] = sfp(max(max2.r, max2.g)); | |||
| max_workspace_data[gy * p.w + gx] = sfpvec4(max(max2.r, max2.g)); | |||
| return; | |||
| } | |||
| @@ -117,8 +116,7 @@ void main() | |||
| int v_offset = gy * p.cstep + i * p.w + gx; | |||
| max_value = max(max_value, afpvec4(bottom_top_blob_data[v_offset])); | |||
| } | |||
| afpvec2 max2 = max(max_value.rg, max_value.ba); | |||
| max_workspace_data[gy * p.w + gx] = sfp(max(max2.r, max2.g)); | |||
| max_workspace_data[gy * p.w + gx] = sfpvec4(max_value); | |||
| return; | |||
| } | |||
| @@ -131,8 +129,7 @@ void main() | |||
| int v_offset = gy * p.cstep + gx * p.w + i; | |||
| max_value = max(max_value, afpvec4(bottom_top_blob_data[v_offset])); | |||
| } | |||
| afpvec2 max2 = max(max_value.rg, max_value.ba); | |||
| max_workspace_data[gy * p.h + gx] = sfp(max(max2.r, max2.g)); | |||
| max_workspace_data[gy * p.h + gx] = sfpvec4(max_value); | |||
| return; | |||
| } | |||
| @@ -28,7 +28,7 @@ layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer sum_workspace { sfp sum_workspace_data[]; }; | |||
| layout (binding = 1) writeonly buffer sum_workspace { sfpvec4 sum_workspace_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| @@ -62,7 +62,7 @@ void main() | |||
| sum_value += afpvec4(bottom_top_blob_data[i]); | |||
| } | |||
| afpvec2 sum2 = sum_value.rg + sum_value.ba; | |||
| sum_workspace_data[0] = sfp(sum2.r + sum2.g); | |||
| sum_workspace_data[0] = sfpvec4(sum2.r + sum2.g); | |||
| return; | |||
| } | |||
| @@ -76,7 +76,7 @@ void main() | |||
| sum_value += afpvec4(bottom_top_blob_data[v_offset]); | |||
| } | |||
| afpvec2 sum2 = sum_value.rg + sum_value.ba; | |||
| sum_workspace_data[gx] = sfp(sum2.r + sum2.g); | |||
| sum_workspace_data[gx] = sfpvec4(sum2.r + sum2.g); | |||
| return; | |||
| } | |||
| @@ -89,8 +89,7 @@ void main() | |||
| int v_offset = gx * p.w + i; | |||
| sum_value += afpvec4(bottom_top_blob_data[v_offset]); | |||
| } | |||
| afpvec2 sum2 = sum_value.rg + sum_value.ba; | |||
| sum_workspace_data[gx] = sfp(sum2.r + sum2.g); | |||
| sum_workspace_data[gx] = sfpvec4(sum_value); | |||
| return; | |||
| } | |||
| @@ -104,7 +103,7 @@ void main() | |||
| sum_value += afpvec4(bottom_top_blob_data[v_offset]); | |||
| } | |||
| afpvec2 sum2 = sum_value.rg + sum_value.ba; | |||
| sum_workspace_data[ gy * p.w + gx ] = sfp(sum2.r + sum2.g); | |||
| sum_workspace_data[ gy * p.w + gx ] = sfpvec4(sum2.r + sum2.g); | |||
| return; | |||
| } | |||
| @@ -117,8 +116,7 @@ void main() | |||
| int v_offset = gy * p.cstep + i * p.w + gx; | |||
| sum_value += afpvec4(bottom_top_blob_data[v_offset]); | |||
| } | |||
| afpvec2 sum2 = sum_value.rg + sum_value.ba; | |||
| sum_workspace_data[gy * p.w + gx] = sfp(sum2.r + sum2.g); | |||
| sum_workspace_data[gy * p.w + gx] = sfpvec4(sum_value); | |||
| return; | |||
| } | |||
| @@ -131,8 +129,7 @@ void main() | |||
| int v_offset = gy * p.cstep + gx * p.w + i; | |||
| sum_value += afpvec4(bottom_top_blob_data[v_offset]); | |||
| } | |||
| afpvec2 sum2 = sum_value.rg + sum_value.ba; | |||
| sum_workspace_data[gy * p.h + gx] = sfp(sum2.r + sum2.g); | |||
| sum_workspace_data[gy * p.h + gx] = sfpvec4(sum_value); | |||
| return; | |||
| } | |||
| @@ -412,6 +412,7 @@ int Softmax::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Optio | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| size_t elemsize = bottom_top_blob.elemsize; | |||
| int packing = bottom_top_blob.packing; | |||
| VkMat max_workspace; | |||
| @@ -419,33 +420,33 @@ int Softmax::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Optio | |||
| if (dims == 1) // axis == 0 | |||
| { | |||
| max_workspace.create(1, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(1, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| max_workspace.create(1, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(1, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| } | |||
| else if (dims == 2 && axis == 0) | |||
| { | |||
| max_workspace.create(w, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(w, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| max_workspace.create(w, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(w, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| } | |||
| else if (dims == 2 && axis == 1) | |||
| { | |||
| max_workspace.create(h, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(h, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| max_workspace.create(h, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(h, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| } | |||
| else if (dims == 3 && axis == 0) | |||
| { | |||
| max_workspace.create(w, h, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(w, h, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| max_workspace.create(w, h, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(w, h, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| } | |||
| else if (dims == 3 && axis == 1) | |||
| { | |||
| max_workspace.create(w, channels, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(w, channels, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| max_workspace.create(w, channels, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(w, channels, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| } | |||
| else if (dims == 3 && axis == 2) | |||
| { | |||
| max_workspace.create(h, channels, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(h, channels, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| max_workspace.create(h, channels, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(h, channels, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| } | |||
| // fprintf(stderr, "Softmax::forward_inplace %p\n", bottom_top_blob.buffer()); | |||