diff --git a/src/command.cpp b/src/command.cpp index 7d305cd0b..27643e832 100644 --- a/src/command.cpp +++ b/src/command.cpp @@ -224,6 +224,7 @@ void VkCompute::record_download(const VkMat& m) void VkCompute::record_clone(const VkMat& src, const VkMat& dst) { record_prepare_transfer_barrier(src); + record_prepare_transfer_barrier(dst); if (vkdev->info.support_VK_KHR_push_descriptor) return copy_buffer(src.buffer(), src.buffer_offset(), dst.buffer(), dst.buffer_offset(), src.total() * src.elemsize); @@ -249,6 +250,7 @@ void VkCompute::record_copy_region(const VkMat& src, const VkMat& dst, const VkB void VkCompute::record_copy_regions(const VkMat& src, const VkMat& dst, const std::vector& regions) { record_prepare_transfer_barrier(src); + record_prepare_transfer_barrier(dst); if (vkdev->info.support_VK_KHR_push_descriptor) return copy_buffer_regions(src.buffer(), dst.buffer(), regions); diff --git a/src/layer/shader/softmax_div_sum_pack4.comp b/src/layer/shader/softmax_div_sum_pack4.comp index 14aeef06a..24326f3ff 100644 --- a/src/layer/shader/softmax_div_sum_pack4.comp +++ b/src/layer/shader/softmax_div_sum_pack4.comp @@ -28,7 +28,7 @@ layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; -layout (binding = 1) readonly buffer sum_workspace { sfp sum_workspace_data[]; }; +layout (binding = 1) readonly buffer sum_workspace { sfpvec4 sum_workspace_data[]; }; layout (push_constant) uniform parameter { @@ -56,54 +56,42 @@ void main() if (p.dims == 1) // axis == 0 { - afp sum = afp(sum_workspace_data[0]); - afpvec4 v = afpvec4(bottom_top_blob_data[gx]); - bottom_top_blob_data[gx] = sfpvec4(v / sum); + bottom_top_blob_data[gx] = sfpvec4(afpvec4(bottom_top_blob_data[gx]) / afpvec4(sum_workspace_data[0])); return; } if (p.dims == 2 && axis == 0) { int gi = gy * p.w + gx; - afp sum = afp(sum_workspace_data[gx]); - afpvec4 v = afpvec4(bottom_top_blob_data[gi]); - bottom_top_blob_data[gi] = sfpvec4(v / sum); + bottom_top_blob_data[gi] = sfpvec4(afpvec4(bottom_top_blob_data[gi]) / afpvec4(sum_workspace_data[gx])); return; } if (p.dims == 2 && axis == 1) { int gi = gy * p.w + gx; - afp sum = afp(sum_workspace_data[gy]); - afpvec4 v = afpvec4(bottom_top_blob_data[gi]); - bottom_top_blob_data[gi] = sfpvec4(v / sum); + bottom_top_blob_data[gi] = sfpvec4(afpvec4(bottom_top_blob_data[gi]) / afpvec4(sum_workspace_data[gy])); return; } if (p.dims == 3 && axis == 0) { int gi = gz * p.cstep + gy * p.w + gx; - afp sum = afp(sum_workspace_data[gy * p.w + gx]); - afpvec4 v = afpvec4(bottom_top_blob_data[gi]); - bottom_top_blob_data[gi] = sfpvec4(v / sum); + bottom_top_blob_data[gi] = sfpvec4(afpvec4(bottom_top_blob_data[gi]) / afpvec4(sum_workspace_data[gy * p.w + gx])); return; } if (p.dims == 3 && axis == 1) { int gi = gz * p.cstep + gy * p.w + gx; - afp sum = afp(sum_workspace_data[gz * p.w + gx]); - afpvec4 v = afpvec4(bottom_top_blob_data[gi]); - bottom_top_blob_data[gi] = sfpvec4(v / sum); + bottom_top_blob_data[gi] = sfpvec4(afpvec4(bottom_top_blob_data[gi]) / afpvec4(sum_workspace_data[gz * p.w + gx])); return; } if (p.dims == 3 && axis == 2) { int gi = gz * p.cstep + gy * p.w + gx; - afp sum = afp(sum_workspace_data[gz * p.h + gy]); - afpvec4 v = afpvec4(bottom_top_blob_data[gi]); - bottom_top_blob_data[gi] = sfpvec4(v / sum); + bottom_top_blob_data[gi] = sfpvec4(afpvec4(bottom_top_blob_data[gi]) / afpvec4(sum_workspace_data[gz * p.h + gy])); return; } } diff --git a/src/layer/shader/softmax_exp_sub_max_pack4.comp b/src/layer/shader/softmax_exp_sub_max_pack4.comp index 9ff016169..07292bffc 100644 --- a/src/layer/shader/softmax_exp_sub_max_pack4.comp +++ b/src/layer/shader/softmax_exp_sub_max_pack4.comp @@ -28,7 +28,7 @@ layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; -layout (binding = 1) readonly buffer max_workspace { sfp max_workspace_data[]; }; +layout (binding = 1) readonly buffer max_workspace { sfpvec4 max_workspace_data[]; }; layout (push_constant) uniform parameter { @@ -56,42 +56,42 @@ void main() if (p.dims == 1) // axis == 0 { - bottom_top_blob_data[gx] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gx]) - afp(max_workspace_data[0]))); + bottom_top_blob_data[gx] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gx]) - afpvec4(max_workspace_data[0]))); return; } if (p.dims == 2 && axis == 0) { int gi = gy * p.w + gx; - bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afp(max_workspace_data[gx]))); + bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afpvec4(max_workspace_data[gx]))); return; } if (p.dims == 2 && axis == 1) { int gi = gy * p.w + gx; - bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afp(max_workspace_data[gy]))); + bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afpvec4(max_workspace_data[gy]))); return; } if (p.dims == 3 && axis == 0) { int gi = gz * p.cstep + gy * p.w + gx; - bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afp(max_workspace_data[gy * p.w + gx]))); + bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afpvec4(max_workspace_data[gy * p.w + gx]))); return; } if (p.dims == 3 && axis == 1) { int gi = gz * p.cstep + gy * p.w + gx; - bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afp(max_workspace_data[gz * p.w + gx]))); + bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afpvec4(max_workspace_data[gz * p.w + gx]))); return; } if (p.dims == 3 && axis == 2) { int gi = gz * p.cstep + gy * p.w + gx; - bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afp(max_workspace_data[gz * p.h + gy]))); + bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afpvec4(max_workspace_data[gz * p.h + gy]))); return; } } diff --git a/src/layer/shader/softmax_reduce_max_pack4.comp b/src/layer/shader/softmax_reduce_max_pack4.comp index 9dc074ae6..978535b53 100644 --- a/src/layer/shader/softmax_reduce_max_pack4.comp +++ b/src/layer/shader/softmax_reduce_max_pack4.comp @@ -28,7 +28,7 @@ layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; layout (binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; -layout (binding = 1) writeonly buffer max_workspace { sfp max_workspace_data[]; }; +layout (binding = 1) writeonly buffer max_workspace { sfpvec4 max_workspace_data[]; }; layout (push_constant) uniform parameter { @@ -62,7 +62,7 @@ void main() max_value = max(max_value, afpvec4(bottom_top_blob_data[i])); } afpvec2 max2 = max(max_value.rg, max_value.ba); - max_workspace_data[0] = sfp(max(max2.r, max2.g)); + max_workspace_data[0] = sfpvec4(max(max2.r, max2.g)); return; } @@ -76,7 +76,7 @@ void main() max_value = max(max_value, afpvec4(bottom_top_blob_data[v_offset])); } afpvec2 max2 = max(max_value.rg, max_value.ba); - max_workspace_data[gx] = sfp(max(max2.r, max2.g)); + max_workspace_data[gx] = sfpvec4(max(max2.r, max2.g)); return; } @@ -89,8 +89,7 @@ void main() int v_offset = gx * p.w + i; max_value = max(max_value, afpvec4(bottom_top_blob_data[v_offset])); } - afpvec2 max2 = max(max_value.rg, max_value.ba); - max_workspace_data[gx] = sfp(max(max2.r, max2.g)); + max_workspace_data[gx] = sfpvec4(max_value); return; } @@ -104,7 +103,7 @@ void main() max_value = max(max_value, afpvec4(bottom_top_blob_data[v_offset])); } afpvec2 max2 = max(max_value.rg, max_value.ba); - max_workspace_data[gy * p.w + gx] = sfp(max(max2.r, max2.g)); + max_workspace_data[gy * p.w + gx] = sfpvec4(max(max2.r, max2.g)); return; } @@ -117,8 +116,7 @@ void main() int v_offset = gy * p.cstep + i * p.w + gx; max_value = max(max_value, afpvec4(bottom_top_blob_data[v_offset])); } - afpvec2 max2 = max(max_value.rg, max_value.ba); - max_workspace_data[gy * p.w + gx] = sfp(max(max2.r, max2.g)); + max_workspace_data[gy * p.w + gx] = sfpvec4(max_value); return; } @@ -131,8 +129,7 @@ void main() int v_offset = gy * p.cstep + gx * p.w + i; max_value = max(max_value, afpvec4(bottom_top_blob_data[v_offset])); } - afpvec2 max2 = max(max_value.rg, max_value.ba); - max_workspace_data[gy * p.h + gx] = sfp(max(max2.r, max2.g)); + max_workspace_data[gy * p.h + gx] = sfpvec4(max_value); return; } diff --git a/src/layer/shader/softmax_reduce_sum_pack4.comp b/src/layer/shader/softmax_reduce_sum_pack4.comp index 4b3f0826b..1ba17f3b5 100644 --- a/src/layer/shader/softmax_reduce_sum_pack4.comp +++ b/src/layer/shader/softmax_reduce_sum_pack4.comp @@ -28,7 +28,7 @@ layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; layout (binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; -layout (binding = 1) writeonly buffer sum_workspace { sfp sum_workspace_data[]; }; +layout (binding = 1) writeonly buffer sum_workspace { sfpvec4 sum_workspace_data[]; }; layout (push_constant) uniform parameter { @@ -62,7 +62,7 @@ void main() sum_value += afpvec4(bottom_top_blob_data[i]); } afpvec2 sum2 = sum_value.rg + sum_value.ba; - sum_workspace_data[0] = sfp(sum2.r + sum2.g); + sum_workspace_data[0] = sfpvec4(sum2.r + sum2.g); return; } @@ -76,7 +76,7 @@ void main() sum_value += afpvec4(bottom_top_blob_data[v_offset]); } afpvec2 sum2 = sum_value.rg + sum_value.ba; - sum_workspace_data[gx] = sfp(sum2.r + sum2.g); + sum_workspace_data[gx] = sfpvec4(sum2.r + sum2.g); return; } @@ -89,8 +89,7 @@ void main() int v_offset = gx * p.w + i; sum_value += afpvec4(bottom_top_blob_data[v_offset]); } - afpvec2 sum2 = sum_value.rg + sum_value.ba; - sum_workspace_data[gx] = sfp(sum2.r + sum2.g); + sum_workspace_data[gx] = sfpvec4(sum_value); return; } @@ -104,7 +103,7 @@ void main() sum_value += afpvec4(bottom_top_blob_data[v_offset]); } afpvec2 sum2 = sum_value.rg + sum_value.ba; - sum_workspace_data[ gy * p.w + gx ] = sfp(sum2.r + sum2.g); + sum_workspace_data[ gy * p.w + gx ] = sfpvec4(sum2.r + sum2.g); return; } @@ -117,8 +116,7 @@ void main() int v_offset = gy * p.cstep + i * p.w + gx; sum_value += afpvec4(bottom_top_blob_data[v_offset]); } - afpvec2 sum2 = sum_value.rg + sum_value.ba; - sum_workspace_data[gy * p.w + gx] = sfp(sum2.r + sum2.g); + sum_workspace_data[gy * p.w + gx] = sfpvec4(sum_value); return; } @@ -131,8 +129,7 @@ void main() int v_offset = gy * p.cstep + gx * p.w + i; sum_value += afpvec4(bottom_top_blob_data[v_offset]); } - afpvec2 sum2 = sum_value.rg + sum_value.ba; - sum_workspace_data[gy * p.h + gx] = sfp(sum2.r + sum2.g); + sum_workspace_data[gy * p.h + gx] = sfpvec4(sum_value); return; } diff --git a/src/layer/softmax.cpp b/src/layer/softmax.cpp index 20325b323..8c1bbf7c2 100644 --- a/src/layer/softmax.cpp +++ b/src/layer/softmax.cpp @@ -412,6 +412,7 @@ int Softmax::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Optio int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; + size_t elemsize = bottom_top_blob.elemsize; int packing = bottom_top_blob.packing; VkMat max_workspace; @@ -419,33 +420,33 @@ int Softmax::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Optio if (dims == 1) // axis == 0 { - max_workspace.create(1, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); - sum_workspace.create(1, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); + max_workspace.create(1, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); + sum_workspace.create(1, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); } else if (dims == 2 && axis == 0) { - max_workspace.create(w, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); - sum_workspace.create(w, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); + max_workspace.create(w, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); + sum_workspace.create(w, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); } else if (dims == 2 && axis == 1) { - max_workspace.create(h, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); - sum_workspace.create(h, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); + max_workspace.create(h, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); + sum_workspace.create(h, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); } else if (dims == 3 && axis == 0) { - max_workspace.create(w, h, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); - sum_workspace.create(w, h, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); + max_workspace.create(w, h, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); + sum_workspace.create(w, h, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); } else if (dims == 3 && axis == 1) { - max_workspace.create(w, channels, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); - sum_workspace.create(w, channels, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); + max_workspace.create(w, channels, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); + sum_workspace.create(w, channels, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); } else if (dims == 3 && axis == 2) { - max_workspace.create(h, channels, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); - sum_workspace.create(h, channels, 4u, opt.workspace_vkallocator, opt.staging_vkallocator); + max_workspace.create(h, channels, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); + sum_workspace.create(h, channels, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); } // fprintf(stderr, "Softmax::forward_inplace %p\n", bottom_top_blob.buffer());